noxu_tree/tree.rs
1//! B+tree implementation.
2//!
3//!
4//! Tree implements the B+tree. It provides search, insert, and delete
5//! operations on the tree structure. The tree uses latch-coupling for
6//! concurrent access: when traversing down the tree, the parent latch
7//! is released after the child latch is acquired.
8//!
9//! # Architecture
10//!
11//! The tree has a hierarchical structure:
12//! - Internal Nodes (IN) at levels 2 and above
13//! - Bottom Internal Nodes (BIN) at level 1
14//! - Leaf Nodes (LN) containing actual data
15//!
16//! # Locking Strategy
17//!
18//! - Root latch protects the root pointer itself
19//! - Each node has its own latch for concurrent access
20//! - Search uses latch-coupling: acquire child, release parent
21//! - Modifications may require exclusive latches
22
23use crate::error::TreeError;
24use crate::key::{create_key_prefix, get_key_prefix_length};
25use crate::search_result::SearchResult;
26use noxu_latch::{LatchContext, SharedLatch};
27use noxu_util::{Lsn, NULL_LSN};
28use parking_lot::RwLock;
29use std::sync::atomic::{AtomicI64, AtomicU64, Ordering};
30use std::sync::{Arc, Weak};
31
32/// Observer that mirrors JE's `INList` feeding the evictor's `LRUList`s.
33///
34/// The tree owns no eviction policy of its own; instead it notifies a
35/// registered listener whenever an IN/BIN node enters the resident cache, is
36/// accessed, or is removed. The `Evictor` (in `noxu-evictor`) implements this
37/// trait, but the dependency is one-way (`noxu-evictor` → `noxu-tree`), so the
38/// tree refers to the listener only through this trait object — avoiding a
39/// circular crate dependency.
40///
41/// JE reference: `IN.fetchTarget` / split / `rebuildINList` call
42/// `Evictor.addBack`; node access calls `Evictor.moveBack`; node removal
43/// calls `Evictor.remove`.
44pub trait InListListener: Send + Sync {
45 /// A node has just become resident in the cache (JE `Evictor.addBack`).
46 fn note_ins_added(&self, node_id: u64);
47 /// A resident node was accessed (JE `Evictor.moveBack` — LRU touch).
48 fn note_ins_accessed(&self, node_id: u64);
49 /// A node was removed from the cache (JE `Evictor.remove`).
50 fn note_ins_removed(&self, node_id: u64);
51}
52
53// Level and flag constants re-exported here for tree-internal use.
54pub const DBMAP_LEVEL: i32 = 0x20000;
55pub const MAIN_LEVEL: i32 = 0x10000;
56pub const LEVEL_MASK: i32 = 0x0ffff;
57pub const MIN_LEVEL: i32 = -1;
58pub const BIN_LEVEL: i32 = MAIN_LEVEL | 1;
59pub const EXACT_MATCH: i32 = 1 << 16;
60pub const INSERT_SUCCESS: i32 = 1 << 17;
61
62/// Per-slot fixed memory overhead for a BIN entry, in bytes (DBI-23).
63///
64/// This is the heap footprint of one `BinEntry` *struct* as it lives inside
65/// the BIN's `Vec<BinEntry>` buffer — NOT counting the variable-length key and
66/// data bytes, which are separate heap allocations counted on top of this.
67///
68/// Faithful to JE `IN.getEntryInMemorySize` + the per-slot `entryStates` /
69/// LSN-array overhead folded into `IN.computeMemorySize` (IN.java ~4632):
70/// JE measures the slot's fixed cost with `Sizeof` on the JVM; Rust has a
71/// fixed struct layout so `size_of::<BinEntry>()` is exact.
72///
73/// T-2/T-3: the per-slot `key` (`Vec<u8>` header) and `lsn` (`u64`) were
74/// hoisted out of `BinEntry` into the node-level `KeyRep`/`LsnRep`. The
75/// `size_of::<BinEntry>()` therefore shrank; we add back the packed per-slot
76/// LSN-rep cost (`LsnRep::BYTES_PER_LSN_ENTRY`, 4 bytes) so the incremental
77/// live counter still approximates the walked heap (the key bytes are charged
78/// separately as `key.len()` at the call site, matching the compact key rep).
79///
80/// Derived (not hard-coded) so a layout change to `BinEntry` is tracked
81/// automatically — see `bin_stub_conformance` for the drift guard.
82pub const BIN_ENTRY_OVERHEAD: usize =
83 std::mem::size_of::<BinEntry>() + LsnRep::BYTES_PER_LSN_ENTRY;
84
85/// Per-slot fixed memory overhead for an IN entry, in bytes (DBI-23).
86///
87/// Heap footprint of one `InEntry` struct inside the IN's `Vec<InEntry>`
88/// buffer (key bytes counted separately). JE `IN.getEntryInMemorySize` for
89/// an upper IN plus the per-slot state/LSN/target overhead from
90/// `IN.computeMemorySize`.
91pub const IN_ENTRY_OVERHEAD: usize = std::mem::size_of::<InEntry>();
92
93/// Type alias for the key comparator used by sorted-duplicate databases.
94///
95/// The comparator takes two full (uncompressed) keys and returns their
96/// relative ordering. For sorted-dup databases this is `DupKeyData::compare`,
97/// which splits each key into primary + data parts and applies separate
98/// comparators to each. For normal databases this field is `None` and
99/// lexicographic byte comparison is used.
100///
101/// `DatabaseImpl.btreeComparator` / `DatabaseImpl.dupComparator`.
102pub type KeyComparatorFn =
103 Arc<dyn Fn(&[u8], &[u8]) -> std::cmp::Ordering + Send + Sync>;
104
105/// Combined search result carrying slot data and the BIN arc, returned by
106/// [`Tree::search_with_data`].
107///
108/// Avoids the double-descent pattern where `Tree::search` checked key
109/// existence and a second call re-descended to fetch the actual slot bytes.
110/// One descent now serves both purposes (Wave-11-I optimisation).
111pub struct SlotFetch {
112 /// `true` if an exact key match was found and is not expired.
113 pub found: bool,
114 /// Data bytes for the slot (`None` when `found` is `false`).
115 pub data: Option<Vec<u8>>,
116 /// Raw slot LSN as `u64`; zero when `found` is `false`.
117 pub lsn: u64,
118 /// Slot index within the BIN. Set to the actual BIN slot index when
119 /// `found` is `true`; `0` otherwise.
120 ///
121 /// Used by `CursorImpl` to set `current_index` correctly so that
122 /// `retrieve_next` advances to the right slot after a search.
123 pub slot_index: usize,
124 /// Arc to the BIN that the descent reached. Always `Some` when the
125 /// tree has at least one node, regardless of whether `found` is `true`.
126 pub bin_arc: Arc<RwLock<TreeNode>>,
127}
128
129/// The B+tree.
130///
131///
132///
133/// This is the main tree structure that manages the B+tree nodes and
134/// provides operations for search, insert, delete, and tree maintenance.
135pub struct Tree {
136 /// Database ID this tree belongs to.
137 database_id: u64,
138
139 /// Maximum entries per node (from config).
140 max_entries_per_node: usize,
141
142 /// Root of the tree. None if tree is empty.
143 ///
144 /// Wrapped in `RwLock` so that `insert`, `delete`, and other mutating
145 /// operations can take `&self` (interior mutability), enabling concurrent
146 /// access to different BIN nodes without requiring a global `&mut Tree`
147 /// borrow. The root pointer itself is only written during root splits
148 /// and initial creation; all other access is read-only.
149 ///
150 /// `Tree.root` protected by the root latch.
151 root: RwLock<Option<Arc<RwLock<TreeNode>>>>,
152
153 /// Latch protecting the root reference itself.
154 /// Must be held when changing the root pointer.
155 root_latch: SharedLatch,
156
157 /// LSN at which the current root IN/BIN was last logged.
158 ///
159 /// Used by the IN-redo currency check (`recover_root_bin` /
160 /// `recover_root_upper_in`) to decide whether a logged root replaces the
161 /// in-memory one. Updated whenever a new root is installed via
162 /// `set_root_with_lsn` or the IN-redo recover-root path.
163 ///
164 /// JE `RootUpdater.originalLsn` / `ChildReference.getLsn()` for the root.
165 root_log_lsn: RwLock<noxu_util::Lsn>,
166
167 /// Statistics: number of times the root has been split.
168 root_splits: AtomicU64,
169
170 /// Statistics: number of latch upgrades from shared to exclusive.
171 relatches_required: AtomicU64,
172
173 /// Optional custom key comparator for sorted-duplicate databases.
174 ///
175 /// When `Some`, all key comparisons in tree traversal (upper IN routing
176 /// and BIN entry search/insert/delete) use this comparator instead of
177 /// lexicographic byte comparison.
178 ///
179 /// / `dupComparator` stored on the
180 /// database and consulted at every `IN.findEntry()` call.
181 pub key_comparator: Option<KeyComparatorFn>,
182
183 /// Shared memory counter for the evictor / MemoryBudget.
184 ///
185 /// Updated on every BIN entry insert (+key+data+overhead) and delete
186 /// (-key+overhead) so the evictor sees real cache pressure.
187 ///
188 /// `env.getMemoryBudget().updateTreeMemoryUsage(delta)` call
189 /// in the equivalent `IN.updateMemorySize()`. In Noxu the counter is an
190 /// `Arc<AtomicI64>` shared with the `Arbiter` (and later `MemoryBudget`)
191 /// to avoid a circular crate dependency (`noxu-tree` → `noxu-dbi`).
192 pub memory_counter: Option<Arc<AtomicI64>>,
193
194 /// Optional listener fed on node add/access/remove, mirroring JE's
195 /// `INList` feeding the evictor's `LRUList`s.
196 ///
197 /// When `None` (the default — used by unit tests with no environment),
198 /// the notifications are no-ops. `EnvironmentImpl` installs the
199 /// `Evictor` here so production inserts/accesses populate the LRU lists
200 /// the evictor drains.
201 ///
202 /// JE reference: `IN.fetchTarget`/split/`rebuildINList` → `addBack`,
203 /// access → `moveBack`, removal → `remove`.
204 pub in_list_listener: Option<Arc<dyn InListListener>>,
205
206 /// Optional log manager so an evicted root IN can be re-materialized from
207 /// its persisted `root_log_lsn` on the next access (EV-14, piece B).
208 ///
209 /// JE's `Tree` reaches the log via `database.getEnv().getLogManager()`;
210 /// `Tree.getRootINRootAlreadyLatched` calls `root.fetchTarget(...)` which
211 /// reads the root IN back from its `ChildReference` LSN when the in-memory
212 /// target is null (Tree.java:477-516, ChildReference.fetchTarget). Noxu
213 /// has no env back-reference here, so the log manager is installed
214 /// directly (the same one-way wiring as `in_list_listener`). When `None`
215 /// (unit tests with no environment), an evicted root cannot be re-fetched
216 /// — but `evict_root` refuses to evict without a log manager, so the root
217 /// is never made non-resident in that configuration.
218 pub log_manager: Option<Arc<noxu_log::LogManager>>,
219
220 /// Capacity hint for the recovery redo path.
221 ///
222 /// When non-zero, the first BIN created by `redo_insert` (the first-key
223 /// path) pre-allocates its `entries` Vec with this capacity so that
224 /// redo insertions proceed without Vec-resize doublings. The value is
225 /// clamped to `max_entries_per_node` at use.
226 ///
227 /// Set by `hint_redo_capacity` before the redo loop.
228 /// Wave 11-K optimisation (Fix 3).
229 redo_capacity_hint: usize,
230
231 /// Whether key-prefix compression is enabled for this tree's BINs.
232 ///
233 /// JE `DatabaseImpl.getKeyPrefixing()` / `DatabaseConfig.setKeyPrefixing()`.
234 /// When `false`, `IN.computeKeyPrefix` returns `null` in JE — no prefix
235 /// is ever set. Noxu mirrors this: `insert_with_prefix` is skipped in
236 /// favour of `insert_raw`, and `recompute_key_prefix` is not called on
237 /// BIN halves after a split.
238 ///
239 /// Default: `false` (matches JE's `DatabaseConfig.KEY_PREFIXING_DEFAULT`).
240 ///
241 /// Ref: `IN.java computeKeyPrefix` ~line 2456.
242 pub key_prefixing: bool,
243 /// T-5: maximum post-prefix key length (bytes) for the compact key rep
244 /// (`INKeyRep.MaxKeySize`). A node packs all its keys into one fixed-width
245 /// byte array when every post-prefix key is `<=` this length; a longer key
246 /// inflates the node to the `Default` rep. `<= 0` disables the compact
247 /// rep entirely.
248 ///
249 /// Default 16 (`TREE_COMPACT_MAX_KEY_LENGTH` /
250 /// `INKeyRep.MaxKeySize.DEFAULT_MAX_KEY_LENGTH`). Wired from
251 /// `EnvironmentConfig` via `Tree::set_compact_max_key_length`
252 /// (`IN.getCompactMaxKeyLength`, IN.java:4929).
253 pub compact_max_key_length: i32,
254}
255
256/// A node in the tree.
257///
258/// TreeNode wraps an upper IN or a BIN. Each variant carries a lightweight
259/// stub whose fields mirror the persistent IN/BIN structure. The stubs will
260/// be replaced with full InNode/Bin types as the implementation matures; the
261/// API surface here is intentionally minimal.
262#[derive(Debug)]
263pub enum TreeNode {
264 /// Internal Node (IN) - non-leaf node in the tree.
265 Internal(InNodeStub),
266
267 /// Bottom Internal Node (BIN) - leaf-level internal node.
268 Bottom(BinStub),
269}
270
271/// Type alias for a resident child pointer.
272pub type ChildArc = Arc<RwLock<TreeNode>>;
273
274/// T-4: per-node representation of the resident-child-pointer array.
275///
276/// Faithful to JE `INTargetRep` (`INTargetRep.java`), the abstract array of
277/// target pointers to an IN's cached children. These arrays are usually
278/// sparse — most upper INs have NO resident children — so JE never stores a
279/// full per-slot `Node[]` until many children are actually cached:
280///
281/// * `None` — `INTargetRep.None`: a shared singleton, 0 child-pointer
282/// bytes, used when no children are cached (the common case for upper
283/// INs). `get` returns null for every slot.
284/// * `Sparse` — `INTargetRep.Sparse`: a small parallel `(index, target)[]`
285/// for 1..=`MAX_ENTRIES` cached children (JE caps at 4). `get(j)` is a
286/// linear scan of the index array.
287/// * `Default`— `INTargetRep.Default`: the full `Vec<Option<Arc>>`, one
288/// slot per entry, used once more than `MAX_ENTRIES` children are
289/// resident.
290///
291/// A node starts `None` and grows `None → Sparse → Default`. JE does not
292/// shrink back when entries are nulled (it only compacts on IN-stripping) to
293/// avoid transitionary rep churn; we follow the same policy — `set_child` only
294/// inflates, and `compact()` (called on eviction/stripping) collapses an
295/// empty/small `Default`/`Sparse` back toward `None`.
296#[derive(Debug)]
297pub enum TargetRep {
298 /// `INTargetRep.None` — no children cached (shared-singleton semantics).
299 None,
300 /// `INTargetRep.Sparse` — a few cached children, `(slot_index, child)`.
301 /// Invariant: `len() <= SPARSE_MAX_ENTRIES`.
302 Sparse(Vec<(u16, ChildArc)>),
303 /// `INTargetRep.Default` — full parallel array, one slot per entry.
304 Default(Vec<Option<ChildArc>>),
305}
306
307impl TargetRep {
308 /// `INTargetRep.Sparse.MAX_ENTRIES` (INTargetRep.java) — the maximum
309 /// number of cached children the `Sparse` rep holds before inflating to
310 /// `Default`.
311 pub const SPARSE_MAX_ENTRIES: usize = 4;
312
313 /// `INTargetRep.get(idx)` — the cached child for slot `idx`, or `None`.
314 #[inline]
315 pub fn get(&self, idx: usize) -> Option<&ChildArc> {
316 match self {
317 TargetRep::None => None,
318 TargetRep::Sparse(v) => {
319 v.iter().find(|(i, _)| *i as usize == idx).map(|(_, c)| c)
320 }
321 TargetRep::Default(v) => v.get(idx).and_then(|o| o.as_ref()),
322 }
323 }
324
325 /// `INTargetRep.set(idx, node, parent)` — set (or clear, when `node` is
326 /// `None`) the cached child for slot `idx`, mutating the representation
327 /// upward (`None → Sparse → Default`) as needed.
328 pub fn set(&mut self, idx: usize, node: Option<ChildArc>) {
329 match self {
330 TargetRep::None => {
331 // INTargetRep.None.set: clearing stays None; setting mutates
332 // to a Sparse rep and sets there.
333 if let Some(child) = node {
334 *self = TargetRep::Sparse(vec![(idx as u16, child)]);
335 }
336 }
337 TargetRep::Sparse(v) => {
338 // Update existing slot in place.
339 if let Some(pos) =
340 v.iter().position(|(i, _)| *i as usize == idx)
341 {
342 match node {
343 Some(child) => v[pos].1 = child,
344 None => {
345 v.swap_remove(pos);
346 }
347 }
348 return;
349 }
350 // New child: clearing a non-present slot is a no-op.
351 let Some(child) = node else { return };
352 if v.len() < Self::SPARSE_MAX_ENTRIES {
353 v.push((idx as u16, child));
354 return;
355 }
356 // Full — INTargetRep.Sparse.set mutates to Default.
357 let cap = v.iter().map(|(i, _)| *i as usize).max().unwrap_or(0);
358 let cap = cap.max(idx) + 1;
359 let mut def: Vec<Option<ChildArc>> = vec![None; cap];
360 for (i, c) in v.drain(..) {
361 def[i as usize] = Some(c);
362 }
363 def[idx] = Some(child);
364 *self = TargetRep::Default(def);
365 }
366 TargetRep::Default(v) => {
367 if idx >= v.len() {
368 if node.is_none() {
369 return;
370 }
371 v.resize_with(idx + 1, || None);
372 }
373 v[idx] = node;
374 }
375 }
376 }
377
378 /// `INTargetRep.None`-aware take: remove and return the cached child for
379 /// slot `idx`, leaving the slot empty (JE `IN.setTarget(idx, null)` plus
380 /// returning the old target).
381 pub fn take(&mut self, idx: usize) -> Option<ChildArc> {
382 match self {
383 TargetRep::None => None,
384 TargetRep::Sparse(v) => v
385 .iter()
386 .position(|(i, _)| *i as usize == idx)
387 .map(|pos| v.swap_remove(pos).1),
388 TargetRep::Default(v) => v.get_mut(idx).and_then(|o| o.take()),
389 }
390 }
391
392 /// JE `INArrayRep.copy(from, to, n, parent)` adapted to slice ops: shift
393 /// the child mapping when an entry is INSERTED at `idx` (all children at
394 /// slots `>= idx` move up by one). Mirrors how `Vec::insert` shifts the
395 /// parallel `entries` array.
396 pub fn insert_shift(&mut self, idx: usize) {
397 match self {
398 TargetRep::None => {}
399 TargetRep::Sparse(v) => {
400 for (i, _) in v.iter_mut() {
401 if (*i as usize) >= idx {
402 *i += 1;
403 }
404 }
405 }
406 TargetRep::Default(v) => {
407 if idx <= v.len() {
408 v.insert(idx, None);
409 }
410 }
411 }
412 }
413
414 /// JE `INArrayRep.copy` adapted: shift the child mapping when the entry at
415 /// `idx` is REMOVED (all children at slots `> idx` move down by one; the
416 /// child at `idx` itself is dropped). Mirrors `Vec::remove`.
417 pub fn remove_shift(&mut self, idx: usize) {
418 match self {
419 TargetRep::None => {}
420 TargetRep::Sparse(v) => {
421 v.retain(|(i, _)| *i as usize != idx);
422 for (i, _) in v.iter_mut() {
423 if (*i as usize) > idx {
424 *i -= 1;
425 }
426 }
427 }
428 TargetRep::Default(v) => {
429 if idx < v.len() {
430 v.remove(idx);
431 }
432 }
433 }
434 }
435
436 /// `INTargetRep.compact(parent)` — collapse toward the most compact rep:
437 /// an empty rep becomes `None`; a `Default` with `<= MAX_ENTRIES` children
438 /// becomes `Sparse` (or `None`). Called when an IN is stripped/evicted.
439 pub fn compact(&mut self) {
440 let count = self.resident_count();
441 if count == 0 {
442 *self = TargetRep::None;
443 return;
444 }
445 if count <= Self::SPARSE_MAX_ENTRIES
446 && let TargetRep::Default(v) = self
447 {
448 let sparse: Vec<(u16, ChildArc)> = v
449 .iter()
450 .enumerate()
451 .filter_map(|(i, o)| o.as_ref().map(|c| (i as u16, c.clone())))
452 .collect();
453 *self = TargetRep::Sparse(sparse);
454 }
455 }
456
457 /// Number of resident (non-null) children.
458 pub fn resident_count(&self) -> usize {
459 match self {
460 TargetRep::None => 0,
461 TargetRep::Sparse(v) => v.len(),
462 TargetRep::Default(v) => v.iter().filter(|o| o.is_some()).count(),
463 }
464 }
465
466 /// True if no children are cached (`INTargetRep.None` or empty).
467 pub fn is_empty(&self) -> bool {
468 self.resident_count() == 0
469 }
470
471 /// Iterate every resident child (in unspecified order).
472 pub fn iter_children(&self) -> Box<dyn Iterator<Item = ChildArc> + '_> {
473 match self {
474 TargetRep::None => Box::new(std::iter::empty()),
475 TargetRep::Sparse(v) => Box::new(v.iter().map(|(_, c)| c.clone())),
476 TargetRep::Default(v) => {
477 Box::new(v.iter().filter_map(|o| o.clone()))
478 }
479 }
480 }
481
482 /// `INTargetRep.calculateMemorySize()` — heap bytes of the rep itself
483 /// (excluding the children it points at). `None` is 0 (shared singleton),
484 /// matching `INTargetRep.None.calculateMemorySize() == 0`.
485 pub fn memory_size(&self) -> usize {
486 use std::mem::size_of;
487 match self {
488 TargetRep::None => 0,
489 TargetRep::Sparse(v) => v.capacity() * size_of::<(u16, ChildArc)>(),
490 TargetRep::Default(v) => {
491 v.capacity() * size_of::<Option<ChildArc>>()
492 }
493 }
494 }
495}
496
497/// T-3: node-level packed LSN array — `IN.entryLsnByteArray` /
498/// `IN.entryLsnLongArray` (IN.java:251-289, getLsn/setLsnInternal
499/// IN.java:1752-1935).
500///
501/// JE stores one LSN per slot. A naive `Lsn` (u64) costs 8 bytes/slot even
502/// though most LSNs in a node share a file number and have a file offset that
503/// fits in 3 bytes. JE's compact rep is a single `byte[]` with
504/// `BYTES_PER_LSN_ENTRY == 4` bytes per slot:
505///
506/// * `base_file_number` is the lowest file number of any non-NULL LSN in the
507/// node;
508/// * byte 0 of each slot = `file_number - base_file_number` (0..=127,
509/// `Byte.MAX_VALUE`);
510/// * bytes 1..4 = the 3-byte little-endian file offset (max
511/// `MAX_FILE_OFFSET == 0xff_fffe`).
512///
513/// The NULL_LSN blocker (Noxu `NULL_LSN == u64::MAX`) is solved EXACTLY as JE
514/// does it: NULL is NOT stored as the raw u64; the slot's 3 file-offset bytes
515/// are set to `0xff_ffff` (`THREE_BYTE_NEGATIVE_ONE`), a value `MAX_FILE_OFFSET`
516/// can never reach, and `get_lsn` maps it back to `NULL_LSN`.
517///
518/// If a file-number difference exceeds 127 or a file offset exceeds
519/// `MAX_FILE_OFFSET`, the rep mutates to `Long` (one `u64` per slot), matching
520/// JE's `mutateToLongArray` (IN.java:1924). An all-NULL node uses `Empty`
521/// (0 bytes), matching the EMPTY_REP/initial-capacity-free state.
522#[derive(Debug)]
523pub enum LsnRep {
524 /// All slots NULL — 0 heap bytes (the `byteArray == null` initial state).
525 Empty,
526 /// `IN.entryLsnByteArray` — 4 bytes/slot, `base_file_number`-relative.
527 Compact { base_file_number: u32, bytes: Vec<u8> },
528 /// `IN.entryLsnLongArray` — 8 bytes/slot fallback after `mutateToLongArray`.
529 Long(Vec<Lsn>),
530}
531
532impl LsnRep {
533 /// `IN.BYTES_PER_LSN_ENTRY` (IN.java:151).
534 pub const BYTES_PER_LSN_ENTRY: usize = 4;
535 /// `IN.MAX_FILE_OFFSET` (IN.java:152) — max file offset the 3-byte form holds.
536 const MAX_FILE_OFFSET: u32 = 0x00ff_fffe;
537 /// `IN.THREE_BYTE_NEGATIVE_ONE` (IN.java:153) — the NULL sentinel in the
538 /// 3 file-offset bytes.
539 const THREE_BYTE_NEGATIVE_ONE: u32 = 0x00ff_ffff;
540 /// `Byte.MAX_VALUE` — max file-number difference the 1-byte offset holds.
541 const MAX_FILE_NUMBER_OFFSET: u32 = 127;
542
543 /// A rep sized for `n` slots, all NULL. Returns `Empty` (0 bytes); the
544 /// Compact byte array is lazily allocated by the first non-NULL `set_lsn`
545 /// — `base_file_number` is unknown until then (IN.java:1820, the
546 /// `baseFileNumber == -1` first-entry case).
547 #[inline]
548 pub fn new(_n: usize) -> Self {
549 LsnRep::Empty
550 }
551
552 /// Build a rep from a per-slot `Lsn` slice (used by node construction and
553 /// split, where slots arrive together). Equivalent to `new(lsns.len())`
554 /// followed by `set(i, lsns[i])` for each slot.
555 pub fn from_lsns(lsns: &[Lsn]) -> Self {
556 let mut rep = LsnRep::Empty;
557 let n = lsns.len();
558 for (i, &lsn) in lsns.iter().enumerate() {
559 rep.set(i, lsn, n);
560 }
561 rep
562 }
563
564 /// `IN.getLsn(idx)` (IN.java:1752).
565 pub fn get(&self, idx: usize) -> Lsn {
566 match self {
567 LsnRep::Empty => NULL_LSN,
568 LsnRep::Long(v) => v.get(idx).copied().unwrap_or(NULL_LSN),
569 LsnRep::Compact { base_file_number, bytes } => {
570 let off = idx * Self::BYTES_PER_LSN_ENTRY;
571 if off + Self::BYTES_PER_LSN_ENTRY > bytes.len() {
572 return NULL_LSN;
573 }
574 let file_offset = Self::get_3byte(bytes, off + 1);
575 if file_offset == Self::THREE_BYTE_NEGATIVE_ONE {
576 NULL_LSN
577 } else {
578 let file_number = base_file_number + bytes[off] as u32;
579 Lsn::new(file_number, file_offset)
580 }
581 }
582 }
583 }
584
585 /// `IN.setLsnInternal(idx, value)` (IN.java:1801) — set the LSN of slot
586 /// `idx`, mutating Empty→Compact→Long as necessary. `n` is the node's
587 /// slot count (sizes a freshly-allocated Compact array).
588 pub fn set(&mut self, idx: usize, lsn: Lsn, n: usize) {
589 // Empty: first non-NULL value allocates the Compact array; a NULL set
590 // on an Empty rep is a no-op (all slots already read NULL).
591 if let LsnRep::Empty = self {
592 if lsn.is_null() {
593 return;
594 }
595 let cap = n.max(idx + 1);
596 *self = LsnRep::Compact {
597 base_file_number: lsn.file_number(),
598 bytes: vec![0u8; cap * Self::BYTES_PER_LSN_ENTRY],
599 };
600 // Mark every other slot NULL (3-byte offset = 0xffffff).
601 if let LsnRep::Compact { bytes, .. } = self {
602 for s in 0..cap {
603 if s != idx {
604 Self::put_3byte(
605 bytes,
606 s * Self::BYTES_PER_LSN_ENTRY + 1,
607 Self::THREE_BYTE_NEGATIVE_ONE,
608 );
609 }
610 }
611 }
612 self.set(idx, lsn, n);
613 return;
614 }
615
616 if let LsnRep::Long(v) = self {
617 if idx >= v.len() {
618 v.resize(idx + 1, NULL_LSN);
619 }
620 v[idx] = lsn;
621 return;
622 }
623
624 // Compact path.
625 let LsnRep::Compact { base_file_number, bytes } = self else {
626 unreachable!()
627 };
628 let need = (idx + 1) * Self::BYTES_PER_LSN_ENTRY;
629 if need > bytes.len() {
630 let old = bytes.len() / Self::BYTES_PER_LSN_ENTRY;
631 bytes.resize(need, 0);
632 for s in old..(idx + 1) {
633 Self::put_3byte(
634 bytes,
635 s * Self::BYTES_PER_LSN_ENTRY + 1,
636 Self::THREE_BYTE_NEGATIVE_ONE,
637 );
638 }
639 }
640 let off = idx * Self::BYTES_PER_LSN_ENTRY;
641
642 if lsn.is_null() {
643 // IN.java:1812 — file-number offset 0, file offset -1 (0xffffff).
644 bytes[off] = 0;
645 Self::put_3byte(bytes, off + 1, Self::THREE_BYTE_NEGATIVE_ONE);
646 return;
647 }
648
649 let this_file_number = lsn.file_number();
650 let this_file_offset = lsn.file_offset();
651
652 // Whether to fall back to the Long rep.
653 let mutate = this_file_offset > Self::MAX_FILE_OFFSET || {
654 if this_file_number < *base_file_number {
655 // IN.java:1827 — try to re-base downward; bail if any existing
656 // slot would then exceed the 1-byte file-number offset.
657 !Self::adjust_file_numbers(
658 bytes,
659 *base_file_number,
660 this_file_number,
661 )
662 } else {
663 this_file_number - *base_file_number
664 > Self::MAX_FILE_NUMBER_OFFSET
665 }
666 };
667
668 if mutate {
669 // IN.java:1924 mutateToLongArray.
670 let nelts = bytes.len() / Self::BYTES_PER_LSN_ENTRY;
671 let mut longs = vec![NULL_LSN; nelts.max(idx + 1)];
672 for (s, slot) in longs.iter_mut().enumerate().take(nelts) {
673 *slot = self_get_compact(*base_file_number, bytes, s);
674 }
675 longs[idx] = lsn;
676 *self = LsnRep::Long(longs);
677 return;
678 }
679
680 if this_file_number < *base_file_number {
681 *base_file_number = this_file_number;
682 }
683 bytes[off] = (this_file_number - *base_file_number) as u8;
684 Self::put_3byte(bytes, off + 1, this_file_offset);
685 }
686
687 /// `IN.adjustFileNumbers` (IN.java:1855) — re-base to a lower file number,
688 /// rewriting every existing slot's 1-byte offset. Returns false (and
689 /// leaves `bytes` unchanged) if any slot would overflow the 1-byte offset.
690 fn adjust_file_numbers(
691 bytes: &mut [u8],
692 old_base: u32,
693 new_base: u32,
694 ) -> bool {
695 let stride = Self::BYTES_PER_LSN_ENTRY;
696 // First pass: verify none overflow.
697 let mut i = 0;
698 while i < bytes.len() {
699 if Self::get_3byte(bytes, i + 1) != Self::THREE_BYTE_NEGATIVE_ONE {
700 let cur_fn = old_base + bytes[i] as u32;
701 if cur_fn - new_base > Self::MAX_FILE_NUMBER_OFFSET {
702 return false;
703 }
704 }
705 i += stride;
706 }
707 // Second pass: apply.
708 let mut i = 0;
709 while i < bytes.len() {
710 if Self::get_3byte(bytes, i + 1) != Self::THREE_BYTE_NEGATIVE_ONE {
711 let cur_fn = old_base + bytes[i] as u32;
712 bytes[i] = (cur_fn - new_base) as u8;
713 }
714 i += stride;
715 }
716 true
717 }
718
719 /// `INArrayRep.copy` analogue: shift LSNs when an entry is inserted at
720 /// `idx` (slots `>= idx` move up one). Mirrors `targets.insert_shift`.
721 pub fn insert_shift(&mut self, idx: usize, n: usize) {
722 match self {
723 LsnRep::Empty => {}
724 LsnRep::Long(v) => {
725 if idx <= v.len() {
726 v.insert(idx, NULL_LSN);
727 }
728 }
729 LsnRep::Compact { bytes, .. } => {
730 let stride = Self::BYTES_PER_LSN_ENTRY;
731 let cap = (n.max((bytes.len() / stride) + 1)) * stride;
732 bytes.resize(cap, 0);
733 let at = idx * stride;
734 // Shift the tail up by one slot.
735 bytes.copy_within(at..cap - stride, at + stride);
736 // The new slot reads NULL.
737 Self::put_3byte(bytes, at + 1, Self::THREE_BYTE_NEGATIVE_ONE);
738 }
739 }
740 }
741
742 /// `INArrayRep.copy` analogue: shift LSNs when entry `idx` is removed
743 /// (slots `> idx` move down one). Mirrors `targets.remove_shift`.
744 pub fn remove_shift(&mut self, idx: usize) {
745 match self {
746 LsnRep::Empty => {}
747 LsnRep::Long(v) => {
748 if idx < v.len() {
749 v.remove(idx);
750 }
751 }
752 LsnRep::Compact { bytes, .. } => {
753 let stride = Self::BYTES_PER_LSN_ENTRY;
754 let at = idx * stride;
755 if at + stride <= bytes.len() {
756 bytes.copy_within(at + stride.., at);
757 let newlen = bytes.len() - stride;
758 bytes.truncate(newlen);
759 }
760 }
761 }
762 }
763
764 /// `IN.computeLsnOverhead` analogue: heap bytes of the rep itself.
765 pub fn memory_size(&self) -> usize {
766 use std::mem::size_of;
767 match self {
768 LsnRep::Empty => 0,
769 LsnRep::Compact { bytes, .. } => bytes.capacity(),
770 LsnRep::Long(v) => v.capacity() * size_of::<Lsn>(),
771 }
772 }
773
774 fn put_3byte(bytes: &mut [u8], offset: usize, value: u32) {
775 bytes[offset] = (value & 0xFF) as u8;
776 bytes[offset + 1] = ((value >> 8) & 0xFF) as u8;
777 bytes[offset + 2] = ((value >> 16) & 0xFF) as u8;
778 }
779
780 fn get_3byte(bytes: &[u8], offset: usize) -> u32 {
781 (bytes[offset] as u32)
782 | ((bytes[offset + 1] as u32) << 8)
783 | ((bytes[offset + 2] as u32) << 16)
784 }
785}
786
787/// Helper used by `LsnRep::set` during `mutateToLongArray` to read an existing
788/// Compact slot without borrowing `self` (which is mid-mutation).
789fn self_get_compact(base_file_number: u32, bytes: &[u8], idx: usize) -> Lsn {
790 let off = idx * LsnRep::BYTES_PER_LSN_ENTRY;
791 let file_offset = LsnRep::get_3byte(bytes, off + 1);
792 if file_offset == LsnRep::THREE_BYTE_NEGATIVE_ONE {
793 NULL_LSN
794 } else {
795 Lsn::new(base_file_number + bytes[off] as u32, file_offset)
796 }
797}
798
799/// `INKeyRep.MaxKeySize.DEFAULT_MAX_KEY_LENGTH` (INKeyRep.java) and the
800/// `TREE_COMPACT_MAX_KEY_LENGTH` config default.
801#[allow(non_upper_case_globals)]
802pub const INKeyRep_DEFAULT_MAX_KEY_LENGTH: i32 = 16;
803
804/// T-2: node-level key array — `INKeyRep.{Default,MaxKeySize}` (INKeyRep.java).
805///
806/// The per-slot key that used to live in `BinEntry`/`InEntry` as a `Vec<u8>`
807/// (24-byte header + a separate heap allocation per key) is hoisted here as a
808/// node-level rep. When every (post-prefix) key in the node is `<=`
809/// `TREE_COMPACT_MAX_KEY_LENGTH` (default 16) the keys pack into ONE
810/// fixed-width byte buffer (`MaxKeySize`): `slot_width` bytes per slot, with a
811/// parallel `lengths` vector tracking the actual length of each key. A key
812/// longer than the threshold inflates the whole node to the `Default` rep
813/// (one `Vec<u8>` per slot), matching JE's `Default.compact` /
814/// `MaxKeySize.expandToDefaultRep`.
815///
816/// As in JE, this stores the UNPREFIXED suffix (key prefixing strips the
817/// common prefix first), so the compact rep is the smaller post-prefix bytes.
818#[derive(Debug, Clone)]
819pub enum KeyRep {
820 /// `INKeyRep.Default` — one owned key per slot (any length).
821 Default(Vec<Vec<u8>>),
822 /// `INKeyRep.MaxKeySize` — all keys packed into one fixed-width buffer.
823 /// `buf.len() == slot_width * lengths.len()`; slot `i` occupies
824 /// `buf[i*slot_width .. i*slot_width + lengths[i]]`.
825 Compact { buf: Vec<u8>, slot_width: usize, lengths: Vec<u16> },
826}
827
828impl KeyRep {
829 /// An empty `Default` rep.
830 #[inline]
831 pub fn new() -> Self {
832 KeyRep::Default(Vec::new())
833 }
834
835 /// Build a `Default` rep from owned keys (callers may later `compact`).
836 #[inline]
837 pub fn from_keys(keys: Vec<Vec<u8>>) -> Self {
838 KeyRep::Default(keys)
839 }
840
841 /// Number of slots.
842 #[inline]
843 pub fn len(&self) -> usize {
844 match self {
845 KeyRep::Default(v) => v.len(),
846 KeyRep::Compact { lengths, .. } => lengths.len(),
847 }
848 }
849
850 #[inline]
851 pub fn is_empty(&self) -> bool {
852 self.len() == 0
853 }
854
855 /// `INKeyRep.get(idx)` / `getKey` — borrow the (post-prefix) key at slot
856 /// `idx` without allocating.
857 #[inline]
858 pub fn get(&self, idx: usize) -> &[u8] {
859 match self {
860 KeyRep::Default(v) => v[idx].as_slice(),
861 KeyRep::Compact { buf, slot_width, lengths } => {
862 let off = idx * slot_width;
863 &buf[off..off + lengths[idx] as usize]
864 }
865 }
866 }
867
868 /// Set the key at slot `idx`. A key longer than a Compact rep's
869 /// `slot_width` inflates the rep to `Default` first
870 /// (`MaxKeySize.expandToDefaultRep`).
871 pub fn set(&mut self, idx: usize, key: Vec<u8>) {
872 match self {
873 KeyRep::Default(v) => v[idx] = key,
874 KeyRep::Compact { slot_width, .. } if key.len() > *slot_width => {
875 self.inflate_to_default();
876 self.set(idx, key);
877 }
878 KeyRep::Compact { buf, slot_width, lengths } => {
879 let off = idx * *slot_width;
880 buf[off..off + key.len()].copy_from_slice(&key);
881 lengths[idx] = key.len() as u16;
882 }
883 }
884 }
885
886 /// Insert a key at slot `idx`, shifting later slots up (mirrors
887 /// `Vec::insert` + `INArrayRep.copy`).
888 pub fn insert(&mut self, idx: usize, key: Vec<u8>) {
889 match self {
890 KeyRep::Default(v) => v.insert(idx, key),
891 KeyRep::Compact { slot_width, .. } if key.len() > *slot_width => {
892 self.inflate_to_default();
893 self.insert(idx, key);
894 }
895 KeyRep::Compact { buf, slot_width, lengths } => {
896 let sw = *slot_width;
897 let at = idx * sw;
898 buf.splice(at..at, std::iter::repeat_n(0u8, sw));
899 buf[at..at + key.len()].copy_from_slice(&key);
900 lengths.insert(idx, key.len() as u16);
901 }
902 }
903 }
904
905 /// Remove the key at slot `idx`, shifting later slots down.
906 pub fn remove(&mut self, idx: usize) -> Vec<u8> {
907 match self {
908 KeyRep::Default(v) => v.remove(idx),
909 KeyRep::Compact { buf, slot_width, lengths } => {
910 let sw = *slot_width;
911 let len = lengths[idx] as usize;
912 let at = idx * sw;
913 let out = buf[at..at + len].to_vec();
914 buf.drain(at..at + sw);
915 lengths.remove(idx);
916 out
917 }
918 }
919 }
920
921 /// `INKeyRep.MaxKeySize.expandToDefaultRep` — mutate a Compact rep to a
922 /// Default rep (one owned `Vec<u8>` per slot).
923 fn inflate_to_default(&mut self) {
924 if let KeyRep::Compact { .. } = self {
925 let keys: Vec<Vec<u8>> =
926 (0..self.len()).map(|i| self.get(i).to_vec()).collect();
927 *self = KeyRep::Default(keys);
928 }
929 }
930
931 /// `INKeyRep.Default.compact(parent)` (INKeyRep.java) — if every key in a
932 /// `Default` rep fits `compact_max_key_length`, pack them into a
933 /// `MaxKeySize` (`Compact`) rep. `compact_max_key_length <= 0` disables
934 /// compaction. No-op when already Compact.
935 pub fn compact(&mut self, compact_max_key_length: i32) {
936 if compact_max_key_length <= 0 {
937 return;
938 }
939 let KeyRep::Default(keys) = self else {
940 return; // already Compact
941 };
942 if keys.is_empty() {
943 return;
944 }
945 let max_len = keys.iter().map(|k| k.len()).max().unwrap_or(0);
946 if max_len > compact_max_key_length as usize {
947 return; // a key exceeds the threshold — stay Default
948 }
949 let slot_width = max_len.max(1);
950 let mut buf = vec![0u8; slot_width * keys.len()];
951 let mut lengths = Vec::with_capacity(keys.len());
952 for (i, k) in keys.iter().enumerate() {
953 let off = i * slot_width;
954 buf[off..off + k.len()].copy_from_slice(k);
955 lengths.push(k.len() as u16);
956 }
957 *self = KeyRep::Compact { buf, slot_width, lengths };
958 }
959
960 /// True when key-byte memory is accounted for inside this rep (Compact),
961 /// vs per-slot `Vec` allocations (Default).
962 /// `INKeyRep.accountsForKeyByteMemUsage`.
963 #[inline]
964 pub fn is_compact(&self) -> bool {
965 matches!(self, KeyRep::Compact { .. })
966 }
967
968 /// Heap bytes of the rep itself (`INKeyRep.calculateMemorySize` +
969 /// key-byte accounting). For Default this is the `Vec<Vec<u8>>` header
970 /// plus each key's heap allocation; for Compact it is the single buffer
971 /// plus the lengths vector.
972 pub fn memory_size(&self) -> usize {
973 use std::mem::size_of;
974 match self {
975 KeyRep::Default(v) => {
976 v.capacity() * size_of::<Vec<u8>>()
977 + v.iter().map(|k| k.capacity()).sum::<usize>()
978 }
979 KeyRep::Compact { buf, lengths, .. } => {
980 buf.capacity() + lengths.capacity() * size_of::<u16>()
981 }
982 }
983 }
984}
985
986impl Default for KeyRep {
987 fn default() -> Self {
988 KeyRep::new()
989 }
990}
991
992/// Lightweight upper-IN representation used by the tree traversal layer.
993///
994/// `IN`: carries the dirty flag (IN_DIRTY_BIT), the LRU
995/// generation counter, and a weak back-pointer to the parent so that
996/// dirty state can be propagated upward.
997#[derive(Debug)]
998pub struct InNodeStub {
999 /// Node ID.
1000 pub node_id: u64,
1001 /// Level in tree.
1002 pub level: i32,
1003 /// Child entries (key, lsn).
1004 pub entries: Vec<InEntry>,
1005 /// T-4: per-node resident-child-pointer representation.
1006 ///
1007 /// `IN.entryTargets` (`INTargetRep`). The cached child pointer is no
1008 /// longer a per-`InEntry` `Option<Arc>` (which cost a pointer-sized slot
1009 /// even when no child was resident); it lives here as a compact
1010 /// node-level rep that starts `None` (0 child-pointer bytes — most upper
1011 /// INs have no resident children), grows to `Sparse` for a few cached
1012 /// children, and inflates to `Default` (the full parallel array) once
1013 /// many children are resident. See `INTargetRep.{None,Sparse,Default}`.
1014 pub targets: TargetRep,
1015 /// Dirty flag — set whenever this node is modified.
1016 /// `IN.dirty` (IN_DIRTY_BIT).
1017 pub dirty: bool,
1018 /// LRU generation counter for the evictor.
1019 /// `IN.generation`.
1020 pub generation: u64,
1021 /// Weak back-pointer to parent IN.
1022 /// Enables dirty-propagation and latch-coupling validation.
1023 /// `IN.parent` reference used during splits and logging.
1024 pub parent: Option<Weak<RwLock<TreeNode>>>,
1025 /// T-3: per-node packed LSN array (`IN.entryLsnByteArray`). The per-slot
1026 /// `lsn` (8 bytes) that used to live in `InEntry` is hoisted here as a
1027 /// `base_file_number`-relative 4-byte-per-slot rep, falling back to a
1028 /// `u64`-per-slot `Long` rep only when a node's LSN range exceeds the
1029 /// compact form. Access via `get_lsn(slot)` / `set_lsn(slot, lsn)`.
1030 pub lsn_rep: LsnRep,
1031}
1032
1033/// Entry in an IN node.
1034///
1035/// T-4: the resident-child pointer that used to live here (`Option<Arc>`) was
1036/// hoisted to the node-level `InNodeStub.targets` (`INTargetRep`); access the
1037/// child for slot `i` via `InNodeStub::get_child(i)` / `set_child` / etc.
1038///
1039/// T-3: the per-slot `lsn` (8 bytes) that used to live here was hoisted to the
1040/// node-level `InNodeStub.lsn_rep` (`IN.entryLsnByteArray`); access the LSN for
1041/// slot `i` via `InNodeStub::get_lsn(i)` / `set_lsn(i, lsn)`.
1042#[derive(Debug, Clone)]
1043pub struct InEntry {
1044 /// Key for this entry.
1045 pub key: Vec<u8>,
1046}
1047
1048/// Lightweight BIN representation used by the tree traversal layer.
1049///
1050/// `BIN` (which extends `IN`): carries the dirty flag, LRU
1051/// generation counter, and a weak back-pointer to the parent IN.
1052///
1053/// # Key Prefix Compression
1054///
1055/// BINs support key prefix compression. When
1056/// `key_prefix` is non-empty the `key` field of every `BinEntry` stores only
1057/// the *suffix* — the bytes after stripping the common leading bytes. The
1058/// full key is reconstructed by prepending `key_prefix` to the stored suffix.
1059///
1060/// This is transparent to callers through the `get_full_key` / `find_entry`
1061/// helpers on `BinStub`. The prefix is recomputed after every insert and
1062/// after a split via `recompute_key_prefix`.
1063#[derive(Debug)]
1064pub struct BinStub {
1065 /// Node ID.
1066 pub node_id: u64,
1067 /// Level (always BIN_LEVEL).
1068 pub level: i32,
1069 /// Entries. When `key_prefix` is non-empty the `key` field in each entry
1070 /// is the *suffix* of the full key (leading `key_prefix` bytes stripped).
1071 /// `IN.entryKeys` (suffix-only storage when prefixing is on).
1072 pub entries: Vec<BinEntry>,
1073 /// Common prefix shared by every key in this BIN.
1074 /// Empty slice means no prefix compression is active.
1075 /// `IN.keyPrefix`.
1076 pub key_prefix: Vec<u8>,
1077 /// Dirty flag — set whenever this BIN is modified.
1078 /// `IN.dirty` (IN_DIRTY_BIT).
1079 pub dirty: bool,
1080 /// BIN-delta flag — true when this BIN contains only dirty (delta) slots
1081 /// rather than a complete set of entries.
1082 /// `IN.IN_DELTA_BIT` (the IN_DELTA_BIT flag inside `flags`).
1083 pub is_delta: bool,
1084 /// LSN at which this BIN was last logged as a full (non-delta) BIN.
1085 ///
1086 /// Used by the checkpoint path to construct `BINDeltaLogEntry.prev_full_lsn`
1087 /// and to compare against `prev_delta_lsn` when deciding whether to write
1088 /// a delta or a full BIN.
1089 ///
1090 /// `BIN.lastFullLsn`.
1091 pub last_full_lsn: Lsn,
1092 /// LSN at which this BIN was last logged as a BIN-delta.
1093 ///
1094 /// Written as `prev_delta_lsn` into the next `BINDeltaLogEntry` so the
1095 /// cleaner's utilization tracker can mark the superseded delta obsolete.
1096 /// Reset to `NULL_LSN` whenever a full BIN is written.
1097 ///
1098 /// `BIN.lastDeltaVersion` / `BIN.getLastDeltaLsn()`.
1099 pub last_delta_lsn: Lsn,
1100 /// LRU generation counter for the evictor.
1101 /// `IN.generation`.
1102 pub generation: u64,
1103 /// Weak back-pointer to parent IN.
1104 /// Enables dirty-propagation and latch-coupling validation.
1105 pub parent: Option<Weak<RwLock<TreeNode>>>,
1106 /// If true, `BinEntry.expiration_time` values in this BIN are packed hours
1107 /// since epoch; if false, they are packed seconds since epoch.
1108 ///
1109 /// Default: `true` (hours, matching TTL resolution).
1110 ///
1111 /// `BIN.expirationInHours`.
1112 pub expiration_in_hours: bool,
1113 /// Number of cursors currently positioned on this BIN.
1114 ///
1115 /// The evictor skips BINs with a non-zero cursor count to avoid evicting
1116 /// a node that a cursor is actively traversing. CursorImpl increments
1117 /// this when positioning on a BIN and decrements it on reposition/close.
1118 ///
1119 /// `IN.cursorSet.size()` used by `Evictor.selectIN()`.
1120 pub cursor_count: i32,
1121 /// When true, the NEXT log of this BIN must be a full BIN, not a delta.
1122 ///
1123 /// Set after a dirty slot is removed (a delta would silently lose that
1124 /// removal) and cleared after a full BIN is written. This is the
1125 /// delta-chain bound: it forces a periodic full BIN so a delta never
1126 /// references stale state.
1127 ///
1128 /// `IN.prohibitNextDelta` / `IN.setProhibitNextDelta` (IN.java:5013) /
1129 /// `IN.getProhibitNextDelta`.
1130 pub prohibit_next_delta: bool,
1131 /// T-3: per-node packed LSN array (`IN.entryLsnByteArray`). The per-slot
1132 /// `lsn` (8 bytes) that used to live in `BinEntry` is hoisted here as a
1133 /// `base_file_number`-relative 4-byte-per-slot rep. Access via
1134 /// `get_lsn(slot)` / `set_lsn(slot, lsn)`.
1135 pub lsn_rep: LsnRep,
1136 /// T-2: per-node key array (`INKeyRep.{Default,MaxKeySize}`). The per-slot
1137 /// `key` (`Vec<u8>`, 24-byte header + heap alloc) that used to live in
1138 /// `BinEntry` is hoisted here. Stores the post-prefix SUFFIX (key
1139 /// prefixing strips the common prefix first). Packs into one fixed-width
1140 /// buffer (`Compact`) when every suffix is `<= compact_max_key_length`,
1141 /// else one `Vec<u8>` per slot (`Default`). `keys.len()` is kept in lock
1142 /// step with `entries.len()`. Access via `get_key(slot)` /
1143 /// `get_full_key(slot)`.
1144 pub keys: KeyRep,
1145 /// T-5: the node's compact-key threshold (`IN.getCompactMaxKeyLength`),
1146 /// copied from the owning `Tree` at construction so `apply_new_prefix` can
1147 /// decide whether the suffixes now fit `MaxKeySize`. Default 16.
1148 pub compact_max_key_length: i32,
1149}
1150
1151/// Entry in a BIN node.
1152///
1153/// T-3: the per-slot `lsn` (8 bytes) that used to live here was hoisted to the
1154/// node-level `BinStub.lsn_rep` (`IN.entryLsnByteArray`); access the LSN for
1155/// slot `i` via `BinStub::get_lsn(i)` / `set_lsn(i, lsn)`.
1156#[derive(Debug, Clone)]
1157pub struct BinEntry {
1158 /// Optional embedded data (for small records) or cached LN.
1159 pub data: Option<Vec<u8>>,
1160 /// True when this slot has been marked known-deleted (analogous to the
1161 /// KNOWN_DELETED_BIT in `IN.entryStates`). The slot is eligible for
1162 /// removal by `compress_bin()`.
1163 pub known_deleted: bool,
1164 /// True when this slot has been modified since the last full BIN log write.
1165 ///
1166 /// `IN.entryStates[i] & IN_DIRTY_BIT`. Used by the checkpoint
1167 /// path to decide whether to write a BIN-delta (few dirty slots) or a
1168 /// full BIN (many dirty slots).
1169 pub dirty: bool,
1170 /// Packed expiration time (0 = no expiration).
1171 ///
1172 /// When the owning `BinStub.expiration_in_hours` is true, this value is
1173 /// hours since Unix epoch; otherwise it is seconds since Unix epoch.
1174 ///
1175 /// `IN.entryExpiration`.
1176 pub expiration_time: u32,
1177}
1178
1179impl InNodeStub {
1180 /// `IN.getTarget(idx)` — the resident child cached for slot `idx`, cloned
1181 /// (a strong `Arc`), or `None` if the child is not cached. Routes through
1182 /// the node-level `INTargetRep` (T-4).
1183 #[inline]
1184 pub fn get_child(&self, idx: usize) -> Option<ChildArc> {
1185 self.targets.get(idx).cloned()
1186 }
1187
1188 /// Borrow the resident child for slot `idx` without cloning.
1189 #[inline]
1190 pub fn child_ref(&self, idx: usize) -> Option<&ChildArc> {
1191 self.targets.get(idx)
1192 }
1193
1194 /// True if slot `idx` has no resident (cached) child.
1195 /// `IN.getTarget(idx) == null`.
1196 #[inline]
1197 pub fn child_is_none(&self, idx: usize) -> bool {
1198 self.targets.get(idx).is_none()
1199 }
1200
1201 /// `IN.setTarget(idx, node)` — set (or clear) the cached child for slot
1202 /// `idx`, mutating the `INTargetRep` upward as needed.
1203 #[inline]
1204 pub fn set_child(&mut self, idx: usize, node: Option<ChildArc>) {
1205 self.targets.set(idx, node);
1206 }
1207
1208 /// `IN.detachNode` helper — remove and return the cached child for slot
1209 /// `idx`, leaving the slot's key/LSN intact for re-fetch.
1210 #[inline]
1211 pub fn take_child(&mut self, idx: usize) -> Option<ChildArc> {
1212 self.targets.take(idx)
1213 }
1214
1215 /// `IN.getLsn(idx)` (IN.java:1752) — the LSN of slot `idx` via the
1216 /// node-level packed `LsnRep` (T-3).
1217 #[inline]
1218 pub fn get_lsn(&self, idx: usize) -> Lsn {
1219 self.lsn_rep.get(idx)
1220 }
1221
1222 /// `IN.setLsn(idx, lsn)` (IN.java:1773) — set the LSN of slot `idx` via
1223 /// the node-level packed `LsnRep` (T-3).
1224 #[inline]
1225 pub fn set_lsn(&mut self, idx: usize, lsn: Lsn) {
1226 let n = self.entries.len();
1227 self.lsn_rep.set(idx, lsn, n);
1228 }
1229
1230 /// Insert an entry at `idx`, shifting the child mapping to stay aligned
1231 /// (`INArrayRep.copy`), then set the new slot's cached child. Mirrors the
1232 /// old `entries.insert(idx, InEntry{ child: ..})` in one call.
1233 pub fn insert_entry(
1234 &mut self,
1235 idx: usize,
1236 key: Vec<u8>,
1237 lsn: Lsn,
1238 child: Option<ChildArc>,
1239 ) {
1240 self.entries.insert(idx, InEntry { key });
1241 let n = self.entries.len();
1242 self.lsn_rep.insert_shift(idx, n);
1243 self.lsn_rep.set(idx, lsn, n);
1244 self.targets.insert_shift(idx);
1245 if child.is_some() {
1246 self.targets.set(idx, child);
1247 }
1248 }
1249
1250 /// Remove the entry at `idx`, shifting the child mapping to stay aligned
1251 /// (`INArrayRep.copy`). Returns the removed `InEntry` (key).
1252 pub fn remove_entry(&mut self, idx: usize) -> InEntry {
1253 let e = self.entries.remove(idx);
1254 self.lsn_rep.remove_shift(idx);
1255 self.targets.remove_shift(idx);
1256 e
1257 }
1258
1259 /// All resident children (cloned `Arc`s), in unspecified order.
1260 /// Replaces `entries.iter().filter_map(|e| e.child.clone())`.
1261 pub fn resident_children(&self) -> Vec<ChildArc> {
1262 self.targets.iter_children().collect()
1263 }
1264
1265 /// `(slot_index, child)` of the first resident child, if any.
1266 pub fn first_resident_child(&self) -> Option<(usize, ChildArc)> {
1267 (0..self.entries.len())
1268 .find_map(|i| self.targets.get(i).map(|c| (i, c.clone())))
1269 }
1270}
1271
1272impl BinStub {
1273 /// `IN.getLsn(idx)` (IN.java:1752) — the LSN of slot `idx` via the
1274 /// node-level packed `LsnRep` (T-3).
1275 #[inline]
1276 pub fn get_lsn(&self, idx: usize) -> Lsn {
1277 self.lsn_rep.get(idx)
1278 }
1279
1280 /// `IN.setLsn(idx, lsn)` (IN.java:1773) — set the LSN of slot `idx` via
1281 /// the node-level packed `LsnRep` (T-3).
1282 #[inline]
1283 pub fn set_lsn(&mut self, idx: usize, lsn: Lsn) {
1284 let n = self.entries.len();
1285 self.lsn_rep.set(idx, lsn, n);
1286 }
1287
1288 /// TREE-F1: the single user-facing liveness predicate for a BIN slot.
1289 ///
1290 /// A slot is LIVE for reads/scans iff it is neither `known_deleted` nor
1291 /// TTL-expired. This mirrors the two ways JE makes a slot read as ABSENT:
1292 /// * `IN.findEntry` (IN.java:3197) returns -1 for a `known_deleted`
1293 /// exact match;
1294 /// * `CursorImpl.isProbablyExpired` / `lockAndGetCurrent`
1295 /// (CursorImpl.java:2062-2064) skip `isEntryKnownDeleted` (and
1296 /// expired) slots while stepping.
1297 ///
1298 /// KD slots legitimately exist in live BINs during BIN-delta
1299 /// reconstitution until the compressor reclaims them; the maintenance
1300 /// paths (compressor / recovery undo) iterate them on purpose and do NOT
1301 /// use this predicate.
1302 #[inline]
1303 pub fn slot_is_live(&self, idx: usize) -> bool {
1304 match self.entries.get(idx) {
1305 Some(e) => {
1306 !(e.known_deleted
1307 || (e.expiration_time != 0
1308 && noxu_util::ttl::is_expired(
1309 e.expiration_time,
1310 self.expiration_in_hours,
1311 )))
1312 }
1313 None => false,
1314 }
1315 }
1316
1317 // ========================================================================
1318 // Key prefix compression helpers
1319 // IN.computeKeyPrefix / IN.recalcSuffixes / IN.getKey
1320 // ========================================================================
1321
1322 /// Strips embedded LN data from non-dirty slots, freeing the heap
1323 /// allocations of the per-slot value bytes while keeping the slot keys
1324 /// and LSNs addressable. Used by the evictor's PartialEvict path: a
1325 /// hot BIN is kept in cache so its descent path stays warm, but the LN
1326 /// data is dropped to make room for hotter content. Subsequent reads
1327 /// re-fetch the data from the log via the slot LSN.
1328 ///
1329 /// Skips slots that are still dirty (their data has not been written
1330 /// to the log yet, so dropping the in-memory copy would lose the
1331 /// update). Returns the number of bytes freed (sum of the lengths
1332 /// of the dropped `Vec<u8>` data fields).
1333 ///
1334 /// Returns 0 if the BIN has any open cursors (the cursor may be
1335 /// reading the data right now).
1336 pub fn strip_lns(&mut self) -> usize {
1337 if self.cursor_count > 0 {
1338 return 0;
1339 }
1340 let mut freed = 0usize;
1341 for idx in 0..self.entries.len() {
1342 // JE BIN.evictLNs / LN.isEvictable (LN.java:263 returns true): an
1343 // LN's in-memory value can be stripped whenever it is recoverable
1344 // from the log — i.e. the slot has a valid (logged) LSN — REGARDLESS
1345 // of the dirty bit. The dirty bit governs whether the BIN's
1346 // *structure* needs re-logging at the next checkpoint (BIN-delta vs
1347 // full BIN), NOT whether the LN *value* is durable: a transactional
1348 // commit logs the LN, so the slot's LSN points at the durable copy
1349 // even while the slot is still dirty. Gating the strip on `!dirty`
1350 // (the previous behaviour) meant a freshly-written, not-yet-
1351 // checkpointed record — the common case under a write/recently-read
1352 // workload — could never be stripped, so eviction reclaimed almost
1353 // nothing under pressure (EVICTOR-RECLAIM-1). A slot with a NULL/
1354 // transient LSN (a deferred-write LN never logged) is NOT
1355 // strippable — its only copy is the in-memory value.
1356 if self.get_lsn(idx) == NULL_LSN {
1357 continue;
1358 }
1359 if let Some(data) = self.entries[idx].data.take() {
1360 freed = freed.saturating_add(data.len());
1361 }
1362 }
1363 freed
1364 }
1365
1366 /// Reconstruct the full key for slot `idx` by prepending the BIN's
1367 /// current prefix to the stored suffix.
1368 ///
1369 /// `IN.getKey(int idx)`.
1370 pub fn get_full_key(&self, idx: usize) -> Option<Vec<u8>> {
1371 if idx >= self.keys.len() {
1372 return None;
1373 }
1374 let suffix = self.keys.get(idx); // T-2
1375 if self.key_prefix.is_empty() {
1376 Some(suffix.to_vec())
1377 } else {
1378 let mut full =
1379 Vec::with_capacity(self.key_prefix.len() + suffix.len());
1380 full.extend_from_slice(&self.key_prefix);
1381 full.extend_from_slice(suffix);
1382 Some(full)
1383 }
1384 }
1385
1386 /// Borrow the stored (post-prefix) suffix at slot `idx` (`INKeyRep.get`).
1387 #[inline]
1388 pub fn get_key(&self, idx: usize) -> &[u8] {
1389 self.keys.get(idx)
1390 }
1391
1392 /// T-2: insert a new slot at `idx` keeping the parallel `entries`, `keys`,
1393 /// and `lsn_rep` arrays in lock step. `suffix` is the post-prefix key.
1394 fn insert_slot(
1395 &mut self,
1396 idx: usize,
1397 suffix: Vec<u8>,
1398 lsn: Lsn,
1399 data: Option<Vec<u8>>,
1400 ) {
1401 self.entries.insert(
1402 idx,
1403 BinEntry {
1404 data,
1405 known_deleted: false,
1406 dirty: true,
1407 expiration_time: 0,
1408 },
1409 );
1410 self.keys.insert(idx, suffix); // T-2
1411 let n = self.entries.len();
1412 self.lsn_rep.insert_shift(idx, n); // T-3
1413 self.lsn_rep.set(idx, lsn, n);
1414 }
1415
1416 /// Decompress a stored suffix back to a full key.
1417 ///
1418 /// `IN.getKey` used from outside: prepend `key_prefix` to
1419 /// `suffix`. If `key_prefix` is empty the suffix *is* the full key.
1420 pub fn decompress_key(&self, suffix: &[u8]) -> Vec<u8> {
1421 if self.key_prefix.is_empty() {
1422 suffix.to_vec()
1423 } else {
1424 let mut full =
1425 Vec::with_capacity(self.key_prefix.len() + suffix.len());
1426 full.extend_from_slice(&self.key_prefix);
1427 full.extend_from_slice(suffix);
1428 full
1429 }
1430 }
1431
1432 /// Strip the current prefix from a full key to obtain the stored suffix.
1433 ///
1434 /// `IN.computeKeySuffix(byte[] prefix, byte[] key)`.
1435 ///
1436 /// # Panics
1437 /// Panics (debug only) if `full_key` does not start with `key_prefix`.
1438 pub fn compress_key(&self, full_key: &[u8]) -> Vec<u8> {
1439 let plen = self.key_prefix.len();
1440 if plen == 0 {
1441 full_key.to_vec()
1442 } else {
1443 debug_assert!(
1444 full_key.starts_with(&self.key_prefix),
1445 "compress_key: key does not start with current prefix"
1446 );
1447 full_key[plen..].to_vec()
1448 }
1449 }
1450
1451 /// Compute the longest common prefix of all full keys currently in this
1452 /// BIN, optionally excluding the entry at `exclude_idx` (used during
1453 /// insertions to ignore the slot that is about to be replaced).
1454 ///
1455 /// Returns an empty `Vec` if the BIN has fewer than 2 entries or if the
1456 /// keys share no common leading bytes.
1457 ///
1458 /// `IN.computeKeyPrefix(int excludeIdx)`.
1459 pub fn compute_key_prefix(&self, exclude_idx: Option<usize>) -> Vec<u8> {
1460 // Need at least 2 entries to find a common prefix.
1461 let n = self.keys.len();
1462 if n < 2 {
1463 return Vec::new();
1464 }
1465
1466 // Pick the first non-excluded index as the seed.
1467 let first_idx = match exclude_idx {
1468 Some(0) => 1,
1469 _ => 0,
1470 };
1471
1472 // The current prefix_len is taken from the seed full key.
1473 let seed_full = match self.get_full_key(first_idx) {
1474 Some(k) => k,
1475 None => return Vec::new(),
1476 };
1477 let mut prefix_len = seed_full.len();
1478
1479 // Compare every other non-excluded entry against the running prefix.
1480 // Iterate all entries (byteOrdered disabled in too).
1481 for i in (first_idx + 1)..n {
1482 if let Some(ex) = exclude_idx
1483 && i == ex
1484 {
1485 continue;
1486 }
1487 let full_key = match self.get_full_key(i) {
1488 Some(k) => k,
1489 None => continue,
1490 };
1491 let new_len =
1492 get_key_prefix_length(&seed_full[..prefix_len], &full_key);
1493 if new_len < prefix_len {
1494 prefix_len = new_len;
1495 }
1496 if prefix_len == 0 {
1497 return Vec::new();
1498 }
1499 }
1500
1501 seed_full[..prefix_len].to_vec()
1502 }
1503
1504 /// Recompute the key prefix from scratch and re-encode every stored suffix.
1505 ///
1506 /// Call this after bulk inserts, splits, or merges.
1507 ///
1508 /// `IN.recalcKeyPrefix()` → `IN.recalcSuffixes(newPrefix, …)`.
1509 pub fn recompute_key_prefix(&mut self) {
1510 let new_prefix = self.compute_key_prefix(None);
1511 self.apply_new_prefix(new_prefix);
1512 }
1513
1514 /// Apply `new_prefix` as the BIN's key prefix, re-encoding all stored
1515 /// suffixes from the old prefix into the new one.
1516 ///
1517 /// This is the Rust.
1518 fn apply_new_prefix(&mut self, new_prefix: Vec<u8>) {
1519 // Reconstruct all full keys (using old prefix), then re-encode with
1520 // the new prefix.
1521 let full_keys: Vec<Vec<u8>> = (0..self.keys.len())
1522 .map(|i| self.get_full_key(i).unwrap_or_default())
1523 .collect();
1524
1525 self.key_prefix = new_prefix;
1526
1527 // T-2: re-encode every suffix into the key rep, then re-attempt
1528 // compaction (a smaller prefix may make all suffixes fit MaxKeySize).
1529 for (i, full_key) in full_keys.into_iter().enumerate() {
1530 let suffix = self.compress_key(&full_key);
1531 self.keys.set(i, suffix);
1532 }
1533 self.keys.compact(self.compact_max_key_length);
1534 }
1535
1536 /// Binary-search this BIN for `full_key` (a full, uncompressed key).
1537 ///
1538 /// The stored suffixes are compared after stripping the current prefix
1539 /// from `full_key`, so the search is done entirely in suffix-space — no
1540 /// heap allocation needed in the happy path.
1541 ///
1542 /// Returns `(idx, exact)` where:
1543 /// - `idx` is the slot index (or insertion point when `exact == false`).
1544 /// - `exact` is `true` when an exact match was found.
1545 ///
1546 /// `IN.findEntry(key, indicateIfDuplicate, exact)`.
1547 pub fn find_entry_compressed(&self, full_key: &[u8]) -> (usize, bool) {
1548 let plen = self.key_prefix.len();
1549 // Check that the key shares the current prefix; if not it cannot be
1550 // present and we return the appropriate insertion point.
1551 if plen > 0
1552 && (full_key.len() < plen
1553 || &full_key[..plen] != self.key_prefix.as_slice())
1554 {
1555 // The key does not share the current prefix.
1556 // Determine insertion point using full-key comparison.
1557 let pos = self.key_partition_point(|s| {
1558 self.decompress_key(s).as_slice() < full_key
1559 });
1560 return (pos, false);
1561 }
1562 let suffix = &full_key[plen..];
1563 // T-2: binary search over the node-level key rep (suffix space).
1564 match self.key_binary_search(suffix) {
1565 Ok(idx) => (idx, true),
1566 Err(idx) => (idx, false),
1567 }
1568 }
1569
1570 /// Binary search the key rep for `suffix` (suffix space, unsigned bytes).
1571 /// Mirrors `Vec::binary_search_by(|e| e.key.cmp(suffix))` over the
1572 /// node-level `KeyRep` (T-2).
1573 #[inline]
1574 fn key_binary_search(&self, suffix: &[u8]) -> Result<usize, usize> {
1575 let mut lo = 0usize;
1576 let mut hi = self.keys.len();
1577 while lo < hi {
1578 let mid = lo + (hi - lo) / 2;
1579 match self.keys.get(mid).cmp(suffix) {
1580 std::cmp::Ordering::Less => lo = mid + 1,
1581 std::cmp::Ordering::Greater => hi = mid,
1582 std::cmp::Ordering::Equal => return Ok(mid),
1583 }
1584 }
1585 Err(lo)
1586 }
1587
1588 /// `slice::partition_point` over the node-level key rep suffixes (T-2):
1589 /// the index of the first slot for which `pred(suffix)` is false.
1590 #[inline]
1591 fn key_partition_point(
1592 &self,
1593 mut pred: impl FnMut(&[u8]) -> bool,
1594 ) -> usize {
1595 let mut lo = 0usize;
1596 let mut hi = self.keys.len();
1597 while lo < hi {
1598 let mid = lo + (hi - lo) / 2;
1599 if pred(self.keys.get(mid)) {
1600 lo = mid + 1;
1601 } else {
1602 hi = mid;
1603 }
1604 }
1605 lo
1606 }
1607
1608 /// Insert or update a full (uncompressed) key in this BIN.
1609 ///
1610 /// After insertion the key prefix is recomputed; if the prefix changes all
1611 /// stored suffixes are re-encoded.
1612 ///
1613 /// Returns `(slot_index, is_new_insert)`.
1614 ///
1615 /// `IN.setKey` / BIN insert path.
1616 pub fn insert_with_prefix(
1617 &mut self,
1618 full_key: Vec<u8>,
1619 lsn: Lsn,
1620 data: Option<Vec<u8>>,
1621 ) -> (usize, bool) {
1622 // Is the current prefix still compatible with this key?
1623 let plen = self.key_prefix.len();
1624 let new_len = if plen > 0 {
1625 get_key_prefix_length(&self.key_prefix, &full_key)
1626 } else {
1627 0
1628 };
1629
1630 // If the new key shrinks the prefix we must re-encode everything first.
1631 if plen > 0 && new_len < plen {
1632 // Compute new prefix considering the incoming key and
1633 // all existing full keys. We pass `None` for exclude_idx because
1634 // the slot for this key does not yet exist.
1635 let mut candidate = self.compute_key_prefix(None);
1636 // Also constrain by the new key itself.
1637 if !candidate.is_empty() {
1638 let cl = get_key_prefix_length(&candidate, &full_key);
1639 candidate.truncate(cl);
1640 } else {
1641 // No existing prefix; try to build one from the new key
1642 // against the existing full keys.
1643 if !self.entries.is_empty()
1644 && let Some(first_full) = self.get_full_key(0)
1645 {
1646 candidate = create_key_prefix(&first_full, &full_key)
1647 .unwrap_or_default();
1648 for i in 1..self.entries.len() {
1649 if candidate.is_empty() {
1650 break;
1651 }
1652 if let Some(fk) = self.get_full_key(i) {
1653 let l = get_key_prefix_length(&candidate, &fk);
1654 candidate.truncate(l);
1655 }
1656 }
1657 }
1658 }
1659 self.apply_new_prefix(candidate);
1660 }
1661
1662 // Compress the new key under the (possibly updated) prefix.
1663 let suffix = self.compress_key(&full_key);
1664
1665 match self.key_binary_search(&suffix) {
1666 Ok(idx) => {
1667 // Key exists — update in place.
1668 self.set_lsn(idx, lsn); // T-3
1669 self.entries[idx].data = data;
1670 // Mark slot dirty: this slot changed since the last full BIN log.
1671 // `IN.setDirtyEntry(idx)`.
1672 self.entries[idx].dirty = true;
1673 (idx, false)
1674 }
1675 Err(idx) => {
1676 // New key — insert in sorted position.
1677 // New slots start dirty: they have never been logged in any BIN.
1678 // `IN.setDirtyEntry(idx)` called after `insertEntry`.
1679 self.insert_slot(idx, suffix, lsn, data);
1680 // After insertion, if there is no prefix yet, try to establish one.
1681 if self.key_prefix.is_empty() && self.entries.len() >= 2 {
1682 self.recompute_key_prefix();
1683 }
1684 (idx, true)
1685 }
1686 }
1687 }
1688
1689 /// Slice-based variant of [`BinStub::insert_with_prefix`] for the recovery redo path.
1690 ///
1691 /// Accepts `key` and `data` as `&[u8]` slices instead of owned `Vec<u8>`,
1692 /// eliminating the intermediate `Vec<u8>` that `redo_ln` would otherwise
1693 /// allocate before crossing the BIN boundary. The compressed suffix and
1694 /// the data bytes are each copied into the `BinEntry` exactly once.
1695 ///
1696 /// Semantics are identical to `insert_with_prefix`:
1697 /// - Updates the slot in place when the key already exists.
1698 /// - Inserts a new sorted entry when absent, recomputing the key prefix.
1699 ///
1700 /// Wave 11-K optimisation (Fix 1).
1701 pub fn insert_with_prefix_slice(
1702 &mut self,
1703 full_key: &[u8],
1704 lsn: Lsn,
1705 data: Option<&[u8]>,
1706 ) -> (usize, bool) {
1707 let plen = self.key_prefix.len();
1708 let new_len = if plen > 0 {
1709 get_key_prefix_length(&self.key_prefix, full_key)
1710 } else {
1711 0
1712 };
1713
1714 if plen > 0 && new_len < plen {
1715 let mut candidate = self.compute_key_prefix(None);
1716 if !candidate.is_empty() {
1717 let cl = get_key_prefix_length(&candidate, full_key);
1718 candidate.truncate(cl);
1719 } else {
1720 if !self.entries.is_empty()
1721 && let Some(first_full) = self.get_full_key(0)
1722 {
1723 candidate = create_key_prefix(&first_full, full_key)
1724 .unwrap_or_default();
1725 for i in 1..self.entries.len() {
1726 if candidate.is_empty() {
1727 break;
1728 }
1729 if let Some(fk) = self.get_full_key(i) {
1730 let l = get_key_prefix_length(&candidate, &fk);
1731 candidate.truncate(l);
1732 }
1733 }
1734 }
1735 }
1736 self.apply_new_prefix(candidate);
1737 }
1738
1739 let suffix = self.compress_key(full_key);
1740
1741 match self.key_binary_search(&suffix) {
1742 Ok(idx) => {
1743 self.set_lsn(idx, lsn); // T-3
1744 self.entries[idx].data = data.map(|d| d.to_vec());
1745 self.entries[idx].dirty = true;
1746 (idx, false)
1747 }
1748 Err(idx) => {
1749 self.insert_slot(idx, suffix, lsn, data.map(|d| d.to_vec()));
1750 if self.key_prefix.is_empty() && self.entries.len() >= 2 {
1751 self.recompute_key_prefix();
1752 }
1753 (idx, true)
1754 }
1755 }
1756 }
1757
1758 /// Returns the number of slots that are marked dirty.
1759 ///
1760 /// `BIN.getNumDirtyEntries()`.
1761 pub fn dirty_count(&self) -> usize {
1762 self.entries.iter().filter(|e| e.dirty).count()
1763 }
1764
1765 /// Decide whether to log this BIN as a delta (true) or a full BIN (false).
1766 ///
1767 /// Faithful port of JE `BIN.shouldLogDelta()` (BIN.java:1892). The
1768 /// decision is COUNT-based (number of would-be delta slots vs a percent of
1769 /// `nEntries`), NOT a dirty-fraction-vs-hardcoded-0.25 heuristic:
1770 ///
1771 /// ```text
1772 /// if (isBINDelta()) { return true; } // already a delta
1773 /// if (isDeltaProhibited()) return false; // prohibit / no prior full
1774 /// numDeltas = getNDeltas();
1775 /// if (numDeltas <= 0) return false; // empty delta is invalid
1776 /// deltaLimit = (getNEntries() * binDeltaPercent) / 100; // INTEGER math
1777 /// return numDeltas <= deltaLimit;
1778 /// ```
1779 ///
1780 /// `numDeltas` (JE `getNDeltas`) is the count of slots that would appear in
1781 /// the delta — i.e. the dirty slots since the last full BIN — which here is
1782 /// `dirty_count()`. `binDeltaPercent` is the CONFIGURABLE `TREE_BIN_DELTA`
1783 /// param (JE `DatabaseImpl.getBinDeltaPercent()`, default 25), threaded in
1784 /// by the checkpointer — NOT a hardcoded constant.
1785 ///
1786 /// `isDeltaProhibited()` (BIN.java:1867) is
1787 /// `getProhibitNextDelta() || isDeferredWriteMode() || lastFullLsn == NULL`.
1788 /// Deferred-write mode is not modelled in the runtime stub; the other two
1789 /// terms are.
1790 ///
1791 /// JE ref: `BIN.shouldLogDelta` (BIN.java:1892), `BIN.isDeltaProhibited`
1792 /// (BIN.java:1867).
1793 pub fn should_log_delta(&self, bin_delta_percent: i32) -> bool {
1794 // Already a delta: re-log as a delta. JE asserts !prohibitNextDelta
1795 // and lastFullLsn != NULL here.
1796 if self.is_delta {
1797 return self.last_full_lsn != NULL_LSN && !self.prohibit_next_delta;
1798 }
1799
1800 // isDeltaProhibited(): cheapest checks first.
1801 if self.prohibit_next_delta || self.last_full_lsn == NULL_LSN {
1802 return false;
1803 }
1804
1805 // numDeltas = getNDeltas(): the dirty slots that would be in the delta.
1806 let num_deltas = self.dirty_count() as i32;
1807
1808 // A delta with zero items is not valid.
1809 if num_deltas <= 0 {
1810 return false;
1811 }
1812
1813 // Configured BinDeltaPercent limit — INTEGER math, exactly as JE.
1814 let delta_limit = (self.entries.len() as i32 * bin_delta_percent) / 100;
1815 num_deltas <= delta_limit
1816 }
1817
1818 /// Comparator-aware binary search: finds `full_key` using `cmp`.
1819 ///
1820 /// Unlike `find_entry_compressed` (which uses suffix-based lexicographic
1821 /// comparison), this decompresses each entry's key to its full form and
1822 /// applies the provided comparator — required for sorted-dup databases
1823 /// where lexicographic suffix comparison would give wrong results when
1824 /// different-length primary keys are in the same BIN.
1825 ///
1826 /// Returns `(idx, exact)`. Does NOT do prefix compression.
1827 ///
1828 /// `IN.findEntry` with btreeComparator active.
1829 pub fn find_entry_cmp(
1830 &self,
1831 full_key: &[u8],
1832 cmp: &dyn Fn(&[u8], &[u8]) -> std::cmp::Ordering,
1833 ) -> (usize, bool) {
1834 // Hot path: avoid per-comparison Vec<u8> allocation.
1835 // When key_prefix is empty the stored suffix IS the full key, so we
1836 // pass the suffix slice directly. When prefix is non-empty we build a
1837 // temporary concatenation only once per comparison using a small
1838 // stack-local Vec that is dropped immediately after the call — this
1839 // still allocates but is limited to O(key_len) bytes per call and
1840 // avoids retaining any heap state between comparisons.
1841 if self.key_prefix.is_empty() {
1842 match self.key_binary_search_by(|s| cmp(s, full_key)) {
1843 Ok(idx) => (idx, true),
1844 Err(idx) => (idx, false),
1845 }
1846 } else {
1847 let prefix = self.key_prefix.as_slice();
1848 match self.key_binary_search_by(|s| {
1849 let mut fk = Vec::with_capacity(prefix.len() + s.len());
1850 fk.extend_from_slice(prefix);
1851 fk.extend_from_slice(s);
1852 cmp(&fk, full_key)
1853 }) {
1854 Ok(idx) => (idx, true),
1855 Err(idx) => (idx, false),
1856 }
1857 }
1858 }
1859
1860 /// Comparator-driven binary search over the node-level key rep (T-2).
1861 /// `cmp(stored_suffix)` returns how the stored slot compares to the
1862 /// search key.
1863 #[inline]
1864 fn key_binary_search_by(
1865 &self,
1866 mut cmp: impl FnMut(&[u8]) -> std::cmp::Ordering,
1867 ) -> Result<usize, usize> {
1868 let mut lo = 0usize;
1869 let mut hi = self.keys.len();
1870 while lo < hi {
1871 let mid = lo + (hi - lo) / 2;
1872 match cmp(self.keys.get(mid)) {
1873 std::cmp::Ordering::Less => lo = mid + 1,
1874 std::cmp::Ordering::Greater => hi = mid,
1875 std::cmp::Ordering::Equal => return Ok(mid),
1876 }
1877 }
1878 Err(lo)
1879 }
1880
1881 /// Returns the LSN of the slot matching `full_key`, if one exists.
1882 ///
1883 /// Used by the recovery LN-redo apply to enforce JE's currency check
1884 /// (`RecoveryManager.redo()` line ~2512): a logged LN is applied only
1885 /// when `logrecLsn > treeLsn`. Returns `None` when the key is absent
1886 /// (always apply). Uses the same lookup variant the matching insert
1887 /// path uses so the comparison is over the right slot.
1888 pub fn redo_slot_lsn(
1889 &self,
1890 full_key: &[u8],
1891 cmp: Option<&dyn Fn(&[u8], &[u8]) -> std::cmp::Ordering>,
1892 key_prefixing: bool,
1893 ) -> Option<Lsn> {
1894 let (idx, found) = match cmp {
1895 Some(c) => self.find_entry_cmp(full_key, c),
1896 None if key_prefixing => self.find_entry_compressed(full_key),
1897 None => {
1898 // insert_raw path: full keys stored verbatim.
1899 match self.key_binary_search(full_key) {
1900 Ok(idx) => (idx, true),
1901 Err(idx) => (idx, false),
1902 }
1903 }
1904 };
1905 if found { Some(self.get_lsn(idx)) } else { None }
1906 }
1907
1908 /// Raw insert (no prefix compression) for databases with
1909 /// `key_prefixing = false`.
1910 ///
1911 /// JE `IN.computeKeyPrefix` returns `null` when
1912 /// `databaseImpl.getKeyPrefixing()` is `false`, so no prefix is ever
1913 /// set on those BINs. Noxu was previously ignoring the flag and always
1914 /// calling `insert_with_prefix`; this method provides the faithful path.
1915 ///
1916 /// The key is stored verbatim (no suffix stripping). An existing
1917 /// `key_prefix` on the BIN is left untouched; callers must ensure it is
1918 /// empty (split_child already guarantees this for new BINs when
1919 /// `key_prefixing = false`).
1920 ///
1921 /// Returns `(slot_index, is_new_insert)`.
1922 ///
1923 /// Ref: `IN.java computeKeyPrefix` ~line 2456,
1924 /// `DatabaseConfig.setKeyPrefixing` / `DatabaseImpl.getKeyPrefixing`.
1925 pub fn insert_raw(
1926 &mut self,
1927 full_key: Vec<u8>,
1928 lsn: Lsn,
1929 data: Option<Vec<u8>>,
1930 ) -> (usize, bool) {
1931 // Binary search on the stored (full) keys.
1932 // When key_prefix is empty entries store full keys directly; for
1933 // key_prefixing=false DBs the prefix is always empty.
1934 match self.key_binary_search(full_key.as_slice()) {
1935 Ok(idx) => {
1936 self.set_lsn(idx, lsn); // T-3
1937 self.entries[idx].data = data;
1938 self.entries[idx].dirty = true;
1939 (idx, false)
1940 }
1941 Err(idx) => {
1942 self.insert_slot(idx, full_key, lsn, data);
1943 (idx, true)
1944 }
1945 }
1946 }
1947
1948 /// Comparator-aware insert: inserts `full_key` into the BIN using `cmp`.
1949 ///
1950 /// Prefix compression is DISABLED: the key is stored as-is. This is
1951 /// intentional for sorted-dup databases where the custom comparator
1952 /// requires full-key access at every comparison.
1953 ///
1954 /// Returns `(slot_index, is_new_insert)`.
1955 ///
1956 pub fn insert_cmp(
1957 &mut self,
1958 full_key: Vec<u8>,
1959 lsn: Lsn,
1960 data: Option<Vec<u8>>,
1961 cmp: &dyn Fn(&[u8], &[u8]) -> std::cmp::Ordering,
1962 ) -> (usize, bool) {
1963 if self.key_prefix.is_empty() {
1964 match self.key_binary_search_by(|s| cmp(s, &full_key)) {
1965 Ok(idx) => {
1966 self.set_lsn(idx, lsn); // T-3
1967 self.entries[idx].data = data;
1968 self.entries[idx].dirty = true;
1969 (idx, false)
1970 }
1971 Err(idx) => {
1972 self.insert_slot(idx, full_key, lsn, data);
1973 (idx, true)
1974 }
1975 }
1976 } else {
1977 let prefix = self.key_prefix.clone();
1978 match self.key_binary_search_by(|s| {
1979 let mut fk = Vec::with_capacity(prefix.len() + s.len());
1980 fk.extend_from_slice(&prefix);
1981 fk.extend_from_slice(s);
1982 cmp(&fk, &full_key)
1983 }) {
1984 Ok(idx) => {
1985 // Key exists — update in place.
1986 self.set_lsn(idx, lsn); // T-3
1987 self.entries[idx].data = data;
1988 self.entries[idx].dirty = true;
1989 (idx, false)
1990 }
1991 Err(idx) => {
1992 // New key — insert at sorted position (no prefix compression).
1993 self.insert_slot(idx, full_key, lsn, data);
1994 (idx, true)
1995 }
1996 }
1997 }
1998 }
1999
2000 /// Comparator-aware delete: removes `full_key` from the BIN using `cmp`.
2001 ///
2002 /// Returns `true` if the entry was found and removed.
2003 pub fn delete_cmp(
2004 &mut self,
2005 full_key: &[u8],
2006 cmp: &dyn Fn(&[u8], &[u8]) -> std::cmp::Ordering,
2007 ) -> bool {
2008 let result = if self.key_prefix.is_empty() {
2009 self.key_binary_search_by(|s| cmp(s, full_key))
2010 } else {
2011 let prefix = self.key_prefix.clone();
2012 self.key_binary_search_by(|s| {
2013 let mut fk = Vec::with_capacity(prefix.len() + s.len());
2014 fk.extend_from_slice(&prefix);
2015 fk.extend_from_slice(s);
2016 cmp(&fk, full_key)
2017 })
2018 };
2019 match result {
2020 Ok(idx) => {
2021 self.entries.remove(idx);
2022 self.keys.remove(idx); // T-2
2023 self.lsn_rep.remove_shift(idx); // T-3
2024 self.dirty = true;
2025 true
2026 }
2027 Err(_) => false,
2028 }
2029 }
2030
2031 /// Serialise ALL entries (full BIN write).
2032 ///
2033 /// Format (per slot): key_len(u32BE) | key | lsn(u64BE) |
2034 /// has_data(u8) | data_len(u32BE) | data | known_deleted(u8)
2035 ///
2036 /// Prepended by: node_id(u64BE) | num_entries(u32BE).
2037 ///
2038 /// `BIN.writeToLog()` (non-delta path).
2039 pub fn serialize_full(&self) -> Vec<u8> {
2040 let mut buf = Vec::new();
2041 buf.extend_from_slice(&self.node_id.to_be_bytes());
2042 buf.extend_from_slice(&(self.entries.len() as u32).to_be_bytes());
2043 for i in 0..self.entries.len() {
2044 let full_key = self.get_full_key(i).unwrap_or_default();
2045 buf.extend_from_slice(&(full_key.len() as u32).to_be_bytes());
2046 buf.extend_from_slice(&full_key);
2047 let lsn = self.get_lsn(i); // T-3
2048 let e = &self.entries[i];
2049 buf.extend_from_slice(&lsn.as_u64().to_be_bytes());
2050 if let Some(d) = &e.data {
2051 buf.push(1u8);
2052 buf.extend_from_slice(&(d.len() as u32).to_be_bytes());
2053 buf.extend_from_slice(d);
2054 } else {
2055 buf.push(0u8);
2056 }
2057 buf.push(e.known_deleted as u8);
2058 }
2059 buf
2060 }
2061
2062 /// Serialise only dirty slots (BIN-delta write).
2063 ///
2064 /// Format (per dirty slot): slot_idx(u32BE) | key_len(u32BE) | key |
2065 /// lsn(u64BE) | has_data(u8) | data_len(u32BE) | data | known_deleted(u8)
2066 ///
2067 /// Prepended by: node_id(u64BE) | num_dirty(u32BE).
2068 ///
2069 /// `BIN.writeToLog()` (delta path).
2070 pub fn serialize_delta(&self) -> Vec<u8> {
2071 let dirty: Vec<usize> = (0..self.entries.len())
2072 .filter(|&i| self.entries[i].dirty)
2073 .collect();
2074 let mut buf = Vec::new();
2075 buf.extend_from_slice(&self.node_id.to_be_bytes());
2076 buf.extend_from_slice(&(dirty.len() as u32).to_be_bytes());
2077 for idx in dirty {
2078 buf.extend_from_slice(&(idx as u32).to_be_bytes());
2079 let full_key = self.get_full_key(idx).unwrap_or_default();
2080 buf.extend_from_slice(&(full_key.len() as u32).to_be_bytes());
2081 buf.extend_from_slice(&full_key);
2082 let lsn = self.get_lsn(idx); // T-3
2083 let e = &self.entries[idx];
2084 buf.extend_from_slice(&lsn.as_u64().to_be_bytes());
2085 if let Some(d) = &e.data {
2086 buf.push(1u8);
2087 buf.extend_from_slice(&(d.len() as u32).to_be_bytes());
2088 buf.extend_from_slice(d);
2089 } else {
2090 buf.push(0u8);
2091 }
2092 buf.push(e.known_deleted as u8);
2093 }
2094 buf
2095 }
2096
2097 /// Deserialise a full BIN from the bytes produced by `serialize_full()`.
2098 ///
2099 /// Returns a `BinStub` with all entries populated and all slots marked
2100 /// clean (they are already on disk at `last_full_lsn`). Returns `None`
2101 /// if the byte slice is malformed.
2102 ///
2103 /// `INLogEntry.readEntry()` / `IN.readFromLog()` (non-delta).
2104 pub fn deserialize_full(bytes: &[u8]) -> Option<BinStub> {
2105 if bytes.len() < 12 {
2106 return None;
2107 }
2108 let node_id = u64::from_be_bytes(bytes[0..8].try_into().ok()?);
2109 let num_entries =
2110 u32::from_be_bytes(bytes[8..12].try_into().ok()?) as usize;
2111 let mut pos = 12usize;
2112 let mut entries = Vec::with_capacity(num_entries);
2113 let mut lsns: Vec<Lsn> = Vec::with_capacity(num_entries);
2114 let mut keys: Vec<Vec<u8>> = Vec::with_capacity(num_entries); // T-2
2115 for _ in 0..num_entries {
2116 // key_len(u32BE) | key | lsn(u64BE) | has_data(u8) [| data_len(u32BE) | data] | known_deleted(u8)
2117 if pos + 4 > bytes.len() {
2118 return None;
2119 }
2120 let key_len =
2121 u32::from_be_bytes(bytes[pos..pos + 4].try_into().ok()?)
2122 as usize;
2123 pos += 4;
2124 if pos + key_len > bytes.len() {
2125 return None;
2126 }
2127 let key = bytes[pos..pos + key_len].to_vec();
2128 pos += key_len;
2129 if pos + 8 > bytes.len() {
2130 return None;
2131 }
2132 let lsn = Lsn::from_u64(u64::from_be_bytes(
2133 bytes[pos..pos + 8].try_into().ok()?,
2134 ));
2135 pos += 8;
2136 if pos + 1 > bytes.len() {
2137 return None;
2138 }
2139 let has_data = bytes[pos] != 0;
2140 pos += 1;
2141 let data = if has_data {
2142 if pos + 4 > bytes.len() {
2143 return None;
2144 }
2145 let data_len =
2146 u32::from_be_bytes(bytes[pos..pos + 4].try_into().ok()?)
2147 as usize;
2148 pos += 4;
2149 if pos + data_len > bytes.len() {
2150 return None;
2151 }
2152 let d = bytes[pos..pos + data_len].to_vec();
2153 pos += data_len;
2154 Some(d)
2155 } else {
2156 None
2157 };
2158 if pos + 1 > bytes.len() {
2159 return None;
2160 }
2161 let known_deleted = bytes[pos] != 0;
2162 pos += 1;
2163 entries.push(BinEntry {
2164 data,
2165 known_deleted,
2166 dirty: false, // freshly loaded from log — clean
2167 expiration_time: 0,
2168 });
2169 keys.push(key); // T-2 (full keys; recompute_key_prefix compresses)
2170 lsns.push(lsn); // T-3
2171 }
2172 // Keys stored in the serialized format are full (uncompressed) keys.
2173 // Re-establish the key prefix after loading so that memory use and
2174 // search performance match an in-memory BIN.
2175 // `IN.readFromLog()` → key prefix is part of the wire
2176 // format in the; in Noxu we store full keys and recompute on load.
2177 let mut bin = BinStub {
2178 node_id,
2179 level: BIN_LEVEL,
2180 entries,
2181 key_prefix: Vec::new(),
2182 dirty: false,
2183 is_delta: false,
2184 last_full_lsn: NULL_LSN, // caller sets this to the logged LSN
2185 last_delta_lsn: NULL_LSN,
2186 generation: 0,
2187 parent: None,
2188 expiration_in_hours: true,
2189 cursor_count: 0,
2190 prohibit_next_delta: false,
2191 lsn_rep: LsnRep::from_lsns(&lsns), // T-3
2192 keys: KeyRep::from_keys(keys), // T-2 (full keys, no prefix yet)
2193 compact_max_key_length: INKeyRep_DEFAULT_MAX_KEY_LENGTH,
2194 };
2195 // Recompute key prefix from the full keys just loaded.
2196 // `IN.recalcKeyPrefix()` called after materializing from log.
2197 if bin.entries.len() >= 2 {
2198 bin.recompute_key_prefix();
2199 } else {
2200 // Even a single-slot BIN should attempt compaction.
2201 bin.keys.compact(bin.compact_max_key_length);
2202 }
2203 Some(bin)
2204 }
2205
2206 /// Deserialise a BIN delta from the bytes produced by `serialize_delta()`.
2207 ///
2208 /// **DO NOT USE for BIN reconstruction.** This helper writes full
2209 /// (uncompressed) keys directly into slots without recomputing the BIN
2210 /// key prefix, so on a prefix-compressed BIN it corrupts the slot keys and
2211 /// breaks the sorted-suffix invariant. It is NOT wired into any live path.
2212 /// The correct delta-reconstruction path is
2213 /// `mutate_to_full_bin` → `apply_delta_to_bin` → `insert_with_prefix`,
2214 /// which recomputes the prefix. This function is retained only for the
2215 /// raw byte-format round-trip and must not be used to reconstitute a BIN.
2216 /// Tracked for removal — see the v3.x review synthesis (storage C-2).
2217 ///
2218 /// Returns `None` if `delta_bytes` is malformed.
2219 pub fn apply_delta(base: &mut BinStub, delta_bytes: &[u8]) -> Option<()> {
2220 if delta_bytes.len() < 12 {
2221 return None;
2222 }
2223 // node_id(u64BE) — must match base
2224 let _node_id = u64::from_be_bytes(delta_bytes[0..8].try_into().ok()?);
2225 let num_dirty =
2226 u32::from_be_bytes(delta_bytes[8..12].try_into().ok()?) as usize;
2227 let mut pos = 12usize;
2228 for _ in 0..num_dirty {
2229 // slot_idx(u32BE) | key_len(u32BE) | key | lsn(u64BE) | has_data(u8) [| data_len | data] | known_deleted(u8)
2230 if pos + 4 > delta_bytes.len() {
2231 return None;
2232 }
2233 let slot_idx =
2234 u32::from_be_bytes(delta_bytes[pos..pos + 4].try_into().ok()?)
2235 as usize;
2236 pos += 4;
2237 if pos + 4 > delta_bytes.len() {
2238 return None;
2239 }
2240 let key_len =
2241 u32::from_be_bytes(delta_bytes[pos..pos + 4].try_into().ok()?)
2242 as usize;
2243 pos += 4;
2244 if pos + key_len > delta_bytes.len() {
2245 return None;
2246 }
2247 let key = delta_bytes[pos..pos + key_len].to_vec();
2248 pos += key_len;
2249 if pos + 8 > delta_bytes.len() {
2250 return None;
2251 }
2252 let lsn = Lsn::from_u64(u64::from_be_bytes(
2253 delta_bytes[pos..pos + 8].try_into().ok()?,
2254 ));
2255 pos += 8;
2256 if pos + 1 > delta_bytes.len() {
2257 return None;
2258 }
2259 let has_data = delta_bytes[pos] != 0;
2260 pos += 1;
2261 let data = if has_data {
2262 if pos + 4 > delta_bytes.len() {
2263 return None;
2264 }
2265 let data_len = u32::from_be_bytes(
2266 delta_bytes[pos..pos + 4].try_into().ok()?,
2267 ) as usize;
2268 pos += 4;
2269 if pos + data_len > delta_bytes.len() {
2270 return None;
2271 }
2272 let d = delta_bytes[pos..pos + data_len].to_vec();
2273 pos += data_len;
2274 Some(d)
2275 } else {
2276 None
2277 };
2278 if pos + 1 > delta_bytes.len() {
2279 return None;
2280 }
2281 let known_deleted = delta_bytes[pos] != 0;
2282 pos += 1;
2283
2284 // Apply to base: update existing slot or insert new one.
2285 if slot_idx < base.entries.len() {
2286 base.keys.set(slot_idx, key); // T-2
2287 base.set_lsn(slot_idx, lsn); // T-3
2288 base.entries[slot_idx].data = data;
2289 base.entries[slot_idx].known_deleted = known_deleted;
2290 base.entries[slot_idx].dirty = false;
2291 } else {
2292 // Slot index beyond current length — append.
2293 base.entries.push(BinEntry {
2294 data,
2295 known_deleted,
2296 dirty: false,
2297 expiration_time: 0,
2298 });
2299 let n = base.entries.len();
2300 base.keys.insert(n - 1, key); // T-2
2301 base.lsn_rep.set(n - 1, lsn, n); // T-3
2302 }
2303 }
2304 Some(())
2305 }
2306
2307 /// Clear per-slot dirty flags and record `logged_at` as the LSN at which
2308 /// this BIN was last fully logged.
2309 ///
2310 /// Called by the checkpoint path after a successful full-BIN log write.
2311 /// `BIN.afterLog()` / `BIN.setLastFullLsn()`.
2312 pub fn clear_dirty_after_full_log(&mut self, logged_at: Lsn) {
2313 for e in &mut self.entries {
2314 e.dirty = false;
2315 }
2316 self.last_full_lsn = logged_at;
2317 self.dirty = false;
2318 // A full BIN captures all current state, so the delta-chain bound is
2319 // cleared: the next log may once again be a delta.
2320 // JE `IN.afterLog` clears the prohibit flag after a full log
2321 // (IN.java:5557 `bin.setProhibitNextDelta(false)`).
2322 self.prohibit_next_delta = false;
2323 }
2324
2325 /// Clear per-slot dirty flags after a successful delta log write.
2326 ///
2327 /// `last_full_lsn` is NOT updated — the full LSN only changes after a
2328 /// full BIN write.
2329 /// `BIN.afterLog()` (delta path).
2330 pub fn clear_dirty_after_delta_log(&mut self) {
2331 for e in &mut self.entries {
2332 e.dirty = false;
2333 }
2334 self.dirty = false;
2335 }
2336}
2337
2338impl TreeNode {
2339 /// Returns true if this is a BIN (bottom internal node).
2340 pub fn is_bin(&self) -> bool {
2341 matches!(self, TreeNode::Bottom(_))
2342 }
2343
2344 /// Returns the level of this node.
2345 pub fn level(&self) -> i32 {
2346 match self {
2347 TreeNode::Internal(n) => n.level,
2348 TreeNode::Bottom(b) => b.level,
2349 }
2350 }
2351
2352 /// Returns the node id of this node.
2353 pub fn node_id(&self) -> u64 {
2354 match self {
2355 TreeNode::Internal(n) => n.node_id,
2356 TreeNode::Bottom(b) => b.node_id,
2357 }
2358 }
2359
2360 /// Faithful in-memory heap footprint of this node, in bytes.
2361 ///
2362 /// JE `IN.getBudgetedMemorySize()` (IN.java) returns the running
2363 /// `inMemorySize` that `MemoryBudget` tracks for the node: the fixed
2364 /// IN/BIN struct overhead plus, per slot, the fixed entry overhead and the
2365 /// variable key (and embedded-LN data for BINs) bytes. This is the single
2366 /// source of truth for both the live tree accounting and the evictor's
2367 /// detach credit (EV-13) — keeping it on `TreeNode` avoids the formula
2368 /// drifting between `noxu-tree` and `noxu-evictor`.
2369 ///
2370 /// Rust has a fixed struct layout (unlike JE's `Sizeof`-measured JVM
2371 /// constants) so `size_of` is exact for the fixed overheads; the variable
2372 /// part mirrors JE's per-slot `entryKeys`/embedded-data accounting.
2373 pub fn budgeted_memory_size(&self) -> u64 {
2374 use std::mem::size_of;
2375 match self {
2376 TreeNode::Bottom(b) => {
2377 (size_of::<BinStub>()
2378 + b.entries.len() * size_of::<BinEntry>()
2379 + b.key_prefix.len()
2380 + b.keys.memory_size() // T-2: node-level key rep bytes
2381 + b.lsn_rep.memory_size() // T-3: node-level LSN rep bytes
2382 + b.entries
2383 .iter()
2384 .map(|e| {
2385 e.data.as_ref().map(|d| d.len()).unwrap_or(0)
2386 })
2387 .sum::<usize>()) as u64
2388 }
2389 TreeNode::Internal(n) => {
2390 (size_of::<InNodeStub>()
2391 + n.entries.len() * size_of::<InEntry>()
2392 + n.targets.memory_size()
2393 + n.entries.iter().map(|e| e.key.len()).sum::<usize>())
2394 as u64
2395 }
2396 }
2397 }
2398
2399 /// Binary search for a key in this node.
2400 ///
2401 /// For BIN nodes the search is prefix-aware: if the BIN has a key prefix,
2402 /// `key` (a full, uncompressed key) is compared against stored suffixes
2403 /// after stripping the prefix.
2404 /// `IN.findEntry(key, indicateIfDuplicate, exact)`.
2405 ///
2406 /// Returns index with EXACT_MATCH flag set if exact match found.
2407 /// If exact is false, returns insertion point.
2408 pub fn find_entry(&self, key: &[u8], _indicator: bool, exact: bool) -> i32 {
2409 match self {
2410 TreeNode::Internal(n) => {
2411 let result = n
2412 .entries
2413 .binary_search_by(|entry| entry.key.as_slice().cmp(key));
2414 match result {
2415 Ok(idx) => (idx as i32) | EXACT_MATCH,
2416 Err(idx) => {
2417 if exact {
2418 -1
2419 } else {
2420 // Floor (not insertion point): the child slot to
2421 // descend into is the largest entry ≤ key. Slot 0
2422 // is the leftmost child, so a key below every
2423 // separator floors to 0. (St-H5: previously
2424 // returned the insertion point `idx`, which routes
2425 // one child too far right.)
2426 (idx as i32 - 1).max(0)
2427 }
2428 }
2429 }
2430 }
2431 TreeNode::Bottom(b) => {
2432 // Use prefix-aware search: the stored key is a suffix when
2433 // key_prefix is non-empty.
2434 let (idx, found) = b.find_entry_compressed(key);
2435 if found {
2436 (idx as i32) | EXACT_MATCH
2437 } else if exact {
2438 -1
2439 } else {
2440 idx as i32
2441 }
2442 }
2443 }
2444 }
2445
2446 /// Gets the number of entries in this node.
2447 pub fn get_n_entries(&self) -> usize {
2448 match self {
2449 TreeNode::Internal(n) => n.entries.len(),
2450 TreeNode::Bottom(b) => b.entries.len(),
2451 }
2452 }
2453
2454 // ========================================================================
2455 // Dirty flag
2456 // ========================================================================
2457
2458 /// Returns true if this node has been modified since last checkpoint.
2459 ///
2460 /// `IN.getDirty()`.
2461 pub fn is_dirty(&self) -> bool {
2462 match self {
2463 TreeNode::Internal(n) => n.dirty,
2464 TreeNode::Bottom(b) => b.dirty,
2465 }
2466 }
2467
2468 /// Sets or clears the dirty flag on this node.
2469 ///
2470 /// `IN.setDirty(boolean dirty)`.
2471 pub fn set_dirty(&mut self, dirty: bool) {
2472 match self {
2473 TreeNode::Internal(n) => n.dirty = dirty,
2474 TreeNode::Bottom(b) => b.dirty = dirty,
2475 }
2476 }
2477
2478 // ========================================================================
2479 // LRU generation
2480 // ========================================================================
2481
2482 /// Returns the LRU generation counter.
2483 ///
2484 /// `IN.getGeneration()`.
2485 pub fn get_generation(&self) -> u64 {
2486 match self {
2487 TreeNode::Internal(n) => n.generation,
2488 TreeNode::Bottom(b) => b.generation,
2489 }
2490 }
2491
2492 /// Sets the LRU generation counter.
2493 ///
2494 /// `IN.setGeneration(long gen)`.
2495 pub fn set_generation(&mut self, r#gen: u64) {
2496 match self {
2497 TreeNode::Internal(n) => n.generation = r#gen,
2498 TreeNode::Bottom(b) => b.generation = r#gen,
2499 }
2500 }
2501
2502 // ========================================================================
2503 // Parent pointer
2504 // ========================================================================
2505
2506 /// Returns a clone of the weak parent pointer, if any.
2507 pub fn get_parent(&self) -> Option<Weak<RwLock<TreeNode>>> {
2508 match self {
2509 TreeNode::Internal(n) => n.parent.clone(),
2510 TreeNode::Bottom(b) => b.parent.clone(),
2511 }
2512 }
2513
2514 /// Sets the weak parent pointer on this node.
2515 pub fn set_parent(&mut self, parent: Option<Weak<RwLock<TreeNode>>>) {
2516 match self {
2517 TreeNode::Internal(n) => n.parent = parent,
2518 TreeNode::Bottom(b) => b.parent = parent,
2519 }
2520 }
2521
2522 // ========================================================================
2523 // Log serialization
2524 // ========================================================================
2525
2526 /// Estimates the serialized byte size of this node for log/checkpoint use.
2527 ///
2528 /// `IN.getLogSize()` — Noxu-native serialization format.
2529 ///
2530 /// Format (big-endian):
2531 /// - node_id : 8 bytes
2532 /// - level : 4 bytes
2533 /// - n_entries : 4 bytes
2534 /// - dirty : 1 byte
2535 /// - For each entry:
2536 /// - key_len : 2 bytes
2537 /// - key : key_len bytes
2538 /// - lsn : 8 bytes
2539 pub fn log_size(&self) -> usize {
2540 // Fixed header: node_id(8) + level(4) + n_entries(4) + dirty(1)
2541 let mut size: usize = 8 + 4 + 4 + 1;
2542 match self {
2543 TreeNode::Internal(n) => {
2544 for entry in &n.entries {
2545 size += 2 + entry.key.len() + 8; // key_len + key + lsn
2546 }
2547 }
2548 TreeNode::Bottom(b) => {
2549 for i in 0..b.entries.len() {
2550 size += 2 + b.get_key(i).len() + 8; // key_len + key + lsn
2551 }
2552 }
2553 }
2554 size
2555 }
2556
2557 /// Serializes this node to bytes for log writing.
2558 ///
2559 /// `IN.writeToLog(ByteBuffer logBuffer)` — Noxu-native
2560 /// format matching `log_size()`.
2561 pub fn write_to_bytes(&self) -> Vec<u8> {
2562 let mut buf = Vec::with_capacity(self.log_size());
2563 match self {
2564 TreeNode::Internal(n) => {
2565 buf.extend_from_slice(&n.node_id.to_be_bytes());
2566 buf.extend_from_slice(&n.level.to_be_bytes());
2567 buf.extend_from_slice(&(n.entries.len() as u32).to_be_bytes());
2568 buf.push(n.dirty as u8);
2569 for (i, entry) in n.entries.iter().enumerate() {
2570 buf.extend_from_slice(
2571 &(entry.key.len() as u16).to_be_bytes(),
2572 );
2573 buf.extend_from_slice(&entry.key);
2574 buf.extend_from_slice(&n.get_lsn(i).as_u64().to_be_bytes());
2575 }
2576 }
2577 TreeNode::Bottom(b) => {
2578 buf.extend_from_slice(&b.node_id.to_be_bytes());
2579 buf.extend_from_slice(&b.level.to_be_bytes());
2580 buf.extend_from_slice(&(b.entries.len() as u32).to_be_bytes());
2581 buf.push(b.dirty as u8);
2582 for i in 0..b.entries.len() {
2583 let key = b.get_key(i);
2584 buf.extend_from_slice(&(key.len() as u16).to_be_bytes());
2585 buf.extend_from_slice(key);
2586 buf.extend_from_slice(&b.get_lsn(i).as_u64().to_be_bytes());
2587 }
2588 }
2589 }
2590 buf
2591 }
2592}
2593
2594/// Internal helper used during splits to carry entries of either node kind.
2595///
2596/// `BinStub` and `InNodeStub` store different entry types, so we need a
2597/// common wrapper to pass split slices around without code duplication.
2598enum SplitEntries {
2599 /// Upper-IN entries plus the parallel resident-child pointers (one per
2600 /// entry; `None` when the child is not cached) and the parallel per-slot
2601 /// LSNs (T-3: LSNs travel with their slots on a split, just like JE
2602 /// `IN.split` copies `entryLsnByteArray`/`entryLsnLongArray`).
2603 Internal(Vec<InEntry>, Vec<Option<ChildArc>>, Vec<Lsn>),
2604 /// BIN entries (metadata only) plus the parallel per-slot LSNs and the
2605 /// parallel FULL keys (T-2: keys live in the node-level `KeyRep`, not in
2606 /// `BinEntry`, so they travel as a separate `Vec<Vec<u8>>` of full keys
2607 /// through the split — the new BINs recompute their prefix from these).
2608 Bottom(Vec<BinEntry>, Vec<Lsn>, Vec<Vec<u8>>),
2609}
2610
2611impl SplitEntries {
2612 /// Returns the number of entries.
2613 fn len(&self) -> usize {
2614 match self {
2615 SplitEntries::Internal(v, _, _) => v.len(),
2616 SplitEntries::Bottom(v, _, _) => v.len(),
2617 }
2618 }
2619
2620 /// Returns the key at `index` as a slice.
2621 fn get_key(&self, index: usize) -> &[u8] {
2622 match self {
2623 SplitEntries::Internal(v, _, _) => v[index].key.as_slice(),
2624 SplitEntries::Bottom(_, _, k) => k[index].as_slice(),
2625 }
2626 }
2627
2628 /// Returns a sub-range `[lo, hi)` as a new `SplitEntries`.
2629 fn slice(&self, lo: usize, hi: usize) -> Self {
2630 match self {
2631 SplitEntries::Internal(v, c, l) => SplitEntries::Internal(
2632 v[lo..hi].to_vec(),
2633 c[lo..hi].to_vec(),
2634 l[lo..hi].to_vec(),
2635 ),
2636 SplitEntries::Bottom(v, l, k) => SplitEntries::Bottom(
2637 v[lo..hi].to_vec(),
2638 l[lo..hi].to_vec(),
2639 k[lo..hi].to_vec(),
2640 ),
2641 }
2642 }
2643}
2644
2645/// Tri-state outcome from one attempt at
2646/// `Tree::get_adjacent_bin_attempt`.
2647///
2648/// Distinguishes "the tree genuinely has no BIN in the requested
2649/// direction" (→ propagate as end-of-iteration) from "the path we
2650/// captured was invalidated by a concurrent split" (→ caller
2651/// retries from root). This split is necessary because the cursor
2652/// translates a `None` from `get_adjacent_bin` into
2653/// `OperationStatus::NotFound`, which is indistinguishable from a
2654/// real end-of-tree.
2655#[derive(Debug)]
2656enum AdjacentBinOutcome {
2657 /// A BIN was found in the requested direction. T-3: each slot carries its
2658 /// `Lsn` alongside the `BinEntry` (the LSN lives in the node's packed
2659 /// `LsnRep`, not in `BinEntry`, so the scan snapshot pairs them).
2660 Found(Vec<(BinEntry, Lsn, Vec<u8>)>),
2661 /// The tree genuinely has no BIN in the requested direction.
2662 NoAdjacent,
2663 /// A concurrent split invalidated our captured path; the
2664 /// caller should retry from root.
2665 SplitRaceRetry,
2666}
2667
2668/// Split hint for the `splitSpecial` heuristic.
2669///
2670/// JE `Tree.forceSplit` tracks `allLeftSideDescent` / `allRightSideDescent`
2671/// (true if **every** routing decision during the top-down descent followed
2672/// the leftmost / rightmost child). At split time, when one of those flags
2673/// is set, `IN.splitSpecial` forces the split index to 1 (left side) or
2674/// `nEntries - 1` (right side) instead of `nEntries / 2`.
2675///
2676/// Effect: for sequential-append workloads the left BIN stays near-full
2677/// after every split (only one entry migrates to the new sibling), cutting
2678/// the split count roughly in half and reducing write amplification.
2679///
2680/// Ref: `IN.java splitSpecial` ~line 4129, `Tree.java forceSplit` ~line 1907.
2681#[derive(Clone, Copy, Debug, PartialEq, Eq)]
2682enum SplitHint {
2683 /// Normal midpoint split (`n_entries / 2`).
2684 Normal,
2685 /// Key was at position 0 on every level of descent.
2686 /// → `split_index = 1` so left node keeps all but the first entry.
2687 AllLeft,
2688 /// Key was at the rightmost position on every level of descent.
2689 /// → `split_index = n_entries - 1` so left node keeps almost everything.
2690 AllRight,
2691}
2692
2693impl Tree {
2694 /// Creates a new empty tree.
2695 ///
2696 /// Constructor.
2697 pub fn new(database_id: u64, max_entries_per_node: usize) -> Self {
2698 Tree {
2699 database_id,
2700 max_entries_per_node,
2701 root: RwLock::new(None),
2702 root_latch: SharedLatch::new(LatchContext::new("TreeRoot"), false),
2703 root_log_lsn: RwLock::new(noxu_util::NULL_LSN),
2704 root_splits: AtomicU64::new(0),
2705 relatches_required: AtomicU64::new(0),
2706 key_comparator: None,
2707 memory_counter: None,
2708 in_list_listener: None,
2709 log_manager: None,
2710 redo_capacity_hint: 0,
2711 key_prefixing: false, // JE default: KEY_PREFIXING_DEFAULT = false
2712 compact_max_key_length: INKeyRep_DEFAULT_MAX_KEY_LENGTH, // T-5
2713 }
2714 }
2715
2716 /// Installs a shared memory counter for evictor / MemoryBudget feedback.
2717 ///
2718 /// → `env.getMemoryBudget().updateTreeMemoryUsage(delta)`
2719 ///. The counter is updated on every BIN entry insert/delete.
2720 pub fn set_memory_counter(&mut self, counter: Arc<AtomicI64>) {
2721 self.memory_counter = Some(counter);
2722 }
2723
2724 /// Installs the [`InListListener`] (the evictor) so node add/access/remove
2725 /// feed the LRU lists. JE: `INList` registration that feeds
2726 /// `Evictor.addBack`/`moveBack`/`remove`.
2727 pub fn set_in_list_listener(&mut self, listener: Arc<dyn InListListener>) {
2728 self.in_list_listener = Some(listener);
2729 }
2730
2731 /// Installs the [`noxu_log::LogManager`] so an evicted root IN can be
2732 /// re-materialized from its persisted LSN on the next access (EV-14).
2733 ///
2734 /// JE: the tree reaches the log through `database.getEnv().getLogManager()`
2735 /// for `ChildReference.fetchTarget`. Noxu installs it directly.
2736 pub fn set_log_manager(&mut self, lm: Arc<noxu_log::LogManager>) {
2737 self.log_manager = Some(lm);
2738 }
2739
2740 /// Drops this tree's `Arc<LogManager>` reference (EV-14 teardown).
2741 ///
2742 /// The env's `Drop` calls this on every tree it owns so the
2743 /// `Tree -> Arc<LogManager> -> Arc<FileManager>` chain cannot keep the
2744 /// FileManager (and its on-disk exclusive lock) alive past environment
2745 /// close. After this the tree can no longer re-fetch an evicted root
2746 /// from the log — which is correct, because the environment is shutting
2747 /// down and the tree is about to be dropped.
2748 pub fn clear_log_manager(&mut self) {
2749 self.log_manager = None;
2750 }
2751
2752 /// T-5: set the compact-key threshold (`TREE_COMPACT_MAX_KEY_LENGTH` /
2753 /// `IN.getCompactMaxKeyLength`). New BINs created by this tree inherit it;
2754 /// `<= 0` disables the compact key rep. Default 16.
2755 pub fn set_compact_max_key_length(&mut self, len: i32) {
2756 self.compact_max_key_length = len;
2757 }
2758
2759 /// Notify the listener that a node became resident (JE `Evictor.addBack`).
2760 #[inline]
2761 fn note_added(&self, node_id: u64) {
2762 if let Some(l) = &self.in_list_listener {
2763 l.note_ins_added(node_id);
2764 }
2765 }
2766
2767 /// Notify the listener that a resident node was accessed
2768 /// (JE `Evictor.moveBack` — LRU touch).
2769 #[inline]
2770 fn note_accessed(&self, node_id: u64) {
2771 if let Some(l) = &self.in_list_listener {
2772 l.note_ins_accessed(node_id);
2773 }
2774 }
2775
2776 /// Notify the listener that a node was removed (JE `Evictor.remove`).
2777 #[inline]
2778 fn note_removed(&self, node_id: u64) {
2779 if let Some(l) = &self.in_list_listener {
2780 l.note_ins_removed(node_id);
2781 }
2782 }
2783
2784 /// Creates a new empty tree with a custom key comparator.
2785 ///
2786 /// Used for sorted-duplicate databases where keys are two-part
2787 /// composite keys that require a custom ordering function.
2788 ///
2789 /// Constructor with `btreeComparator` parameter.
2790 pub fn new_with_comparator(
2791 database_id: u64,
2792 max_entries_per_node: usize,
2793 comparator: KeyComparatorFn,
2794 ) -> Self {
2795 Tree {
2796 database_id,
2797 max_entries_per_node,
2798 root: RwLock::new(None),
2799 root_latch: SharedLatch::new(LatchContext::new("TreeRoot"), false),
2800 root_log_lsn: RwLock::new(noxu_util::NULL_LSN),
2801 root_splits: AtomicU64::new(0),
2802 relatches_required: AtomicU64::new(0),
2803 key_comparator: Some(comparator),
2804 memory_counter: None,
2805 in_list_listener: None,
2806 log_manager: None,
2807 redo_capacity_hint: 0,
2808 key_prefixing: false,
2809 compact_max_key_length: INKeyRep_DEFAULT_MAX_KEY_LENGTH, // T-5
2810 }
2811 }
2812
2813 /// Sets the key-prefixing flag.
2814 ///
2815 /// When `true`, BIN key-prefix compression is enabled: shared leading
2816 /// bytes are factored out of each slot's key. When `false` (the
2817 /// default), keys are stored verbatim — matching JE
2818 /// `DatabaseConfig.setKeyPrefixing(false)` / `IN.computeKeyPrefix`
2819 /// returning `null`.
2820 ///
2821 /// Ref: `IN.java computeKeyPrefix` ~line 2456.
2822 pub fn set_key_prefixing(&mut self, enabled: bool) {
2823 self.key_prefixing = enabled;
2824 }
2825
2826 /// Sets the key comparator, replacing any existing one.
2827 pub fn set_comparator(&mut self, comparator: KeyComparatorFn) {
2828 self.key_comparator = Some(comparator);
2829 }
2830
2831 /// Store a capacity hint used by `redo_insert` when it creates the first
2832 /// BIN for this tree (the first-key path).
2833 ///
2834 /// The first BIN's `entries` Vec is pre-allocated with
2835 /// `capacity.min(max_entries_per_node)` slots, eliminating the
2836 /// Vec-resize doubling cycle (1 → 2 → 4 → … → cap) that would
2837 /// otherwise occur during the redo loop.
2838 ///
2839 /// Call once before the redo loop. Has no effect on `insert` (the
2840 /// normal, non-recovery path).
2841 ///
2842 /// Wave 11-K optimisation (Fix 3).
2843 pub fn hint_redo_capacity(&mut self, capacity: usize) {
2844 self.redo_capacity_hint = capacity;
2845 }
2846
2847 /// Returns the current redo capacity hint (0 = no hint set).
2848 pub fn get_redo_capacity_hint(&self) -> usize {
2849 self.redo_capacity_hint
2850 }
2851
2852 /// Takes the key comparator out of this tree (leaving None).
2853 pub fn take_comparator(&mut self) -> Option<KeyComparatorFn> {
2854 self.key_comparator.take()
2855 }
2856
2857 /// Returns a reference to the key comparator, if configured.
2858 ///
2859 /// Used by `CursorImpl::find_bin_for_key` (R4 fix) so the cursor's own
2860 /// IN-level descent uses the same comparator-aware floor slot as the
2861 /// tree's own search paths. Mirrors JE `DatabaseImpl.getKeyComparator()`.
2862 pub fn get_comparator(&self) -> Option<&KeyComparatorFn> {
2863 self.key_comparator.as_ref()
2864 }
2865
2866 /// Returns the key comparator if set, or performs lexicographic comparison.
2867 #[inline]
2868 fn key_cmp(&self, a: &[u8], b: &[u8]) -> std::cmp::Ordering {
2869 match &self.key_comparator {
2870 Some(cmp) => cmp(a, b),
2871 None => a.cmp(b),
2872 }
2873 }
2874
2875 /// Floor child slot index for descending an internal node: the largest
2876 /// slot whose key is ≤ `key`. Slot 0 carries a virtual −∞ key (always
2877 /// qualifies); `entries[1..]` are sorted ascending, so this binary-searches
2878 /// the partition point instead of an O(n) linear walk (St-H4). Uses
2879 /// `key_cmp` so a configured custom comparator is honoured on every descent
2880 /// path. Returns 0 for an empty/single-slot node.
2881 fn upper_in_floor_index(&self, entries: &[InEntry], key: &[u8]) -> usize {
2882 if entries.len() <= 1 {
2883 return 0;
2884 }
2885 entries[1..].partition_point(|e| {
2886 self.key_cmp(e.key.as_slice(), key) != std::cmp::Ordering::Greater
2887 })
2888 }
2889
2890 /// Returns true if the tree has no root (is empty).
2891 pub fn is_empty(&self) -> bool {
2892 self.root.read().is_none()
2893 }
2894
2895 /// Sets the root of the tree.
2896 ///
2897 /// Must hold root_latch exclusively before calling.
2898 pub fn set_root(&self, node: TreeNode) {
2899 *self.root.write() = Some(Arc::new(RwLock::new(node)));
2900 }
2901
2902 /// Returns the root Arc, if any.
2903 ///
2904 /// Returns a cloned `Arc` rather than a reference so the caller does not
2905 /// hold the inner `RwLock` guard.
2906 ///
2907 /// EV-14: when the in-memory root has been evicted (`evict_root`) but a
2908 /// persisted version exists (`root_log_lsn` set), this re-materializes it
2909 /// from the log before returning — the faithful equivalent of JE
2910 /// `Tree.getRootIN` always calling `root.fetchTarget(...)`. Returns
2911 /// `None` only for a genuinely empty tree (no resident root and no
2912 /// persisted root LSN).
2913 pub fn get_root(&self) -> Option<Arc<RwLock<TreeNode>>> {
2914 if let Some(r) = self.root.read().clone() {
2915 return Some(r);
2916 }
2917 // Root not resident: re-fetch it from `root_log_lsn` if one exists
2918 // (a no-op returning None when the tree was never populated).
2919 self.fetch_root_from_log()
2920 }
2921
2922 /// Returns the database ID.
2923 pub fn get_database_id(&self) -> u64 {
2924 self.database_id
2925 }
2926
2927 /// Count the total number of live (non-deleted) entries across all BINs.
2928 ///
2929 /// Used by `DatabaseImpl::set_recovered_tree()` to initialise the
2930 /// per-database `entry_count` AtomicU64 after recovery replays the log.
2931 pub fn count_entries(&self) -> u64 {
2932 let mut total = 0u64;
2933 if let Some(root) = self.get_root() {
2934 Self::count_entries_recursive(&root, &mut total);
2935 }
2936 total
2937 }
2938
2939 /// DBI-14: collect every live `(full_key, data, lsn)` triple in physical
2940 /// (left-to-right) order. Used by `resort_under_comparator` to rebuild a
2941 /// tree whose slots were laid out in byte order (e.g. by recovery redo,
2942 /// which has no access to the application comparator) under the real
2943 /// configured comparator.
2944 fn collect_all_entries(&self) -> Vec<(Vec<u8>, Vec<u8>, Lsn)> {
2945 let mut out = Vec::new();
2946 if let Some(root) = self.get_root() {
2947 Self::collect_all_entries_recursive(&root, &mut out);
2948 }
2949 out
2950 }
2951
2952 fn collect_all_entries_recursive(
2953 node_arc: &Arc<RwLock<TreeNode>>,
2954 out: &mut Vec<(Vec<u8>, Vec<u8>, Lsn)>,
2955 ) {
2956 let guard = node_arc.read();
2957 match &*guard {
2958 TreeNode::Bottom(b) => {
2959 for i in 0..b.entries.len() {
2960 if b.entries[i].known_deleted {
2961 continue;
2962 }
2963 if let Some(fk) = b.get_full_key(i) {
2964 let data =
2965 b.entries[i].data.clone().unwrap_or_default();
2966 out.push((fk, data, b.get_lsn(i)));
2967 }
2968 }
2969 }
2970 TreeNode::Internal(n) => {
2971 let children: Vec<Arc<RwLock<TreeNode>>> =
2972 n.resident_children();
2973 drop(guard);
2974 for child in &children {
2975 Self::collect_all_entries_recursive(child, out);
2976 }
2977 }
2978 }
2979 }
2980
2981 /// DBI-14: rebuild this tree so that its on-disk byte-ordered slot layout
2982 /// is re-sorted under the currently-configured key comparator.
2983 ///
2984 /// Recovery redo (`redo_insert`) has no access to the application's
2985 /// comparator function — only the persisted identity — so it lays keys
2986 /// out in unsigned-byte order. After `set_recovered_tree` attaches the
2987 /// real comparator, the slots must be re-sorted, or comparator-driven
2988 /// searches would binary-search a tree ordered by the wrong relation.
2989 ///
2990 /// No-op when no comparator is configured (byte order already matches the
2991 /// recovered layout) or when the tree is empty. Mirrors the effect of
2992 /// JE reconstructing the comparator at open and the tree always having
2993 /// been built under it.
2994 pub fn resort_under_comparator(&self) {
2995 if self.key_comparator.is_none() {
2996 return;
2997 }
2998 let entries = self.collect_all_entries();
2999 if entries.is_empty() {
3000 return;
3001 }
3002 // Drop the current root; re-insert every entry through the normal
3003 // comparator-aware insert path so the new layout obeys the comparator.
3004 *self.root.write() = None;
3005 *self.root_log_lsn.write() = noxu_util::NULL_LSN;
3006 for (key, data, lsn) in entries {
3007 // Best-effort: a failed re-insert would be a tree-structure bug;
3008 // surface it loudly in debug builds.
3009 let r = self.insert(key, data, lsn);
3010 debug_assert!(
3011 r.is_ok(),
3012 "resort_under_comparator: re-insert failed: {r:?}"
3013 );
3014 }
3015 }
3016
3017 fn count_entries_recursive(
3018 node_arc: &Arc<RwLock<TreeNode>>,
3019 total: &mut u64,
3020 ) {
3021 let guard = node_arc.read();
3022 match &*guard {
3023 TreeNode::Bottom(b) => {
3024 // Count only live (non-known_deleted) entries.
3025 *total += b.entries.iter().filter(|e| !e.known_deleted).count()
3026 as u64;
3027 }
3028 TreeNode::Internal(n) => {
3029 let children: Vec<Arc<RwLock<TreeNode>>> =
3030 n.resident_children();
3031 drop(guard);
3032 for child in children {
3033 Self::count_entries_recursive(&child, total);
3034 }
3035 }
3036 }
3037 }
3038
3039 /// Sum the real in-memory heap footprint of every resident node in the
3040 /// tree (DBI-23 oracle / reconciliation), in bytes.
3041 ///
3042 /// Walks all resident IN/BIN nodes and adds each node's
3043 /// `budgeted_memory_size` (JE `IN.getBudgetedMemorySize`). This is the
3044 /// authoritative "real heap" figure the incrementally-maintained
3045 /// `memory_counter` is meant to approximate; an engine can call it to
3046 /// reconcile counter drift, and the DBI-23 test uses it as the oracle the
3047 /// live counter must stay within tolerance of.
3048 pub fn total_budgeted_memory(&self) -> u64 {
3049 let mut total = 0u64;
3050 if let Some(root) = self.get_root() {
3051 Self::total_budgeted_memory_recursive(&root, &mut total);
3052 }
3053 total
3054 }
3055
3056 fn total_budgeted_memory_recursive(
3057 node_arc: &Arc<RwLock<TreeNode>>,
3058 total: &mut u64,
3059 ) {
3060 let guard = node_arc.read();
3061 *total += guard.budgeted_memory_size();
3062 if let TreeNode::Internal(n) = &*guard {
3063 let children: Vec<Arc<RwLock<TreeNode>>> = n.resident_children();
3064 drop(guard);
3065 for child in children {
3066 Self::total_budgeted_memory_recursive(&child, total);
3067 }
3068 }
3069 }
3070
3071 /// Search for a BIN that should contain the given key.
3072 ///
3073 /// This is the core tree traversal operation. It walks from root to BIN
3074 /// using latch-coupling (acquire child latch, then release parent latch).
3075 ///
3076 /// . Descends the tree until a BIN is
3077 /// reached, following the child pointer at the slot whose key is the
3078 /// largest key <= the search key (the "LTE" rule). Slot 0 in every upper
3079 /// IN carries a virtual key (-infinity) so any search key routes through
3080 /// it when all real keys are larger.
3081 ///
3082 /// Returns a SearchResult indicating where the key is or should be.
3083 /// Returns None if tree is empty.
3084 pub fn search(&self, key: &[u8]) -> Option<SearchResult> {
3085 let root = self.get_root()?;
3086
3087 // Hand-over-hand latch coupling for the descent. At each level we
3088 // hold a `parking_lot::ArcRwLockReadGuard` on the current node;
3089 // before dropping it, we acquire the child's read guard via
3090 // `Arc::read_arc`. This keeps a continuous chain of read locks
3091 // along the descent path so that no concurrent `split_child(parent,
3092 // …)` can run on a node we are about to enter — `split_child` takes
3093 // `parent.write()` to install the new sibling, and that write
3094 // blocks while we hold `parent.read()`. Without this, the prior
3095 // pattern (capture child Arc, drop parent guard, then take child
3096 // read lock) left a window in which a split could relocate the
3097 // child entries: a search for a key that should have ended up in
3098 // the new sibling would instead reach the (now left-half) child
3099 // and return a false `NotFound`.
3100 //
3101 // `read_arc()` returns `ArcRwLockReadGuard<RawRwLock, TreeNode>`
3102 // — a guard that owns its own Arc reference, so it has no
3103 // borrow lifetime and can be held across loop iterations and
3104 // assignment.
3105 let mut guard: parking_lot::ArcRwLockReadGuard<
3106 parking_lot::RawRwLock,
3107 TreeNode,
3108 > = root.read_arc();
3109
3110 loop {
3111 if guard.is_bin() {
3112 // JE: IN.fetchTarget / CursorImpl access moves the reached
3113 // BIN toward the hot end of the evictor's LRU list
3114 // (Evictor.moveBack). A freshly split BIN that has not yet
3115 // been registered is added here (moveBack is add-if-absent).
3116 if let TreeNode::Bottom(bin) = &*guard {
3117 self.note_accessed(bin.node_id);
3118 }
3119 // Reached a BIN: final key lookup within the same guard.
3120 // Use indicate_if_duplicate=true so an exact match sets
3121 // EXACT_MATCH in the return value. Guard against -1 (not
3122 // found): -1i32 has all bits set, so the naive
3123 // `index & EXACT_MATCH != 0` check would incorrectly report
3124 // an exact match for a missing key.
3125 let (found, raw_idx) = match &*guard {
3126 TreeNode::Bottom(bin) => match &self.key_comparator {
3127 Some(cmp) => {
3128 let (idx, exact) =
3129 bin.find_entry_cmp(key, cmp.as_ref());
3130 (exact, idx as i32)
3131 }
3132 None => {
3133 let index = guard.find_entry(key, true, true);
3134 let exact =
3135 index >= 0 && (index & EXACT_MATCH != 0);
3136 (exact, index & 0xFFFF)
3137 }
3138 },
3139 _ => {
3140 let index = guard.find_entry(key, true, true);
3141 let exact = index >= 0 && (index & EXACT_MATCH != 0);
3142 (exact, index & 0xFFFF)
3143 }
3144 };
3145 // CursorImpl.isProbablyExpired(): if an exact match
3146 // was found, check whether the entry's TTL has already elapsed.
3147 // If it has, treat the slot as not found so callers skip it.
3148 //
3149 // TREE-F1: also treat a known_deleted slot as ABSENT on an
3150 // exact lookup, mirroring the tail of IN.findEntry
3151 // (IN.java:3197): `if (ret >= 0 && exact &&
3152 // isEntryKnownDeleted(ret & 0xffff)) return -1;`. KD slots
3153 // legitimately exist in live BINs during BIN-delta
3154 // reconstitution until the compressor reclaims them.
3155 let found = if found {
3156 if let TreeNode::Bottom(bin) = &*guard {
3157 let idx = (raw_idx & 0x7FFF) as usize;
3158 bin.slot_is_live(idx)
3159 } else {
3160 found
3161 }
3162 } else {
3163 found
3164 };
3165 return Some(SearchResult::with_values(found, raw_idx, false));
3166 }
3167
3168 // Upper IN: find the child slot with the largest key <= search
3169 // key, and capture the child Arc WHILE HOLDING the guard.
3170 // Slot 0 has a virtual key that compares as -infinity.
3171 let parent_arc =
3172 parking_lot::ArcRwLockReadGuard::rwlock(&guard).clone();
3173 let next_arc = match &*guard {
3174 TreeNode::Internal(n) => {
3175 if n.entries.is_empty() {
3176 return None;
3177 }
3178 // Walk forward as long as entry.key <= key, starting
3179 // from slot 0 (which always qualifies because its key
3180 // is the virtual -infinity key).
3181 let idx = self.upper_in_floor_index(&n.entries, key);
3182 match n.get_child(idx) {
3183 // Resident child: keep the hand-over-hand fast path.
3184 Some(c) => {
3185 let next_guard = c.read_arc();
3186 drop(guard);
3187 guard = next_guard;
3188 continue;
3189 }
3190 // EV-14/EV-13: child evicted — re-fetch it from its
3191 // slot LSN (JE ChildReference.fetchTarget). Must
3192 // drop the parent read guard to upgrade to a write
3193 // latch inside child_at_or_fetch.
3194 None => idx,
3195 }
3196 }
3197 TreeNode::Bottom(_) => {
3198 unreachable!("is_bin() returned false above")
3199 }
3200 };
3201 drop(guard);
3202 let child = self.child_at_or_fetch(&parent_arc, next_arc)?;
3203 guard = child.read_arc();
3204 }
3205 }
3206
3207 /// Combined search-and-fetch: descend once to the BIN and return the
3208 /// slot's data together with a reference to the BIN arc.
3209 ///
3210 /// Replaces the previous three-descent sequence on the `Database::get`
3211 /// hot path:
3212 /// 1. `Tree::search` — existence check only.
3213 /// 2. `CursorImpl::get_data_from_tree` — re-descended to fetch data.
3214 /// 3. `CursorImpl::find_bin_for_key` — re-descended for BIN pinning.
3215 ///
3216 /// One descent now does all three jobs. At the BIN level it uses the
3217 /// existing binary-search helper `find_entry_compressed` instead of the
3218 /// O(n) `iter().find()` used by `get_data_from_tree`.
3219 ///
3220 /// Returns `None` only when the tree is empty. Otherwise returns
3221 /// `Some(SlotFetch)` — callers must inspect `SlotFetch::found` to
3222 /// determine whether the key was present. The BIN read-guard is released
3223 /// before this method returns so callers may safely call `lock_ln`
3224 /// (which may block) without holding any tree latch.
3225 ///
3226 /// Wave-11-I — see the 2026 review.
3227 pub fn search_with_data(&self, key: &[u8]) -> Option<SlotFetch> {
3228 let root = self.get_root()?;
3229 let mut guard: parking_lot::ArcRwLockReadGuard<
3230 parking_lot::RawRwLock,
3231 TreeNode,
3232 > = root.read_arc();
3233
3234 loop {
3235 if guard.is_bin() {
3236 // Capture the BIN Arc before inspecting entries.
3237 let bin_arc =
3238 parking_lot::ArcRwLockReadGuard::rwlock(&guard).clone();
3239
3240 let (found, data, lsn, slot_index) = match &*guard {
3241 TreeNode::Bottom(bin) => {
3242 let (idx, exact) = match &self.key_comparator {
3243 Some(cmp) => bin.find_entry_cmp(key, cmp.as_ref()),
3244 None => bin.find_entry_compressed(key),
3245 };
3246 if exact {
3247 // TREE-F1: a slot is reported as found only when
3248 // live (not known_deleted, not TTL-expired) — the
3249 // same predicate used by Tree::search and the
3250 // cursor scan. Mirrors IN.findEntry (IN.java:3197)
3251 // and CursorImpl.isProbablyExpired.
3252 if bin.slot_is_live(idx) {
3253 let lsn = bin.get_lsn(idx); // T-3
3254 let e = &bin.entries[idx];
3255 (true, e.data.clone(), lsn.as_u64(), idx)
3256 } else {
3257 (false, None, 0u64, 0)
3258 }
3259 } else {
3260 (false, None, 0u64, 0)
3261 }
3262 }
3263 _ => (false, None, 0u64, 0),
3264 };
3265 // Release the BIN read guard before returning so the caller
3266 // can call lock_ln (which may block) without holding a latch.
3267 drop(guard);
3268 return Some(SlotFetch {
3269 found,
3270 data,
3271 lsn,
3272 slot_index,
3273 bin_arc,
3274 });
3275 }
3276
3277 // Upper IN: same hand-over-hand descent as `Tree::search`.
3278 let parent_arc =
3279 parking_lot::ArcRwLockReadGuard::rwlock(&guard).clone();
3280 let next_idx = match &*guard {
3281 TreeNode::Internal(n) => {
3282 if n.entries.is_empty() {
3283 return None;
3284 }
3285 // Slot 0 = virtual −∞; walk forward while entry.key ≤ key.
3286 let idx = self.upper_in_floor_index(&n.entries, key);
3287 match n.get_child(idx) {
3288 Some(c) => {
3289 let next_guard = c.read_arc();
3290 drop(guard);
3291 guard = next_guard;
3292 continue;
3293 }
3294 // EV-14/EV-13: re-fetch an evicted child from its LSN.
3295 None => idx,
3296 }
3297 }
3298 TreeNode::Bottom(_) => {
3299 unreachable!("is_bin() returned false above")
3300 }
3301 };
3302 drop(guard);
3303 let child = self.child_at_or_fetch(&parent_arc, next_idx)?;
3304 guard = child.read_arc();
3305 }
3306 }
3307
3308 /// Sets the expiration time (in absolute hours since Unix epoch) for an
3309 /// existing key's BIN slot.
3310 ///
3311 /// Returns `true` if the key was found and updated, `false` otherwise.
3312 ///
3313 /// Used by `Database::put_with_options()` to apply per-record TTL.
3314 /// `IN.entryExpiration` / `BIN.expirationInHours` path.
3315 pub fn update_key_expiration(
3316 &self,
3317 key: &[u8],
3318 expiration_hours: u32,
3319 ) -> bool {
3320 let root = match self.get_root() {
3321 Some(r) => r,
3322 None => return false,
3323 };
3324 // Hand-over-hand latch coupling for the descent. At the BIN we
3325 // need a write lock; we drop our read lock first and take the
3326 // write lock under the protection of the *outer* parent's read
3327 // lock (held by the previous loop iteration's guard). For the
3328 // first iteration there is no outer parent, but no `split_child`
3329 // can run on the root itself in that single-level case because
3330 // root splits go through `split_root_if_needed` which holds
3331 // `self.root.write()`. So the worst case is that the root is
3332 // promoted from a single BIN to a level-2 IN between our read
3333 // detect and our write — handled by the `is_bin` re-check
3334 // inside the write lock.
3335 //
3336 // We retry the descent up to a small bound to absorb the rare
3337 // case where a concurrent split moved this key into the new
3338 // sibling between the read-chain release and the write-lock
3339 // acquisition. Without the retry, the sole caller
3340 // (`Database::put_with_options`) would silently lose the TTL
3341 // for the affected key. Three attempts is generous: each
3342 // retry only races a single split and splits are infrequent.
3343 for _ in 0..3 {
3344 let mut guard: parking_lot::ArcRwLockReadGuard<
3345 parking_lot::RawRwLock,
3346 TreeNode,
3347 > = root.read_arc();
3348 let bin_arc;
3349 loop {
3350 if guard.is_bin() {
3351 bin_arc =
3352 parking_lot::ArcRwLockReadGuard::rwlock(&guard).clone();
3353 drop(guard);
3354 break;
3355 }
3356 let next_arc = match &*guard {
3357 TreeNode::Internal(n) => {
3358 if n.entries.is_empty() {
3359 return false;
3360 }
3361 let idx = self.upper_in_floor_index(&n.entries, key);
3362 match n.get_child(idx) {
3363 Some(c) => c,
3364 None => return false,
3365 }
3366 }
3367 TreeNode::Bottom(_) => unreachable!(),
3368 };
3369 let next_guard = next_arc.read_arc();
3370 drop(guard);
3371 guard = next_guard;
3372 }
3373
3374 // Now take the write lock on the BIN we descended to.
3375 let mut wguard = bin_arc.write();
3376 if let TreeNode::Bottom(bin) = &mut *wguard {
3377 let slot = if let Some(cmp) = &self.key_comparator {
3378 let (idx, exact) = bin.find_entry_cmp(key, cmp.as_ref());
3379 if exact { Some(idx) } else { None }
3380 } else {
3381 let (idx, exact) = bin.find_entry_compressed(key);
3382 if exact { Some(idx) } else { None }
3383 };
3384 if let Some(slot_idx) = slot
3385 && let Some(entry) = bin.entries.get_mut(slot_idx)
3386 {
3387 entry.expiration_time = expiration_hours;
3388 bin.expiration_in_hours = true;
3389 bin.dirty = true;
3390 return true;
3391 }
3392 }
3393 // Key not in this BIN — either it was never present or a
3394 // concurrent split moved it. Retry the descent; at most a
3395 // few iterations are needed to follow the key into its new
3396 // BIN.
3397 }
3398 false
3399 }
3400
3401 /// Returns the key and data of the first BIN entry at or after `key`.
3402 ///
3403 /// Descends with the tree's key comparator (same path as `search()`), then
3404 /// within the BIN finds the first slot whose stored key >= `key` using the
3405 /// comparator. Returns `None` if every entry in the tree is < `key`.
3406 ///
3407 /// Used by sorted-duplicate cursor `search(Set)` to position at the first
3408 /// (key, data) pair whose two-part key >= `lower_bound(primary_key)`.
3409 ///
3410 /// → BIN scan path.
3411 pub fn first_entry_at_or_after(
3412 &self,
3413 key: &[u8],
3414 ) -> Option<(Vec<u8>, Vec<u8>, u64)> {
3415 // Hand-over-hand latch coupling — see Tree::search for the
3416 // detailed rationale on why this closes a reader-vs-splitter
3417 // race window.
3418 let mut guard: parking_lot::ArcRwLockReadGuard<
3419 parking_lot::RawRwLock,
3420 TreeNode,
3421 > = self.get_root()?.read_arc();
3422
3423 loop {
3424 if guard.is_bin() {
3425 let result = match &*guard {
3426 TreeNode::Bottom(bin) => {
3427 let (mut idx, _exact) = match &self.key_comparator {
3428 Some(cmp) => bin.find_entry_cmp(key, cmp.as_ref()),
3429 None => bin.find_entry_compressed(key),
3430 };
3431 // TREE-F1: skip non-live slots (known_deleted /
3432 // TTL-expired) at/after the floor index, mirroring the
3433 // cursor getNext skip (CursorImpl.java:2062-2064).
3434 while idx < bin.entries.len() && !bin.slot_is_live(idx)
3435 {
3436 idx += 1;
3437 }
3438 if idx < bin.entries.len() {
3439 let full_key =
3440 bin.get_full_key(idx).unwrap_or_default();
3441 let data = bin.entries[idx]
3442 .data
3443 .clone()
3444 .unwrap_or_default();
3445 let lsn = bin.get_lsn(idx).as_u64(); // T-3
3446 Some((full_key, data, lsn))
3447 } else {
3448 None
3449 }
3450 }
3451 _ => None,
3452 };
3453 return result;
3454 }
3455
3456 // Upper IN: same descent as search().
3457 let parent_arc =
3458 parking_lot::ArcRwLockReadGuard::rwlock(&guard).clone();
3459 let next_idx = match &*guard {
3460 TreeNode::Internal(n) => {
3461 if n.entries.is_empty() {
3462 return None;
3463 }
3464 let idx = self.upper_in_floor_index(&n.entries, key);
3465 match n.get_child(idx) {
3466 Some(c) => {
3467 let next_guard = c.read_arc();
3468 drop(guard);
3469 guard = next_guard;
3470 continue;
3471 }
3472 None => idx, // EV-14/EV-13: re-fetch below.
3473 }
3474 }
3475 TreeNode::Bottom(_) => unreachable!(),
3476 };
3477 drop(guard);
3478 let child = self.child_at_or_fetch(&parent_arc, next_idx)?;
3479 guard = child.read_arc();
3480 }
3481 }
3482
3483 /// Like [`Tree::first_entry_at_or_after`] but also returns the BIN node
3484 /// (so callers may pin it) and the entry's slot index inside that
3485 /// BIN.
3486 ///
3487 /// Wave 11-N (Bug 2): `CursorImpl::search_dup` previously stored
3488 /// `current_index = 0` after a sorted-dup `Search`, which broke the
3489 /// fast-path of `retrieve_next` (and the slow path's
3490 /// `next_index = current_index + 1` arithmetic) for any primary
3491 /// that was not the first slot of its BIN. This helper hands back
3492 /// the real index so the cursor can be positioned correctly.
3493 ///
3494 /// CC-2 fix: uses the same `read_arc()` hand-over-hand latch coupling
3495 /// as every other descent method (`search`, `first_entry_at_or_after`,
3496 /// `get_first_node`, `get_adjacent_bin_attempt`). The original
3497 /// implementation did `arc.read().is_bin()` (lock acquired and released)
3498 /// then a SECOND `arc.read()` on the next line — a gap in which a
3499 /// concurrent split can promote the node (BIN→upper IN) or move the
3500 /// sought key to a new sibling, yielding a false "not found" for an
3501 /// existing key. Mirrors JE `Tree.searchSubTree` / `Tree.search`
3502 /// which hold the latch across the `is_bin()` test and the subsequent
3503 /// entry lookup.
3504 pub fn first_entry_at_or_after_with_index(
3505 &self,
3506 key: &[u8],
3507 ) -> Option<(
3508 Vec<u8>,
3509 Vec<u8>,
3510 usize,
3511 u64,
3512 std::sync::Arc<crate::NodeRwLock<TreeNode>>,
3513 )> {
3514 // Hand-over-hand latch coupling — identical strategy to
3515 // first_entry_at_or_after; the guard is held continuously across
3516 // is_bin() and the subsequent entry lookup so no split can
3517 // restructure the path between the two observations.
3518 let mut guard: parking_lot::ArcRwLockReadGuard<
3519 parking_lot::RawRwLock,
3520 TreeNode,
3521 > = self.get_root()?.read_arc();
3522 loop {
3523 if guard.is_bin() {
3524 if let TreeNode::Bottom(bin) = &*guard {
3525 let (idx, _exact) = match &self.key_comparator {
3526 Some(cmp) => bin.find_entry_cmp(key, cmp.as_ref()),
3527 None => bin.find_entry_compressed(key),
3528 };
3529 // TREE-F1: skip non-live slots (known_deleted /
3530 // TTL-expired) at/after the floor index
3531 // (CursorImpl.java:2062-2064).
3532 let mut idx = idx;
3533 while idx < bin.entries.len() && !bin.slot_is_live(idx) {
3534 idx += 1;
3535 }
3536 if idx < bin.entries.len() {
3537 let full_key =
3538 bin.get_full_key(idx).unwrap_or_default();
3539 let data =
3540 bin.entries[idx].data.clone().unwrap_or_default();
3541 let lsn = bin.get_lsn(idx).as_u64(); // T-3
3542 // Obtain the Arc for the BIN node the guard came from.
3543 // `ArcRwLockReadGuard::rwlock()` returns the backing Arc.
3544 let bin_arc =
3545 parking_lot::ArcRwLockReadGuard::rwlock(&guard)
3546 .clone();
3547 return Some((full_key, data, idx, lsn, bin_arc));
3548 } else {
3549 return None;
3550 }
3551 }
3552 return None;
3553 }
3554
3555 // Upper IN: descend as in first_entry_at_or_after / search.
3556 let parent_arc =
3557 parking_lot::ArcRwLockReadGuard::rwlock(&guard).clone();
3558 let next_idx = match &*guard {
3559 TreeNode::Internal(n) => {
3560 if n.entries.is_empty() {
3561 return None;
3562 }
3563 let idx = self.upper_in_floor_index(&n.entries, key);
3564 match n.get_child(idx) {
3565 Some(c) => {
3566 let next_guard = c.read_arc();
3567 drop(guard);
3568 guard = next_guard;
3569 continue;
3570 }
3571 None => idx, // EV-14/EV-13: re-fetch below.
3572 }
3573 }
3574 TreeNode::Bottom(_) => unreachable!(),
3575 };
3576 drop(guard);
3577 let child = self.child_at_or_fetch(&parent_arc, next_idx)?;
3578 guard = child.read_arc();
3579 }
3580 }
3581
3582 /// Insert a key/data pair into the tree.
3583 ///
3584 /// . Handles the root-is-null case by
3585 /// creating a two-level tree (upper IN + BIN) per initialisation path,
3586 /// then delegates to `insert_recursive` which performs preemptive splitting
3587 /// as it descends.
3588 ///
3589 /// Returns Ok(true) if this was a new insert, Ok(false) if it was an update.
3590 pub fn insert(
3591 &self,
3592 key: Vec<u8>,
3593 data: Vec<u8>,
3594 lsn: Lsn,
3595 ) -> Result<bool, TreeError> {
3596 // Save sizes before potentially moving key/data — needed for memory tracking.
3597 let key_len = key.len();
3598 let data_len = data.len();
3599
3600 // First-key path. We MUST hold the write lock while testing
3601 // root.is_none() and replacing the root, otherwise N threads can all
3602 // observe an empty tree, each build a fresh single-entry root, and
3603 // the last writer's `*self.root.write() = Some(...)` silently
3604 // discards the others' inserts. (Reproducer:
3605 // xa_protocol_test::test_concurrent_independent_xids — 8 threads
3606 // each inserting their own key into an empty tree lost ~30% of
3607 // inserts before this lock change.)
3608 {
3609 let mut root_guard = self.root.write();
3610 if root_guard.is_none() {
3611 let bin_node_id = generate_node_id();
3612 let root_node_id = generate_node_id();
3613 let bin = Arc::new(RwLock::new(TreeNode::Bottom(BinStub {
3614 node_id: bin_node_id,
3615 level: BIN_LEVEL,
3616 entries: vec![BinEntry {
3617 data: Some(data),
3618 known_deleted: false,
3619 dirty: false,
3620 expiration_time: 0,
3621 }],
3622 key_prefix: Vec::new(), // single entry — no common prefix yet
3623 dirty: true,
3624 is_delta: false,
3625 last_full_lsn: NULL_LSN,
3626 last_delta_lsn: NULL_LSN,
3627 generation: 0,
3628 parent: None, // set below after root_in is created
3629 // St-H6: use true to match the engine-wide invariant that
3630 // every BIN which may hold TTL entries uses hours granularity
3631 // (JE BIN.java default; matches tree.rs:980 and read_from_log).
3632 expiration_in_hours: true,
3633 cursor_count: 0,
3634 prohibit_next_delta: false,
3635 lsn_rep: LsnRep::from_lsns(&[lsn]),
3636 keys: KeyRep::from_keys(vec![key]), // T-2
3637 compact_max_key_length: self.compact_max_key_length,
3638 })));
3639
3640 // Upper IN at level 2; slot 0 uses an empty key (virtual root key).
3641 let root_arc =
3642 Arc::new(RwLock::new(TreeNode::Internal(InNodeStub {
3643 node_id: root_node_id,
3644 level: MAIN_LEVEL | 2,
3645 entries: vec![InEntry {
3646 key: vec![], // virtual key for slot 0 in upper IN
3647 }],
3648 // T-4: the single resident child at slot 0.
3649 targets: TargetRep::Sparse(vec![(0, bin.clone())]),
3650 dirty: true,
3651 generation: 0,
3652 parent: None,
3653 lsn_rep: LsnRep::from_lsns(&[lsn]),
3654 })));
3655
3656 // Wire the BIN's parent pointer back to the root IN.
3657 {
3658 let mut g = bin.write();
3659 g.set_parent(Some(Arc::downgrade(&root_arc)));
3660 }
3661
3662 *root_guard = Some(root_arc);
3663
3664 // JE: IN.fetchTarget / initial tree build registers the new
3665 // resident nodes with the evictor (Evictor.addBack).
3666 self.note_added(root_node_id);
3667 self.note_added(bin_node_id);
3668
3669 // Count the first entry.
3670 if let Some(counter) = &self.memory_counter {
3671 let delta =
3672 (key_len + data_len + BIN_ENTRY_OVERHEAD) as i64;
3673 counter.fetch_add(delta, Ordering::Relaxed);
3674 }
3675 return Ok(true);
3676 }
3677 // Another thread initialized the root while we were waiting for
3678 // the write lock; fall through and insert into the existing tree.
3679 }
3680
3681 // Check whether the root itself needs to be split before descending.
3682 // Tree.searchSplitsAllowed(): if rootIN.needsSplitting()
3683 // call splitRoot first.
3684 self.split_root_if_needed(lsn)?;
3685
3686 // Recursively insert, splitting children proactively as we descend
3687 // (forceSplit / searchSplitsAllowed pattern).
3688 let root_arc = self.get_root().unwrap();
3689 let result = Self::insert_recursive(
3690 &root_arc,
3691 key,
3692 data,
3693 lsn,
3694 self.max_entries_per_node,
3695 self.key_comparator.as_ref(),
3696 self.key_prefixing,
3697 self.in_list_listener.as_ref(),
3698 )?;
3699
3700 // Update the memory counter for new inserts.
3701 // IN.updateMemorySize(delta) → MemoryBudget.updateTreeMemoryUsage(delta).
3702 // LN_OVERHEAD = 48 bytes (approximate fixed overhead per entry).
3703 if result && let Some(counter) = &self.memory_counter {
3704 let delta = (key_len + data_len + BIN_ENTRY_OVERHEAD) as i64;
3705 counter.fetch_add(delta, Ordering::Relaxed);
3706 }
3707
3708 Ok(result)
3709 }
3710
3711 /// Recovery-redo variant of [`Tree::insert`] that accepts `&[u8]` slices.
3712 ///
3713 /// Eliminates the two intermediate `Vec<u8>` allocations that the normal
3714 /// insert path requires at the `redo_ln` call site (one for the key, one
3715 /// for the data). The compressed key suffix and the data bytes are each
3716 /// materialised into their `BinEntry` slots exactly once.
3717 ///
3718 /// Semantics are identical to `insert`:
3719 /// - Updates the existing slot when the key is already present.
3720 /// - Inserts a new sorted entry when the key is absent.
3721 /// - Triggers the same root-split and proactive-split logic.
3722 ///
3723 /// `data` should be the raw value bytes, or an empty slice for a
3724 /// deletion (which should not normally arrive here during redo, but is
3725 /// handled gracefully).
3726 ///
3727 /// Wave 11-K optimisation (Fix 1).
3728 pub fn redo_insert(
3729 &self,
3730 key: &[u8],
3731 data: &[u8],
3732 lsn: Lsn,
3733 ) -> Result<bool, TreeError> {
3734 let key_len = key.len();
3735 let data_len = data.len();
3736 let data_opt: Option<&[u8]> =
3737 if data.is_empty() { None } else { Some(data) };
3738
3739 // First-key path: initialise a two-level tree from scratch.
3740 {
3741 let mut root_guard = self.root.write();
3742 if root_guard.is_none() {
3743 // Pre-allocate the BIN's entries Vec using the redo capacity
3744 // hint (Fix 3). Without the hint the first BIN starts at
3745 // capacity 1 and doubles on each insert; with the hint it
3746 // starts at min(hint, max_entries) entries, eliminating
3747 // ~log2(max_entries) Vec-resize doublings.
3748 let initial_cap = if self.redo_capacity_hint > 0 {
3749 self.redo_capacity_hint.min(self.max_entries_per_node)
3750 } else {
3751 1
3752 };
3753 let mut initial_entries = Vec::with_capacity(initial_cap);
3754 initial_entries.push(BinEntry {
3755 data: data_opt.map(|d| d.to_vec()),
3756 known_deleted: false,
3757 dirty: false,
3758 expiration_time: 0,
3759 });
3760 let bin = Arc::new(RwLock::new(TreeNode::Bottom(BinStub {
3761 node_id: generate_node_id(),
3762 level: BIN_LEVEL,
3763 entries: initial_entries,
3764 key_prefix: Vec::new(),
3765 dirty: true,
3766 is_delta: false,
3767 last_full_lsn: NULL_LSN,
3768 last_delta_lsn: NULL_LSN,
3769 generation: 0,
3770 parent: None,
3771 // St-H6: use true to match the engine-wide hours-only
3772 // invariant (JE BIN.java default; matches tree.rs:980).
3773 expiration_in_hours: true,
3774 cursor_count: 0,
3775 prohibit_next_delta: false,
3776 lsn_rep: LsnRep::from_lsns(&[lsn]),
3777 keys: KeyRep::from_keys(vec![key.to_vec()]), // T-2
3778 compact_max_key_length: self.compact_max_key_length,
3779 })));
3780
3781 let root_arc =
3782 Arc::new(RwLock::new(TreeNode::Internal(InNodeStub {
3783 node_id: generate_node_id(),
3784 level: MAIN_LEVEL | 2,
3785 entries: vec![InEntry { key: vec![] }],
3786 // T-4: the single resident child at slot 0.
3787 targets: TargetRep::Sparse(vec![(0, bin.clone())]),
3788 dirty: true,
3789 generation: 0,
3790 parent: None,
3791 lsn_rep: LsnRep::from_lsns(&[lsn]),
3792 })));
3793
3794 {
3795 let mut g = bin.write();
3796 g.set_parent(Some(Arc::downgrade(&root_arc)));
3797 }
3798
3799 *root_guard = Some(root_arc);
3800
3801 if let Some(counter) = &self.memory_counter {
3802 let delta =
3803 (key_len + data_len + BIN_ENTRY_OVERHEAD) as i64;
3804 counter.fetch_add(delta, Ordering::Relaxed);
3805 }
3806 return Ok(true);
3807 }
3808 }
3809
3810 self.split_root_if_needed(lsn)?;
3811
3812 let root_arc = self.get_root().unwrap();
3813 let result = Self::redo_insert_recursive(
3814 &root_arc,
3815 key,
3816 data_opt,
3817 lsn,
3818 self.max_entries_per_node,
3819 self.key_comparator.as_ref(),
3820 self.key_prefixing,
3821 )?;
3822
3823 if result && let Some(counter) = &self.memory_counter {
3824 let delta = (key_len + data_len + BIN_ENTRY_OVERHEAD) as i64;
3825 counter.fetch_add(delta, Ordering::Relaxed);
3826 }
3827
3828 Ok(result)
3829 }
3830
3831 /// Splits the root node if it is full (needsSplitting).
3832 ///
3833 ///
3834 /// ```text
3835 /// 1. Save oldRoot (the current root IN or BIN).
3836 /// 2. Create newRoot at oldRoot.level + 1.
3837 /// 3. Insert oldRoot into newRoot at slot 0 with a virtual (empty) key.
3838 /// 4. Call split_node on oldRoot, passing newRoot as parent.
3839 /// 5. Replace tree root with newRoot.
3840 /// ```
3841 fn split_root_if_needed(&self, lsn: Lsn) -> Result<(), TreeError> {
3842 // Hold `self.root.write()` across the needs_split check and the
3843 // root promotion, mirroring the first-key path fix and matching
3844 // the broader insert/split serialisation discipline.
3845 //
3846 // With the previous read-then-write pattern, two concurrent
3847 // splitters could each observe needs_split == true, then take()
3848 // and install in turn, with the second wrapping the first's
3849 // already-promoted root in its own new IN. Each level wraps the
3850 // previous, producing a chain of one-child internal nodes. No
3851 // data is lost (every entry is still reachable) but the tree
3852 // becomes unnecessarily deep, and the imbalance can compound
3853 // under heavy concurrent insertion.
3854 let mut root_guard = self.root.write();
3855 let needs_split = match root_guard.as_ref() {
3856 Some(arc) => {
3857 let g = arc.read();
3858 g.get_n_entries() >= self.max_entries_per_node
3859 }
3860 None => false,
3861 };
3862 if !needs_split {
3863 return Ok(());
3864 }
3865
3866 // Create a fresh new root one level above the current root.
3867 let old_root_arc = root_guard.take().expect("checked Some above");
3868 let old_root_level = {
3869 let g = old_root_arc.read();
3870 g.level()
3871 };
3872
3873 // newRoot = new IN(level = oldRoot.level + 1) with slot 0 = oldRoot.
3874 // The key at slot 0 is the virtual key (empty slice) following the
3875 // convention that entry-zero in an upper IN compares as -infinity.
3876 let new_root_arc =
3877 Arc::new(RwLock::new(TreeNode::Internal(InNodeStub {
3878 node_id: generate_node_id(),
3879 level: old_root_level + 1,
3880 entries: vec![InEntry { key: vec![] }],
3881 // T-4: slot 0's resident child is the old root.
3882 targets: TargetRep::Sparse(vec![(0, old_root_arc.clone())]),
3883 dirty: true,
3884 generation: 0,
3885 parent: None,
3886 lsn_rep: LsnRep::from_lsns(&[lsn]),
3887 })));
3888
3889 // Update the old root's parent pointer to the new root.
3890 {
3891 let mut g = old_root_arc.write();
3892 g.set_parent(Some(Arc::downgrade(&new_root_arc)));
3893 }
3894
3895 // Install the new root before calling split_child so split_child
3896 // (which itself takes parent.write()) can run unencumbered.
3897 *root_guard = Some(new_root_arc.clone());
3898 drop(root_guard);
3899
3900 // Now split the old root (which is now child at slot 0 in new_root).
3901 Self::split_child(
3902 &new_root_arc,
3903 0, // child is at slot 0
3904 self.max_entries_per_node,
3905 lsn,
3906 SplitHint::Normal,
3907 &[], // no insertion key at root-init time
3908 self.key_comparator.as_ref(),
3909 self.key_prefixing,
3910 self.in_list_listener.as_ref(),
3911 )?;
3912
3913 // EVICTOR-RECLAIM-1: register the freshly-promoted root IN with the
3914 // evictor's LRU (JE Tree.splitRoot adds the new root to the INList).
3915 // split_child above already registers the new sibling.
3916 let new_root_id = match &*new_root_arc.read() {
3917 TreeNode::Internal(n) => n.node_id,
3918 TreeNode::Bottom(b) => b.node_id,
3919 };
3920 self.note_added(new_root_id);
3921
3922 self.root_splits.fetch_add(1, Ordering::Relaxed);
3923 Ok(())
3924 }
3925
3926 /// Splits the child at `child_index` in `parent`.
3927 ///
3928 /// . This implementation always keeps the **left** half in the
3929 /// existing child node (`child_arc`) and puts the right half in the new
3930 /// sibling, regardless of where the `identifierKey` falls. JE's
3931 /// `IN.splitInternal` (`idKeyIndex` logic ~line 4172) can place either
3932 /// half in the existing node; Noxu's preemptive-split discipline ensures
3933 /// the parent always has a free slot at split time (the split is done on
3934 /// the way *down*, before the parent fills up), so the safe simplification
3935 /// of always using the left half is correct here — no routing information
3936 /// is lost. This comment replaces the previous incorrect claim that
3937 /// `idKeyIndex` drove the choice.
3938 ///
3939 /// Note: does not emit a split log entry; split nodes are marked dirty
3940 /// and flushed at the next checkpoint (flush_dirty_bins/upper_ins).
3941 ///
3942 /// ```text
3943 /// 1. splitIndex = child.nEntries / 2 (or 1 / n-1 for splitSpecial)
3944 /// 2. Create newSibling at the same level.
3945 /// 3. Move entries [splitIndex..nEntries) to newSibling.
3946 /// 4. Update parent slot childIndex -> child (left half),
3947 /// insert newSibling with newIdKey after childIndex.
3948 /// ```
3949 fn split_child(
3950 parent: &Arc<RwLock<TreeNode>>,
3951 child_index: usize,
3952 max_entries: usize,
3953 lsn: Lsn,
3954 hint: SplitHint,
3955 insert_key: &[u8],
3956 key_comparator: Option<&KeyComparatorFn>,
3957 key_prefixing: bool,
3958 listener: Option<&Arc<dyn InListListener>>,
3959 ) -> Result<(), TreeError> {
3960 // The split is performed under `parent.write()` for the entire
3961 // duration. This is a deliberate choice for correctness:
3962 //
3963 // - Without it, between dropping `child.write()` (after installing
3964 // the left half) and acquiring `parent.write()` (to install the
3965 // sibling), a concurrent descender can pick `child_arc` from the
3966 // parent (still pointing at it), descend, take `child.write()`
3967 // and insert a key. Whether the descender's key belongs in the
3968 // left half (now in `child`) or the right half (which will be
3969 // in the new sibling) is determined by the parent's split key —
3970 // but the parent doesn't know about the split key yet, so the
3971 // descender's routing decision is based on stale data. If the
3972 // descender's key falls in the right half, it lands in `child`
3973 // (left half) where a future search will not find it: the
3974 // future search descends from the root, the parent now has the
3975 // sibling installed, the search routes the key to the sibling,
3976 // the sibling does not contain the key — silently lost.
3977 //
3978 // - Holding `parent.write()` throughout serialises split_child
3979 // against every descender that wants `parent.read()`. A
3980 // descender already holding `parent.read()` (latch coupling
3981 // from above) keeps split_child waiting at this lock until it
3982 // has finished its own work. Combined, the split + sibling
3983 // install is atomic with respect to descents.
3984 //
3985 // - Splits are infrequent compared to inserts (~ once per
3986 // max_entries new keys) so the extra serialisation here does
3987 // not dominate.
3988 //
3989 // Reproducer that exercises this race:
3990 // crates/noxu-db/tests/concurrent_commits_stress.rs.
3991 let mut parent_write_guard = parent.write();
3992
3993 // Extract the child Arc from the parent slot.
3994 let child_arc = match &*parent_write_guard {
3995 TreeNode::Internal(p) => {
3996 p.get_child(child_index).ok_or(TreeError::SplitRequired)?
3997 }
3998 TreeNode::Bottom(_) => return Err(TreeError::SplitRequired),
3999 };
4000
4001 // Gather all entries from the child plus split metadata, AND
4002 // perform the in-place left-half install, all under a single
4003 // write lock on the child. See the earlier comment on the race
4004 // this avoids inside split_child.
4005 let mut child_guard = child_arc.write();
4006 let child_level = child_guard.level();
4007 // St-H6: capture the splitting BIN's expiration_in_hours flag BEFORE
4008 // drop(child_guard) so the right-half sibling inherits it.
4009 // JE: BIN.java::setExpiration calls setExpirationInHours(hours) to
4010 // propagate the flag on split/clone; the Rust split was hardcoding
4011 // false instead of inheriting — this caused hours-granularity TTL
4012 // entries in the right sibling to be read with in_hours=false, making
4013 // the hours-since-epoch value compare as seconds-since-epoch (far in
4014 // the past) and every right-sibling TTL record appear expired.
4015 let bin_expiration_in_hours: bool = match &*child_guard {
4016 TreeNode::Bottom(b) => b.expiration_in_hours,
4017 // Internal nodes do not carry per-entry TTL; default to true
4018 // (the engine-wide invariant for any BIN that may hold TTL data).
4019 TreeNode::Internal(_) => true,
4020 };
4021 // T-2/T-5: the compact-key threshold the new sibling BIN inherits.
4022 // (Only consumed when the child is a BIN; an upper-IN split produces
4023 // upper-IN siblings, which have no compact key rep.)
4024 let bin_compact_max_key_length: i32 = match &*child_guard {
4025 TreeNode::Bottom(b) => b.compact_max_key_length,
4026 TreeNode::Internal(_) => INKeyRep_DEFAULT_MAX_KEY_LENGTH,
4027 };
4028 let (all_entries, bin_old_prefix) = match &*child_guard {
4029 TreeNode::Internal(n) => {
4030 // T-4: capture the parallel resident-child array alongside the
4031 // entries so children travel with their slots through the
4032 // split (JE `IN.split` copies `entryTargets`).
4033 let children: Vec<Option<ChildArc>> =
4034 (0..n.entries.len()).map(|i| n.get_child(i)).collect();
4035 // T-3: capture the parallel per-slot LSNs so they travel with
4036 // their slots (JE `IN.split` copies `entryLsnByteArray`).
4037 let lsns: Vec<Lsn> =
4038 (0..n.entries.len()).map(|i| n.get_lsn(i)).collect();
4039 (
4040 SplitEntries::Internal(n.entries.clone(), children, lsns),
4041 Vec::new(),
4042 )
4043 }
4044 TreeNode::Bottom(b) => {
4045 // Decompress to full keys.
4046 let full: Vec<BinEntry> = (0..b.entries.len())
4047 .map(|i| BinEntry {
4048 data: b.entries[i].data.clone(),
4049 known_deleted: b.entries[i].known_deleted,
4050 dirty: b.entries[i].dirty,
4051 expiration_time: b.entries[i].expiration_time,
4052 })
4053 .collect();
4054 let lsns: Vec<Lsn> =
4055 (0..b.entries.len()).map(|i| b.get_lsn(i)).collect();
4056 // T-2: carry FULL keys through the split; the new BINs
4057 // recompute their own prefix from them.
4058 let full_keys: Vec<Vec<u8>> = (0..b.entries.len())
4059 .map(|i| b.get_full_key(i).unwrap_or_default())
4060 .collect();
4061 (
4062 SplitEntries::Bottom(full, lsns, full_keys),
4063 b.key_prefix.clone(),
4064 )
4065 }
4066 };
4067
4068 // Determine split point — JE `IN.splitSpecial` / `IN.splitInternal`.
4069 //
4070 // Normal midpoint: `n_entries / 2`.
4071 // AllLeft: insertion key is at position 0 on every descend level.
4072 // → split_index = 1 (left half keeps n-1 entries; new right sibling
4073 // gets only the former-first slot, then the insertion fills it).
4074 // This matches JE: `if (leftSide && index == 0) splitInternal(…, 1)`.
4075 // AllRight: insertion key is at the last position on every level.
4076 // → split_index = n_entries - 1 (left half keeps all but one entry).
4077 // JE: `else if (!leftSide && index == nEntries-1) splitInternal(…, nEntries-1)`.
4078 //
4079 // Ref: `IN.java` splitSpecial ~line 4129, splitInternal ~line 4159.
4080 let n_entries = all_entries.len();
4081 let split_index = if n_entries >= 2 {
4082 // Find where insert_key falls in the child.
4083 let insert_idx = {
4084 let mut idx = 0usize;
4085 for i in 1..n_entries {
4086 let ord = match key_comparator {
4087 Some(cmp) => cmp(all_entries.get_key(i), insert_key),
4088 None => all_entries.get_key(i).cmp(insert_key),
4089 };
4090 if ord != std::cmp::Ordering::Greater {
4091 idx = i;
4092 } else {
4093 break;
4094 }
4095 }
4096 idx
4097 };
4098 match hint {
4099 SplitHint::AllLeft if insert_idx == 0 => 1,
4100 SplitHint::AllRight if insert_idx == n_entries - 1 => {
4101 n_entries - 1
4102 }
4103 _ => n_entries / 2,
4104 }
4105 } else {
4106 n_entries / 2
4107 };
4108
4109 // newIdKey — the full key of the first entry of the right half.
4110 // For BIN: entries are already full keys after decompression above.
4111 // For IN: entries carry full keys directly.
4112 let new_id_key = all_entries.get_key(split_index).to_vec();
4113 // Suppress unused-variable warning when no BIN is involved.
4114 let _ = &bin_old_prefix;
4115
4116 // Divide into left and right halves.
4117 let left_entries = all_entries.slice(0, split_index);
4118 let right_entries = all_entries.slice(split_index, n_entries);
4119
4120 // Install the left half into `child_arc` (still under the same
4121 // write lock) and mark the node dirty.
4122 match (&mut *child_guard, &left_entries) {
4123 (TreeNode::Internal(n), SplitEntries::Internal(le, lc, ll)) => {
4124 n.entries = le.clone();
4125 // T-4: reinstall the (now-shorter) left child array.
4126 n.targets = TargetRep::None;
4127 for (i, c) in lc.iter().enumerate() {
4128 if let Some(child) = c {
4129 n.set_child(i, Some(child.clone()));
4130 }
4131 }
4132 // T-3: reinstall the (now-shorter) left LSN array.
4133 n.lsn_rep = LsnRep::from_lsns(ll);
4134 }
4135 (TreeNode::Bottom(b), SplitEntries::Bottom(le, ll, lk)) => {
4136 // Reset prefix; keys arrive as FULL keys (no prefix yet).
4137 b.key_prefix = Vec::new();
4138 // Pre-allocate at max_entries capacity so the left half
4139 // does not need to reallocate on the next insert (Fix 3).
4140 let mut left = Vec::with_capacity(max_entries);
4141 left.extend_from_slice(le);
4142 b.entries = left;
4143 // T-3: reinstall the left LSN array.
4144 b.lsn_rep = LsnRep::from_lsns(ll);
4145 // T-2: reinstall the left key rep from the full keys (Default;
4146 // recompute_key_prefix below compresses + compacts).
4147 b.keys = KeyRep::from_keys(lk.clone());
4148 // Recompute prefix on each half after split (only when
4149 // key_prefixing is enabled for this database).
4150 // JE: IN.computeKeyPrefix returns null when
4151 // databaseImpl.getKeyPrefixing() is false.
4152 // Ref: IN.java computeKeyPrefix ~line 2456.
4153 if key_prefixing && b.entries.len() >= 2 {
4154 b.recompute_key_prefix();
4155 } else {
4156 b.keys.compact(b.compact_max_key_length); // T-2
4157 }
4158 }
4159 _ => return Err(TreeError::SplitRequired),
4160 }
4161 child_guard.set_dirty(true);
4162 drop(child_guard);
4163
4164 // Create the new right-half sibling.
4165 // Parent pointer will be wired in when it is inserted into the parent.
4166 let new_sibling = match right_entries {
4167 SplitEntries::Internal(re, rc, rl) => {
4168 let mut rin = InNodeStub {
4169 node_id: generate_node_id(),
4170 level: child_level,
4171 entries: re,
4172 targets: TargetRep::None,
4173 dirty: true,
4174 generation: 0,
4175 parent: None, // set below
4176 // T-3: the right half's per-slot LSNs.
4177 lsn_rep: LsnRep::from_lsns(&rl),
4178 };
4179 // T-4: install the right half's resident children.
4180 for (i, c) in rc.into_iter().enumerate() {
4181 if c.is_some() {
4182 rin.set_child(i, c);
4183 }
4184 }
4185 Arc::new(RwLock::new(TreeNode::Internal(rin)))
4186 }
4187 SplitEntries::Bottom(re, rl, rk) => {
4188 // Entries arrive as FULL keys; build BinStub with no prefix
4189 // then recompute key prefix for the new sibling.
4190 // Pre-allocate at max_entries capacity so the right half
4191 // does not need to reallocate on the next insert (Fix 3).
4192 let mut right = Vec::with_capacity(max_entries);
4193 right.extend(re);
4194 let mut sibling_bin = BinStub {
4195 node_id: generate_node_id(),
4196 level: child_level,
4197 entries: right,
4198 key_prefix: Vec::new(),
4199 dirty: true,
4200 is_delta: false,
4201 last_full_lsn: NULL_LSN,
4202 last_delta_lsn: NULL_LSN,
4203 generation: 0,
4204 parent: None, // set below
4205 // St-H6 fix: inherit the splitting BIN's flag so that
4206 // is_expired() uses the correct granularity for entries
4207 // that were already in the BIN before the split.
4208 // JE reference: BIN.java::split() propagates
4209 // expirationInHours via setExpirationInHours(hours).
4210 expiration_in_hours: bin_expiration_in_hours,
4211 cursor_count: 0,
4212 prohibit_next_delta: false,
4213 // T-3: the right half's per-slot LSNs.
4214 lsn_rep: LsnRep::from_lsns(&rl),
4215 // T-2: full keys (Default); recompute/compact below.
4216 keys: KeyRep::from_keys(rk),
4217 compact_max_key_length: bin_compact_max_key_length,
4218 };
4219 // St-H6 debug guard: the sibling must carry the same flag as
4220 // the splitting BIN so that in_hours-resolution entries are
4221 // never silently expired by a mismatched false flag.
4222 debug_assert_eq!(
4223 sibling_bin.expiration_in_hours, bin_expiration_in_hours,
4224 "St-H6 invariant: sibling BIN expiration_in_hours must \
4225 match the splitting BIN (got {}, expected {})",
4226 sibling_bin.expiration_in_hours, bin_expiration_in_hours
4227 );
4228
4229 if key_prefixing && sibling_bin.entries.len() >= 2 {
4230 sibling_bin.recompute_key_prefix();
4231 } else {
4232 sibling_bin.keys.compact(bin_compact_max_key_length); // T-2
4233 }
4234 Arc::new(RwLock::new(TreeNode::Bottom(sibling_bin)))
4235 }
4236 };
4237
4238 // Note: the child (left half) was marked dirty earlier under the
4239 // same write lock that installed left_entries; no need to re-take
4240 // the write lock here.
4241
4242 // Insert the new sibling into the parent after child_index.
4243 // We already hold `parent.write()` (taken at the top of the
4244 // function); operate on it directly rather than re-acquiring.
4245 match &mut *parent_write_guard {
4246 TreeNode::Internal(p) => {
4247 let insert_pos = child_index + 1;
4248 // T-4: insert the parent slot and set its cached child via the
4249 // node-level INTargetRep (shifting existing children).
4250 p.insert_entry(
4251 insert_pos,
4252 new_id_key,
4253 lsn,
4254 Some(new_sibling.clone()),
4255 );
4256 // Parent is dirty because it gained a new entry.
4257 p.dirty = true;
4258 }
4259 TreeNode::Bottom(_) => return Err(TreeError::SplitRequired),
4260 }
4261
4262 // Wire the new sibling's parent pointer to the parent node
4263 // before releasing parent_write_guard, so a future descent that
4264 // takes parent.read() and finds the sibling immediately sees a
4265 // fully-wired parent pointer.
4266 {
4267 let mut g = new_sibling.write();
4268 g.set_parent(Some(Arc::downgrade(parent)));
4269 }
4270 // T-4: when an upper IN split, the children that moved into the new
4271 // sibling must have their parent back-pointers re-wired to the
4272 // sibling (JE re-parents moved targets in IN.split).
4273 {
4274 let sg = new_sibling.read();
4275 if let TreeNode::Internal(sn) = &*sg {
4276 let moved = sn.resident_children();
4277 drop(sg);
4278 for child in moved {
4279 let mut cg = child.write();
4280 cg.set_parent(Some(Arc::downgrade(&new_sibling)));
4281 }
4282 }
4283 }
4284 drop(parent_write_guard);
4285
4286 // EVICTOR-RECLAIM-1: register the freshly-split sibling with the
4287 // evictor's LRU (JE IN.splitInternal calls inList.add(newSibling)).
4288 // Without this, split-created BINs/INs are invisible to the evictor:
4289 // the policy lists never receive them, every evict_batch phase quota
4290 // is 0, and eviction reclaims nothing under pressure even though the
4291 // nodes are fully resident. Only the very first root+BIN (the
4292 // first-key path) and re-fetched nodes were ever registered.
4293 if let Some(l) = listener {
4294 let sibling_id = match &*new_sibling.read() {
4295 TreeNode::Internal(n) => n.node_id,
4296 TreeNode::Bottom(b) => b.node_id,
4297 };
4298 l.note_ins_added(sibling_id);
4299 }
4300
4301 Ok(())
4302 }
4303
4304 /// Recursive insert with preemptive splitting.
4305 ///
4306 /// Top-down traversal in `Tree.forceSplit` +
4307 /// `Tree.searchSplitsAllowed`:
4308 ///
4309 /// 1. At an upper IN: find which child slot covers `key`, split the child
4310 /// proactively if it is full (so we always have room to insert the split
4311 /// key into the parent), then recurse into the appropriate child.
4312 /// 2. At a BIN: insert the key/data directly.
4313 ///
4314 /// This implements the "preemptive splitting" strategy from the: we split
4315 /// children on the way down so we never need to walk back up.
4316 fn insert_recursive(
4317 node_arc: &Arc<RwLock<TreeNode>>,
4318 key: Vec<u8>,
4319 data: Vec<u8>,
4320 lsn: Lsn,
4321 max_entries: usize,
4322 key_comparator: Option<&KeyComparatorFn>,
4323 key_prefixing: bool,
4324 listener: Option<&Arc<dyn InListListener>>,
4325 ) -> Result<bool, TreeError> {
4326 Self::insert_recursive_inner(
4327 node_arc,
4328 key,
4329 data,
4330 lsn,
4331 max_entries,
4332 key_comparator,
4333 key_prefixing,
4334 true, // all_left_so_far
4335 true, // all_right_so_far
4336 listener,
4337 )
4338 }
4339
4340 /// Inner recursive helper that threads `allLeftSideDescent` /
4341 /// `allRightSideDescent` from `Tree.forceSplit` (JE ~line 1912).
4342 ///
4343 /// Both flags start `true` at the root and are cleared as soon as the
4344 /// descent takes a non-leftmost / non-rightmost child slot. At split
4345 /// time they are forwarded to `split_child` which uses them to pick the
4346 /// `splitSpecial` split index (JE `IN.splitSpecial` ~line 4129).
4347 #[allow(clippy::too_many_arguments)]
4348 fn insert_recursive_inner(
4349 node_arc: &Arc<RwLock<TreeNode>>,
4350 key: Vec<u8>,
4351 data: Vec<u8>,
4352 lsn: Lsn,
4353 max_entries: usize,
4354 key_comparator: Option<&KeyComparatorFn>,
4355 key_prefixing: bool,
4356 all_left_so_far: bool,
4357 all_right_so_far: bool,
4358 listener: Option<&Arc<dyn InListListener>>,
4359 ) -> Result<bool, TreeError> {
4360 // Determine if this is a BIN (leaf level).
4361 //
4362 // We hold a read lock on `node_arc` (the parent of any descent we
4363 // do below) for the duration of this call, releasing it just
4364 // before returning. That achieves *latch coupling*: a concurrent
4365 // `split_child(parent, …)` that wants to reorganise our subtree
4366 // ultimately needs `parent.write()` to install the new sibling,
4367 // and that write blocks until our read lock is dropped. Without
4368 // this, the descender-vs-splitter race goes:
4369 //
4370 // T_X: at root, picks child_arc (BIN), drops root read lock.
4371 // T_Y: at root, runs split_child(root, …): takes child_arc.write(),
4372 // installs left half [E1..E5], creates sibling [E6..E10],
4373 // takes root.write() and inserts the sibling.
4374 // T_X: now takes child_arc.write() and inserts a key whose
4375 // sort order falls in the right half. The key lands in
4376 // child_arc (left half) but a future search descending
4377 // from the root routes that key to the new sibling and
4378 // does not find it — silently lost.
4379 //
4380 // Reproducer: noxu-db/tests/concurrent_commits_stress.rs
4381 // (32 threads × 100 keys, ~1–6 lost writes per run before this fix;
4382 // occasionally hundreds when an entire BIN is orphaned).
4383 let parent_guard = node_arc.read();
4384 let is_bin = parent_guard.is_bin();
4385
4386 if is_bin {
4387 // BIN: drop the read lock and take the write lock; this is
4388 // safe because the *outer* call frame still holds a read
4389 // lock on this BIN's parent (or this is the root, in which
4390 // case the first-key path has already initialised it). A
4391 // concurrent split_child(parent, …) cannot run while the
4392 // outer parent.read() is held, so the BIN cannot be
4393 // restructured between dropping our read lock and acquiring
4394 // our write lock.
4395 drop(parent_guard);
4396 let mut guard = node_arc.write();
4397 match &mut *guard {
4398 TreeNode::Bottom(bin) => {
4399 let is_new = if let Some(cmp) = key_comparator {
4400 // Comparator-based insert: no prefix compression.
4401 let (_idx, new) =
4402 bin.insert_cmp(key, lsn, Some(data), cmp.as_ref());
4403 new
4404 } else if key_prefixing {
4405 // insert_with_prefix handles prefix recomputation when
4406 // the new key shrinks the existing prefix, and also
4407 // initialises the prefix when 2 entries are present for
4408 // the first time.
4409 let (_idx, new) =
4410 bin.insert_with_prefix(key, lsn, Some(data));
4411 new
4412 } else {
4413 // key_prefixing disabled: store full key, no prefix.
4414 // JE: IN.computeKeyPrefix returns null when
4415 // databaseImpl.getKeyPrefixing() is false.
4416 // Ref: IN.java computeKeyPrefix ~line 2456.
4417 let (_idx, new) = bin.insert_raw(key, lsn, Some(data));
4418 new
4419 };
4420 // Mark dirty after any modification.
4421 bin.dirty = true;
4422 Ok(is_new)
4423 }
4424 TreeNode::Internal(_) => Err(TreeError::SplitRequired),
4425 }
4426 } else {
4427 // Upper IN: find the child slot that covers key.
4428 // Index = parent.findEntry(key, false, false)
4429 // Entry zero in an upper IN has a virtual key (-infinity), so
4430 // any real key is routed to at least slot 0.
4431 let (child_index, n_entries_at_level, child_arc) =
4432 match &*parent_guard {
4433 TreeNode::Internal(n) => {
4434 // Binary search for the largest key <= search key.
4435 // Slot 0 always matches (virtual key = -infinity).
4436 let mut idx = 0usize;
4437 for (i, entry) in n.entries.iter().enumerate() {
4438 if i == 0 {
4439 idx = 0;
4440 } else {
4441 let ord = match key_comparator {
4442 Some(cmp) => cmp(
4443 entry.key.as_slice(),
4444 key.as_slice(),
4445 ),
4446 None => {
4447 entry.key.as_slice().cmp(key.as_slice())
4448 }
4449 };
4450 if ord != std::cmp::Ordering::Greater {
4451 idx = i;
4452 } else {
4453 break;
4454 }
4455 }
4456 }
4457 let child =
4458 n.get_child(idx).ok_or(TreeError::SplitRequired)?;
4459 (idx, n.entries.len(), child)
4460 }
4461 TreeNode::Bottom(_) => {
4462 return Err(TreeError::SplitRequired);
4463 }
4464 };
4465
4466 // Update the descent-side flags (JE `Tree.forceSplit` ~1959).
4467 // `allLeftSideDescent` ← still true only if we chose slot 0.
4468 // `allRightSideDescent` ← still true only if we chose the last slot.
4469 let all_left = all_left_so_far && child_index == 0;
4470 let all_right = all_right_so_far
4471 && child_index == n_entries_at_level.saturating_sub(1);
4472
4473 // Proactively split the child if it is full.
4474 // If (child.needsSplitting()) child.split(parent, ...)
4475 let child_full = {
4476 let g = child_arc.read();
4477 g.get_n_entries() >= max_entries
4478 };
4479
4480 if child_full {
4481 // Build the splitSpecial hint from the accumulated flags.
4482 // JE `Tree.forceSplit` ~line 2010:
4483 // if (allLeftSideDescent || allRightSideDescent)
4484 // child.splitSpecial(parent, index, grandParent,
4485 // maxTreeEntriesPerNode, key, allLeftSideDescent)
4486 let hint = match (all_left, all_right) {
4487 (true, _) => SplitHint::AllLeft,
4488 (_, true) => SplitHint::AllRight,
4489 _ => SplitHint::Normal,
4490 };
4491 // split_child(parent, …) needs parent.write(); we must
4492 // drop our parent read lock before calling it.
4493 drop(parent_guard);
4494 Self::split_child(
4495 node_arc,
4496 child_index,
4497 max_entries,
4498 lsn,
4499 hint,
4500 &key,
4501 key_comparator,
4502 key_prefixing,
4503 listener,
4504 )?;
4505
4506 // After the split, re-find which child now covers key.
4507 // Re-enter at the top of the inner function; carry the
4508 // flags (the new topology doesn't invalidate them — we
4509 // still know the overall descent direction).
4510 return Self::insert_recursive_inner(
4511 node_arc,
4512 key,
4513 data,
4514 lsn,
4515 max_entries,
4516 key_comparator,
4517 key_prefixing,
4518 all_left_so_far,
4519 all_right_so_far,
4520 listener,
4521 );
4522 }
4523
4524 // Descend into the child while still holding parent_guard.
4525 // The recursive call will hold child.read() before this
4526 // returns, then drop it; combined with our parent_guard,
4527 // the latch coupling chain is preserved on the way down and
4528 // unwound on the way back up.
4529 let r = Self::insert_recursive_inner(
4530 &child_arc,
4531 key,
4532 data,
4533 lsn,
4534 max_entries,
4535 key_comparator,
4536 key_prefixing,
4537 all_left,
4538 all_right,
4539 listener,
4540 );
4541 drop(parent_guard);
4542 r
4543 }
4544 }
4545
4546 /// Slice-based variant of [`Tree::insert_recursive`] for the recovery redo path.
4547 ///
4548 /// Accepts `key: &[u8]` and `data: Option<&[u8]>` instead of owned
4549 /// `Vec<u8>` values. At the BIN leaf, calls
4550 /// [`BinStub::insert_with_prefix_slice`] which copies bytes into the
4551 /// `BinEntry` exactly once.
4552 ///
4553 /// For the comparator path (custom key comparator), falls back to
4554 /// `insert_cmp` with a one-time `to_vec()` conversion — that path is
4555 /// rare in practice (sorted-dup databases only) and is not on the
4556 /// W11 hot path.
4557 ///
4558 /// Wave 11-K optimisation (Fix 1).
4559 fn redo_insert_recursive(
4560 node_arc: &Arc<RwLock<TreeNode>>,
4561 key: &[u8],
4562 data: Option<&[u8]>,
4563 lsn: Lsn,
4564 max_entries: usize,
4565 key_comparator: Option<&KeyComparatorFn>,
4566 key_prefixing: bool,
4567 ) -> Result<bool, TreeError> {
4568 Self::redo_insert_recursive_inner(
4569 node_arc,
4570 key,
4571 data,
4572 lsn,
4573 max_entries,
4574 key_comparator,
4575 key_prefixing,
4576 true,
4577 true,
4578 )
4579 }
4580
4581 #[allow(clippy::too_many_arguments)]
4582 fn redo_insert_recursive_inner(
4583 node_arc: &Arc<RwLock<TreeNode>>,
4584 key: &[u8],
4585 data: Option<&[u8]>,
4586 lsn: Lsn,
4587 max_entries: usize,
4588 key_comparator: Option<&KeyComparatorFn>,
4589 key_prefixing: bool,
4590 all_left_so_far: bool,
4591 all_right_so_far: bool,
4592 ) -> Result<bool, TreeError> {
4593 let parent_guard = node_arc.read();
4594 let is_bin = parent_guard.is_bin();
4595
4596 if is_bin {
4597 drop(parent_guard);
4598 let mut guard = node_arc.write();
4599 match &mut *guard {
4600 TreeNode::Bottom(bin) => {
4601 // REC-F2: JE redo currency check
4602 // (RecoveryManager.redo() line ~2512/2544). A logged LN
4603 // is applied only when logrecLsn > treeLsn. If the slot
4604 // already holds an equal-or-newer LSN, skip the overwrite
4605 // so an out-of-order (older-LSN) redo cannot revert
4606 // committed data or reset the slot LSN backward. This
4607 // makes redo genuinely idempotent regardless of
4608 // redo/undo phase order. Deletes never reach this path
4609 // (redo_ln routes Delete through tree.delete), so the JE
4610 // "lsnCmp == 0 && isDeletion -> set KD" sub-case does not
4611 // apply here.
4612 let cmp_ref = key_comparator.map(|c| {
4613 c.as_ref()
4614 as &dyn Fn(&[u8], &[u8]) -> std::cmp::Ordering
4615 });
4616 if let Some(slot_lsn) =
4617 bin.redo_slot_lsn(key, cmp_ref, key_prefixing)
4618 && lsn <= slot_lsn
4619 {
4620 // Tree already holds an equal-or-newer version.
4621 return Ok(false);
4622 }
4623 let is_new = if let Some(cmp) = key_comparator {
4624 // Comparator path: fall back to owned-Vec variant.
4625 let (_idx, new) = bin.insert_cmp(
4626 key.to_vec(),
4627 lsn,
4628 data.map(|d| d.to_vec()),
4629 cmp.as_ref(),
4630 );
4631 new
4632 } else if key_prefixing {
4633 let (_idx, new) =
4634 bin.insert_with_prefix_slice(key, lsn, data);
4635 new
4636 } else {
4637 // key_prefixing disabled: store full key verbatim.
4638 // Ref: IN.java computeKeyPrefix ~line 2456.
4639 let (_idx, new) = bin.insert_raw(
4640 key.to_vec(),
4641 lsn,
4642 data.map(|d| d.to_vec()),
4643 );
4644 new
4645 };
4646 bin.dirty = true;
4647 Ok(is_new)
4648 }
4649 TreeNode::Internal(_) => Err(TreeError::SplitRequired),
4650 }
4651 } else {
4652 let (child_index, n_entries_at_level, child_arc) =
4653 match &*parent_guard {
4654 TreeNode::Internal(n) => {
4655 let mut idx = 0usize;
4656 for (i, entry) in n.entries.iter().enumerate() {
4657 if i == 0 {
4658 idx = 0;
4659 } else {
4660 let ord = match key_comparator {
4661 Some(cmp) => cmp(entry.key.as_slice(), key),
4662 None => entry.key.as_slice().cmp(key),
4663 };
4664 if ord != std::cmp::Ordering::Greater {
4665 idx = i;
4666 } else {
4667 break;
4668 }
4669 }
4670 }
4671 let child =
4672 n.get_child(idx).ok_or(TreeError::SplitRequired)?;
4673 (idx, n.entries.len(), child)
4674 }
4675 TreeNode::Bottom(_) => {
4676 return Err(TreeError::SplitRequired);
4677 }
4678 };
4679
4680 let all_left = all_left_so_far && child_index == 0;
4681 let all_right = all_right_so_far
4682 && child_index == n_entries_at_level.saturating_sub(1);
4683
4684 let child_full = {
4685 let g = child_arc.read();
4686 g.get_n_entries() >= max_entries
4687 };
4688
4689 if child_full {
4690 let hint = match (all_left, all_right) {
4691 (true, _) => SplitHint::AllLeft,
4692 (_, true) => SplitHint::AllRight,
4693 _ => SplitHint::Normal,
4694 };
4695 drop(parent_guard);
4696 Self::split_child(
4697 node_arc,
4698 child_index,
4699 max_entries,
4700 lsn,
4701 hint,
4702 key,
4703 key_comparator,
4704 key_prefixing,
4705 // Recovery redo path: the listener is not active during
4706 // log replay (the evictor is wired AFTER recovery, and
4707 // the INList is rebuilt separately). EVICTOR-RECLAIM-1
4708 // registration happens on the live insert path.
4709 None,
4710 )?;
4711 return Self::redo_insert_recursive_inner(
4712 node_arc,
4713 key,
4714 data,
4715 lsn,
4716 max_entries,
4717 key_comparator,
4718 key_prefixing,
4719 all_left_so_far,
4720 all_right_so_far,
4721 );
4722 }
4723
4724 let r = Self::redo_insert_recursive_inner(
4725 &child_arc,
4726 key,
4727 data,
4728 lsn,
4729 max_entries,
4730 key_comparator,
4731 key_prefixing,
4732 all_left,
4733 all_right,
4734 );
4735 drop(parent_guard);
4736 r
4737 }
4738 }
4739
4740 /// Pre-warm the tree's internal `Vec<BinEntry>` capacity before a redo
4741 /// pass that will insert approximately `n` records.
4742 ///
4743 /// If the tree is empty, this is a no-op (there is no BIN yet to reserve
4744 /// capacity on). If the tree already has a root BIN (from a previous
4745 /// checkpoint), reserves `n.min(max_entries_per_node)` additional slots
4746 /// in that BIN's entries vector, eliminating the resize-double cycle
4747 /// during the redo loop.
4748 ///
4749 /// Wave 11-K optimisation (Fix 3).
4750 pub fn reserve_redo_capacity(&self, n: usize) {
4751 if n == 0 {
4752 return;
4753 }
4754 let root = match self.get_root() {
4755 Some(r) => r,
4756 None => return,
4757 };
4758 // Descend to the leftmost BIN and reserve there.
4759 let mut arc = root;
4760 loop {
4761 let guard = arc.read();
4762 match &*guard {
4763 TreeNode::Bottom(bin_guard) => {
4764 let additional = n
4765 .min(self.max_entries_per_node)
4766 .saturating_sub(bin_guard.entries.len());
4767 drop(guard);
4768 let mut wguard = arc.write();
4769 if let TreeNode::Bottom(bin) = &mut *wguard {
4770 bin.entries.reserve(additional);
4771 }
4772 return;
4773 }
4774 TreeNode::Internal(inner) => {
4775 let child = inner.get_child(0);
4776 drop(guard);
4777 match child {
4778 Some(c) => arc = c,
4779 None => return,
4780 }
4781 }
4782 }
4783 }
4784 }
4785
4786 /// Get the first (leftmost) BIN in the tree.
4787 ///
4788 /// Descends to the leftmost BIN by
4789 /// always following the first child slot at each upper IN level.
4790 pub fn get_first_node(&self) -> Option<SearchResult> {
4791 let mut guard: parking_lot::ArcRwLockReadGuard<
4792 parking_lot::RawRwLock,
4793 TreeNode,
4794 > = self.get_root()?.read_arc();
4795
4796 loop {
4797 if guard.is_bin() {
4798 let n = guard.get_n_entries();
4799 if n == 0 {
4800 return None;
4801 }
4802 // TREE-F1: return the first LIVE slot, skipping known_deleted
4803 // slots (CursorImpl.java:2062-2064). If the leftmost BIN is
4804 // entirely KD during the reconstitution window the cursor's
4805 // get_first falls through to its cross-BIN advance.
4806 if let TreeNode::Bottom(b) = &*guard {
4807 match (0..b.entries.len()).find(|&i| b.slot_is_live(i)) {
4808 Some(i) => {
4809 return Some(SearchResult::with_values(
4810 true, i as i32, false,
4811 ));
4812 }
4813 None => return None,
4814 }
4815 }
4816 return Some(SearchResult::with_values(true, 0, false));
4817 }
4818
4819 // Capture the leftmost child Arc while holding `guard`, then
4820 // hand-over-hand: take the child read lock before releasing
4821 // the parent's. Same race fix as `Tree::search`.
4822 let next_arc = match &*guard {
4823 TreeNode::Internal(n_node) => n_node.get_child(0)?,
4824 _ => return None,
4825 };
4826 let next_guard = next_arc.read_arc();
4827 drop(guard);
4828 guard = next_guard;
4829 }
4830 }
4831
4832 /// Get the last (rightmost) BIN in the tree.
4833 ///
4834 /// Descends to the rightmost BIN by
4835 /// always following the last child slot at each upper IN level.
4836 pub fn get_last_node(&self) -> Option<SearchResult> {
4837 let mut guard: parking_lot::ArcRwLockReadGuard<
4838 parking_lot::RawRwLock,
4839 TreeNode,
4840 > = self.get_root()?.read_arc();
4841
4842 loop {
4843 if guard.is_bin() {
4844 let n = guard.get_n_entries();
4845 if n == 0 {
4846 return None;
4847 }
4848 // TREE-F1: return the last LIVE slot, skipping known_deleted
4849 // slots (CursorImpl.java:2062-2064).
4850 if let TreeNode::Bottom(b) = &*guard {
4851 match (0..b.entries.len())
4852 .rev()
4853 .find(|&i| b.slot_is_live(i))
4854 {
4855 Some(i) => {
4856 return Some(SearchResult::with_values(
4857 true, i as i32, false,
4858 ));
4859 }
4860 None => return None,
4861 }
4862 }
4863 return Some(SearchResult::with_values(
4864 true,
4865 (n - 1) as i32,
4866 false,
4867 ));
4868 }
4869
4870 // Capture the rightmost child Arc while holding `guard`, then
4871 // hand-over-hand: take the child read lock before releasing
4872 // the parent's. Same race fix as `Tree::search`.
4873 let next_arc = match &*guard {
4874 TreeNode::Internal(n_node) => {
4875 n_node.get_child(n_node.entries.len().saturating_sub(1))?
4876 }
4877 _ => return None,
4878 };
4879 let next_guard = next_arc.read_arc();
4880 drop(guard);
4881 guard = next_guard;
4882 }
4883 }
4884
4885 /// Returns the number of root splits that have occurred.
4886 pub fn get_root_splits(&self) -> u64 {
4887 self.root_splits.load(Ordering::Relaxed)
4888 }
4889
4890 /// Returns the number of relatches required.
4891 pub fn get_relatches_required(&self) -> u64 {
4892 self.relatches_required.load(Ordering::Relaxed)
4893 }
4894
4895 /// Delete a key from the tree.
4896 ///
4897 /// Traverses the tree to find the BIN that should contain the key, then
4898 /// removes the entry. Returns true if the key was found and removed.
4899 ///
4900 /// Delete path in `Tree` from the.
4901 ///
4902 /// In-memory removal only — WAL logging for deletes is handled by the
4903 /// cursor layer (`cursor_impl.rs::log_ln_write`) before this is called,
4904 /// matching separation between LN logging and tree mutation.
4905 pub fn delete(&self, key: &[u8]) -> bool {
4906 let root = match self.get_root() {
4907 Some(r) => r,
4908 None => return false,
4909 };
4910
4911 // F8 consistency: insert accounts key + data + BIN_ENTRY_OVERHEAD; delete must
4912 // subtract the SAME (data_len was previously omitted, leaking
4913 // data_len from the cache counter on every delete and biasing the
4914 // evictor's over-budget view). Peek the data length before deleting.
4915 let data_len = if self.memory_counter.is_some() {
4916 self.search_with_data(key)
4917 .filter(|sf| sf.found)
4918 .and_then(|sf| sf.data.as_ref().map(|d| d.len()))
4919 .unwrap_or(0)
4920 } else {
4921 0
4922 };
4923
4924 let deleted =
4925 Self::delete_recursive(&root, key, self.key_comparator.as_ref());
4926
4927 // Update the memory counter when an entry is removed.
4928 // IN.updateMemorySize(-delta) → MemoryBudget.updateTreeMemoryUsage(-delta).
4929 if deleted && let Some(counter) = &self.memory_counter {
4930 let delta = (key.len() + data_len + BIN_ENTRY_OVERHEAD) as i64;
4931 counter.fetch_sub(delta, Ordering::Relaxed);
4932 }
4933
4934 deleted
4935 }
4936
4937 /// Recursive helper for `delete`: descend to the BIN that holds `key`
4938 /// and remove it.
4939 fn delete_recursive(
4940 node_arc: &Arc<RwLock<TreeNode>>,
4941 key: &[u8],
4942 key_comparator: Option<&KeyComparatorFn>,
4943 ) -> bool {
4944 // Latch coupling, mirroring `insert_recursive`. Without this,
4945 // delete has the same "BIN split out from under us" race: thread
4946 // A finds child_arc as the target BIN under parent.read(), drops
4947 // the lock, and another thread runs split_child(parent, …) that
4948 // moves the target key into the new sibling. A then takes
4949 // child_arc.write(), looks for the key in the (now left-half)
4950 // BIN, doesn't find it, and returns `false`. The caller treats
4951 // the `false` as "key was not present", but the key is actually
4952 // still in the tree (in the sibling). Subsequent operations
4953 // observe a stale record that should have been deleted —
4954 // semantically a lost delete.
4955 let parent_guard = node_arc.read();
4956 let is_bin = parent_guard.is_bin();
4957 let child_arc = if !is_bin {
4958 match &*parent_guard {
4959 TreeNode::Internal(n) => {
4960 // Find child slot with largest key <= search key
4961 let mut idx = 0usize;
4962 for (i, entry) in n.entries.iter().enumerate() {
4963 if i == 0 {
4964 idx = 0;
4965 } else {
4966 let ord = match key_comparator {
4967 Some(cmp) => cmp(entry.key.as_slice(), key),
4968 None => entry.key.as_slice().cmp(key),
4969 };
4970 if ord != std::cmp::Ordering::Greater {
4971 idx = i;
4972 } else {
4973 break;
4974 }
4975 }
4976 }
4977 n.get_child(idx)
4978 }
4979 _ => None,
4980 }
4981 } else {
4982 None
4983 };
4984
4985 if is_bin {
4986 // Drop the read lock before taking the write lock; the outer
4987 // call frame still holds the parent read lock so a concurrent
4988 // split_child cannot run on this BIN's parent until we unwind.
4989 drop(parent_guard);
4990 let mut g = node_arc.write();
4991 match &mut *g {
4992 TreeNode::Bottom(bin) => {
4993 if let Some(cmp) = key_comparator {
4994 bin.delete_cmp(key, cmp.as_ref())
4995 } else {
4996 // Entries store compressed (suffix) keys when key_prefix
4997 // is non-empty. Compress the search key before comparing.
4998 //
4999 // The caller is not required to ensure that `key`
5000 // shares this BIN's learned `key_prefix` — a stray
5001 // delete of a key that was never present (or that
5002 // sits under a different prefix) is legal and must
5003 // simply return `false`. Calling `compress_key`
5004 // unconditionally would `debug_assert!`-panic on
5005 // such inputs, so guard it the same way the cursor
5006 // path does.
5007 if !bin.key_prefix.is_empty()
5008 && !key.starts_with(bin.key_prefix.as_slice())
5009 {
5010 return false;
5011 }
5012 let suffix = bin.compress_key(key);
5013 match bin.key_binary_search(suffix.as_slice()) {
5014 Ok(idx) => {
5015 bin.entries.remove(idx);
5016 bin.keys.remove(idx); // T-2
5017 bin.lsn_rep.remove_shift(idx); // T-3
5018 // Mark dirty after any modification.
5019 bin.dirty = true;
5020 true
5021 }
5022 Err(_) => false,
5023 }
5024 }
5025 }
5026 _ => false,
5027 }
5028 } else {
5029 // Descend with parent_guard still held; the recursion will
5030 // hold its own read lock and drop ours after it returns.
5031 let r = match child_arc {
5032 Some(child) => {
5033 Self::delete_recursive(&child, key, key_comparator)
5034 }
5035 None => false,
5036 };
5037 drop(parent_guard);
5038 r
5039 }
5040 }
5041
5042 // ========================================================================
5043 // B-tree Merge / Compress
5044 // ========================================================================
5045
5046 /// Merge under-full sibling BIN pairs and remove empty subtrees.
5047 ///
5048 /// `INCompressor` / `Tree.compressInternal()` logic.
5049 ///
5050 /// merges two adjacent siblings when their combined entry count is
5051 /// ≤ `max_entries_per_node` (the merge threshold equal to the node
5052 /// capacity). The left sibling's entries are prepended into the right
5053 /// sibling; the parent key slot pointing at the left sibling is then
5054 /// removed from the parent IN with `deleteEntry`. If the parent IN
5055 /// becomes empty after the removal the process repeats recursively up
5056 /// the tree.
5057 ///
5058 /// This implementation performs a single post-order walk so that each
5059 /// level is compressed after all its children have been compressed.
5060 pub fn compress(&self) {
5061 let root = match self.get_root() {
5062 Some(r) => r,
5063 None => return,
5064 };
5065 Self::compress_node(&root, self.max_entries_per_node);
5066 }
5067
5068 /// Recursive post-order compress helper.
5069 ///
5070 /// Visits children first (post-order), then scans adjacent child
5071 /// pairs in the current IN and merges them when the merge condition
5072 /// holds: `left.n_entries + right.n_entries <= max_entries`.
5073 ///
5074 /// After merging, the parent entry for the left sibling is deleted.
5075 /// The loop restarts after each merge so that newly under-full pairs
5076 /// created by previous merges are also considered.
5077 fn compress_node(node_arc: &Arc<RwLock<TreeNode>>, max_entries: usize) {
5078 // Collect child arcs to recurse without holding the node lock.
5079 let children: Vec<Arc<RwLock<TreeNode>>> = {
5080 let g = node_arc.read();
5081 match &*g {
5082 TreeNode::Internal(n) => n.resident_children(),
5083 // BINs are leaves; nothing to compress at this level.
5084 TreeNode::Bottom(_) => return,
5085 }
5086 };
5087
5088 // Post-order: recurse into every child before working on this level.
5089 for child in &children {
5090 Self::compress_node(child, max_entries);
5091 }
5092
5093 // Compress the current IN level: merge adjacent under-full children.
5094 // Repeat until a full pass produces no merges.
5095 loop {
5096 let n_entries = {
5097 let g = node_arc.read();
5098 g.get_n_entries()
5099 };
5100
5101 let mut merged_any = false;
5102
5103 // `i` is the index of the *left* candidate; right is at `i+1`.
5104 let mut i = 0usize;
5105 while i + 1 < n_entries {
5106 // Fetch left and right child arcs.
5107 let (left_arc, right_arc) = {
5108 let g = node_arc.read();
5109 match &*g {
5110 TreeNode::Internal(p) => {
5111 let l = p.get_child(i);
5112 let r = p.get_child(i + 1);
5113 match (l, r) {
5114 (Some(l), Some(r)) => (l, r),
5115 _ => {
5116 i += 1;
5117 continue;
5118 }
5119 }
5120 }
5121 TreeNode::Bottom(_) => return,
5122 }
5123 };
5124
5125 let left_n = { left_arc.read().get_n_entries() };
5126 let right_n = { right_arc.read().get_n_entries() };
5127
5128 // merge condition: combined count fits within one node.
5129 if left_n + right_n > max_entries {
5130 i += 1;
5131 continue;
5132 }
5133
5134 // Determine node kind from left child.
5135 let left_is_bin = { left_arc.read().is_bin() };
5136
5137 if left_is_bin {
5138 // BIN merge: decompress left entries to full keys, then
5139 // prepend into right BIN (also decompressed), and finally
5140 // recompute the merged BIN's prefix.
5141 // merge left into right, then
5142 // recalcKeyPrefix on the merged node.
5143 let left_full_entries: Vec<BinEntry> = {
5144 {
5145 let g = left_arc.read();
5146 match &*g {
5147 TreeNode::Bottom(b) => (0..b.entries.len())
5148 .map(|j| BinEntry {
5149 data: b.entries[j].data.clone(),
5150 known_deleted: b.entries[j]
5151 .known_deleted,
5152 dirty: b.entries[j].dirty,
5153 expiration_time: b.entries[j]
5154 .expiration_time,
5155 })
5156 .collect(),
5157 _ => {
5158 i += 1;
5159 continue;
5160 }
5161 }
5162 }
5163 };
5164 // T-3 / T-2: capture left's per-slot LSNs and FULL keys.
5165 let (left_full_lsns, left_full_keys): (
5166 Vec<Lsn>,
5167 Vec<Vec<u8>>,
5168 ) = {
5169 let g = left_arc.read();
5170 match &*g {
5171 TreeNode::Bottom(b) => (
5172 (0..b.entries.len())
5173 .map(|j| b.get_lsn(j))
5174 .collect(),
5175 (0..b.entries.len())
5176 .map(|j| {
5177 b.get_full_key(j).unwrap_or_default()
5178 })
5179 .collect(),
5180 ),
5181 _ => (Vec::new(), Vec::new()),
5182 }
5183 };
5184 {
5185 {
5186 let mut g = right_arc.write();
5187 match &mut *g {
5188 TreeNode::Bottom(rb) => {
5189 // Decompress right entries to full keys.
5190 let right_full: Vec<BinEntry> = (0..rb
5191 .entries
5192 .len())
5193 .map(|j| BinEntry {
5194 data: rb.entries[j].data.clone(),
5195 known_deleted: rb.entries[j]
5196 .known_deleted,
5197 dirty: rb.entries[j].dirty,
5198 expiration_time: rb.entries[j]
5199 .expiration_time,
5200 })
5201 .collect();
5202 // T-3 / T-2: right's per-slot LSNs + keys.
5203 let right_full_lsns: Vec<Lsn> =
5204 (0..rb.entries.len())
5205 .map(|j| rb.get_lsn(j))
5206 .collect();
5207 let right_full_keys: Vec<Vec<u8>> =
5208 (0..rb.entries.len())
5209 .map(|j| {
5210 rb.get_full_key(j)
5211 .unwrap_or_default()
5212 })
5213 .collect();
5214 // Left entries are all smaller; prepend.
5215 let mut combined = left_full_entries;
5216 combined.extend(right_full);
5217 let mut combined_lsns = left_full_lsns;
5218 combined_lsns.extend(right_full_lsns);
5219 let mut combined_keys = left_full_keys;
5220 combined_keys.extend(right_full_keys);
5221 // Reset prefix and assign full keys.
5222 rb.key_prefix = Vec::new();
5223 rb.entries = combined;
5224 // T-3: rebuild the merged LSN array.
5225 rb.lsn_rep =
5226 LsnRep::from_lsns(&combined_lsns);
5227 // T-2: rebuild the merged key rep (Default;
5228 // recompute below compresses + compacts).
5229 rb.keys = KeyRep::from_keys(combined_keys);
5230 // Recompute prefix on merged BIN.
5231 if rb.entries.len() >= 2 {
5232 rb.recompute_key_prefix();
5233 } else {
5234 rb.keys
5235 .compact(rb.compact_max_key_length);
5236 }
5237 rb.dirty = true;
5238 }
5239 _ => {
5240 i += 1;
5241 continue;
5242 }
5243 }
5244 }
5245 }
5246 // Clear the now-merged left BIN.
5247 {
5248 let mut g = left_arc.write();
5249 if let TreeNode::Bottom(lb) = &mut *g {
5250 lb.entries.clear();
5251 lb.lsn_rep = LsnRep::Empty; // T-3
5252 lb.keys = KeyRep::new(); // T-2
5253 lb.key_prefix = Vec::new();
5254 lb.dirty = true;
5255 }
5256 }
5257 } else {
5258 // Upper-IN merge: prepend left's InEntries into right.
5259 // T-4: capture left's resident children alongside its
5260 // entries so they travel into the merged right IN.
5261 let (left_in_entries, left_children): (
5262 Vec<InEntry>,
5263 Vec<Option<ChildArc>>,
5264 ) = {
5265 let g = left_arc.read();
5266 match &*g {
5267 TreeNode::Internal(n) => {
5268 let children = (0..n.entries.len())
5269 .map(|j| n.get_child(j))
5270 .collect();
5271 (n.entries.clone(), children)
5272 }
5273 _ => {
5274 i += 1;
5275 continue;
5276 }
5277 }
5278 };
5279 // T-3: capture left's per-slot LSNs.
5280 let left_in_lsns: Vec<Lsn> = {
5281 let g = left_arc.read();
5282 match &*g {
5283 TreeNode::Internal(n) => (0..n.entries.len())
5284 .map(|j| n.get_lsn(j))
5285 .collect(),
5286 _ => Vec::new(),
5287 }
5288 };
5289 let n_left = left_in_entries.len();
5290 {
5291 {
5292 let mut g = right_arc.write();
5293 match &mut *g {
5294 TreeNode::Internal(rn) => {
5295 // Snapshot right's existing children, then
5296 // rebuild the merged entry + target arrays
5297 // (left half first, then right half).
5298 let right_children: Vec<Option<ChildArc>> =
5299 (0..rn.entries.len())
5300 .map(|j| rn.get_child(j))
5301 .collect();
5302 // T-3: snapshot right's LSNs too.
5303 let right_in_lsns: Vec<Lsn> =
5304 (0..rn.entries.len())
5305 .map(|j| rn.get_lsn(j))
5306 .collect();
5307 let mut combined = left_in_entries.clone();
5308 combined.append(&mut rn.entries);
5309 rn.entries = combined;
5310 // T-3: rebuild the merged LSN array.
5311 let mut combined_lsns =
5312 left_in_lsns.clone();
5313 combined_lsns.extend(right_in_lsns);
5314 rn.lsn_rep =
5315 LsnRep::from_lsns(&combined_lsns);
5316 rn.targets = TargetRep::None;
5317 for (j, c) in
5318 left_children.iter().enumerate()
5319 {
5320 if let Some(child) = c {
5321 rn.set_child(
5322 j,
5323 Some(child.clone()),
5324 );
5325 }
5326 }
5327 for (j, c) in
5328 right_children.into_iter().enumerate()
5329 {
5330 if c.is_some() {
5331 rn.set_child(n_left + j, c);
5332 }
5333 }
5334 rn.dirty = true;
5335 }
5336 _ => {
5337 i += 1;
5338 continue;
5339 }
5340 }
5341 }
5342 }
5343 // Update parent pointers for moved children.
5344 for child in left_children.into_iter().flatten() {
5345 let mut cg = child.write();
5346 cg.set_parent(Some(Arc::downgrade(&right_arc)));
5347 }
5348 // Clear the now-merged left IN.
5349 {
5350 let mut g = left_arc.write();
5351 if let TreeNode::Internal(ln) = &mut *g {
5352 ln.entries.clear();
5353 ln.lsn_rep = LsnRep::Empty; // T-3
5354 ln.targets = TargetRep::None;
5355 ln.dirty = true;
5356 }
5357 }
5358 }
5359
5360 // Remove the right sibling's parent slot and update
5361 // the left slot to point at the merged right child.
5362 //
5363 // We keep the LEFT slot's key (which is the correct minimum for
5364 // the merged BIN's range) and remove the RIGHT slot (i+1).
5365 // This avoids having to update the parent key when i == 0.
5366 {
5367 {
5368 let mut g = node_arc.write();
5369 match &mut *g {
5370 TreeNode::Internal(p) => {
5371 // Update left slot (i) to point at right_arc
5372 // (which now contains the merged entries).
5373 if i < p.entries.len() {
5374 p.set_child(i, Some(right_arc.clone()));
5375 }
5376 // Remove right slot (i+1) — it is now redundant.
5377 // T-4: remove_entry shifts the child array too.
5378 if i + 1 < p.entries.len() {
5379 p.remove_entry(i + 1);
5380 }
5381 p.dirty = true;
5382 }
5383 TreeNode::Bottom(_) => return,
5384 }
5385 }
5386 }
5387
5388 merged_any = true;
5389 // Advance i to check the merged BIN against its new right
5390 // sibling (the old slot i+2 is now at i+1).
5391 i += 1;
5392 let updated_n = { node_arc.read().get_n_entries() };
5393 if i + 1 >= updated_n {
5394 break;
5395 }
5396 }
5397
5398 if !merged_any {
5399 break;
5400 }
5401 }
5402 }
5403
5404 // ========================================================================
5405 // BIN slot compression
5406 // ========================================================================
5407
5408 /// Compress deleted slots from a BIN node, then prune it from its parent
5409 /// IN when it becomes empty.
5410 ///
5411 /// (the in-place slot-removal
5412 /// path, NOT the sibling-merge path handled by `compress()`).
5413 ///
5414 /// # Algorithm
5415 ///
5416 /// 1. If the BIN is a delta, skip — deltas cannot be compressed.
5417 /// 2. Remove all slots where `entry.known_deleted` is true. This mirrors
5418 /// `bin.compress(!bin.shouldLogDelta(), localTracker)`.
5419 /// 3. If the BIN is now empty, remove it from its parent IN. This mirrors
5420 /// `pruneBIN(db, binRef, idKey)` → `tree.delete(idKey)`.
5421 ///
5422 /// # Arguments
5423 ///
5424 /// * `bin_arc` — the BIN to compress (must be a `TreeNode::Bottom`).
5425 ///
5426 /// # Returns
5427 ///
5428 /// `true` if compression made progress (slots were removed or the BIN was
5429 /// pruned), `false` if the BIN was skipped (delta, no cursors issue, etc.).
5430 pub fn compress_bin(&self, bin_arc: &Arc<RwLock<TreeNode>>) -> bool {
5431 self.compress_bin_with_lock_check(bin_arc, None)
5432 }
5433
5434 /// Like [`compress_bin`](Self::compress_bin), but consults a caller-supplied
5435 /// `is_locked` predicate before physically removing each `known_deleted`
5436 /// slot. If `is_locked(slot_lsn)` returns `true`, the slot is SKIPPED
5437 /// (left for a later compression pass after the locking txn resolves).
5438 ///
5439 /// This is the faithful port of JE `BIN.compress` (BIN.java:1141-1172):
5440 ///
5441 /// > We have to be able to lock the LN before we can compress the entry.
5442 /// > If we can't, then skip over it. ... it is more efficient to call
5443 /// > `isLockUncontended` than to actually lock the LN, since we would
5444 /// > release the lock immediately.
5445 ///
5446 /// ```text
5447 /// if (lsn != DbLsn.NULL_LSN &&
5448 /// !lockManager.isLockUncontended(lsn)) {
5449 /// anyLocked = true;
5450 /// continue;
5451 /// }
5452 /// ```
5453 ///
5454 /// JE's `isLockUncontended(lsn)` (LockManager.java:692) returns
5455 /// `nWaiters() == 0 && nOwners() == 0`. Our `is_locked(lsn)` is its
5456 /// inverse: the dbi layer supplies a closure over the `LockManager` that
5457 /// returns `true` iff the slot's LSN has any owner or waiter
5458 /// (`LockManager::get_lock_info(lsn) != (0, 0)`). A `NULL_LSN` slot is
5459 /// always discardable without locking (JE: "Can discard a NULL_LSN entry
5460 /// without locking"), so we never invoke the predicate for it.
5461 ///
5462 /// # Layering (noxu-tree -/-> noxu-txn)
5463 ///
5464 /// The predicate is a `&dyn Fn(u64) -> bool`, NOT a `LockManager`
5465 /// reference, so noxu-tree never depends on noxu-txn. The lock knowledge
5466 /// lives entirely in the dbi-supplied closure.
5467 ///
5468 /// # Lock ordering (no deadlock)
5469 ///
5470 /// `is_locked` is invoked while this method holds the **BIN write latch**.
5471 /// The dbi closure calls `LockManager::get_lock_info`, which takes a
5472 /// lock-table *shard* mutex for a single, non-blocking critical section
5473 /// and releases it before returning — it never waits and never re-enters
5474 /// the tree. The LockManager has no edge back into a BIN latch (lock
5475 /// acquisition descends the tree from the dbi/cursor layer, never the
5476 /// reverse). The only ordering is therefore BIN-latch -> shard-mutex,
5477 /// which is acyclic; no lock cycle exists, so the predicate cannot
5478 /// deadlock against the latch.
5479 ///
5480 /// When `is_locked` is `None` (recovery, BIN-delta replay, unit tests with
5481 /// no lock manager) behavior is identical to the historical
5482 /// `compress_bin`: every `known_deleted` slot is removed.
5483 pub fn compress_bin_with_lock_check(
5484 &self,
5485 bin_arc: &Arc<RwLock<TreeNode>>,
5486 is_locked: Option<&dyn Fn(u64) -> bool>,
5487 ) -> bool {
5488 // ---- Step 1: collect metadata without holding the write lock ----
5489 let (is_delta, n_entries, id_key) = {
5490 {
5491 let g = bin_arc.read();
5492 match &*g {
5493 TreeNode::Bottom(b) => {
5494 // Identifier key = first full key in the BIN
5495 // (the: bin.getIdentifierKey()).
5496 let id_key = b.get_full_key(0);
5497 (b.is_delta, b.entries.len(), id_key)
5498 }
5499 _ => return false, // not a BIN
5500 }
5501 }
5502 };
5503
5504 // If (bin.isBINDelta()) return; — deltas cannot be compressed.
5505 if is_delta {
5506 return false;
5507 }
5508
5509 // ---- Step 2: remove known-deleted slots) ----
5510 // We compress dirty slots too (compress_dirty_slots = true) because
5511 // we are not writing a BIN-delta here.
5512 let removed_any = {
5513 {
5514 let mut g = bin_arc.write();
5515 match &mut *g {
5516 TreeNode::Bottom(b) => {
5517 let before = b.entries.len();
5518 // BIN.compress(): walk backwards to remove
5519 // deleted slots without index confusion.
5520 //
5521 // IC-3 — JE `BIN.compress` (BIN.java:1141-1172) does
5522 // NOT compress a slot it cannot lock: "We have to be
5523 // able to lock the LN before we can compress the
5524 // entry. If we can't, then skip over it." JE calls
5525 // `lockManager.isLockUncontended(lsn)` and, on a
5526 // contended slot, does `anyLocked = true; continue;`.
5527 // We mirror that here via the optional `is_locked`
5528 // predicate (supplied by the dbi layer, closing over
5529 // the LockManager — see
5530 // `compress_bin_with_lock_check`). This removes the
5531 // previously fragile implicit invariant ("no code path
5532 // ever tombstones a slot before its txn commits"):
5533 // even if a future write path leaves an uncommitted,
5534 // write-locked `known_deleted` tombstone in a BinStub,
5535 // the predicate keeps the compressor from physically
5536 // removing a slot a live txn still references.
5537 //
5538 // When `is_locked` is `None` (recovery / BIN-delta
5539 // replay / lock-manager-less tests) behavior is
5540 // unchanged: every `known_deleted` slot is removed,
5541 // matching the historical safe-by-invariant path.
5542 let mut j = b.entries.len();
5543 while j > 0 {
5544 j -= 1;
5545 if b.entries[j].known_deleted {
5546 // IC-3 lock check (JE BIN.compress). A
5547 // NULL_LSN slot is always discardable without
5548 // locking (JE: "Can discard a NULL_LSN entry
5549 // without locking"), so we only consult the
5550 // predicate for a non-null LSN.
5551 if let Some(is_locked) = is_locked {
5552 let slot_lsn = b.get_lsn(j);
5553 if !slot_lsn.is_null()
5554 && is_locked(slot_lsn.as_u64())
5555 {
5556 // Slot still write-locked by an
5557 // in-flight txn — leave it for a later
5558 // pass (JE: anyLocked = true; continue).
5559 continue;
5560 }
5561 }
5562 // JE `IN.deleteEntry` (IN.java:3466): removing a
5563 // DIRTY slot must prohibit the next delta — a
5564 // delta only carries dirty slots, so the removal
5565 // would otherwise be silently lost. Force a
5566 // full BIN on the next log.
5567 if b.entries[j].dirty {
5568 b.prohibit_next_delta = true;
5569 }
5570 b.entries.remove(j);
5571 b.keys.remove(j); // T-2
5572 b.lsn_rep.remove_shift(j); // T-3
5573 b.dirty = true;
5574 }
5575 }
5576 // Recompute prefix after slot removal, since the
5577 // remaining keys may share a longer common prefix.
5578 // After compress(), call recalcKeyPrefix().
5579 if b.entries.len() >= 2 {
5580 b.recompute_key_prefix();
5581 } else if b.entries.len() < 2 {
5582 b.key_prefix = Vec::new();
5583 }
5584 b.entries.len() < before
5585 }
5586 _ => false,
5587 }
5588 }
5589 };
5590
5591 // ---- Step 3: prune empty BIN from parent ----
5592 // If (empty) pruneBIN(db, binRef, idKey) → tree.delete(idKey).
5593 // We only prune when the BIN is actually empty after compression.
5594 let now_empty = { bin_arc.read().get_n_entries() == 0 };
5595
5596 if now_empty {
5597 // pruneBIN re-descends to the SPECIFIC empty BIN and removes its
5598 // parent-IN slot ONLY IF the BIN is still empty (and has no
5599 // cursors and is not a delta) UNDER THE PARENT LATCH.
5600 //
5601 // We must NOT use `self.delete(&id_key)` here (IC-1): that
5602 // re-descends by key and removes whatever live entry now matches
5603 // `id_key`. Between reading `now_empty` (a fresh read lock taken
5604 // after the compression write lock was dropped) and acting on it,
5605 // a concurrent insert can repopulate this BIN; `self.delete` would
5606 // then drop a LIVE entry — tree corruption / lost write.
5607 //
5608 // JE `INCompressor.pruneBIN` (INCompressor.java ~line 502-510)
5609 // calls `tree.delete(idKey)`, and JE `Tree.delete` /
5610 // `searchDeletableSubTree` (Tree.java ~line 755-800) re-validates
5611 // `bin.getNEntries() != 0` → NODE_NOT_EMPTY (abort) and
5612 // `bin.nCursors() > 0` → CURSORS_EXIST (abort) while holding the
5613 // parent (branch) latch. `prune_empty_bin` reproduces exactly
5614 // that re-validation. See `prune_empty_bin` below.
5615 //
5616 // Note: we only attempt the prune if n_entries was > 0 before
5617 // compression (an already-empty BIN we never populated is left
5618 // alone, matching the pre-existing guard).
5619 if let Some(key) = id_key
5620 && n_entries > 0
5621 {
5622 self.prune_empty_bin(&key);
5623 }
5624 return true;
5625 }
5626
5627 removed_any
5628 }
5629
5630 /// Re-descend to the leaf BIN that should contain `id_key` and remove its
5631 /// parent-IN child slot ONLY IF the BIN is still safe to prune.
5632 ///
5633 /// This is the faithful port of JE `Tree.delete(idKey)` /
5634 /// `Tree.searchDeletableSubTree` (Tree.java ~line 755-800) as invoked by
5635 /// `INCompressor.pruneBIN` (INCompressor.java ~line 502-510). JE takes the
5636 /// branch-parent latch, re-descends to the specific empty BIN, and aborts
5637 /// the prune (removing NOTHING) if any of the following changed since the
5638 /// compressor observed the BIN as empty:
5639 ///
5640 /// * `bin.getNEntries() != 0` → `NodeNotEmptyException` (a concurrent
5641 /// insert repopulated the BIN — IC-1: we must NOT delete a live entry).
5642 /// * `bin.isBINDelta()` → `unexpectedState` (deltas are never empty).
5643 /// * `bin.nCursors() > 0` → `CursorsExistException` (a cursor is parked
5644 /// on the empty BIN; requeue rather than orphan the cursor).
5645 ///
5646 /// The re-check and the slot removal both happen while holding the
5647 /// **parent IN write latch**. Holding the parent write latch blocks every
5648 /// descender (insert / delete take `parent.read()` hand-over-hand), so a
5649 /// concurrent insert cannot reach the BIN between our re-check and the
5650 /// slot removal — the TOCTOU window IC-1 describes is closed.
5651 ///
5652 /// Returns `true` iff a parent-IN slot was removed, `false` otherwise
5653 /// (BIN repopulated, has a cursor, is a delta, vanished, or is the root —
5654 /// in every `false` case NOTHING is removed).
5655 pub fn prune_empty_bin(&self, id_key: &[u8]) -> bool {
5656 let root = match self.get_root() {
5657 Some(r) => r,
5658 None => return false,
5659 };
5660
5661 // If the root itself is the BIN (single-BIN tree) there is no parent
5662 // IN to remove a slot from. JE's searchDeletableSubTree returns null
5663 // ("the entire tree is empty") and keeps the root BIN; we do the same.
5664 if root.read().is_bin() {
5665 return false;
5666 }
5667
5668 // Descend by id_key tracking the IN that is the *parent of the leaf
5669 // BIN* and the child index within it. Hand-over-hand read coupling
5670 // keeps the descent consistent with concurrent splits, exactly like
5671 // `get_parent_bin_for_child_ln`.
5672 let (parent_arc, child_index) = {
5673 let mut parent_arc: Arc<RwLock<TreeNode>> = root.clone();
5674 let mut guard: parking_lot::ArcRwLockReadGuard<
5675 parking_lot::RawRwLock,
5676 TreeNode,
5677 > = root.read_arc();
5678 loop {
5679 let (next_arc, idx) = match &*guard {
5680 TreeNode::Internal(n) => {
5681 if n.entries.is_empty() {
5682 return false;
5683 }
5684 let idx = self.upper_in_floor_index(&n.entries, id_key);
5685 match n.get_child(idx) {
5686 Some(c) => (c, idx),
5687 None => return false,
5688 }
5689 }
5690 TreeNode::Bottom(_) => {
5691 unreachable!("is_bin checked before / below")
5692 }
5693 };
5694 // Is the next node the leaf BIN? If so, `guard`'s node is the
5695 // parent IN we want and `idx` is the child slot.
5696 if next_arc.read().is_bin() {
5697 drop(guard);
5698 break (parent_arc, idx);
5699 }
5700 let next_guard = next_arc.read_arc();
5701 drop(guard);
5702 parent_arc = next_arc;
5703 guard = next_guard;
5704 }
5705 };
5706
5707 // ---- Re-validate and remove the slot UNDER THE PARENT WRITE LATCH ----
5708 // Holding parent.write() excludes all descenders (they need
5709 // parent.read()), so the BIN cannot be repopulated between the
5710 // re-check and the slot removal.
5711 let mut parent_guard = parent_arc.write();
5712 let pruned_bin_id;
5713 let removed_key_len = match &mut *parent_guard {
5714 TreeNode::Internal(p) => {
5715 let child = match p.get_child(child_index) {
5716 Some(c) => c,
5717 None => return false, // slot already vacated / invalid
5718 };
5719 // Re-validate the child BIN under the parent latch.
5720 {
5721 let cg = child.read();
5722 match &*cg {
5723 TreeNode::Bottom(b) => {
5724 // JE: bin.getNEntries() != 0 → NODE_NOT_EMPTY (abort).
5725 if !b.entries.is_empty() {
5726 return false;
5727 }
5728 // JE: bin.isBINDelta() → unexpectedState (abort).
5729 if b.is_delta {
5730 return false;
5731 }
5732 // JE: bin.nCursors() > 0 → CURSORS_EXIST (abort).
5733 if b.cursor_count > 0 {
5734 return false;
5735 }
5736 pruned_bin_id = b.node_id;
5737 }
5738 // A concurrent split could in principle have replaced
5739 // the child with an IN; never prune in that case.
5740 TreeNode::Internal(_) => return false,
5741 }
5742 }
5743 // Safe to prune: remove the BIN's slot from the parent IN.
5744 // Mirrors the parent-slot removal `Tree.delete` performs for
5745 // an empty BIN (Tree.java deleteEntry under the branch latch).
5746 // T-4: remove_entry shifts the node-level child array too.
5747 let removed = p.remove_entry(child_index);
5748 p.dirty = true;
5749 removed.key.len()
5750 }
5751 TreeNode::Bottom(_) => return false,
5752 };
5753 drop(parent_guard);
5754
5755 // JE: removing the BIN slot detaches the BIN from the tree; the
5756 // evictor must drop it from its LRU lists (Evictor.remove).
5757 self.note_removed(pruned_bin_id);
5758
5759 // Preserve the memory-counter bookkeeping that `self.delete` performed
5760 // (IN.updateMemorySize(-delta) → MemoryBudget.updateTreeMemoryUsage).
5761 // The pruned slot's key plus the fixed per-entry overhead matches the
5762 // `delete` accounting (key.len() + BIN_ENTRY_OVERHEAD).
5763 if let Some(counter) = &self.memory_counter {
5764 let delta = (removed_key_len + BIN_ENTRY_OVERHEAD) as i64;
5765 counter.fetch_sub(delta, Ordering::Relaxed);
5766 }
5767
5768 true
5769 }
5770
5771 /// Detach the resident child node `node_id` from its parent IN, dropping
5772 /// the strong `Arc` so the node is actually freed from memory, and return
5773 /// the heap bytes reclaimed (0 if not found / not detachable).
5774 ///
5775 /// This is the faithful port of JE `IN.detachNode(idx, updateLsn, newLsn)`
5776 /// (IN.java ~4019) as called from `Evictor.evict` (Evictor.java ~3035):
5777 /// `evict` measures `target.getBudgetedMemorySize()` and then
5778 /// `parent.detachNode(index, ...)` does `setTarget(idx, null)` to drop the
5779 /// child reference and `getInMemoryINs().remove(child)` to drop it from
5780 /// the INList.
5781 ///
5782 /// EV-13: before this method existed, the evictor credited
5783 /// `node_size_fn(node_id)` bytes back to the budget and removed the node
5784 /// from the LRU lists, but the parent's `InEntry.child` still held a
5785 /// strong `Arc` — so the node was never dropped from the heap. The budget
5786 /// over-credited (claimed bytes freed that were not), `cache_usage`
5787 /// drifted below reality, and the evictor under-fired. Detaching here
5788 /// drops the `Arc` for real and credits exactly the measured size.
5789 ///
5790 /// The detach happens **under the parent IN write latch** (JE detaches
5791 /// under the parent's latch), so no concurrent descender can re-cache the
5792 /// child between measurement and detach. The slot (key + LSN) is kept —
5793 /// only the in-memory `child` target is cleared — matching JE's
5794 /// `setTarget(idx, null)` which leaves the `ChildReference` LSN intact so
5795 /// the node can be re-fetched from the log later.
5796 ///
5797 /// Returns `0` if the node is not a resident child of any IN (e.g. it is
5798 /// the root, already detached, or was pinned and could not be latched).
5799 pub fn detach_node_by_id(&self, node_id: u64) -> u64 {
5800 let root = match self.get_root() {
5801 Some(r) => r,
5802 None => return 0,
5803 };
5804
5805 // The root has no parent IN to detach from (JE evicts the root via a
5806 // separate evictRoot path; we keep the root resident here).
5807 let root_id = {
5808 let g = root.read();
5809 match &*g {
5810 TreeNode::Internal(n) => n.node_id,
5811 TreeNode::Bottom(b) => b.node_id,
5812 }
5813 };
5814 if root_id == node_id {
5815 return 0;
5816 }
5817
5818 // Locate the parent IN and the child slot index.
5819 let (parent_arc, child_index) =
5820 match Self::find_parent_of_node_id(&root, node_id) {
5821 Some(p) => p,
5822 None => return 0,
5823 };
5824
5825 // ---- Measure + detach UNDER THE PARENT WRITE LATCH ----
5826 // Holding parent.write() excludes all descenders (they take
5827 // parent.read() hand-over-hand), so the child cannot be re-cached or
5828 // re-pinned between the measurement and the detach. Mirrors JE
5829 // detachNode running under the parent latch held by Evictor.evict.
5830 let mut parent_guard = parent_arc.write();
5831 let TreeNode::Internal(p) = &mut *parent_guard else {
5832 return 0; // parent is not an IN (concurrent restructure)
5833 };
5834 if child_index >= p.entries.len() {
5835 return 0;
5836 }
5837 // T-4: detach the cached child via the node-level INTargetRep, leaving
5838 // the slot's key/LSN intact for re-fetch (JE IN.setTarget(idx, null)).
5839 let child = match p.take_child(child_index) {
5840 Some(c) => c, // child Arc removed from the slot
5841 None => return 0, // already detached
5842 };
5843
5844 // Measure the child's real heap footprint while we still hold it.
5845 // JE: long evictedBytes = target.getBudgetedMemorySize().
5846 let freed = child.read().budgeted_memory_size();
5847
5848 // EV-14 re-fetch correctness: the parent slot LSN must point at the
5849 // child's CURRENT on-disk version so `child_at_or_fetch` re-reads the
5850 // right bytes (JE `IN.updateEntry(idx, newLsn)` is called whenever a
5851 // child is logged; the parent slot LSN tracks the child's LSN). The
5852 // evictor only fully evicts/detaches a CLEAN BIN (it logs+clears dirty
5853 // BINs via flush_dirty_node_to_log first, which sets `last_full_lsn`),
5854 // so the child's authoritative LSN is its `last_full_lsn`. Stamp it
5855 // into the parent slot before dropping the child; if it is null (the
5856 // child was never logged) leave the existing slot LSN intact rather
5857 // than writing a null — a never-logged clean child cannot occur on
5858 // the evict path, but be conservative.
5859 let child_full_lsn = match &*child.read() {
5860 TreeNode::Bottom(b) => b.last_full_lsn,
5861 TreeNode::Internal(_) => NULL_LSN,
5862 };
5863 if child_full_lsn != NULL_LSN {
5864 p.set_lsn(child_index, child_full_lsn);
5865 }
5866
5867 // Mark the parent dirty: the slot's in-memory target changed (JE
5868 // detachNode sets dirty when updateLsn; we conservatively mark dirty
5869 // so the parent is re-logged with the now-non-resident slot).
5870 p.dirty = true;
5871
5872 // Drop the strong Arc explicitly so the node is freed now (the slot's
5873 // `child` is already None). If any other resident path still held a
5874 // strong reference this would not free — but the tree is the sole
5875 // strong owner of a cached child, so this drops the last strong ref.
5876 drop(parent_guard);
5877 drop(child);
5878
5879 // JE: getInMemoryINs().remove(child) — drop it from the evictor LRU.
5880 self.note_removed(node_id);
5881
5882 // NOTE: the live tree-memory counter (`memory_counter`) is the SAME
5883 // `Arc<AtomicI64>` the evictor's Arbiter uses as `cache_usage`. The
5884 // evictor decrements it once via `Arbiter::release_memory(bytes)` for
5885 // the full eviction batch, so detach must NOT decrement here too —
5886 // that would double-credit and drive `cache_usage` below reality
5887 // (the very drift EV-13 fixes, in the other direction). We only
5888 // measure-and-free; the caller does the single counter update.
5889 freed
5890 }
5891
5892 /// Evict the root IN of this tree (EV-14).
5893 ///
5894 /// Faithful port of JE `Evictor.evictRoot` (Evictor.java:3050-3110) plus
5895 /// the `RootEvictor.doWork` + `Tree.withRootLatchedExclusive` framing
5896 /// (Evictor.java:2529-2576, Tree.java:508-517). Unlike a normal IN, the
5897 /// root has no parent slot to detach from; instead the *tree's* root
5898 /// reference is the equivalent of the `RootChildReference`, so eviction:
5899 ///
5900 /// 1. Latches the root reference exclusively (`rootLatch.acquireExclusive`
5901 /// via `withRootLatchedExclusive`).
5902 /// 2. Re-checks that the root is still resident and still evictable
5903 /// (no resident children, no pinned BIN — JE `RootEvictor.doWork`
5904 /// re-latches and re-checks `rootIN == target && rootIN.isRoot()`).
5905 /// 3. If the root is dirty, LOGS it first so the on-disk version is
5906 /// current and updates `root_log_lsn` to the new LSN (JE
5907 /// `evictRoot`: `long newLsn = target.log(...); rootRef.setLsn(newLsn)`).
5908 /// 4. Clears the in-memory root (`rootRef.clearTarget()` — JE leaves the
5909 /// `ChildReference` LSN intact; here `root_log_lsn` is that LSN) and
5910 /// `note_removed`s it from the evictor LRU (JE `inList.remove(target)`).
5911 ///
5912 /// On the next access `fetch_root_from_log` re-materializes the root from
5913 /// `root_log_lsn` (JE `Tree.getRootINRootAlreadyLatched` →
5914 /// `root.fetchTarget`).
5915 ///
5916 /// # Conditions (eviction is REFUSED, returning `None`, when)
5917 ///
5918 /// * there is no log manager wired (the root could never be re-fetched),
5919 /// * the tree has no resident root (already evicted),
5920 /// * the root has any resident child (JE only evicts a childless root —
5921 /// the `hasCachedChildren` skip in `processTarget`; a root with cached
5922 /// children would orphan them, the EV-6 invariant),
5923 /// * the root is a BIN pinned by a cursor (`cursor_count > 0`),
5924 /// * the root is dirty but we have no clean persisted version AND logging
5925 /// it fails, or
5926 /// * the root is clean but `root_log_lsn` is null (never logged — cannot
5927 /// be re-fetched; happens only for a brand-new unlogged tree).
5928 ///
5929 /// Returns `Some((freed_bytes, was_dirty))` on success, where `freed_bytes`
5930 /// is the root's measured heap footprint (JE
5931 /// `target.getBudgetedMemorySize()`) and `was_dirty` reports whether the
5932 /// root had to be logged (JE `rootEvictor.flushed`, which drives
5933 /// `nDirtyNodesEvicted` and `modifyDbRoot`).
5934 pub fn evict_root(&self, db_id: u64) -> Option<(u64, bool)> {
5935 // A root with no re-fetch path must never be made non-resident.
5936 self.log_manager.as_ref()?;
5937
5938 // JE `Tree.withRootLatchedExclusive(rootEvictor)`: hold the root latch
5939 // exclusively across the whole evict so no descender or splitter can
5940 // observe/install a half-evicted root. Acquiring `self.root.write()`
5941 // is the Noxu equivalent (it is the lock guarding the root pointer).
5942 let mut root_slot = self.root.write();
5943 let root_arc = root_slot.as_ref()?.clone();
5944
5945 // JE `RootEvictor.doWork`: re-latch the target and re-check the
5946 // conditions. We hold the node guard for the duration.
5947 let node_guard = root_arc.write();
5948
5949 // EV-6 / JE `processTarget` hasCachedChildren skip: a root with any
5950 // resident child must NOT be evicted (it would orphan the child).
5951 // EV-14 only evicts an *idle* root whose children are already
5952 // non-resident (or which is itself a leaf BIN).
5953 let (node_id, was_dirty, freed) = match &*node_guard {
5954 TreeNode::Internal(n) => {
5955 if !n.resident_children().is_empty() {
5956 return None; // has cached children — keep resident
5957 }
5958 (n.node_id, n.dirty, node_guard.budgeted_memory_size())
5959 }
5960 TreeNode::Bottom(b) => {
5961 if b.cursor_count > 0 {
5962 return None; // pinned by a cursor — keep resident
5963 }
5964 (
5965 b.node_id,
5966 b.dirty || b.dirty_count() > 0,
5967 node_guard.budgeted_memory_size(),
5968 )
5969 }
5970 };
5971
5972 // If dirty, log the root first so the on-disk version is current,
5973 // then record the new LSN as the root's re-fetch point (JE
5974 // `evictRoot`: target.log(...) + rootRef.setLsn(newLsn)).
5975 if was_dirty {
5976 let lm = self.log_manager.as_ref()?; // checked above; re-borrow
5977 let node_bytes = node_guard.write_to_bytes();
5978 let is_bin = node_guard.is_bin();
5979 let entry = noxu_log::entry::in_log_entry::InLogEntry::new(
5980 db_id, NULL_LSN, // prev_full_lsn
5981 NULL_LSN, // prev_delta_lsn
5982 node_bytes,
5983 );
5984 let mut buf = bytes::BytesMut::with_capacity(entry.log_size());
5985 entry.write_to_log(&mut buf);
5986 let entry_type = if is_bin {
5987 noxu_log::LogEntryType::BIN
5988 } else {
5989 noxu_log::LogEntryType::IN
5990 };
5991 // flush_required = true so the root's bytes are durable before we
5992 // drop the in-memory copy (JE logs synchronously in evictRoot).
5993 let new_lsn = match lm.log(
5994 entry_type,
5995 &buf,
5996 noxu_log::Provisional::No,
5997 true, // flush_required
5998 false, // fsync at next checkpoint
5999 ) {
6000 Ok(l) => l,
6001 Err(_) => return None, // could not log — keep the root resident
6002 };
6003 *self.root_log_lsn.write() = new_lsn;
6004 } else {
6005 // Clean root: it must already be re-fetchable. If it was never
6006 // logged (root_log_lsn null) we cannot evict it safely.
6007 if *self.root_log_lsn.read() == NULL_LSN {
6008 return None;
6009 }
6010 }
6011
6012 // JE `rootRef.clearTarget()` + `inList.remove(target)`: drop the
6013 // in-memory root and remove it from the evictor LRU. The root_log_lsn
6014 // is the surviving `ChildReference` LSN used to re-fetch it.
6015 drop(node_guard);
6016 *root_slot = None;
6017 drop(root_slot);
6018 self.note_removed(node_id);
6019
6020 Some((freed, was_dirty))
6021 }
6022
6023 /// Re-materialize an evicted root IN from its persisted `root_log_lsn`
6024 /// (EV-14, piece B).
6025 /// Faithful to JE `Tree.getRootINRootAlreadyLatched` (Tree.java:477-516)
6026 /// which calls `root.fetchTarget(database, null)` when the in-memory
6027 /// target is null. Idempotent and cheap when the root is already
6028 /// resident: returns the resident root without touching the log.
6029 ///
6030 /// Returns `None` only when the tree is genuinely empty (no resident root
6031 /// AND `root_log_lsn` is null) or when the re-fetch fails (no log manager,
6032 /// log read error, deserialize failure) — callers then see an empty tree,
6033 /// never wrong data.
6034 pub fn fetch_root_from_log(&self) -> Option<Arc<RwLock<TreeNode>>> {
6035 // Fast path: root already resident.
6036 if let Some(r) = self.root.read().clone() {
6037 return Some(r);
6038 }
6039 // Take the write lock and re-check (another thread may have re-fetched
6040 // it while we waited — JE upgrades the root latch the same way).
6041 let mut root_slot = self.root.write();
6042 if let Some(r) = root_slot.as_ref() {
6043 return Some(r.clone());
6044 }
6045 let log_lsn = *self.root_log_lsn.read();
6046 let node = self.fetch_node_from_log(log_lsn)?;
6047 let node_id = node.node_id();
6048 let arc = Arc::new(RwLock::new(node));
6049 *root_slot = Some(arc.clone());
6050 drop(root_slot);
6051 // JE: a fetched IN is added back to the INList (Evictor LRU).
6052 self.note_added(node_id);
6053 Some(arc)
6054 }
6055
6056 /// Return the resident child Arc for slot `idx` of `parent_arc`, fetching
6057 /// it from its slot LSN and installing it if it is not resident (EV-14 /
6058 /// EV-13 re-fetch on descent).
6059 ///
6060 /// Faithful to JE `ChildReference.fetchTarget` (and `IN.fetchTarget`):
6061 /// when a slot's in-memory target is null but its LSN is valid, the node
6062 /// is read back from the log and cached in the slot. Installing the
6063 /// fetched child requires the parent EX-latch, so this takes the parent
6064 /// write lock; the fast path (child already resident) takes only a read
6065 /// lock.
6066 ///
6067 /// Returns `None` only when the slot index is out of range, the slot has
6068 /// no valid LSN, or the log read/deserialize fails — callers then treat
6069 /// the descent as terminating in an empty subtree, never wrong data.
6070 fn child_at_or_fetch(
6071 &self,
6072 parent_arc: &Arc<RwLock<TreeNode>>,
6073 idx: usize,
6074 ) -> Option<ChildArc> {
6075 // Fast path: child already cached (read lock only).
6076 {
6077 let g = parent_arc.read();
6078 if let TreeNode::Internal(n) = &*g {
6079 if let Some(c) = n.get_child(idx) {
6080 return Some(c);
6081 }
6082 } else {
6083 return None; // BINs have no IN children
6084 }
6085 }
6086 // Slow path: fetch the child from its slot LSN under the parent
6087 // EX-latch (JE installs the fetched target under the IN latch).
6088 let mut g = parent_arc.write();
6089 let TreeNode::Internal(n) = &mut *g else {
6090 return None;
6091 };
6092 // Re-check: another thread may have fetched it while we upgraded.
6093 if let Some(c) = n.get_child(idx) {
6094 return Some(c);
6095 }
6096 if idx >= n.entries.len() {
6097 return None;
6098 }
6099 let child_lsn = n.get_lsn(idx);
6100 let node = self.fetch_node_from_log(child_lsn)?;
6101 let node_id = node.node_id();
6102 let arc: ChildArc = Arc::new(RwLock::new(node));
6103 n.set_child(idx, Some(arc.clone()));
6104 drop(g);
6105 // JE: a fetched IN is added back to the INList (Evictor LRU).
6106 self.note_added(node_id);
6107 Some(arc)
6108 }
6109
6110 /// Check whether a BIN node is a candidate for slot compression and,
6111 /// if so, trigger `compress_bin`.
6112 ///
6113 /// from (the opportunistic / lazy compression path).
6114 ///
6115 /// # Algorithm
6116 ///
6117 /// 1. Skip the BIN if it is a delta or has no defunct (known-deleted) slots.
6118 /// 2. If compression succeeds and the BIN becomes empty, it is pruned.
6119 ///
6120 /// # Returns
6121 ///
6122 /// `true` if compression was triggered (regardless of whether any slots
6123 /// were actually removed), `false` if the BIN does not need compression.
6124 pub fn maybe_compress_bin_and_parent(
6125 &self,
6126 bin_arc: &Arc<RwLock<TreeNode>>,
6127 ) -> bool {
6128 // Check whether the BIN has any deleted slots worth compressing.
6129 // lazyCompress: skip deltas and BINs with no defunct slots.
6130 let should_compress = {
6131 {
6132 let g = bin_arc.read();
6133 match &*g {
6134 TreeNode::Bottom(b) => {
6135 // Skip deltas (the: !in.isBIN() || in.isBINDelta()).
6136 if b.is_delta {
6137 false
6138 } else {
6139 // Check for any known-deleted slot
6140 // (the: for (int i=0; i < bin.getNEntries(); i++) {
6141 // if (bin.isDefunct(i)) { ... break; }
6142 // }).
6143 b.entries.iter().any(|e| e.known_deleted)
6144 }
6145 }
6146 _ => false,
6147 }
6148 }
6149 };
6150
6151 if !should_compress {
6152 return false;
6153 }
6154
6155 self.compress_bin(bin_arc)
6156 }
6157
6158 // ========================================================================
6159 // Latch-coupling validation
6160 // ========================================================================
6161
6162 /// Validate that `parent.entries[child_index].child` still points at
6163 /// `child_arc` after acquiring the child's latch.
6164 ///
6165 /// Re-latch validation step inside the
6166 /// `Tree.searchSplitsAllowed`: after a concurrent split the parent
6167 /// slot that previously held the child may have changed. Callers that
6168 /// plan to mutate the child must verify the parent-child link is still
6169 /// intact before proceeding.
6170 ///
6171 /// Returns `true` if the parent-child link is intact.
6172 pub fn validate_parent_child(
6173 parent: &Arc<RwLock<TreeNode>>,
6174 child_index: usize,
6175 child_arc: &Arc<RwLock<TreeNode>>,
6176 ) -> bool {
6177 let g = parent.read();
6178 match &*g {
6179 TreeNode::Internal(p) => match p.child_ref(child_index) {
6180 Some(stored) => Arc::ptr_eq(stored, child_arc),
6181 None => false,
6182 },
6183 TreeNode::Bottom(_) => false,
6184 }
6185 }
6186
6187 /// Search for the BIN that should contain `key`, with latch-coupling
6188 /// validation at every level of descent.
6189 ///
6190 /// .
6191 ///
6192 /// The difference from `search()` is that after obtaining the child
6193 /// arc we call `validate_parent_child` to confirm the parent still
6194 /// holds the expected Arc. If the link has been broken (e.g. by a
6195 /// concurrent split that relocated the child) the traversal restarts
6196 /// from the root.
6197 ///
6198 /// Returns a `SearchResult` if the key is (or should be) in the tree,
6199 /// `None` if the tree is empty.
6200 ///
6201 /// Same as [`Tree::search`] but exposes the hand-over-hand latch
6202 /// coupling explicitly. Kept as a public, equivalent API for
6203 /// callers (today only tests) that want to verify the
6204 /// latch-coupling behaviour against `search()` itself.
6205 ///
6206 /// Both `search()` and this method use the same `read_arc()`
6207 /// hand-over-hand: take the child read guard *before* dropping
6208 /// the parent guard, so a concurrent `split_child(parent, ..)`
6209 /// (which takes `parent.write()`) cannot run between when we
6210 /// captured the child Arc and when we entered the child. There
6211 /// is no validate-and-restart loop because the coupling makes
6212 /// the race unreachable.
6213 pub fn search_with_coupling(&self, key: &[u8]) -> Option<SearchResult> {
6214 let root = self.get_root()?;
6215 let mut guard: parking_lot::ArcRwLockReadGuard<
6216 parking_lot::RawRwLock,
6217 TreeNode,
6218 > = root.read_arc();
6219
6220 loop {
6221 if guard.is_bin() {
6222 let index = guard.find_entry(key, true, true);
6223 let found = index >= 0 && (index & EXACT_MATCH != 0);
6224 return Some(SearchResult::with_values(
6225 found,
6226 index & 0xFFFF,
6227 false,
6228 ));
6229 }
6230
6231 let parent_arc =
6232 parking_lot::ArcRwLockReadGuard::rwlock(&guard).clone();
6233 let next_idx = match &*guard {
6234 TreeNode::Internal(n) => {
6235 if n.entries.is_empty() {
6236 return None;
6237 }
6238 let idx = self.upper_in_floor_index(&n.entries, key);
6239 match n.get_child(idx) {
6240 Some(c) => {
6241 let next_guard = c.read_arc();
6242 drop(guard);
6243 guard = next_guard;
6244 continue;
6245 }
6246 None => idx, // EV-14/EV-13: re-fetch below.
6247 }
6248 }
6249 TreeNode::Bottom(_) => {
6250 unreachable!("is_bin() returned false above")
6251 }
6252 };
6253 // Hand-over-hand: take the child read guard before
6254 // releasing the parent guard. Closes the
6255 // descender-vs-splitter window: a concurrent
6256 // split_child(parent, ..) takes parent.write(), which
6257 // blocks while we still hold parent.read().
6258 drop(guard);
6259 let child = self.child_at_or_fetch(&parent_arc, next_idx)?;
6260 guard = child.read_arc();
6261 }
6262 }
6263
6264 // ========================================================================
6265 // BIN-Delta reconstitution
6266 // ========================================================================
6267
6268 /// Increments the cursor-pin count on a BIN node.
6269 ///
6270 /// Called by `CursorImpl` when it positions on (or enters) a BIN.
6271 /// The evictor will not select a BIN with `cursor_count > 0` for eviction
6272 /// (`RealNodeInfo.pin_count`), matching `BIN.incrementCursorCount()`.
6273 pub fn pin_bin(bin_arc: &Arc<RwLock<TreeNode>>) {
6274 let mut guard = bin_arc.write();
6275 if let TreeNode::Bottom(ref mut stub) = *guard {
6276 stub.cursor_count += 1;
6277 }
6278 }
6279
6280 /// Decrements the cursor-pin count on a BIN node.
6281 ///
6282 /// Called by `CursorImpl` when it moves away from or closes on a BIN.
6283 /// Uses `saturating_sub` to guard against an accidental double-unpin.
6284 /// Matching `BIN.decrementCursorCount()`.
6285 pub fn unpin_bin(bin_arc: &Arc<RwLock<TreeNode>>) {
6286 let mut guard = bin_arc.write();
6287 if let TreeNode::Bottom(ref mut stub) = *guard {
6288 stub.cursor_count = stub.cursor_count.saturating_sub(1);
6289 }
6290 }
6291
6292 /// Returns `true` if the given `BinStub` is a BIN-delta (not a full BIN).
6293 ///
6294 /// `IN.isBINDelta()`.
6295 pub fn bin_is_delta(bin: &BinStub) -> bool {
6296 bin.is_delta
6297 }
6298
6299 /// Merge delta entries into a full BIN's entry list.
6300 ///
6301 /// - For each delta entry: if a matching key already exists in `bin`,
6302 /// replace it (delta is authoritative).
6303 /// - Otherwise insert the delta entry in sorted position.
6304 ///
6305 /// Delta entries carry **full** keys (prefix already prepended by the
6306 /// caller). After applying all delta entries the BIN's prefix is
6307 /// recomputed so the final state is consistent.
6308 ///
6309 /// All delta entries are considered to be the most-recently-dirtied
6310 /// state, exactly as in where delta slots supersede full-BIN slots.
6311 pub fn apply_delta_to_bin(
6312 bin: &mut BinStub,
6313 delta_entries: Vec<(Vec<u8>, Lsn, Option<Vec<u8>>)>,
6314 ) {
6315 for (full_key, lsn, data) in delta_entries {
6316 // `full_key` is a full (uncompressed) key here.
6317 bin.insert_with_prefix(full_key, lsn, data);
6318 }
6319 bin.dirty = true;
6320 }
6321
6322 /// Reconstitute a BIN-delta into a full BIN.
6323 ///
6324 /// from the:
6325 ///
6326 /// 1. Extract the delta entries from `self` (this BIN-delta), decompressing
6327 /// them to full keys.
6328 /// 2. Apply them onto `base` (the previously logged full BIN) via
6329 /// `apply_delta_to_bin`.
6330 /// 3. Copy `base`'s merged entries and prefix back into `self`.
6331 /// 4. Clear the `is_delta` flag so subsequent code treats `self` as
6332 /// a full BIN.
6333 ///
6334 /// After this call `self` is a full BIN; `base` should be discarded.
6335 pub fn mutate_to_full_bin(delta: &mut BinStub, mut base: BinStub) {
6336 // Decompress delta entries to full keys before applying.
6337 let delta_full_entries: Vec<(Vec<u8>, Lsn, Option<Vec<u8>>)> = (0
6338 ..delta.entries.len())
6339 .map(|i| {
6340 (
6341 delta.get_full_key(i).unwrap_or_default(),
6342 delta.get_lsn(i),
6343 delta.entries[i].data.clone(),
6344 )
6345 })
6346 .collect();
6347 // reconstituteBIN + resetContent + setBINDelta(false).
6348 Self::apply_delta_to_bin(&mut base, delta_full_entries);
6349 delta.entries = base.entries;
6350 delta.lsn_rep = base.lsn_rep; // T-3
6351 delta.keys = base.keys; // T-2
6352 delta.key_prefix = base.key_prefix;
6353 delta.is_delta = false;
6354 delta.dirty = true;
6355 }
6356
6357 /// Read an IN/BIN log entry at `log_lsn` and deserialise it into a
6358 /// `TreeNode`, ready to be installed as a (re-fetched) resident node.
6359 ///
6360 /// JE `LogManager.getLogEntry(lsn)` + `IN.readFromLog` as used by
6361 /// `ChildReference.fetchTarget` (the path that re-materializes a
6362 /// non-resident node from its persisted LSN on descent) and by
6363 /// `Tree.getRootINRootAlreadyLatched` for the root. The freshly-fetched
6364 /// node has no resident children (`TargetRep::None`); its own children, if
6365 /// any, are re-fetched on demand the same way when the descent reaches
6366 /// them.
6367 ///
6368 /// Returns `None` if the LSN is null, the log read fails, the entry is not
6369 /// an IN/BIN, or deserialisation fails (the caller treats this as "node
6370 /// unavailable" rather than panicking, matching the graceful-degradation
6371 /// policy of `mutate_to_full_bin_from_log`).
6372 fn fetch_node_from_log(&self, log_lsn: Lsn) -> Option<TreeNode> {
6373 if log_lsn == NULL_LSN {
6374 return None;
6375 }
6376 let lm = self.log_manager.as_ref()?;
6377 let (entry_type, payload) = lm.read_entry(log_lsn).ok()?;
6378 // The on-disk payload is an `InLogEntry` body (db_id | prev_full_lsn
6379 // | prev_delta_lsn | len | node_data). The recovery scanner strips
6380 // this header before calling `recover_in_redo`; re-fetch must do the
6381 // same so `deserialize_*` sees the bare node bytes. JE
6382 // `INLogEntry.readEntry` parses the same wrapper.
6383 let in_entry =
6384 noxu_log::entry::in_log_entry::InLogEntry::read_from_log(&payload)
6385 .ok()?;
6386 let node_data = &in_entry.node_data;
6387 use noxu_log::LogEntryType;
6388 match entry_type {
6389 LogEntryType::BIN => {
6390 Self::deserialize_bin(node_data).map(TreeNode::Bottom)
6391 }
6392 LogEntryType::IN => {
6393 Self::deserialize_upper_in(node_data).map(TreeNode::Internal)
6394 }
6395 // BIN-deltas are never logged as the *root* version and are
6396 // reconstituted by the BIN-delta path, not here.
6397 _ => {
6398 log::warn!(
6399 "fetch_node_from_log: expected IN/BIN entry at LSN {:?}, \
6400 got {:?}",
6401 log_lsn,
6402 entry_type
6403 );
6404 None
6405 }
6406 }
6407 }
6408
6409 /// Reconstitute a BIN-delta into a full BIN by reading the base from log.
6410 ///
6411 /// — the
6412 /// single-argument overload that calls `fetchFullBIN(databaseImpl)` to
6413 /// read the last full BIN from the log manager automatically.
6414 ///
6415 /// Algorithm:
6416 /// 1. If `delta.last_full_lsn == NULL_LSN`, the BIN was never written as a
6417 /// full entry; there is no base to merge so the delta IS the full BIN.
6418 /// Clear `is_delta` and return.
6419 /// 2. Read the full-BIN log entry at `delta.last_full_lsn` using
6420 /// `log_manager.read_entry(lsn)`.
6421 /// 3. Deserialize the payload with `BinStub::deserialize_full()`.
6422 /// 4. Delegate to `Self::mutate_to_full_bin(delta, base)` to merge and
6423 /// replace `delta`'s contents.
6424 ///
6425 /// On any read / parse failure the function falls back to clearing the
6426 /// `is_delta` flag without merging, so the caller always gets a non-delta
6427 /// BIN (possibly missing some old slots). This mirrors the
6428 /// `EnvironmentFailureException` path but gracefully degrades instead of
6429 /// panicking.
6430 ///
6431 /// `BIN.fetchFullBIN(dbImpl)` + `BIN.mutateToFullBIN(boolean)`.
6432 pub fn mutate_to_full_bin_from_log(
6433 delta: &mut BinStub,
6434 log_manager: &noxu_log::LogManager,
6435 ) {
6436 if !delta.is_delta {
6437 // Already a full BIN; nothing to do.
6438 return;
6439 }
6440
6441 if delta.last_full_lsn == NULL_LSN {
6442 // BIN has never been logged as a full entry — the in-memory delta
6443 // is effectively the full state. During recovery this path is
6444 // harmless.
6445 delta.is_delta = false;
6446 return;
6447 }
6448
6449 // Read the full-BIN log entry at last_full_lsn.
6450 // `envImpl.getLogManager().getEntryHandleFileNotFound(lsn)`.
6451 match log_manager.read_entry(delta.last_full_lsn) {
6452 Ok((entry_type, payload)) => {
6453 use noxu_log::LogEntryType;
6454 if entry_type == LogEntryType::BIN {
6455 if let Some(mut base) = BinStub::deserialize_full(&payload)
6456 {
6457 // Set the base's last_full_lsn so it is preserved
6458 // into the merged result.
6459 base.last_full_lsn = delta.last_full_lsn;
6460 Self::mutate_to_full_bin(delta, base);
6461 return;
6462 }
6463 // Deserialization failed — fall through to graceful degradation.
6464 log::warn!(
6465 "mutate_to_full_bin_from_log: failed to deserialize \
6466 full BIN at LSN {:?}; keeping delta as-is",
6467 delta.last_full_lsn
6468 );
6469 } else {
6470 log::warn!(
6471 "mutate_to_full_bin_from_log: expected BIN entry at \
6472 LSN {:?}, got {:?}",
6473 delta.last_full_lsn,
6474 entry_type
6475 );
6476 }
6477 }
6478 Err(e) => {
6479 log::warn!(
6480 "mutate_to_full_bin_from_log: failed to read log at \
6481 LSN {:?}: {}",
6482 delta.last_full_lsn,
6483 e
6484 );
6485 }
6486 }
6487
6488 // Graceful degradation: promote the delta to a "full" BIN without
6489 // the base slots. The BIN will be re-logged as a full BIN at the
6490 // next checkpoint.
6491 delta.is_delta = false;
6492 delta.dirty = true;
6493 }
6494
6495 // ========================================================================
6496 // getNextBin / getPrevBin
6497 // ========================================================================
6498
6499 /// Return the entries of the BIN immediately to the right of the BIN
6500 /// that contains (or would contain) `current_key`.
6501 ///
6502 /// → `Tree.getNextIN(forward=true)`.
6503 ///
6504 /// # Algorithm
6505 /// 1. Build a root-to-BIN path for `current_key`.
6506 /// 2. Walk the path back up looking for a parent that has a slot to the
6507 /// right of the slot we descended through.
6508 /// 3. When found, descend to the leftmost BIN of that sibling subtree.
6509 /// 4. If no such parent exists, return `None` (no next BIN).
6510 pub fn get_next_bin(
6511 &self,
6512 current_key: &[u8],
6513 ) -> Option<Vec<(BinEntry, Lsn, Vec<u8>)>> {
6514 let root = self.get_root()?;
6515 self.get_adjacent_bin(&root, current_key, true)
6516 }
6517
6518 /// Return the entries of the BIN immediately to the left of the BIN
6519 /// that contains (or would contain) `current_key`.
6520 ///
6521 /// → `Tree.getNextIN(forward=false)`.
6522 pub fn get_prev_bin(
6523 &self,
6524 current_key: &[u8],
6525 ) -> Option<Vec<(BinEntry, Lsn, Vec<u8>)>> {
6526 let root = self.get_root()?;
6527 self.get_adjacent_bin(&root, current_key, false)
6528 }
6529
6530 /// Core implementation shared by `get_next_bin` and `get_prev_bin`.
6531 ///
6532 /// Builds the path from `root` down to the BIN for `current_key`
6533 /// (each element records the parent arc, the slot index taken,
6534 /// and the child Arc reached) using `read_arc()` hand-over-hand
6535 /// latch coupling.
6536 ///
6537 /// The ascent re-acquires the parent's read lock one level at a
6538 /// time. To handle a concurrent split that completes between
6539 /// path capture and ascent, we validate that the slot still
6540 /// holds the child Arc we descended through. If the slot
6541 /// mismatches we retry the whole operation from root with a
6542 /// short pause between attempts. The retry budget is generous
6543 /// (`MAX_ASCENT_ATTEMPTS`) so that the typical case of a few
6544 /// cascading splits between two BIN-level cursor steps is
6545 /// absorbed without surfacing as a false end-of-iteration.
6546 /// After exhausting the budget we conservatively return `None`,
6547 /// signalling "no adjacent BIN found"; the cursor will then
6548 /// either restart its scan or report end-of-iteration. The
6549 /// budget is finite so a pathological workload (a thread
6550 /// permanently splitting under us) cannot livelock the lookup.
6551 /// JE `Tree.getNextIN` / `Tree.getPrevIN`.
6552 ///
6553 /// R3 fix (2026-06-16): converted from `static fn` to `&self` so that the
6554 /// IN-level descent uses `self.upper_in_floor_index` (comparator-aware)
6555 /// instead of a raw byte `<=`. Without this, databases with a custom
6556 /// comparator (secondary indexes, sorted-dup) could descend to the wrong
6557 /// child → wrong adjacent BIN → incorrect cursor iteration across BIN
6558 /// boundaries. Mirrors `Tree.getNextIN`/`Tree.getPrevIN` using the
6559 /// comparator-aware `IN.findEntry`.
6560 fn get_adjacent_bin(
6561 &self,
6562 root: &Arc<RwLock<TreeNode>>,
6563 current_key: &[u8],
6564 forward: bool,
6565 ) -> Option<Vec<(BinEntry, Lsn, Vec<u8>)>> {
6566 const MAX_ASCENT_ATTEMPTS: u32 = 8;
6567 for attempt in 0..MAX_ASCENT_ATTEMPTS {
6568 match self.get_adjacent_bin_attempt(root, current_key, forward) {
6569 AdjacentBinOutcome::Found(v) => return Some(v),
6570 AdjacentBinOutcome::NoAdjacent => return None,
6571 AdjacentBinOutcome::SplitRaceRetry => {
6572 // Brief pause to let the splitter finish.
6573 if attempt + 1 < MAX_ASCENT_ATTEMPTS {
6574 std::thread::yield_now();
6575 }
6576 }
6577 }
6578 }
6579 // Exhausted retry budget. Signal "no adjacent" so the
6580 // cursor can fall back to its end-of-iteration path.
6581 None
6582 }
6583
6584 /// One attempt at `get_adjacent_bin`. The tri-state return
6585 /// value distinguishes "no adjacent BIN exists" (which the
6586 /// caller should propagate as end-of-iteration) from "a
6587 /// concurrent split invalidated our path" (which the caller
6588 /// should retry from root).
6589 fn get_adjacent_bin_attempt(
6590 &self,
6591 root: &Arc<RwLock<TreeNode>>,
6592 current_key: &[u8],
6593 forward: bool,
6594 ) -> AdjacentBinOutcome {
6595 // Path entry: (parent_arc, slot_idx_taken, child_arc_reached).
6596 // The child Arc lets the ascent validate that the slot still
6597 // points to the same node we descended through.
6598 let mut path: Vec<(
6599 Arc<RwLock<TreeNode>>,
6600 usize,
6601 Arc<RwLock<TreeNode>>,
6602 )> = Vec::new();
6603
6604 let mut guard: parking_lot::ArcRwLockReadGuard<
6605 parking_lot::RawRwLock,
6606 TreeNode,
6607 > = root.read_arc();
6608 loop {
6609 if guard.is_bin() {
6610 break;
6611 }
6612
6613 let (next_arc, slot_idx) = match &*guard {
6614 TreeNode::Internal(n) => {
6615 if n.entries.is_empty() {
6616 return AdjacentBinOutcome::NoAdjacent;
6617 }
6618 // R3 fix: use comparator-aware upper_in_floor_index so
6619 // that custom-comparator / sorted-dup databases descend
6620 // to the correct child. Mirrors JE Tree.getNextIN which
6621 // uses IN.findEntry (comparator-aware) not raw byte order.
6622 let idx =
6623 self.upper_in_floor_index(&n.entries, current_key);
6624 let child = match n.get_child(idx) {
6625 Some(c) => c,
6626 None => return AdjacentBinOutcome::NoAdjacent,
6627 };
6628 (child, idx)
6629 }
6630 TreeNode::Bottom(_) => unreachable!(),
6631 };
6632
6633 // Record the parent and the child we are about to enter
6634 // — the child Arc lets the ascent validate the slot.
6635 let parent_arc =
6636 parking_lot::ArcRwLockReadGuard::rwlock(&guard).clone();
6637 path.push((parent_arc, slot_idx, Arc::clone(&next_arc)));
6638
6639 // Hand-over-hand: take child read lock BEFORE releasing parent.
6640 let next_guard = next_arc.read_arc();
6641 drop(guard);
6642 guard = next_guard;
6643 }
6644 drop(guard);
6645
6646 // Ascend the path. At each level, validate that
6647 // `parent.entries[taken_idx].child == descended_child` before
6648 // trusting `taken_idx` as a coordinate. If not, return
6649 // `SplitRaceRetry` so the caller restarts from root.
6650 while let Some((parent_arc, taken_idx, descended_child)) = path.pop() {
6651 let parent_guard = parent_arc.read();
6652 let (n_entries, slot_still_valid) = match &*parent_guard {
6653 TreeNode::Internal(p) => {
6654 let n = p.entries.len();
6655 let valid = p
6656 .child_ref(taken_idx)
6657 .is_some_and(|c| Arc::ptr_eq(c, &descended_child));
6658 (n, valid)
6659 }
6660 _ => return AdjacentBinOutcome::NoAdjacent,
6661 };
6662 drop(parent_guard);
6663
6664 if !slot_still_valid {
6665 return AdjacentBinOutcome::SplitRaceRetry;
6666 }
6667
6668 let sibling_idx = if forward {
6669 taken_idx + 1
6670 } else if taken_idx == 0 {
6671 // No left sibling at this level — ascend further.
6672 continue;
6673 } else {
6674 taken_idx - 1
6675 };
6676
6677 if forward && sibling_idx >= n_entries {
6678 // No right sibling at this level — ascend further.
6679 continue;
6680 }
6681
6682 // Found a sibling slot — fetch the sibling child arc.
6683 let sibling_arc = {
6684 let g = parent_arc.read();
6685 match &*g {
6686 TreeNode::Internal(p) => match p.get_child(sibling_idx) {
6687 Some(c) => c,
6688 None => return AdjacentBinOutcome::NoAdjacent,
6689 },
6690 _ => return AdjacentBinOutcome::NoAdjacent,
6691 }
6692 };
6693
6694 // Descend to the leftmost (forward) or rightmost (!forward) BIN.
6695 return match Self::descend_to_edge_bin(&sibling_arc, forward) {
6696 Some(v) => AdjacentBinOutcome::Found(v),
6697 None => AdjacentBinOutcome::NoAdjacent,
6698 };
6699 }
6700
6701 // Exhausted path without finding a sibling → no adjacent BIN.
6702 AdjacentBinOutcome::NoAdjacent
6703 }
6704
6705 /// Descend to the leftmost BIN (`forward = true`) or rightmost BIN
6706 /// (`forward = false`) in the sub-tree rooted at `node_arc`.
6707 ///
6708 /// `Tree.searchSubTree(SearchType.LEFT / RIGHT, targetLevel)`.
6709 fn descend_to_edge_bin(
6710 node_arc: &Arc<RwLock<TreeNode>>,
6711 forward: bool,
6712 ) -> Option<Vec<(BinEntry, Lsn, Vec<u8>)>> {
6713 // Hand-over-hand latch coupling — see Tree::search.
6714 let mut guard: parking_lot::ArcRwLockReadGuard<
6715 parking_lot::RawRwLock,
6716 TreeNode,
6717 > = node_arc.read_arc();
6718
6719 loop {
6720 if guard.is_bin() {
6721 return match &*guard {
6722 TreeNode::Bottom(b) => {
6723 // Return entries with full (decompressed) keys so that
6724 // callers always work with complete keys.
6725 //
6726 // TREE-F1: KD slots are NOT filtered here — the BIN's
6727 // slot indices are returned verbatim so the cursor can
6728 // skip KD slots itself (CursorImpl getNext loop;
6729 // CursorImpl.java:2062-2064) and continue to the next
6730 // BIN when an edge BIN is entirely KD during the
6731 // BIN-delta reconstitution window.
6732 let full_entries: Vec<(BinEntry, Lsn, Vec<u8>)> = (0
6733 ..b.entries.len())
6734 .map(|i| {
6735 (
6736 BinEntry {
6737 data: b.entries[i].data.clone(),
6738 known_deleted: b.entries[i]
6739 .known_deleted,
6740 dirty: b.entries[i].dirty,
6741 expiration_time: b.entries[i]
6742 .expiration_time,
6743 },
6744 b.get_lsn(i),
6745 b.get_full_key(i).unwrap_or_default(),
6746 )
6747 })
6748 .collect();
6749 Some(full_entries)
6750 }
6751 _ => None,
6752 };
6753 }
6754
6755 let next = match &*guard {
6756 TreeNode::Internal(n) => {
6757 if forward {
6758 n.get_child(0)?
6759 } else {
6760 n.get_child(n.entries.len().saturating_sub(1))?
6761 }
6762 }
6763 _ => return None,
6764 };
6765 // Take child read lock BEFORE releasing parent's.
6766 let next_guard = next.read_arc();
6767 drop(guard);
6768 guard = next_guard;
6769 }
6770 }
6771}
6772
6773// ============================================================================
6774// Tree statistics
6775// ============================================================================
6776
6777/// Statistics collected by a full tree walk.
6778///
6779/// `TreeWalkerStatsAccumulator`.
6780#[derive(Debug, Default, Clone, PartialEq, Eq)]
6781pub struct TreeStats {
6782 /// Number of BINs (bottom internal nodes).
6783 pub n_bins: u64,
6784 /// Number of upper INs.
6785 pub n_ins: u64,
6786 /// Total number of entries across all nodes.
6787 pub n_entries: u64,
6788 /// Height of the tree (1 = root is a BIN, 2 = one level above BINs, …).
6789 pub height: u32,
6790}
6791
6792impl Tree {
6793 /// Walks the entire tree and collects structural statistics.
6794 ///
6795 /// `TreeWalkerStatsAccumulator` pattern — performs a simple
6796 /// recursive DFS and counts INs, BINs, entries, and tree height.
6797 pub fn collect_stats(&self) -> TreeStats {
6798 let mut stats = TreeStats::default();
6799 if let Some(root) = self.get_root() {
6800 Self::collect_stats_recursive(&root, &mut stats, 0);
6801 }
6802 stats
6803 }
6804
6805 fn collect_stats_recursive(
6806 node_arc: &Arc<RwLock<TreeNode>>,
6807 stats: &mut TreeStats,
6808 depth: u32,
6809 ) {
6810 let guard = node_arc.read();
6811
6812 let current_height = depth + 1;
6813 if current_height > stats.height {
6814 stats.height = current_height;
6815 }
6816
6817 match &*guard {
6818 TreeNode::Bottom(b) => {
6819 stats.n_bins += 1;
6820 stats.n_entries += b.entries.len() as u64;
6821 }
6822 TreeNode::Internal(n) => {
6823 stats.n_ins += 1;
6824 stats.n_entries += n.entries.len() as u64;
6825 // Collect child arcs before releasing the guard.
6826 let children: Vec<Arc<RwLock<TreeNode>>> =
6827 n.resident_children();
6828 // Release guard before recursing to avoid lock ordering issues.
6829 drop(guard);
6830 for child in children {
6831 Self::collect_stats_recursive(&child, stats, depth + 1);
6832 }
6833 }
6834 }
6835 }
6836
6837 /// Collects all dirty BINs as (Arc to node, db_id) pairs.
6838 ///
6839 /// The checkpoint path calls this to enumerate BINs that need to be
6840 /// logged. For each dirty BIN the checkpoint decides — based on the
6841 /// BIN-delta threshold — whether to write a full `BIN` entry or a
6842 /// `BINDelta` entry.
6843 ///
6844 /// `Checkpointer.processINList()` which iterates the dirty
6845 /// IN list accumulated during normal operation.
6846 pub fn collect_dirty_bins(
6847 &self,
6848 db_id: u64,
6849 ) -> Vec<(u64, Arc<RwLock<TreeNode>>)> {
6850 let mut result = Vec::new();
6851 if let Some(root) = self.get_root() {
6852 Self::collect_dirty_bins_recursive(&root, db_id, &mut result);
6853 }
6854 result
6855 }
6856
6857 fn collect_dirty_bins_recursive(
6858 node_arc: &Arc<RwLock<TreeNode>>,
6859 db_id: u64,
6860 out: &mut Vec<(u64, Arc<RwLock<TreeNode>>)>,
6861 ) {
6862 let guard = node_arc.read();
6863 match &*guard {
6864 TreeNode::Bottom(b) => {
6865 // Include this BIN if it is dirty or has any dirty slots.
6866 if b.dirty || b.dirty_count() > 0 {
6867 out.push((db_id, Arc::clone(node_arc)));
6868 }
6869 }
6870 TreeNode::Internal(n) => {
6871 let children: Vec<Arc<RwLock<TreeNode>>> =
6872 n.resident_children();
6873 drop(guard);
6874 for child in children {
6875 Self::collect_dirty_bins_recursive(&child, db_id, out);
6876 } // guard already dropped
6877 }
6878 }
6879 }
6880
6881 /// Collect all BINs that have at least one `known_deleted` slot.
6882 ///
6883 /// INCompressor queue-drain scan in the: the daemon iterates
6884 /// the in-memory IN list and identifies BINs that still hold zombie deleted
6885 /// slots. Each returned `Arc` can be passed directly to `compress_bin()`.
6886 pub fn collect_bins_with_known_deleted(
6887 &self,
6888 ) -> Vec<Arc<RwLock<TreeNode>>> {
6889 let mut result = Vec::new();
6890 if let Some(root) = self.get_root() {
6891 Self::collect_bins_with_known_deleted_recursive(&root, &mut result);
6892 }
6893 result
6894 }
6895
6896 fn collect_bins_with_known_deleted_recursive(
6897 node_arc: &Arc<RwLock<TreeNode>>,
6898 out: &mut Vec<Arc<RwLock<TreeNode>>>,
6899 ) {
6900 let guard = node_arc.read();
6901 match &*guard {
6902 TreeNode::Bottom(b) => {
6903 if b.entries.iter().any(|e| e.known_deleted) {
6904 out.push(Arc::clone(node_arc));
6905 }
6906 }
6907 TreeNode::Internal(n) => {
6908 let children: Vec<Arc<RwLock<TreeNode>>> =
6909 n.resident_children();
6910 drop(guard);
6911 for child in children {
6912 Self::collect_bins_with_known_deleted_recursive(
6913 &child, out,
6914 );
6915 }
6916 }
6917 }
6918 }
6919
6920 /// Collect all dirty upper (non-BIN) internal nodes, sorted ascending by
6921 /// level (bottom-up order, BIN level excluded).
6922 ///
6923 /// Serialise an upper-IN node (level > 1) by node_id for off-heap storage.
6924 ///
6925 /// Traverses the tree to find the internal node whose matches,
6926 /// then calls to produce a compact byte
6927 /// representation. Returns if the node is not found or is a BIN
6928 /// (BINs are not upper INs).
6929 ///
6930 /// Mirrors `OffHeapAllocator` serialises the same bytes that would be written
6931 /// to the log, allowing the evictor to store upper-INs off-heap and avoid
6932 /// log-file reads on the next traversal.
6933 pub fn serialize_upper_in(&self, node_id: u64) -> Option<Vec<u8>> {
6934 let root = self.get_root()?;
6935 Self::find_and_serialize_upper_in(&root, node_id)
6936 }
6937
6938 fn find_and_serialize_upper_in(
6939 node_arc: &Arc<RwLock<TreeNode>>,
6940 target_id: u64,
6941 ) -> Option<Vec<u8>> {
6942 let guard = node_arc.read();
6943 match &*guard {
6944 TreeNode::Bottom(_) => None, // BINs are not upper INs
6945 TreeNode::Internal(n) => {
6946 if n.node_id == target_id {
6947 // Serialise InNodeStub for off-heap storage.
6948 // Format: node_id(u64BE) | level(i32BE) | n_entries(u32BE)
6949 // then per-entry: key_len(u32BE) | key | lsn(u64BE)
6950 let mut buf = Vec::new();
6951 buf.extend_from_slice(&n.node_id.to_be_bytes());
6952 buf.extend_from_slice(&n.level.to_be_bytes());
6953 buf.extend_from_slice(
6954 &(n.entries.len() as u32).to_be_bytes(),
6955 );
6956 for (i, e) in n.entries.iter().enumerate() {
6957 buf.extend_from_slice(
6958 &(e.key.len() as u32).to_be_bytes(),
6959 );
6960 buf.extend_from_slice(&e.key);
6961 buf.extend_from_slice(
6962 &n.get_lsn(i).as_u64().to_be_bytes(),
6963 );
6964 }
6965 return Some(buf);
6966 }
6967 // Recurse into children before releasing the guard so we
6968 // hold the minimum read-lock duration.
6969 let children: Vec<Arc<RwLock<TreeNode>>> =
6970 n.resident_children();
6971 drop(guard);
6972 for child in &children {
6973 if let Some(bytes) =
6974 Self::find_and_serialize_upper_in(child, target_id)
6975 {
6976 return Some(bytes);
6977 }
6978 }
6979 None
6980 }
6981 }
6982 }
6983
6984 /// Upper-IN traversal in `Checkpointer.processINList()` from
6985 /// — visits all `TreeNode::Internal` nodes whose `dirty` flag is set
6986 /// and returns them together with their level, sorted lowest-level-first
6987 /// so the checkpointer can log them bottom-up. The root is always the
6988 /// last entry (highest level), which must be logged `Provisional::No`.
6989 pub fn collect_dirty_upper_ins(
6990 &self,
6991 _db_id: u64,
6992 ) -> Vec<(i32, Arc<RwLock<TreeNode>>)> {
6993 let mut result: Vec<(i32, Arc<RwLock<TreeNode>>)> = Vec::new();
6994 if let Some(root) = self.get_root() {
6995 Self::collect_dirty_upper_ins_recursive(&root, &mut result);
6996 }
6997 result.sort_by_key(|(level, _)| *level);
6998 result
6999 }
7000
7001 fn collect_dirty_upper_ins_recursive(
7002 node_arc: &Arc<RwLock<TreeNode>>,
7003 out: &mut Vec<(i32, Arc<RwLock<TreeNode>>)>,
7004 ) {
7005 let guard = node_arc.read();
7006 match &*guard {
7007 TreeNode::Bottom(_) => {
7008 // BINs are handled by flush_dirty_bins_internal; skip here.
7009 }
7010 TreeNode::Internal(n) => {
7011 let is_dirty = n.dirty;
7012 // REC-AA: return the node's ACTUAL tree level (n.level, in
7013 // MAIN_LEVEL|n units), not a root-relative depth. The level
7014 // must be on the same scale as a BIN's `level` (BIN_LEVEL =
7015 // MAIN_LEVEL|1) so that the checkpointer's flush-level
7016 // computation and the evictor's `node_level < flush_level`
7017 // comparison are meaningful. With a root-relative depth the
7018 // root had the SMALLEST value (0) and the IN above the BINs
7019 // the LARGEST, inverting the provisional/non-provisional
7020 // boundary; with n.level the root has the largest level, as JE
7021 // expects.
7022 let level = n.level;
7023 let children: Vec<Arc<RwLock<TreeNode>>> =
7024 n.resident_children();
7025 drop(guard);
7026 // Recurse into children first (bottom-up ordering).
7027 for child in &children {
7028 Self::collect_dirty_upper_ins_recursive(child, out);
7029 }
7030 // Add this node after children (so parent comes after all descendants).
7031 if is_dirty {
7032 out.push((level, Arc::clone(node_arc)));
7033 }
7034 }
7035 }
7036 }
7037
7038 // ========================================================================
7039 // Tree.java ports: 8 additional tree methods (Task #82)
7040 // ========================================================================
7041
7042 /// Returns `true` if the root node is currently loaded in memory.
7043 ///
7044 /// .
7045 pub fn is_root_resident(&self) -> bool {
7046 self.root.read().is_some()
7047 }
7048
7049 /// Returns the root node `Arc` if present, or `None`.
7050 ///
7051 /// .
7052 pub fn get_resident_root_in(&self) -> Option<Arc<RwLock<TreeNode>>> {
7053 self.root.read().clone()
7054 }
7055
7056 /// Returns the BIN that should contain a slot for `key` (the "parent" of
7057 /// LN slots).
7058 ///
7059 /// . Descends the tree
7060 /// exactly like `search()` and returns the leaf-level BIN arc, or `None`
7061 /// if the tree is empty.
7062 ///
7063 /// Uses `read_arc()` hand-over-hand on the descent — the child
7064 /// guard is taken before the parent guard is dropped, matching
7065 /// `search()`. Returns the BIN Arc with no read lock held; the
7066 /// caller must take whatever lock it needs to operate on the
7067 /// returned BIN.
7068 pub fn get_parent_bin_for_child_ln(
7069 &self,
7070 key: &[u8],
7071 ) -> Option<Arc<RwLock<TreeNode>>> {
7072 let root = self.get_root()?;
7073 let mut current_arc: Arc<RwLock<TreeNode>> = root.clone();
7074 let mut guard: parking_lot::ArcRwLockReadGuard<
7075 parking_lot::RawRwLock,
7076 TreeNode,
7077 > = root.read_arc();
7078
7079 loop {
7080 if guard.is_bin() {
7081 drop(guard);
7082 return Some(current_arc);
7083 }
7084
7085 let parent_arc = current_arc.clone();
7086 let next_idx = match &*guard {
7087 TreeNode::Internal(n) => {
7088 if n.entries.is_empty() {
7089 return None;
7090 }
7091 let idx = self.upper_in_floor_index(&n.entries, key);
7092 match n.get_child(idx) {
7093 Some(c) => {
7094 let next_guard = c.read_arc();
7095 drop(guard);
7096 current_arc = c;
7097 guard = next_guard;
7098 continue;
7099 }
7100 None => idx, // EV-14/EV-13: re-fetch below.
7101 }
7102 }
7103 TreeNode::Bottom(_) => {
7104 unreachable!("is_bin() returned false above")
7105 }
7106 };
7107 // Hand-over-hand: take child guard before dropping parent.
7108 drop(guard);
7109 let child = self.child_at_or_fetch(&parent_arc, next_idx)?;
7110 let next_guard = child.read_arc();
7111 current_arc = child;
7112 guard = next_guard;
7113 }
7114 }
7115
7116 /// Returns the BIN where `key` should be inserted.
7117 ///
7118 /// . Semantically identical to
7119 /// `get_parent_bin_for_child_ln` — expressed as a separate method to match
7120 /// API surface.
7121 ///
7122 /// Implemented as a delegation to `get_parent_bin_for_child_ln`,
7123 /// which uses `read_arc()` hand-over-hand on the descent.
7124 pub fn find_bin_for_insert(
7125 &self,
7126 key: &[u8],
7127 ) -> Option<Arc<RwLock<TreeNode>>> {
7128 self.get_parent_bin_for_child_ln(key)
7129 }
7130
7131 /// Search for a BIN, allowing splits during descent (preemptive splitting).
7132 ///
7133 /// . This thin wrapper
7134 /// delegates to `search()` and returns the result wrapped in `Some`.
7135 /// The full split-allowed descent is performed by `insert()` internally;
7136 /// this method exposes the same result type for callers that only need to
7137 /// locate the BIN.
7138 ///
7139 /// Returns `None` if the tree is empty.
7140 pub fn search_splits_allowed(&self, key: &[u8]) -> Option<SearchResult> {
7141 self.search(key)
7142 }
7143
7144 /// Traverses the entire tree and returns every IN and BIN node as a flat
7145 /// list.
7146 ///
7147 /// . Used by recovery to rebuild
7148 /// the in-memory IN list after log replay. The walk is a BFS from the
7149 /// root; every `Arc<RwLock<TreeNode>>` encountered (both Internal and
7150 /// Bottom variants) is included in the result.
7151 pub fn rebuild_in_list(&self) -> Vec<Arc<RwLock<TreeNode>>> {
7152 let mut result = Vec::new();
7153 if let Some(root) = self.get_root() {
7154 Self::rebuild_in_list_recursive(&root, &mut result);
7155 }
7156 result
7157 }
7158
7159 fn rebuild_in_list_recursive(
7160 node_arc: &Arc<RwLock<TreeNode>>,
7161 out: &mut Vec<Arc<RwLock<TreeNode>>>,
7162 ) {
7163 // Push this node unconditionally — both INs and BINs belong in the list.
7164 out.push(Arc::clone(node_arc));
7165
7166 let guard = node_arc.read();
7167
7168 if let TreeNode::Internal(n) = &*guard {
7169 // Collect child arcs while holding the guard, then drop it before
7170 // recursing to avoid holding multiple locks simultaneously.
7171 let children: Vec<Arc<RwLock<TreeNode>>> = n.resident_children();
7172 drop(guard);
7173 for child in children {
7174 Self::rebuild_in_list_recursive(&child, out);
7175 }
7176 }
7177 // BIN nodes are leaves — no children to recurse into.
7178 }
7179
7180 /// Validates internal tree consistency.
7181 ///
7182 /// . Primarily a debug/test tool.
7183 ///
7184 /// Rules checked:
7185 /// - An empty tree (no root) is trivially valid → returns `true`.
7186 /// - A non-empty tree must have a non-null root.
7187 /// - Every Internal node must have at least one entry.
7188 /// - Every child pointer that is `Some` must be readable (lock must be
7189 /// acquirable — i.e., no poisoned locks).
7190 ///
7191 /// Returns `true` if no inconsistencies are detected, `false` otherwise.
7192 pub fn validate_in_list(&self) -> bool {
7193 match self.get_root() {
7194 None => true, // empty tree is always valid
7195 Some(root) => Self::validate_node(&root),
7196 }
7197 }
7198
7199 fn validate_node(node_arc: &Arc<RwLock<TreeNode>>) -> bool {
7200 let guard = node_arc.read();
7201
7202 match &*guard {
7203 TreeNode::Bottom(_bin) => {
7204 // BIN nodes are always structurally valid at this level.
7205 true
7206 }
7207 TreeNode::Internal(n) => {
7208 // An Internal node must have at least one entry.
7209 if n.entries.is_empty() {
7210 return false;
7211 }
7212 // Collect child arcs before dropping the guard.
7213 let children: Vec<Arc<RwLock<TreeNode>>> =
7214 n.resident_children();
7215 drop(guard);
7216 // Recursively validate every resident child.
7217 for child in children {
7218 if !Self::validate_node(&child) {
7219 return false;
7220 }
7221 }
7222 true
7223 }
7224 }
7225 }
7226
7227 /// Traverses the tree to find the parent IN that contains `child_node_id`
7228 /// as one of its child slots.
7229 ///
7230 /// . Used by the cleaner
7231 /// migration path to re-insert migrated INs after eviction/fetch.
7232 ///
7233 /// Returns `(parent_arc, slot_index)` where `slot_index` is the position
7234 /// in the parent's `entries` vector whose child matches `child_node_id`,
7235 /// or `None` if no such parent is found.
7236 pub fn get_parent_in_for_child_in(
7237 &self,
7238 child_node_id: u64,
7239 ) -> Option<(Arc<RwLock<TreeNode>>, usize)> {
7240 let root = self.get_root()?;
7241 Self::find_parent_of_node_id(&root, child_node_id)
7242 }
7243
7244 /// Recursive DFS helper for `get_parent_in_for_child_in`.
7245 ///
7246 /// Scans every entry in each Internal node. When a child's node_id
7247 /// matches `target_id` the parent arc and slot index are returned.
7248 fn find_parent_of_node_id(
7249 node_arc: &Arc<RwLock<TreeNode>>,
7250 target_id: u64,
7251 ) -> Option<(Arc<RwLock<TreeNode>>, usize)> {
7252 let guard = node_arc.read();
7253
7254 let TreeNode::Internal(n) = &*guard else {
7255 // BIN nodes have no IN children — cannot be a parent of another IN.
7256 return None;
7257 };
7258
7259 // Check whether any child of this IN has the target node_id.
7260 let mut children: Vec<(usize, Arc<RwLock<TreeNode>>)> = Vec::new();
7261 for slot in 0..n.entries.len() {
7262 if let Some(child_arc) = n.child_ref(slot) {
7263 // Read the child's node_id under a separate lock (acquire child
7264 // while parent guard is still held — this is intentional for
7265 // the ID comparison only; we release both immediately after).
7266 let child_id = {
7267 let cg = child_arc.read();
7268 match &*cg {
7269 TreeNode::Internal(cn) => cn.node_id,
7270 TreeNode::Bottom(cb) => cb.node_id,
7271 }
7272 };
7273
7274 if child_id == target_id {
7275 // Found — return a clone of this node as parent.
7276 let parent_clone = Arc::clone(node_arc);
7277 return Some((parent_clone, slot));
7278 }
7279
7280 // Not found at this slot; schedule this child for recursion.
7281 children.push((slot, Arc::clone(child_arc)));
7282 }
7283 }
7284 // Release parent guard before recursing.
7285 drop(guard);
7286
7287 // Recurse into each Internal child.
7288 for (_slot, child_arc) in children {
7289 if let Some(result) =
7290 Self::find_parent_of_node_id(&child_arc, target_id)
7291 {
7292 return Some(result);
7293 }
7294 }
7295
7296 None
7297 }
7298
7299 /// Propagates the dirty flag upward from `node_arc` to the root.
7300 ///
7301 /// Implicit dirty propagation: after modifying any node,
7302 /// all ancestors on the path to the root must also be marked dirty so
7303 /// the checkpointer logs them.
7304 ///
7305 /// In this happens through `IN.setDirty(true)` calls at each level
7306 /// during split/insert callbacks. Here we walk the weak parent chain.
7307 /// Reconstitute a BIN-delta by merging it onto a base full BIN.
7308 ///
7309 /// Implements JE `BINDelta.reconstituteBIN(databaseImpl)` for the recovery
7310 /// path where the log manager is not available as a `LogManager` but as
7311 /// raw serialized bytes.
7312 ///
7313 /// Algorithm:
7314 /// 1. Deserialise `base_bytes` as a full `BinStub`.
7315 /// 2. Apply `delta_bytes` slots onto the base using `BinStub::apply_delta`
7316 /// (raw slot overlay).
7317 /// 3. Recompute key prefix so prefix-compressed entries are consistent.
7318 ///
7319 /// Returns `None` if either byte slice is malformed.
7320 ///
7321 /// JE `BINDelta.reconstituteBIN` / `BINDelta.applyDelta`
7322 /// (DRIFT-10 / Stage 3).
7323 pub fn reconstitute_bin_delta(
7324 base_bytes: &[u8],
7325 delta_bytes: &[u8],
7326 ) -> Option<BinStub> {
7327 let mut base = BinStub::deserialize_full(base_bytes)?;
7328 // Apply the delta slots onto the base.
7329 // Note: BinStub::apply_delta uses slot-index addressing into base.entries,
7330 // extending with new entries when the slot_idx >= base.entries.len().
7331 // After apply_delta we recompute the key prefix to fix prefix compression.
7332 BinStub::apply_delta(&mut base, delta_bytes)?;
7333 // Recompute prefix so prefix-compressed BINs are consistent after merge.
7334 base.recompute_key_prefix();
7335 base.is_delta = false;
7336 base.dirty = false;
7337 Some(base)
7338 }
7339
7340 pub fn propagate_dirty_to_root(node_arc: &Arc<RwLock<TreeNode>>) {
7341 let parent_weak = { node_arc.read().get_parent() };
7342
7343 if let Some(parent_arc) = parent_weak.and_then(|w| w.upgrade()) {
7344 {
7345 let mut g = parent_arc.write();
7346 g.set_dirty(true);
7347 }
7348 // Recurse further up.
7349 Self::propagate_dirty_to_root(&parent_arc);
7350 }
7351 }
7352
7353 // ========================================================================
7354 // IN-redo: JE RecoveryManager.recoverIN / recoverRootIN / recoverChildIN
7355 // ========================================================================
7356
7357 /// Deserialise an upper-IN node from bytes produced by
7358 /// `TreeNode::write_to_bytes()` / `flush_one_tree_upper_ins`.
7359 ///
7360 /// Format: node_id(u64BE) | level(i32BE) | n_entries(u32BE) | dirty(u8)
7361 /// | per-entry: key_len(u16BE) | key | lsn(u64BE)
7362 ///
7363 /// JE `INFileReader.getIN(db)` / `IN.readFromLog`.
7364 pub fn deserialize_upper_in(bytes: &[u8]) -> Option<InNodeStub> {
7365 if bytes.len() < 13 {
7366 return None;
7367 }
7368 let node_id = u64::from_be_bytes(bytes[0..8].try_into().ok()?);
7369 let level = i32::from_be_bytes(bytes[8..12].try_into().ok()?);
7370 let n_entries =
7371 u32::from_be_bytes(bytes[12..16].try_into().ok()?) as usize;
7372 // dirty byte (1 byte after n_entries)
7373 if bytes.len() < 17 {
7374 return None;
7375 }
7376 let mut pos = 17usize; // skip node_id(8) + level(4) + n_entries(4) + dirty(1)
7377 let mut entries = Vec::with_capacity(n_entries);
7378 let mut lsns: Vec<Lsn> = Vec::with_capacity(n_entries);
7379 for _ in 0..n_entries {
7380 if pos + 2 > bytes.len() {
7381 return None;
7382 }
7383 let key_len =
7384 u16::from_be_bytes(bytes[pos..pos + 2].try_into().ok()?)
7385 as usize;
7386 pos += 2;
7387 if pos + key_len > bytes.len() {
7388 return None;
7389 }
7390 let key = bytes[pos..pos + key_len].to_vec();
7391 pos += key_len;
7392 if pos + 8 > bytes.len() {
7393 return None;
7394 }
7395 let lsn = noxu_util::Lsn::from_u64(u64::from_be_bytes(
7396 bytes[pos..pos + 8].try_into().ok()?,
7397 ));
7398 pos += 8;
7399 entries.push(InEntry { key });
7400 lsns.push(lsn); // T-3
7401 }
7402 Some(InNodeStub {
7403 node_id,
7404 level,
7405 entries,
7406 // T-4: a freshly deserialized IN has no resident children.
7407 targets: TargetRep::None,
7408 dirty: false,
7409 generation: 0,
7410 parent: None,
7411 lsn_rep: LsnRep::from_lsns(&lsns), // T-3
7412 })
7413 }
7414
7415 /// Deserialise a BIN from bytes produced by `BinStub::serialize_full()`.
7416 ///
7417 /// Thin wrapper so the recovery path does not need to import `BinStub`
7418 /// directly from callers that only have the raw bytes.
7419 ///
7420 /// JE `INFileReader.getIN(db)` for a BIN entry.
7421 pub fn deserialize_bin(bytes: &[u8]) -> Option<BinStub> {
7422 let mut bin = BinStub::deserialize_full(bytes)?;
7423 bin.dirty = false; // freshly loaded from log — clean for now
7424 Some(bin)
7425 }
7426
7427 /// Apply a logged IN/BIN to the in-memory tree during the recovery redo pass.
7428 ///
7429 /// Implements JE `RecoveryManager.recoverIN`:
7430 /// - `is_root` nodes are handled by `recover_root_in`.
7431 /// - non-root nodes are handled by `recover_child_in`.
7432 ///
7433 /// `log_lsn` is the LSN at which this IN/BIN was logged. The currency
7434 /// check in `recover_child_in` uses this to decide whether to replace the
7435 /// in-memory slot (tree slot LSN < log_lsn → replace; equal → noop;
7436 /// greater → skip).
7437 ///
7438 /// JE `RecoveryManager.recoverIN` / `replayOneIN`
7439 /// (RecoveryManager.java ~lines 1200–1280).
7440 pub fn recover_in_redo(
7441 &self,
7442 log_lsn: noxu_util::Lsn,
7443 is_root: bool,
7444 is_bin: bool,
7445 node_data: &[u8],
7446 ) -> InRedoResult {
7447 if is_bin {
7448 let Some(bin) = Self::deserialize_bin(node_data) else {
7449 return InRedoResult::DeserializeFailed;
7450 };
7451 if is_root {
7452 self.recover_root_bin(log_lsn, bin)
7453 } else {
7454 self.recover_child_bin(log_lsn, bin)
7455 }
7456 } else {
7457 let Some(upper) = Self::deserialize_upper_in(node_data) else {
7458 return InRedoResult::DeserializeFailed;
7459 };
7460 if is_root {
7461 self.recover_root_upper_in(log_lsn, upper)
7462 } else {
7463 self.recover_child_upper_in(log_lsn, upper)
7464 }
7465 }
7466 }
7467
7468 /// Recover a root BIN.
7469 ///
7470 /// If no root exists or the existing root is older (lower LSN), install
7471 /// this BIN as the new root.
7472 ///
7473 /// JE `RecoveryManager.recoverRootIN` / `RootUpdater.doWork`
7474 /// (RecoveryManager.java ~lines 1293–1410).
7475 fn recover_root_bin(
7476 &self,
7477 log_lsn: noxu_util::Lsn,
7478 bin: BinStub,
7479 ) -> InRedoResult {
7480 let mut root_guard = self.root.write();
7481 let existing_lsn = *self.root_log_lsn.read();
7482 match &*root_guard {
7483 None => {
7484 // No root — install this BIN as the root.
7485 // JE: `root == null` case in `RootUpdater.doWork`.
7486 let node = TreeNode::Bottom(bin);
7487 *root_guard = Some(Arc::new(RwLock::new(node)));
7488 *self.root_log_lsn.write() = log_lsn;
7489 InRedoResult::Inserted
7490 }
7491 Some(_) => {
7492 // JE: `originalLsn = root.getLsn()`; replace if logLsn > originalLsn.
7493 if log_lsn > existing_lsn {
7494 let node = TreeNode::Bottom(bin);
7495 *root_guard = Some(Arc::new(RwLock::new(node)));
7496 *self.root_log_lsn.write() = log_lsn;
7497 InRedoResult::Replaced
7498 } else {
7499 InRedoResult::Skipped
7500 }
7501 }
7502 }
7503 }
7504
7505 /// Recover a root upper IN.
7506 ///
7507 /// JE `RecoveryManager.recoverRootIN` for a non-BIN root.
7508 fn recover_root_upper_in(
7509 &self,
7510 log_lsn: noxu_util::Lsn,
7511 upper: InNodeStub,
7512 ) -> InRedoResult {
7513 let mut root_guard = self.root.write();
7514 let existing_lsn = *self.root_log_lsn.read();
7515 match &*root_guard {
7516 None => {
7517 let node = TreeNode::Internal(upper);
7518 *root_guard = Some(Arc::new(RwLock::new(node)));
7519 *self.root_log_lsn.write() = log_lsn;
7520 InRedoResult::Inserted
7521 }
7522 Some(_) => {
7523 if log_lsn > existing_lsn {
7524 let node = TreeNode::Internal(upper);
7525 *root_guard = Some(Arc::new(RwLock::new(node)));
7526 *self.root_log_lsn.write() = log_lsn;
7527 InRedoResult::Replaced
7528 } else {
7529 InRedoResult::Skipped
7530 }
7531 }
7532 }
7533 }
7534
7535 /// Recover a non-root BIN.
7536 ///
7537 /// Implements the three-case currency check from JE
7538 /// `RecoveryManager.recoverChildIN`
7539 /// (RecoveryManager.java lines 1412–1500):
7540 ///
7541 /// 1. Node not in tree: skip (parent logged a later structure that already
7542 /// omits this node, or node was deleted).
7543 /// 2. Physical match (slot LSN == log_lsn): noop — already current.
7544 /// 3. Logical match: another version of the node is in the slot.
7545 /// Replace if tree slot LSN < log_lsn (tree is older), skip otherwise.
7546 fn recover_child_bin(
7547 &self,
7548 log_lsn: noxu_util::Lsn,
7549 bin: BinStub,
7550 ) -> InRedoResult {
7551 let node_id = bin.node_id;
7552 let Some((parent_arc, slot)) = self.get_parent_in_for_child_in(node_id)
7553 else {
7554 // Case 1: not in tree.
7555 return InRedoResult::NotInTree;
7556 };
7557 let mut parent = parent_arc.write();
7558 let TreeNode::Internal(ref mut p) = *parent else {
7559 return InRedoResult::NotInTree;
7560 };
7561 let tree_lsn = p.get_lsn(slot); // T-3
7562 if tree_lsn == log_lsn {
7563 // Case 2: physical match — noop.
7564 InRedoResult::Skipped
7565 } else if tree_lsn < log_lsn {
7566 // Case 3: logical match, tree is older — replace.
7567 // JE `parent.recoverIN(idx, inFromLog, logLsn, lastLoggedSize)`.
7568 let new_arc = Arc::new(RwLock::new(TreeNode::Bottom(bin)));
7569 // Set parent back-pointer on the new node.
7570 {
7571 let mut ng = new_arc.write();
7572 if let TreeNode::Bottom(ref mut b) = *ng {
7573 b.parent = Some(Arc::downgrade(&parent_arc));
7574 }
7575 }
7576 p.set_child(slot, Some(new_arc));
7577 p.set_lsn(slot, log_lsn); // T-3
7578 InRedoResult::Replaced
7579 } else {
7580 // tree_lsn > log_lsn: tree already holds a newer version.
7581 InRedoResult::Skipped
7582 }
7583 }
7584
7585 /// Recover a non-root upper IN.
7586 ///
7587 /// JE `RecoveryManager.recoverChildIN` for a non-BIN node.
7588 fn recover_child_upper_in(
7589 &self,
7590 log_lsn: noxu_util::Lsn,
7591 upper: InNodeStub,
7592 ) -> InRedoResult {
7593 let node_id = upper.node_id;
7594 let Some((parent_arc, slot)) = self.get_parent_in_for_child_in(node_id)
7595 else {
7596 return InRedoResult::NotInTree;
7597 };
7598 let mut parent = parent_arc.write();
7599 let TreeNode::Internal(ref mut p) = *parent else {
7600 return InRedoResult::NotInTree;
7601 };
7602 let tree_lsn = p.get_lsn(slot); // T-3
7603 if tree_lsn == log_lsn {
7604 InRedoResult::Skipped
7605 } else if tree_lsn < log_lsn {
7606 let new_arc = Arc::new(RwLock::new(TreeNode::Internal(upper)));
7607 {
7608 let mut ng = new_arc.write();
7609 if let TreeNode::Internal(ref mut n) = *ng {
7610 n.parent = Some(Arc::downgrade(&parent_arc));
7611 }
7612 }
7613 p.set_child(slot, Some(new_arc));
7614 p.set_lsn(slot, log_lsn); // T-3
7615 InRedoResult::Replaced
7616 } else {
7617 InRedoResult::Skipped
7618 }
7619 }
7620}
7621
7622/// Result of a single `recover_in_redo` call.
7623///
7624/// JE traces the same outcomes in `RecoveryManager` debug logging.
7625#[derive(Debug, Clone, Copy, PartialEq, Eq)]
7626pub enum InRedoResult {
7627 /// Node was inserted as the new root.
7628 Inserted,
7629 /// Node replaced an older version in the tree.
7630 Replaced,
7631 /// Node not applied: tree already holds an equal or newer version.
7632 Skipped,
7633 /// Node not found in tree (parent logged later structure that excludes it).
7634 NotInTree,
7635 /// Deserialisation of `node_data` bytes failed.
7636 DeserializeFailed,
7637}
7638
7639/// Global node ID counter for generating unique node IDs.
7640///
7641/// This is the SINGLE source of node-ids for the whole tree subsystem. The
7642/// BIN constructor (`bin.rs`) and `node.rs` route through `generate_node_id`
7643/// so that, after crash recovery, a freshly allocated node-id is always
7644/// strictly greater than every node-id present in the recovered log.
7645///
7646/// JE ref: `NodeSequence.getNextLocalNodeId` (a single per-env counter) and
7647/// `IN.nodeId` allocation; `NodeSequence.initRealNodeId` seeds the counter
7648/// from the recovered `CheckpointEnd.lastLocalNodeId`. The env seeds this
7649/// counter post-recovery via `seed_node_id_counter`.
7650static NODE_ID_COUNTER: std::sync::atomic::AtomicU64 =
7651 std::sync::atomic::AtomicU64::new(1);
7652
7653/// Generates a unique node ID.
7654pub fn generate_node_id() -> u64 {
7655 NODE_ID_COUNTER.fetch_add(1, std::sync::atomic::Ordering::SeqCst)
7656}
7657
7658/// Returns the node-id that would be generated next (without allocating it).
7659///
7660/// Used by recovery seeding and by tests to assert no node-id reuse after a
7661/// restart.
7662pub fn peek_next_node_id_counter() -> u64 {
7663 NODE_ID_COUNTER.load(std::sync::atomic::Ordering::SeqCst)
7664}
7665
7666/// Seeds the node-id counter so the next generated id is `> last_node_id`.
7667///
7668/// Called by `EnvironmentImpl` after recovery with the recovered
7669/// `use_max_node_id`, mirroring `NodeSequence.initRealNodeId` /
7670/// `setLastNodeId`: post-restart allocation must never reuse a node-id that
7671/// is already in the log. Monotonic: never lowers the counter.
7672pub fn seed_node_id_counter(last_node_id: u64) {
7673 let want_next = last_node_id.saturating_add(1);
7674 // Bump only if our current next is below the recovered floor.
7675 let mut cur = NODE_ID_COUNTER.load(std::sync::atomic::Ordering::SeqCst);
7676 while cur < want_next {
7677 match NODE_ID_COUNTER.compare_exchange_weak(
7678 cur,
7679 want_next,
7680 std::sync::atomic::Ordering::SeqCst,
7681 std::sync::atomic::Ordering::SeqCst,
7682 ) {
7683 Ok(_) => break,
7684 Err(observed) => cur = observed,
7685 }
7686 }
7687}
7688
7689#[cfg(test)]
7690mod tests {
7691 use super::*;
7692
7693 // ====================================================================
7694 // T-3: LsnRep packed-LSN encoding (IN.entryLsnByteArray / getLsn /
7695 // setLsnInternal, IN.java:1752-1935).
7696 // ====================================================================
7697
7698 /// All-NULL node uses the 0-byte Empty rep; reads return NULL_LSN.
7699 #[test]
7700 fn lsnrep_empty_is_zero_bytes() {
7701 let rep = LsnRep::new(64);
7702 assert!(matches!(rep, LsnRep::Empty));
7703 assert_eq!(rep.memory_size(), 0);
7704 assert_eq!(rep.get(0), NULL_LSN);
7705 assert_eq!(rep.get(63), NULL_LSN);
7706 }
7707
7708 /// LSNs sharing a file number pack to the Compact rep (4 bytes/slot,
7709 /// base_file_number-relative) and round-trip exactly.
7710 #[test]
7711 fn lsnrep_compact_roundtrip_same_file() {
7712 let mut rep = LsnRep::new(8);
7713 for i in 0..8u32 {
7714 rep.set(i as usize, Lsn::new(7, 1000 + i), 8);
7715 }
7716 assert!(matches!(rep, LsnRep::Compact { .. }));
7717 for i in 0..8u32 {
7718 assert_eq!(rep.get(i as usize), Lsn::new(7, 1000 + i));
7719 }
7720 // 8 slots * 4 bytes = 32 bytes, far below 8 * 8 = 64 for raw u64.
7721 assert_eq!(rep.memory_size(), 8 * 4);
7722 }
7723
7724 /// NULL_LSN is stored via the 0xffffff file-offset sentinel, NOT u64::MAX,
7725 /// so a node with NULL slots still packs Compact (the blocker JE solves).
7726 #[test]
7727 fn lsnrep_null_does_not_force_long() {
7728 let mut rep = LsnRep::new(4);
7729 rep.set(0, Lsn::new(3, 50), 4);
7730 rep.set(1, NULL_LSN, 4);
7731 rep.set(2, Lsn::new(3, 60), 4);
7732 rep.set(3, NULL_LSN, 4);
7733 assert!(
7734 matches!(rep, LsnRep::Compact { .. }),
7735 "NULL slots must NOT force the Long rep"
7736 );
7737 assert_eq!(rep.get(0), Lsn::new(3, 50));
7738 assert_eq!(rep.get(1), NULL_LSN);
7739 assert_eq!(rep.get(2), Lsn::new(3, 60));
7740 assert_eq!(rep.get(3), NULL_LSN);
7741 }
7742
7743 /// base_file_number tracks the minimum; setting a lower file number
7744 /// re-bases the whole array (adjustFileNumbers) while staying Compact.
7745 #[test]
7746 fn lsnrep_rebase_on_lower_file_number() {
7747 let mut rep = LsnRep::new(3);
7748 rep.set(0, Lsn::new(10, 5), 3);
7749 rep.set(1, Lsn::new(12, 6), 3);
7750 // A lower file number re-bases base_file_number to 8.
7751 rep.set(2, Lsn::new(8, 7), 3);
7752 assert!(matches!(rep, LsnRep::Compact { .. }));
7753 assert_eq!(rep.get(0), Lsn::new(10, 5));
7754 assert_eq!(rep.get(1), Lsn::new(12, 6));
7755 assert_eq!(rep.get(2), Lsn::new(8, 7));
7756 }
7757
7758 /// A file-number spread > 127 forces the Long fallback (mutateToLongArray),
7759 /// still round-tripping every slot.
7760 #[test]
7761 fn lsnrep_mutates_to_long_on_wide_file_range() {
7762 let mut rep = LsnRep::new(2);
7763 rep.set(0, Lsn::new(1, 5), 2);
7764 rep.set(1, Lsn::new(1000, 6), 2); // diff 999 > 127 -> Long
7765 assert!(matches!(rep, LsnRep::Long(_)));
7766 assert_eq!(rep.get(0), Lsn::new(1, 5));
7767 assert_eq!(rep.get(1), Lsn::new(1000, 6));
7768 }
7769
7770 /// A file offset > MAX_FILE_OFFSET (0xfffffe) forces the Long fallback.
7771 #[test]
7772 fn lsnrep_mutates_to_long_on_large_offset() {
7773 let mut rep = LsnRep::new(2);
7774 rep.set(0, Lsn::new(1, 10), 2);
7775 rep.set(1, Lsn::new(1, 0x00ff_ffff), 2); // > MAX_FILE_OFFSET -> Long
7776 assert!(matches!(rep, LsnRep::Long(_)));
7777 assert_eq!(rep.get(1), Lsn::new(1, 0x00ff_ffff));
7778 }
7779
7780 /// insert_shift / remove_shift keep slots aligned (INArrayRep.copy).
7781 #[test]
7782 fn lsnrep_insert_and_remove_shift() {
7783 let mut rep = LsnRep::from_lsns(&[
7784 Lsn::new(2, 1),
7785 Lsn::new(2, 2),
7786 Lsn::new(2, 3),
7787 ]);
7788 // Insert a new slot at index 1.
7789 rep.insert_shift(1, 4);
7790 rep.set(1, Lsn::new(2, 99), 4);
7791 assert_eq!(rep.get(0), Lsn::new(2, 1));
7792 assert_eq!(rep.get(1), Lsn::new(2, 99));
7793 assert_eq!(rep.get(2), Lsn::new(2, 2));
7794 assert_eq!(rep.get(3), Lsn::new(2, 3));
7795 // Remove slot 1.
7796 rep.remove_shift(1);
7797 assert_eq!(rep.get(0), Lsn::new(2, 1));
7798 assert_eq!(rep.get(1), Lsn::new(2, 2));
7799 assert_eq!(rep.get(2), Lsn::new(2, 3));
7800 }
7801
7802 #[test]
7803 fn test_empty_tree() {
7804 let tree = Tree::new(1, 128);
7805 assert!(tree.is_empty());
7806 assert_eq!(tree.get_database_id(), 1);
7807 assert_eq!(tree.get_root_splits(), 0);
7808 }
7809
7810 #[test]
7811 fn test_redo_insert_older_lsn_does_not_overwrite_newer_slot() {
7812 // REC-F2 reproduce-first: redo() must be idempotent w.r.t. slot
7813 // currency. JE RecoveryManager.redo() (line ~2512/2544) only
7814 // replaces a slot when logrecLsn > treeLsn. A later redo of an
7815 // OLDER committed LN for the same key must NOT revert the slot to
7816 // the older value or reset the slot LSN backward.
7817 let tree = Tree::new(1, 128);
7818 let key = b"k".to_vec();
7819
7820 // Install the newer version at LSN X (e.g. the BIN-logged value).
7821 let newer = Lsn::new(5, 500);
7822 tree.redo_insert(&key, b"new", newer).unwrap();
7823
7824 // Replay an OLDER committed LN at Y < X for the same key.
7825 let older = Lsn::new(2, 200);
7826 tree.redo_insert(&key, b"old", older).unwrap();
7827
7828 // The newer value and LSN must survive.
7829 let got = tree.search_with_data(&key).expect("key present");
7830 assert!(got.found);
7831 assert_eq!(
7832 got.data.as_deref(),
7833 Some(&b"new"[..]),
7834 "older-LSN redo reverted committed data"
7835 );
7836 assert_eq!(
7837 got.lsn,
7838 newer.as_u64(),
7839 "older-LSN redo reset slot LSN backward"
7840 );
7841
7842 // A redo at a strictly NEWER LSN must still replace (replace-only
7843 // when log_lsn > slot_lsn, matching JE lsnCmp > 0).
7844 let newest = Lsn::new(9, 900);
7845 tree.redo_insert(&key, b"newest", newest).unwrap();
7846 let got = tree.search_with_data(&key).expect("key present");
7847 assert_eq!(got.data.as_deref(), Some(&b"newest"[..]));
7848 assert_eq!(got.lsn, newest.as_u64());
7849 }
7850
7851 #[test]
7852 fn test_insert_single() {
7853 let tree = Tree::new(1, 128);
7854 let key = b"testkey".to_vec();
7855 let data = b"testdata".to_vec();
7856 let lsn = Lsn::new(1, 100);
7857
7858 let result = tree.insert(key.clone(), data, lsn);
7859 assert!(result.is_ok());
7860 assert!(result.unwrap()); // Should be a new insert
7861
7862 assert!(!tree.is_empty());
7863
7864 // Verify we can search for it
7865 let search_result = tree.search(&key);
7866 assert!(search_result.is_some());
7867 let sr = search_result.unwrap();
7868 assert!(sr.exact_parent_found || !sr.child_not_resident);
7869 }
7870
7871 #[test]
7872 fn test_insert_multiple() {
7873 let tree = Tree::new(1, 128);
7874
7875 let keys = vec![
7876 b"apple".to_vec(),
7877 b"banana".to_vec(),
7878 b"cherry".to_vec(),
7879 b"date".to_vec(),
7880 ];
7881
7882 for (i, key) in keys.iter().enumerate() {
7883 let data = format!("data{}", i).into_bytes();
7884 let lsn = Lsn::new(1, 100 + (i as u32) * 10);
7885 let result = tree.insert(key.clone(), data, lsn);
7886 assert!(result.is_ok());
7887 assert!(result.unwrap()); // All should be new inserts
7888 }
7889
7890 // Verify we can search for each
7891 for key in &keys {
7892 let search_result = tree.search(key);
7893 assert!(search_result.is_some());
7894 }
7895 }
7896
7897 #[test]
7898 fn test_insert_duplicate_key() {
7899 let tree = Tree::new(1, 128);
7900 let key = b"duplicate".to_vec();
7901 let data1 = b"first".to_vec();
7902 let data2 = b"second".to_vec();
7903 let lsn1 = Lsn::new(1, 100);
7904 let lsn2 = Lsn::new(1, 200);
7905
7906 // First insert
7907 let result1 = tree.insert(key.clone(), data1, lsn1);
7908 assert!(result1.is_ok());
7909 assert!(result1.unwrap()); // New insert
7910
7911 // Second insert with same key - should be update
7912 let result2 = tree.insert(key, data2, lsn2);
7913 assert!(result2.is_ok());
7914 assert!(!result2.unwrap()); // Update, not new insert
7915 }
7916
7917 #[test]
7918 fn test_search_empty_tree() {
7919 let tree = Tree::new(1, 128);
7920 let key = b"noexist".to_vec();
7921
7922 let result = tree.search(&key);
7923 assert!(result.is_none());
7924 }
7925
7926 #[test]
7927 fn test_first_and_last_node() {
7928 let tree = Tree::new(1, 128);
7929
7930 // Empty tree
7931 assert!(tree.get_first_node().is_none());
7932 assert!(tree.get_last_node().is_none());
7933
7934 // Insert some keys
7935 let keys = [b"a".to_vec(), b"b".to_vec(), b"c".to_vec()];
7936 for (i, key) in keys.iter().enumerate() {
7937 let data = format!("data{}", i).into_bytes();
7938 let lsn = Lsn::new(1, 100 + (i as u32) * 10);
7939 tree.insert(key.clone(), data, lsn).unwrap();
7940 }
7941
7942 // Now should have first and last
7943 let first = tree.get_first_node();
7944 assert!(first.is_some());
7945 assert_eq!(first.unwrap().index, 0);
7946
7947 let last = tree.get_last_node();
7948 assert!(last.is_some());
7949 assert_eq!(last.unwrap().index, 2);
7950 }
7951
7952 #[test]
7953 fn test_node_id_generation() {
7954 let id1 = generate_node_id();
7955 let id2 = generate_node_id();
7956 let id3 = generate_node_id();
7957
7958 assert!(id2 > id1);
7959 assert!(id3 > id2);
7960 }
7961
7962 #[test]
7963 fn test_tree_node_is_bin() {
7964 let bin = TreeNode::Bottom(BinStub {
7965 node_id: 1,
7966 level: BIN_LEVEL,
7967 entries: vec![],
7968 key_prefix: Vec::new(),
7969 dirty: false,
7970 is_delta: false,
7971 last_full_lsn: NULL_LSN,
7972 last_delta_lsn: NULL_LSN,
7973 generation: 0,
7974 parent: None,
7975 expiration_in_hours: true,
7976 cursor_count: 0,
7977 prohibit_next_delta: false,
7978 lsn_rep: LsnRep::Empty,
7979 keys: KeyRep::new(),
7980 compact_max_key_length: INKeyRep_DEFAULT_MAX_KEY_LENGTH,
7981 });
7982 assert!(bin.is_bin());
7983 assert_eq!(bin.level(), BIN_LEVEL);
7984
7985 let internal = TreeNode::Internal(InNodeStub {
7986 node_id: 2,
7987 level: MAIN_LEVEL + 2,
7988 entries: vec![],
7989 targets: TargetRep::None,
7990 dirty: false,
7991 generation: 0,
7992 parent: None,
7993 lsn_rep: LsnRep::Empty,
7994 });
7995 assert!(!internal.is_bin());
7996 assert_eq!(internal.level(), MAIN_LEVEL + 2);
7997 }
7998
7999 #[test]
8000 fn test_find_entry() {
8001 let mut entries = vec![];
8002 let mut keys = vec![];
8003 for i in 0..5 {
8004 entries.push(BinEntry {
8005 data: Some(vec![]),
8006 known_deleted: false,
8007 dirty: false,
8008 expiration_time: 0,
8009 });
8010 keys.push(format!("key{}", i).into_bytes());
8011 }
8012
8013 let bin = TreeNode::Bottom(BinStub {
8014 node_id: 1,
8015 level: BIN_LEVEL,
8016 entries,
8017 key_prefix: Vec::new(),
8018 dirty: false,
8019 is_delta: false,
8020 last_full_lsn: NULL_LSN,
8021 last_delta_lsn: NULL_LSN,
8022 generation: 0,
8023 parent: None,
8024 expiration_in_hours: true,
8025 cursor_count: 0,
8026 prohibit_next_delta: false,
8027 lsn_rep: LsnRep::Empty,
8028 keys: KeyRep::from_keys(keys),
8029 compact_max_key_length: INKeyRep_DEFAULT_MAX_KEY_LENGTH,
8030 });
8031
8032 // Search for existing key
8033 let result = bin.find_entry(b"key2", false, true);
8034 assert_eq!(result & 0xFFFF, 2);
8035 assert_ne!(result & EXACT_MATCH, 0);
8036
8037 // Search for non-existing key with exact=false
8038 let result = bin.find_entry(b"key15", false, false);
8039 assert_eq!(result & 0xFFFF, 2); // Would go between key1 and key2
8040 assert_eq!(result & EXACT_MATCH, 0);
8041 }
8042
8043 #[test]
8044 fn test_insert_until_full() {
8045 // With splits implemented, inserting beyond max_entries_per_node must
8046 // succeed (the tree splits proactively rather than returning an error).
8047 let tree = Tree::new(1, 3); // Small max to exercise splits
8048
8049 // Insert up to max
8050 for i in 0..3 {
8051 let key = format!("key{}", i).into_bytes();
8052 let data = format!("data{}", i).into_bytes();
8053 let lsn = Lsn::new(1, 100 + i);
8054 let result = tree.insert(key, data, lsn);
8055 assert!(result.is_ok(), "insert {} should succeed", i);
8056 }
8057
8058 // The 4th insert triggers a split and must also succeed.
8059 let key = b"key3".to_vec();
8060 let data = b"data3".to_vec();
8061 let lsn = Lsn::new(1, 103);
8062 let result = tree.insert(key.clone(), data, lsn);
8063 assert!(
8064 result.is_ok(),
8065 "insert after full should trigger split and succeed"
8066 );
8067 assert!(result.unwrap(), "should be a new insert");
8068
8069 // The inserted key must be findable after the split.
8070 let sr = tree.search(&key);
8071 assert!(sr.is_some(), "key3 must be searchable after split");
8072 assert!(sr.unwrap().exact_parent_found, "key3 must be found exactly");
8073 }
8074
8075 #[test]
8076 fn test_memory_counter_balanced_on_insert_delete_f8() {
8077 use std::sync::Arc;
8078 use std::sync::atomic::{AtomicI64, Ordering};
8079 // F8 regression: insert accounts key+data+48; delete must subtract the
8080 // SAME, so an insert+delete of the same record returns the counter to
8081 // its starting value (previously delete omitted data_len -> the counter
8082 // leaked data_len per delete, biasing the evictor over-budget view).
8083 let mut tree = Tree::new(1, 16);
8084 let counter = Arc::new(AtomicI64::new(0));
8085 tree.set_memory_counter(Arc::clone(&counter));
8086
8087 let key = b"a-key".to_vec();
8088 let data = vec![0u8; 200]; // non-trivial data length
8089 tree.insert(key.clone(), data.clone(), Lsn::new(0, 10)).unwrap();
8090 let after_insert = counter.load(Ordering::Relaxed);
8091 assert!(after_insert > 0, "insert must increase the counter");
8092 assert_eq!(
8093 after_insert,
8094 (key.len() + data.len() + BIN_ENTRY_OVERHEAD) as i64,
8095 "insert accounts key + data + per-slot BinEntry overhead"
8096 );
8097
8098 let deleted = tree.delete(&key);
8099 assert!(deleted);
8100 assert_eq!(
8101 counter.load(Ordering::Relaxed),
8102 0,
8103 "F8: delete must subtract key + data + BIN_ENTRY_OVERHEAD, returning the counter to its pre-insert value (no data_len leak)"
8104 );
8105 }
8106
8107 /// EV-13 (pass-post): a full-node detach must ACTUALLY drop the child
8108 /// `Arc` from the parent IN, not merely credit bytes. Before the fix the
8109 /// evictor credited `node_size_fn(node_id)` and removed the node from the
8110 /// LRU list, but the parent's `InEntry.child` still held a strong `Arc`,
8111 /// so the node was never freed (phantom free) and the budget over-credited.
8112 ///
8113 /// This test proves: after `detach_node_by_id` the held child `Arc` is the
8114 /// LAST strong reference (strong_count == 1), the parent slot's `child` is
8115 /// `None`, and the returned bytes equal the node's measured heap size.
8116 ///
8117 /// JE ref: `IN.detachNode` (`setTarget(idx, null)`) / `Evictor.evict`.
8118 #[test]
8119 fn test_ev13_detach_actually_frees_child() {
8120 // Tiny fanout forces a root split so we get a real IN parent with BIN
8121 // children that the evictor would target.
8122 let tree = Tree::new(7, 4);
8123 for i in 0u8..12 {
8124 tree.insert(
8125 vec![b'a' + i],
8126 vec![i; 8],
8127 Lsn::new(1, u32::from(i) + 1),
8128 )
8129 .unwrap();
8130 }
8131
8132 // Find a BIN child of the root IN (the eviction target) + its parent.
8133 let root = tree.get_root().expect("tree must have a root");
8134 let (parent_arc, child_idx, bin_id, expected_bytes) = {
8135 let rg = root.read();
8136 let TreeNode::Internal(n) = &*rg else {
8137 panic!("root must be an IN after split");
8138 };
8139 // Pick the first slot whose child is a resident BIN.
8140 let (idx, child) = n
8141 .first_resident_child()
8142 .expect("root must have a resident child");
8143 let (id, bytes) = {
8144 let cg = child.read();
8145 (
8146 match &*cg {
8147 TreeNode::Bottom(b) => b.node_id,
8148 TreeNode::Internal(n2) => n2.node_id,
8149 },
8150 cg.budgeted_memory_size(),
8151 )
8152 };
8153 (Arc::clone(&root), idx, id, bytes)
8154 };
8155
8156 // Hold an external strong reference to the child so we can observe its
8157 // strong_count drop when detach releases the parent's reference.
8158 let child_arc = {
8159 let pg = parent_arc.read();
8160 let TreeNode::Internal(n) = &*pg else { unreachable!() };
8161 Arc::clone(n.child_ref(child_idx).unwrap())
8162 };
8163 // Two strong refs now: the parent slot + our test handle.
8164 assert_eq!(
8165 Arc::strong_count(&child_arc),
8166 2,
8167 "precondition: parent slot + test handle hold the child"
8168 );
8169
8170 let freed = tree.detach_node_by_id(bin_id);
8171
8172 // 1. Bytes credited equal the measured heap size (no phantom credit).
8173 assert_eq!(
8174 freed, expected_bytes,
8175 "detach must credit the node's real measured heap size"
8176 );
8177 // 2. The parent slot's child is now None (JE setTarget(idx, null)).
8178 {
8179 let pg = parent_arc.read();
8180 let TreeNode::Internal(n) = &*pg else { unreachable!() };
8181 assert!(
8182 n.child_is_none(child_idx),
8183 "EV-13: parent slot must be detached (child == None)"
8184 );
8185 // The slot itself (key + LSN) is retained for re-fetch.
8186 assert!(
8187 !n.get_lsn(child_idx).is_null(),
8188 "detach keeps the slot LSN so the node can be re-fetched"
8189 );
8190 }
8191 // 3. Our handle is now the ONLY strong reference -> the parent really
8192 // dropped its Arc; the node is freed when we drop `child_arc`.
8193 // Before EV-13 this would be 2 (parent still held it) = phantom free.
8194 assert_eq!(
8195 Arc::strong_count(&child_arc),
8196 1,
8197 "EV-13: detach must drop the parent's strong Arc (no phantom free)"
8198 );
8199 }
8200
8201 /// EV-13: detach must NOT decrement the memory counter itself (the evictor
8202 /// owns that bookkeeping via `Arbiter::release_memory`). A double credit
8203 /// would drive `cache_usage` below reality.
8204 #[test]
8205 fn test_ev13_detach_does_not_touch_counter() {
8206 use std::sync::atomic::{AtomicI64, Ordering};
8207 let mut tree = Tree::new(8, 4);
8208 let counter = Arc::new(AtomicI64::new(0));
8209 tree.set_memory_counter(Arc::clone(&counter));
8210 for i in 0u8..12 {
8211 tree.insert(
8212 vec![b'a' + i],
8213 vec![i; 8],
8214 Lsn::new(1, u32::from(i) + 1),
8215 )
8216 .unwrap();
8217 }
8218 let before = counter.load(Ordering::Relaxed);
8219
8220 // Grab a BIN child id.
8221 let root = tree.get_root().unwrap();
8222 let bin_id = {
8223 let rg = root.read();
8224 let TreeNode::Internal(n) = &*rg else { unreachable!() };
8225 let child = n
8226 .resident_children()
8227 .into_iter()
8228 .next()
8229 .expect("resident child");
8230 match &*child.read() {
8231 TreeNode::Bottom(b) => b.node_id,
8232 TreeNode::Internal(n2) => n2.node_id,
8233 }
8234 };
8235
8236 let freed = tree.detach_node_by_id(bin_id);
8237 assert!(freed > 0, "detach must free a resident child");
8238 assert_eq!(
8239 counter.load(Ordering::Relaxed),
8240 before,
8241 "EV-13: detach must not change the counter (evictor credits once)"
8242 );
8243 }
8244
8245 /// EV-13: detaching the root or an unknown id is a no-op returning 0.
8246 #[test]
8247 fn test_ev13_detach_root_or_missing_is_noop() {
8248 let tree = Tree::new(9, 4);
8249 for i in 0u8..12 {
8250 tree.insert(
8251 vec![b'a' + i],
8252 vec![i; 8],
8253 Lsn::new(1, u32::from(i) + 1),
8254 )
8255 .unwrap();
8256 }
8257 let root_id = {
8258 let rg = tree.get_root().unwrap();
8259 let g = rg.read();
8260 match &*g {
8261 TreeNode::Internal(n) => n.node_id,
8262 TreeNode::Bottom(b) => b.node_id,
8263 }
8264 };
8265 assert_eq!(
8266 tree.detach_node_by_id(root_id),
8267 0,
8268 "root has no parent IN -> detach is a no-op"
8269 );
8270 assert_eq!(
8271 tree.detach_node_by_id(u64::MAX),
8272 0,
8273 "unknown node id -> detach is a no-op"
8274 );
8275 }
8276
8277 /// DBI-23 (pass-post): the live `memory_counter` must APPROXIMATE the real
8278 /// in-memory heap of the tree, not the old `key + data + 48` lower bound.
8279 ///
8280 /// JE keeps `inMemorySize` (`IN.getBudgetedMemorySize`) in lock-step with
8281 /// the per-node `computeMemorySize`; the over-budget arbiter sees the real
8282 /// figure so eviction fires at the right time. The previous Noxu live
8283 /// path undercounted each BIN slot (48 vs the 64-byte `BinEntry` struct)
8284 /// and never accounted the node-struct fixed overhead, so the counter ran
8285 /// below real heap and the evictor under-fired.
8286 ///
8287 /// We assert the live counter is within tolerance of
8288 /// `total_budgeted_memory` (the authoritative walk-and-sum oracle). The
8289 /// only gap is the per-node fixed struct overhead (BinStub/InNodeStub),
8290 /// which is a small fraction for non-trivial entries — the fix closes the
8291 /// dominant per-slot gap.
8292 #[test]
8293 fn test_dbi23_live_counter_approximates_real_heap() {
8294 use std::sync::atomic::{AtomicI64, Ordering};
8295 let mut tree = Tree::new(42, 32);
8296 let counter = Arc::new(AtomicI64::new(0));
8297 tree.set_memory_counter(Arc::clone(&counter));
8298
8299 // Insert N entries with realistic key+data sizes.
8300 let n = 400u32;
8301 for i in 0..n {
8302 let key = format!("key-{i:08}").into_bytes(); // 12 bytes
8303 let data = vec![0u8; 64]; // 64 bytes
8304 tree.insert(key, data, Lsn::new(1, i + 1)).unwrap();
8305 }
8306
8307 let live = counter.load(Ordering::Relaxed) as u64;
8308 let real = tree.total_budgeted_memory();
8309
8310 // The live counter must reflect the per-slot cost AFTER the T-2/T-3
8311 // compactions hoisted the per-slot key/LSN out of `BinEntry` into the
8312 // node-level reps. The per-slot live charge is now
8313 // `key + data + size_of::<BinEntry>() + 4` (the packed LSN slot); the
8314 // dominant data+key bytes are still charged in full. Assert the live
8315 // counter is at least the data-and-fixed portion (a stable floor that
8316 // does NOT assume the pre-compaction 64-byte slot).
8317 let new_lower_bound: u64 = (0..n)
8318 .map(|i| {
8319 let key_len = format!("key-{i:08}").len();
8320 (key_len + 64 + BIN_ENTRY_OVERHEAD) as u64
8321 })
8322 .sum();
8323
8324 assert!(
8325 live >= new_lower_bound,
8326 "DBI-23: live counter ({live}) must be >= the per-slot-correct \
8327 lower bound ({new_lower_bound})"
8328 );
8329
8330 // Within tolerance of real heap (the residual gap is the per-node
8331 // fixed struct overhead, intentionally not tracked incrementally).
8332 let lower = real * 80 / 100;
8333 assert!(
8334 live >= lower && live <= real,
8335 "DBI-23: live counter ({live}) must approximate real heap ({real}) \
8336 within tolerance [{lower}, {real}]"
8337 );
8338 }
8339
8340 #[test]
8341 fn test_delete_existing_key() {
8342 let tree = Tree::new(1, 128);
8343 let key = b"remove_me".to_vec();
8344 tree.insert(key.clone(), b"val".to_vec(), Lsn::new(1, 10)).unwrap();
8345 assert!(tree.delete(&key));
8346
8347 // After deletion the BIN is empty, so delete returns true the first
8348 // time and false the second time.
8349 assert!(!tree.delete(&key));
8350 }
8351
8352 #[test]
8353 fn test_delete_nonexistent_key() {
8354 let tree = Tree::new(1, 128);
8355 tree.insert(b"a".to_vec(), b"v".to_vec(), Lsn::new(1, 1)).unwrap();
8356
8357 assert!(!tree.delete(b"zzz"));
8358 }
8359
8360 #[test]
8361 fn test_delete_empty_tree() {
8362 let tree = Tree::new(1, 128);
8363 assert!(!tree.delete(b"nothing"));
8364 }
8365
8366 #[test]
8367 fn test_delete_all_entries_makes_bin_empty() {
8368 let tree = Tree::new(1, 128);
8369 tree.insert(b"x".to_vec(), b"1".to_vec(), Lsn::new(1, 1)).unwrap();
8370 tree.insert(b"y".to_vec(), b"2".to_vec(), Lsn::new(1, 2)).unwrap();
8371
8372 assert!(tree.delete(b"x"));
8373 assert!(tree.delete(b"y"));
8374
8375 // Tree still has a root (empty BIN), so is_empty() returns false.
8376 assert!(!tree.is_empty());
8377 // get_first_node should return None for an empty BIN.
8378 assert!(tree.get_first_node().is_none());
8379 }
8380
8381 #[test]
8382 fn test_set_root_and_get_root() {
8383 let tree = Tree::new(1, 128);
8384 assert!(tree.get_root().is_none());
8385
8386 let bin = TreeNode::Bottom(BinStub {
8387 node_id: generate_node_id(),
8388 level: BIN_LEVEL,
8389 entries: vec![],
8390 key_prefix: Vec::new(),
8391 dirty: false,
8392 is_delta: false,
8393 last_full_lsn: NULL_LSN,
8394 last_delta_lsn: NULL_LSN,
8395 generation: 0,
8396 parent: None,
8397 expiration_in_hours: true,
8398 cursor_count: 0,
8399 prohibit_next_delta: false,
8400 lsn_rep: LsnRep::Empty,
8401 keys: KeyRep::new(),
8402 compact_max_key_length: INKeyRep_DEFAULT_MAX_KEY_LENGTH,
8403 });
8404 tree.set_root(bin);
8405 assert!(tree.get_root().is_some());
8406 }
8407
8408 // ========================================================================
8409 // Split / multi-level insert tests (new)
8410 // ========================================================================
8411
8412 /// inserting enough keys to fill the root IN causes
8413 /// the root IN itself to split, resulting in a tree with 3 or more levels.
8414 ///
8415 /// With max_entries_per_node = 4:
8416 /// - Each BIN holds 4 entries before it is split.
8417 /// - The root IN at level 2 holds up to 4 BIN children.
8418 /// - Filling those 4 BINs (16 entries) and adding a 17th forces the
8419 /// root IN to split, creating a level-3 root.
8420 #[test]
8421 fn test_insert_forces_root_split() {
8422 let tree = Tree::new(1, 4);
8423
8424 // 17 inserts with fanout 4 forces the root IN to split.
8425 for i in 0u32..20 {
8426 let key = format!("key{:04}", i).into_bytes();
8427 let data = format!("data{}", i).into_bytes();
8428 let lsn = Lsn::new(1, 100 + i);
8429 let r = tree.insert(key, data, lsn);
8430 assert!(r.is_ok(), "insert {} must succeed", i);
8431 }
8432
8433 // At least one root split must have occurred.
8434 assert!(
8435 tree.get_root_splits() > 0,
8436 "expected at least one root split after 20 inserts with fanout 4"
8437 );
8438
8439 // The root level must be > level-2 (i.e., the tree has grown to 3+ levels).
8440 let root_arc = tree.get_root().as_ref().unwrap().clone();
8441 let root_level = root_arc.read().level();
8442 let level_2 = MAIN_LEVEL | 2;
8443 assert!(
8444 root_level > level_2,
8445 "root level {} must be > level-2 after root split",
8446 root_level
8447 );
8448 }
8449
8450 /// Inserting 1000 keys in sorted order and verifying all are searchable.
8451 #[test]
8452 fn test_insert_many_keys() {
8453 let tree = Tree::new(1, 8);
8454 let n = 1000u32;
8455
8456 for i in 0..n {
8457 let key = format!("key{:08}", i).into_bytes();
8458 let data = format!("data{}", i).into_bytes();
8459 let lsn = Lsn::new(1, i);
8460 let r = tree.insert(key, data, lsn);
8461 assert!(r.is_ok(), "insert {} must succeed", i);
8462 }
8463
8464 // All keys must be findable.
8465 for i in 0..n {
8466 let key = format!("key{:08}", i).into_bytes();
8467 let sr = tree.search(&key);
8468 assert!(
8469 sr.is_some() && sr.unwrap().exact_parent_found,
8470 "key{:08} must be found after bulk insert",
8471 i
8472 );
8473 }
8474 }
8475
8476 /// Inserting 500 keys in pseudo-random (reverse) order and verifying all
8477 /// are searchable.
8478 #[test]
8479 fn test_insert_random_keys() {
8480 let tree = Tree::new(1, 8);
8481 let n = 500u32;
8482
8483 // Insert in reverse order as a simple non-sorted sequence.
8484 for i in (0..n).rev() {
8485 let key = format!("rkey{:08}", i).into_bytes();
8486 let data = format!("data{}", i).into_bytes();
8487 let lsn = Lsn::new(1, i);
8488 let r = tree.insert(key, data, lsn);
8489 assert!(r.is_ok(), "insert {} must succeed", i);
8490 }
8491
8492 for i in 0..n {
8493 let key = format!("rkey{:08}", i).into_bytes();
8494 let sr = tree.search(&key);
8495 assert!(
8496 sr.is_some() && sr.unwrap().exact_parent_found,
8497 "rkey{:08} must be found",
8498 i
8499 );
8500 }
8501 }
8502
8503 /// After any number of splits, every key inserted must still be findable.
8504 ///
8505 #[test]
8506 fn test_split_preserves_all_keys() {
8507 // Tiny fanout to maximise split frequency.
8508 let tree = Tree::new(1, 3);
8509 let n = 60u32;
8510
8511 let mut keys: Vec<Vec<u8>> = Vec::new();
8512 for i in 0..n {
8513 let key = format!("sk{:04}", i).into_bytes();
8514 keys.push(key.clone());
8515 let data = format!("d{}", i).into_bytes();
8516 let lsn = Lsn::new(1, i);
8517 let r = tree.insert(key, data, lsn);
8518 assert!(r.is_ok(), "insert {} must not fail", i);
8519 }
8520
8521 // After all inserts (and all the splits they induced), every key must
8522 // still be findable in the tree.
8523 for key in &keys {
8524 let sr = tree.search(key);
8525 assert!(
8526 sr.is_some() && sr.unwrap().exact_parent_found,
8527 "key {:?} must survive all splits",
8528 std::str::from_utf8(key).unwrap_or("?")
8529 );
8530 }
8531 }
8532
8533 /// The tree level (depth) must grow as keys are inserted and splits occur.
8534 #[test]
8535 fn test_tree_height_grows() {
8536 let tree = Tree::new(1, 4);
8537
8538 // With fanout 4, one level-2 root IN can hold 4 children. After enough
8539 // inserts the root itself will split and a level-3 node will appear.
8540 // Insert enough keys to force the root to split at least once.
8541 let n = 40u32;
8542 for i in 0..n {
8543 let key = format!("hk{:08}", i).into_bytes();
8544 let data = format!("d{}", i).into_bytes();
8545 let lsn = Lsn::new(1, i);
8546 tree.insert(key, data, lsn).unwrap();
8547 }
8548
8549 // At least one root split must have occurred.
8550 assert!(
8551 tree.get_root_splits() > 0,
8552 "expected root to have split at least once for {} keys with fanout 4",
8553 n
8554 );
8555
8556 // The root level must be > level-2 (i.e., the tree has grown past two levels).
8557 let root_arc = tree.get_root().as_ref().unwrap().clone();
8558 let root_level = root_arc.read().level();
8559 let level_2 = MAIN_LEVEL | 2;
8560 assert!(
8561 root_level > level_2,
8562 "root level {} must be > {} after enough inserts",
8563 root_level,
8564 level_2
8565 );
8566 }
8567
8568 #[test]
8569 fn test_find_entry_on_internal_node() {
8570 let mut entries = vec![];
8571 for i in 0..4 {
8572 entries.push(InEntry { key: format!("k{}", i).into_bytes() });
8573 }
8574 let internal = TreeNode::Internal(InNodeStub {
8575 node_id: 1,
8576 level: MAIN_LEVEL + 2,
8577 entries,
8578 targets: TargetRep::None,
8579 dirty: false,
8580 generation: 0,
8581 parent: None,
8582 lsn_rep: LsnRep::Empty,
8583 });
8584
8585 // Exact match
8586 let r = internal.find_entry(b"k2", false, true);
8587 assert_ne!(r & EXACT_MATCH, 0);
8588 assert_eq!(r & 0xFFFF, 2);
8589
8590 // No exact match with exact=true
8591 let r = internal.find_entry(b"kx", false, true);
8592 assert_eq!(r, -1);
8593 }
8594
8595 // St-H5: non-exact `find_entry` on an Internal node must return the FLOOR
8596 // child slot (largest entry ≤ key), not the insertion point. Entries are
8597 // k0,k1,k2,k3; slot 0 is the leftmost child.
8598 #[test]
8599 fn test_find_entry_internal_nonexact_returns_floor() {
8600 let mut entries = vec![];
8601 for i in 0..4 {
8602 entries.push(InEntry { key: format!("k{}", i).into_bytes() });
8603 }
8604 let internal = TreeNode::Internal(InNodeStub {
8605 node_id: 1,
8606 level: MAIN_LEVEL + 2,
8607 entries,
8608 targets: TargetRep::None,
8609 dirty: false,
8610 generation: 0,
8611 parent: None,
8612 lsn_rep: LsnRep::Empty,
8613 });
8614
8615 // Key below every separator floors to slot 0 (leftmost child).
8616 assert_eq!(internal.find_entry(b"a", false, false) & 0xFFFF, 0);
8617 // Between k1 and k2 floors to k1 (slot 1).
8618 assert_eq!(internal.find_entry(b"k1x", false, false) & 0xFFFF, 1);
8619 // Above every separator floors to the last slot (k3 = slot 3).
8620 assert_eq!(internal.find_entry(b"zzz", false, false) & 0xFFFF, 3);
8621 // Exact match still reported as the exact slot.
8622 let r = internal.find_entry(b"k2", false, false);
8623 assert_ne!(r & EXACT_MATCH, 0);
8624 assert_eq!(r & 0xFFFF, 2);
8625 }
8626
8627 // ========================================================================
8628 // New tests: dirty tracking, generation, parent pointers, log size, stats
8629 // ========================================================================
8630
8631 /// After inserting into a tree, the BIN (and root IN) must be dirty.
8632 ///
8633 /// The: Tree.insertLN() calls bin.setDirty(true) after each insert.
8634 #[test]
8635 fn test_insert_marks_bin_dirty() {
8636 let tree = Tree::new(1, 128);
8637 tree.insert(b"key1".to_vec(), b"val1".to_vec(), Lsn::new(1, 1))
8638 .unwrap();
8639
8640 let root_arc = tree.get_root().as_ref().unwrap().clone();
8641 // root is an upper IN — its slot 0 child is the BIN.
8642 let bin_arc = {
8643 let g = root_arc.read();
8644 match &*g {
8645 TreeNode::Internal(n) => n.get_child(0).unwrap(),
8646 _ => panic!("expected Internal root"),
8647 }
8648 };
8649
8650 let bin_dirty = bin_arc.read().is_dirty();
8651 assert!(bin_dirty, "BIN must be dirty after insert");
8652 }
8653
8654 /// Updating an existing key keeps the BIN dirty.
8655 #[test]
8656 fn test_update_keeps_bin_dirty() {
8657 let tree = Tree::new(1, 128);
8658 tree.insert(b"k".to_vec(), b"v1".to_vec(), Lsn::new(1, 1)).unwrap();
8659 // second insert is an update
8660 tree.insert(b"k".to_vec(), b"v2".to_vec(), Lsn::new(1, 2)).unwrap();
8661
8662 let root_arc = tree.get_root().as_ref().unwrap().clone();
8663 let bin_arc = {
8664 let g = root_arc.read();
8665 match &*g {
8666 TreeNode::Internal(n) => n.get_child(0).unwrap(),
8667 _ => panic!("expected Internal root"),
8668 }
8669 };
8670
8671 assert!(bin_arc.read().is_dirty(), "BIN must be dirty after update");
8672 }
8673
8674 /// After deleting a key the BIN must be dirty.
8675 #[test]
8676 fn test_delete_marks_bin_dirty() {
8677 let tree = Tree::new(1, 128);
8678 tree.insert(b"del".to_vec(), b"val".to_vec(), Lsn::new(1, 1)).unwrap();
8679
8680 // Manually clear dirty flag to verify delete re-sets it.
8681 {
8682 let root_arc = tree.get_root().as_ref().unwrap().clone();
8683 let bin_arc = {
8684 let g = root_arc.read();
8685 match &*g {
8686 TreeNode::Internal(n) => n.get_child(0).unwrap(),
8687 _ => panic!("expected Internal root"),
8688 }
8689 };
8690 bin_arc.write().set_dirty(false);
8691 assert!(!bin_arc.read().is_dirty());
8692 }
8693
8694 tree.delete(b"del");
8695
8696 let root_arc = tree.get_root().as_ref().unwrap().clone();
8697 let bin_arc = {
8698 let g = root_arc.read();
8699 match &*g {
8700 TreeNode::Internal(n) => n.get_child(0).unwrap(),
8701 _ => panic!("expected Internal root"),
8702 }
8703 };
8704 assert!(bin_arc.read().is_dirty(), "BIN must be dirty after delete");
8705 }
8706
8707 /// BIN's parent pointer must point to the root IN.
8708 #[test]
8709 fn test_bin_parent_pointer_set_on_initial_insert() {
8710 let tree = Tree::new(1, 128);
8711 tree.insert(b"k".to_vec(), b"v".to_vec(), Lsn::new(1, 1)).unwrap();
8712
8713 let root_arc = tree.get_root().as_ref().unwrap().clone();
8714 let bin_arc = {
8715 let g = root_arc.read();
8716 match &*g {
8717 TreeNode::Internal(n) => n.get_child(0).unwrap(),
8718 _ => panic!("expected Internal root"),
8719 }
8720 };
8721
8722 let parent_weak = bin_arc.read().get_parent();
8723 assert!(parent_weak.is_some(), "BIN must have a parent pointer");
8724
8725 // Upgrading the weak pointer must give us the root arc.
8726 let parent_arc = parent_weak.unwrap().upgrade().unwrap();
8727 assert!(
8728 Arc::ptr_eq(&parent_arc, &root_arc),
8729 "BIN parent must be the root IN"
8730 );
8731 }
8732
8733 /// set_dirty / is_dirty round-trip on both variants.
8734 #[test]
8735 fn test_dirty_flag_roundtrip() {
8736 let mut bin_node = TreeNode::Bottom(BinStub {
8737 node_id: 1,
8738 level: BIN_LEVEL,
8739 entries: vec![],
8740 key_prefix: Vec::new(),
8741 dirty: false,
8742 is_delta: false,
8743 last_full_lsn: NULL_LSN,
8744 last_delta_lsn: NULL_LSN,
8745 generation: 0,
8746 parent: None,
8747 expiration_in_hours: true,
8748 cursor_count: 0,
8749 prohibit_next_delta: false,
8750 lsn_rep: LsnRep::Empty,
8751 keys: KeyRep::new(),
8752 compact_max_key_length: INKeyRep_DEFAULT_MAX_KEY_LENGTH,
8753 });
8754 assert!(!bin_node.is_dirty());
8755 bin_node.set_dirty(true);
8756 assert!(bin_node.is_dirty());
8757 bin_node.set_dirty(false);
8758 assert!(!bin_node.is_dirty());
8759
8760 let mut in_node = TreeNode::Internal(InNodeStub {
8761 node_id: 2,
8762 level: MAIN_LEVEL | 2,
8763 entries: vec![],
8764 targets: TargetRep::None,
8765 dirty: false,
8766 generation: 0,
8767 parent: None,
8768 lsn_rep: LsnRep::Empty,
8769 });
8770 assert!(!in_node.is_dirty());
8771 in_node.set_dirty(true);
8772 assert!(in_node.is_dirty());
8773 }
8774
8775 /// set_generation / get_generation round-trip on both variants.
8776 #[test]
8777 fn test_generation_roundtrip() {
8778 let mut bin_node = TreeNode::Bottom(BinStub {
8779 node_id: 1,
8780 level: BIN_LEVEL,
8781 entries: vec![],
8782 key_prefix: Vec::new(),
8783 dirty: false,
8784 is_delta: false,
8785 last_full_lsn: NULL_LSN,
8786 last_delta_lsn: NULL_LSN,
8787 generation: 0,
8788 parent: None,
8789 expiration_in_hours: true,
8790 cursor_count: 0,
8791 prohibit_next_delta: false,
8792 lsn_rep: LsnRep::Empty,
8793 keys: KeyRep::new(),
8794 compact_max_key_length: INKeyRep_DEFAULT_MAX_KEY_LENGTH,
8795 });
8796 assert_eq!(bin_node.get_generation(), 0);
8797 bin_node.set_generation(42);
8798 assert_eq!(bin_node.get_generation(), 42);
8799
8800 let mut in_node = TreeNode::Internal(InNodeStub {
8801 node_id: 2,
8802 level: MAIN_LEVEL | 2,
8803 entries: vec![],
8804 targets: TargetRep::None,
8805 dirty: false,
8806 generation: 0,
8807 parent: None,
8808 lsn_rep: LsnRep::Empty,
8809 });
8810 in_node.set_generation(99);
8811 assert_eq!(in_node.get_generation(), 99);
8812 }
8813
8814 /// log_size() must be consistent with write_to_bytes() length.
8815 #[test]
8816 fn test_log_size_matches_bytes_len() {
8817 // BIN stub with some entries.
8818 let bin_node = TreeNode::Bottom(BinStub {
8819 node_id: 7,
8820 level: BIN_LEVEL,
8821 entries: vec![
8822 BinEntry {
8823 data: Some(b"d1".to_vec()),
8824 known_deleted: false,
8825 dirty: false,
8826 expiration_time: 0,
8827 },
8828 BinEntry {
8829 data: None,
8830 known_deleted: false,
8831 dirty: false,
8832 expiration_time: 0,
8833 },
8834 ],
8835 key_prefix: Vec::new(),
8836 dirty: true,
8837 is_delta: false,
8838 last_full_lsn: NULL_LSN,
8839 last_delta_lsn: NULL_LSN,
8840 generation: 5,
8841 parent: None,
8842 expiration_in_hours: true,
8843 cursor_count: 0,
8844 prohibit_next_delta: false,
8845 lsn_rep: LsnRep::Empty,
8846 keys: KeyRep::from_keys(vec![b"alpha".to_vec(), b"beta".to_vec()]),
8847 compact_max_key_length: INKeyRep_DEFAULT_MAX_KEY_LENGTH,
8848 });
8849 assert_eq!(bin_node.log_size(), bin_node.write_to_bytes().len());
8850
8851 // IN stub with some entries.
8852 let in_node = TreeNode::Internal(InNodeStub {
8853 node_id: 8,
8854 level: MAIN_LEVEL | 2,
8855 entries: vec![
8856 InEntry { key: vec![] },
8857 InEntry { key: b"mid".to_vec() },
8858 ],
8859 targets: TargetRep::None,
8860 dirty: false,
8861 generation: 0,
8862 parent: None,
8863 lsn_rep: LsnRep::Empty,
8864 });
8865 assert_eq!(in_node.log_size(), in_node.write_to_bytes().len());
8866 }
8867
8868 /// write_to_bytes() output contains the node_id and dirty flag.
8869 #[test]
8870 fn test_write_to_bytes_encodes_node_id_and_dirty() {
8871 let node = TreeNode::Bottom(BinStub {
8872 node_id: 0xDEAD_BEEF_0000_0001,
8873 level: BIN_LEVEL,
8874 entries: vec![],
8875 key_prefix: Vec::new(),
8876 dirty: true,
8877 is_delta: false,
8878 last_full_lsn: NULL_LSN,
8879 last_delta_lsn: NULL_LSN,
8880 generation: 0,
8881 parent: None,
8882 expiration_in_hours: true,
8883 cursor_count: 0,
8884 prohibit_next_delta: false,
8885 lsn_rep: LsnRep::Empty,
8886 keys: KeyRep::new(),
8887 compact_max_key_length: INKeyRep_DEFAULT_MAX_KEY_LENGTH,
8888 });
8889 let bytes = node.write_to_bytes();
8890 // First 8 bytes = node_id big-endian.
8891 let id_bytes = &bytes[0..8];
8892 assert_eq!(id_bytes, 0xDEAD_BEEF_0000_0001u64.to_be_bytes());
8893 // Byte at offset 16 (after node_id[8] + level[4] + n_entries[4]) = dirty flag.
8894 assert_eq!(bytes[16], 1u8, "dirty flag must be 1");
8895 }
8896
8897 /// log_size() grows as entries are added.
8898 #[test]
8899 fn test_log_size_grows_with_entries() {
8900 let empty = TreeNode::Bottom(BinStub {
8901 node_id: 1,
8902 level: BIN_LEVEL,
8903 entries: vec![],
8904 key_prefix: Vec::new(),
8905 dirty: false,
8906 is_delta: false,
8907 last_full_lsn: NULL_LSN,
8908 last_delta_lsn: NULL_LSN,
8909 generation: 0,
8910 parent: None,
8911 expiration_in_hours: true,
8912 cursor_count: 0,
8913 prohibit_next_delta: false,
8914 lsn_rep: LsnRep::Empty,
8915 keys: KeyRep::new(),
8916 compact_max_key_length: INKeyRep_DEFAULT_MAX_KEY_LENGTH,
8917 });
8918 let with_entry = TreeNode::Bottom(BinStub {
8919 node_id: 2,
8920 level: BIN_LEVEL,
8921 entries: vec![BinEntry {
8922 data: None,
8923 known_deleted: false,
8924 dirty: false,
8925 expiration_time: 0,
8926 }],
8927 key_prefix: Vec::new(),
8928 dirty: false,
8929 is_delta: false,
8930 last_full_lsn: NULL_LSN,
8931 last_delta_lsn: NULL_LSN,
8932 generation: 0,
8933 parent: None,
8934 expiration_in_hours: true,
8935 cursor_count: 0,
8936 prohibit_next_delta: false,
8937 lsn_rep: LsnRep::Empty,
8938 keys: KeyRep::from_keys(vec![b"longkey_here".to_vec()]),
8939 compact_max_key_length: INKeyRep_DEFAULT_MAX_KEY_LENGTH,
8940 });
8941 assert!(
8942 with_entry.log_size() > empty.log_size(),
8943 "log_size must grow when entries are added"
8944 );
8945 }
8946
8947 /// propagate_dirty_to_root() marks all ancestors dirty.
8948 #[test]
8949 fn test_propagate_dirty_to_root() {
8950 // Build a 2-level tree manually: root IN -> BIN.
8951 let bin_arc = Arc::new(RwLock::new(TreeNode::Bottom(BinStub {
8952 node_id: generate_node_id(),
8953 level: BIN_LEVEL,
8954 entries: vec![],
8955 key_prefix: Vec::new(),
8956 dirty: false,
8957 is_delta: false,
8958 last_full_lsn: NULL_LSN,
8959 last_delta_lsn: NULL_LSN,
8960 generation: 0,
8961 parent: None, // set below
8962 expiration_in_hours: true,
8963 cursor_count: 0,
8964 prohibit_next_delta: false,
8965 lsn_rep: LsnRep::Empty,
8966 keys: KeyRep::new(),
8967 compact_max_key_length: INKeyRep_DEFAULT_MAX_KEY_LENGTH,
8968 })));
8969
8970 let root_arc = Arc::new(RwLock::new(TreeNode::Internal(InNodeStub {
8971 node_id: generate_node_id(),
8972 level: MAIN_LEVEL | 2,
8973 entries: vec![InEntry { key: vec![] }],
8974 targets: TargetRep::Sparse(vec![(0, bin_arc.clone())]),
8975 dirty: false,
8976 generation: 0,
8977 parent: None,
8978 lsn_rep: LsnRep::Empty,
8979 })));
8980
8981 // Wire BIN's parent to root.
8982 bin_arc.write().set_parent(Some(Arc::downgrade(&root_arc)));
8983
8984 // Root is not dirty before propagation.
8985 assert!(!root_arc.read().is_dirty());
8986
8987 // Propagate from the BIN up.
8988 Tree::propagate_dirty_to_root(&bin_arc);
8989
8990 // Root must now be dirty.
8991 assert!(
8992 root_arc.read().is_dirty(),
8993 "root must be dirty after propagate_dirty_to_root"
8994 );
8995 }
8996
8997 /// collect_stats() on an empty tree returns all-zero stats.
8998 #[test]
8999 fn test_collect_stats_empty_tree() {
9000 let tree = Tree::new(1, 128);
9001 let stats = tree.collect_stats();
9002 assert_eq!(stats, TreeStats::default());
9003 }
9004
9005 /// collect_stats() on a single-entry tree: 1 IN + 1 BIN, height 2.
9006 #[test]
9007 fn test_collect_stats_single_insert() {
9008 let tree = Tree::new(1, 128);
9009 tree.insert(b"k".to_vec(), b"v".to_vec(), Lsn::new(1, 1)).unwrap();
9010 let stats = tree.collect_stats();
9011 assert_eq!(stats.n_bins, 1, "must have 1 BIN");
9012 assert_eq!(stats.n_ins, 1, "must have 1 upper IN");
9013 assert_eq!(stats.height, 2, "single-entry tree has height 2");
9014 assert!(stats.n_entries >= 1, "must have at least 1 entry total");
9015 }
9016
9017 /// collect_stats() with many inserts: entry count matches insert count.
9018 #[test]
9019 fn test_collect_stats_many_inserts() {
9020 let tree = Tree::new(1, 8);
9021 let n = 50u32;
9022 for i in 0..n {
9023 let key = format!("sk{:04}", i).into_bytes();
9024 tree.insert(key, b"v".to_vec(), Lsn::new(1, i)).unwrap();
9025 }
9026 let stats = tree.collect_stats();
9027 // All n entries should be accounted for across all BINs.
9028 // n_entries counts entries in both INs and BINs; BIN entries = n.
9029 // We verify BIN entry total equals n by summing manually.
9030 let bin_entries: u64 = stats.n_entries - stats.n_ins; // rough check
9031 // A more precise assertion: the sum of all BIN entries == n.
9032 // Since we can't easily separate, just assert the tree is non-trivial.
9033 assert!(stats.n_bins > 0, "must have at least one BIN");
9034 assert!(stats.height >= 2, "multi-entry tree has height >= 2");
9035 // Total entries in the tree must be >= n (BIN entries alone).
9036 assert!(
9037 bin_entries >= n as u64 || stats.n_entries >= n as u64,
9038 "entry count must account for all inserts"
9039 );
9040 }
9041
9042 // ========================================================================
9043 // Tests: B-tree merge / compress
9044 // ========================================================================
9045
9046 /// After deleting most keys from a tree, compress() must reduce the BIN
9047 /// count by merging under-full siblings.
9048 ///
9049 /// Strategy: build a large tree (many BINs), delete almost all keys,
9050 /// then verify compress() reduces n_bins and all surviving keys remain
9051 /// findable. We do not hard-code the exact BIN counts because the
9052 /// preemptive splitting strategy determines the exact split points.
9053 #[test]
9054 fn test_compress_merges_underfull_bins() {
9055 let tree = Tree::new(1, 8);
9056
9057 // Insert 64 sorted keys to build a multi-BIN tree.
9058 let n = 64u32;
9059 let keys: Vec<Vec<u8>> =
9060 (0..n).map(|i| format!("cm{:04}", i).into_bytes()).collect();
9061 for (i, key) in keys.iter().enumerate() {
9062 tree.insert(key.clone(), vec![i as u8], Lsn::new(1, i as u32))
9063 .unwrap();
9064 }
9065
9066 let stats_full = tree.collect_stats();
9067 assert!(
9068 stats_full.n_bins >= 2,
9069 "must have multiple BINs after 64 inserts"
9070 );
9071
9072 // Delete all but 4 widely-spaced keys (one roughly per BIN pair).
9073 // We keep every 16th key: k0000, k0016, k0032, k0048.
9074 let keep: std::collections::HashSet<u32> =
9075 [0, 16, 32, 48].iter().cloned().collect();
9076 for i in 0..n {
9077 if !keep.contains(&i) {
9078 let key = format!("cm{:04}", i).into_bytes();
9079 tree.delete(&key);
9080 }
9081 }
9082
9083 let stats_sparse = tree.collect_stats();
9084 assert!(
9085 stats_sparse.n_bins >= 2,
9086 "should still have multiple BINs before compress"
9087 );
9088
9089 // compress() must reduce BIN count since most BINs now hold 0–1 entries.
9090 tree.compress();
9091
9092 let stats_after = tree.collect_stats();
9093 assert!(
9094 stats_after.n_bins < stats_sparse.n_bins,
9095 "compress must reduce BIN count (was {}, now {})",
9096 stats_sparse.n_bins,
9097 stats_after.n_bins
9098 );
9099
9100 // Surviving keys must still be findable.
9101 for i in keep {
9102 let key = format!("cm{:04}", i).into_bytes();
9103 let sr = tree.search(&key);
9104 assert!(
9105 sr.is_some() && sr.unwrap().exact_parent_found,
9106 "key cm{:04} must survive compress",
9107 i
9108 );
9109 }
9110 }
9111
9112 /// compress() preserves all entries: a full-BIN tree has fewer merges
9113 /// but all keys remain accessible.
9114 #[test]
9115 fn test_compress_no_op_when_full() {
9116 // Insert exactly max_entries worth of keys into a single BIN — no split
9117 // will have occurred yet, and the BINs will all be reasonably full.
9118 // We can't prevent splits entirely (preemptive), but we can verify that
9119 // compress() never loses entries.
9120 let tree = Tree::new(1, 8);
9121 let n = 32u32;
9122 for i in 0..n {
9123 let key = format!("fn{:04}", i).into_bytes();
9124 tree.insert(key, vec![i as u8], Lsn::new(1, i)).unwrap();
9125 }
9126
9127 let stats_before = tree.collect_stats();
9128 tree.compress();
9129 let stats_after = tree.collect_stats();
9130
9131 // All keys still findable.
9132 for i in 0..n {
9133 let key = format!("fn{:04}", i).into_bytes();
9134 let sr = tree.search(&key);
9135 assert!(
9136 sr.is_some() && sr.unwrap().exact_parent_found,
9137 "key fn{:04} must be findable after compress",
9138 i
9139 );
9140 }
9141
9142 // BIN count must not increase.
9143 assert!(
9144 stats_after.n_bins <= stats_before.n_bins,
9145 "compress must not increase BIN count"
9146 );
9147 }
9148
9149 /// compress() on an empty tree must not panic.
9150 #[test]
9151 fn test_compress_empty_tree() {
9152 let tree = Tree::new(1, 4);
9153 tree.compress(); // must not panic
9154 }
9155
9156 /// After deleting all entries, compress() reduces BINs to 1.
9157 #[test]
9158 fn test_compress_removes_empty_bin_from_parent() {
9159 let tree = Tree::new(1, 4);
9160 // Insert enough keys to generate multiple BINs.
9161 let n = 16u32;
9162 for i in 0..n {
9163 let key = format!("ep{:04}", i).into_bytes();
9164 tree.insert(key, vec![i as u8], Lsn::new(1, i)).unwrap();
9165 }
9166
9167 let stats_before = tree.collect_stats();
9168 assert!(stats_before.n_bins >= 2, "need multiple BINs for this test");
9169
9170 // Delete everything except the very last key.
9171 for i in 0..n - 1 {
9172 let key = format!("ep{:04}", i).into_bytes();
9173 tree.delete(&key);
9174 }
9175
9176 tree.compress();
9177
9178 let stats_after = tree.collect_stats();
9179 assert!(
9180 stats_after.n_bins < stats_before.n_bins,
9181 "compress must reduce BIN count after mass deletion"
9182 );
9183
9184 // The surviving key must still be findable.
9185 let last_key = format!("ep{:04}", n - 1).into_bytes();
9186 let sr = tree.search(&last_key);
9187 assert!(
9188 sr.is_some() && sr.unwrap().exact_parent_found,
9189 "last key must survive after compress"
9190 );
9191 }
9192
9193 // ========================================================================
9194 // IC-1: prune_empty_bin must NOT remove a live entry when the BIN was
9195 // repopulated between the compressor observing it empty and the prune.
9196 // (Tree corruption / lost-write regression test.)
9197 // ========================================================================
9198
9199 /// Find a BIN arc that is currently empty (0 entries) and is NOT the
9200 /// root, returning it together with the `id_key` the compressor would
9201 /// have captured (here we just use any key that routes to that BIN).
9202 fn first_empty_non_root_bin(tree: &Tree) -> Option<Arc<RwLock<TreeNode>>> {
9203 let root = tree.get_root()?;
9204 for node in tree.rebuild_in_list() {
9205 if Arc::ptr_eq(&node, &root) {
9206 continue; // skip root (single-BIN tree is never pruned)
9207 }
9208 let is_empty_bin = {
9209 let g = node.read();
9210 matches!(&*g, TreeNode::Bottom(b) if b.entries.is_empty())
9211 };
9212 if is_empty_bin {
9213 return Some(node);
9214 }
9215 }
9216 None
9217 }
9218
9219 /// IC-1 (fail-pre / pass-post): the old `compress_bin` prune step called
9220 /// `self.delete(&id_key)`, which re-descends by key. If a concurrent
9221 /// insert repopulated the empty BIN with a LIVE entry under that same
9222 /// `id_key`, `self.delete` would silently remove the live entry — a lost
9223 /// write. `prune_empty_bin` re-validates `n_entries == 0` under the
9224 /// parent latch and must REMOVE NOTHING when the BIN is non-empty.
9225 ///
9226 /// JE `Tree.delete` / `searchDeletableSubTree` (Tree.java ~line 755-800):
9227 /// `bin.getNEntries() != 0` → NODE_NOT_EMPTY (abort prune).
9228 #[test]
9229 fn test_ic1_prune_empty_bin_aborts_when_repopulated() {
9230 let tree = Tree::new(1, 4);
9231 let n = 16u32;
9232 for i in 0..n {
9233 let key = format!("ic{:04}", i).into_bytes();
9234 tree.insert(key, vec![i as u8], Lsn::new(1, i)).unwrap();
9235 }
9236 assert!(
9237 tree.collect_stats().n_bins >= 2,
9238 "need multiple BINs for this test"
9239 );
9240
9241 // Empty out one whole BIN by deleting every key it holds. We delete
9242 // the lowest 4 keys (ic0000..ic0003) which share the first BIN, then
9243 // physically compress it so it has 0 entries.
9244 for i in 0..4 {
9245 let key = format!("ic{:04}", i).into_bytes();
9246 tree.delete(&key);
9247 }
9248
9249 // Locate the now-empty BIN and the id_key the compressor would use.
9250 let empty_bin = match first_empty_non_root_bin(&tree) {
9251 Some(b) => b,
9252 // If the layout didn't leave an isolated empty BIN, the scenario
9253 // isn't reproducible on this build; treat as vacuously passing.
9254 None => return,
9255 };
9256
9257 // SIMULATE THE RACE: a concurrent insert repopulates the empty BIN
9258 // with a LIVE entry *before* the prune runs. We insert directly into
9259 // the BIN arc to model the insert that lands after `now_empty` was
9260 // read. Pick a key that routes to this BIN.
9261 let live_key = format!("ic{:04}", 1).into_bytes(); // was deleted above
9262 {
9263 let mut g = empty_bin.write();
9264 if let TreeNode::Bottom(b) = &mut *g {
9265 // T-2/T-3: route through the insert helper so entries/keys/
9266 // lsn_rep stay in lock step.
9267 b.insert_with_prefix(
9268 live_key.clone(),
9269 Lsn::new(1, 1),
9270 Some(vec![0xAB]),
9271 );
9272 }
9273 }
9274 let id_key = {
9275 let g = empty_bin.read();
9276 match &*g {
9277 TreeNode::Bottom(b) => b.get_full_key(0).unwrap(),
9278 _ => unreachable!(),
9279 }
9280 };
9281
9282 // Prune must ABORT (return false) because the BIN is no longer empty,
9283 // and must NOT remove the live entry.
9284 let pruned = tree.prune_empty_bin(&id_key);
9285 assert!(!pruned, "IC-1: prune must abort when the BIN was repopulated");
9286
9287 // The live entry must still be present in the BIN.
9288 let still_there = {
9289 let g = empty_bin.read();
9290 match &*g {
9291 TreeNode::Bottom(b) => {
9292 b.entries.iter().enumerate().any(|(i, _)| {
9293 b.key_prefix.is_empty() && b.get_key(i) == live_key
9294 })
9295 }
9296 _ => false,
9297 }
9298 };
9299 assert!(
9300 still_there,
9301 "IC-1: prune must not remove the repopulated live entry"
9302 );
9303 }
9304
9305 /// IC-1 companion: prune_empty_bin must abort when a cursor is parked on
9306 /// the (still-empty) BIN. JE: `bin.nCursors() > 0` → CURSORS_EXIST.
9307 #[test]
9308 fn test_ic1_prune_empty_bin_aborts_with_cursor() {
9309 let tree = Tree::new(1, 4);
9310 for i in 0..16u32 {
9311 let key = format!("cu{:04}", i).into_bytes();
9312 tree.insert(key, vec![i as u8], Lsn::new(1, i)).unwrap();
9313 }
9314 for i in 0..4 {
9315 let key = format!("cu{:04}", i).into_bytes();
9316 tree.delete(&key);
9317 }
9318 let empty_bin = match first_empty_non_root_bin(&tree) {
9319 Some(b) => b,
9320 None => return,
9321 };
9322 // Park a cursor on the empty BIN.
9323 Tree::pin_bin(&empty_bin);
9324 // id_key: any key routing to this BIN. Use the first deleted key.
9325 let id_key = format!("cu{:04}", 0).into_bytes();
9326 let pruned = tree.prune_empty_bin(&id_key);
9327 assert!(
9328 !pruned,
9329 "IC-1: prune must abort when a cursor is parked on the BIN"
9330 );
9331 Tree::unpin_bin(&empty_bin);
9332 }
9333
9334 /// IC-1 happy path: prune_empty_bin removes the parent slot when the BIN
9335 /// really is empty, no cursors, not a delta.
9336 #[test]
9337 fn test_ic1_prune_empty_bin_succeeds_when_truly_empty() {
9338 let tree = Tree::new(1, 4);
9339 for i in 0..16u32 {
9340 let key = format!("ok{:04}", i).into_bytes();
9341 tree.insert(key, vec![i as u8], Lsn::new(1, i)).unwrap();
9342 }
9343 for i in 0..4 {
9344 let key = format!("ok{:04}", i).into_bytes();
9345 tree.delete(&key);
9346 }
9347 let bins_before = tree.collect_stats().n_bins;
9348 let empty_bin = match first_empty_non_root_bin(&tree) {
9349 Some(b) => b,
9350 None => return,
9351 };
9352 // id_key: a key that routes to this empty BIN (one of the deleted).
9353 let id_key = {
9354 // route by the lowest deleted key; it falls into the leftmost BIN.
9355 let _ = &empty_bin;
9356 format!("ok{:04}", 0).into_bytes()
9357 };
9358 let pruned = tree.prune_empty_bin(&id_key);
9359 assert!(pruned, "IC-1: prune must succeed on a truly empty BIN");
9360 let bins_after = tree.collect_stats().n_bins;
9361 assert!(
9362 bins_after < bins_before,
9363 "IC-1: pruned BIN slot must be removed from the parent (was {}, now {})",
9364 bins_before,
9365 bins_after
9366 );
9367 // Every surviving key must still be findable.
9368 for i in 4..16u32 {
9369 let key = format!("ok{:04}", i).into_bytes();
9370 assert!(
9371 tree.search(&key).is_some_and(|s| s.exact_parent_found),
9372 "surviving key ok{:04} must remain after prune",
9373 i
9374 );
9375 }
9376 }
9377
9378 // ========================================================================
9379 // Tests: latch-coupling validation (validate_parent_child /
9380 // search_with_coupling)
9381 // ========================================================================
9382
9383 /// validate_parent_child returns true when the parent slot points at the
9384 /// expected child.
9385 #[test]
9386 fn test_validate_parent_child_correct_link() {
9387 let bin_arc = Arc::new(RwLock::new(TreeNode::Bottom(BinStub {
9388 node_id: generate_node_id(),
9389 level: BIN_LEVEL,
9390 entries: vec![],
9391 key_prefix: Vec::new(),
9392 dirty: false,
9393 is_delta: false,
9394 last_full_lsn: NULL_LSN,
9395 last_delta_lsn: NULL_LSN,
9396 generation: 0,
9397 parent: None,
9398 expiration_in_hours: true,
9399 cursor_count: 0,
9400 prohibit_next_delta: false,
9401 lsn_rep: LsnRep::Empty,
9402 keys: KeyRep::new(),
9403 compact_max_key_length: INKeyRep_DEFAULT_MAX_KEY_LENGTH,
9404 })));
9405
9406 let root_arc = Arc::new(RwLock::new(TreeNode::Internal(InNodeStub {
9407 node_id: generate_node_id(),
9408 level: MAIN_LEVEL | 2,
9409 entries: vec![InEntry { key: vec![] }],
9410 targets: TargetRep::Sparse(vec![(0, bin_arc.clone())]),
9411 dirty: false,
9412 generation: 0,
9413 parent: None,
9414 lsn_rep: LsnRep::Empty,
9415 })));
9416
9417 assert!(
9418 Tree::validate_parent_child(&root_arc, 0, &bin_arc),
9419 "link must be valid when parent slot 0 points at bin_arc"
9420 );
9421 }
9422
9423 /// validate_parent_child returns false when the slot index is out of range.
9424 #[test]
9425 fn test_validate_parent_child_out_of_range() {
9426 let root_arc = Arc::new(RwLock::new(TreeNode::Internal(InNodeStub {
9427 node_id: generate_node_id(),
9428 level: MAIN_LEVEL | 2,
9429 entries: vec![],
9430 targets: TargetRep::None,
9431 dirty: false,
9432 generation: 0,
9433 parent: None,
9434 lsn_rep: LsnRep::Empty,
9435 })));
9436 let other_arc = Arc::new(RwLock::new(TreeNode::Bottom(BinStub {
9437 node_id: generate_node_id(),
9438 level: BIN_LEVEL,
9439 entries: vec![],
9440 key_prefix: Vec::new(),
9441 dirty: false,
9442 is_delta: false,
9443 last_full_lsn: NULL_LSN,
9444 last_delta_lsn: NULL_LSN,
9445 generation: 0,
9446 parent: None,
9447 expiration_in_hours: true,
9448 cursor_count: 0,
9449 prohibit_next_delta: false,
9450 lsn_rep: LsnRep::Empty,
9451 keys: KeyRep::new(),
9452 compact_max_key_length: INKeyRep_DEFAULT_MAX_KEY_LENGTH,
9453 })));
9454
9455 assert!(
9456 !Tree::validate_parent_child(&root_arc, 0, &other_arc),
9457 "link must be invalid when parent has no entries"
9458 );
9459 }
9460
9461 /// validate_parent_child returns false when the slot points at a different Arc.
9462 #[test]
9463 fn test_validate_parent_child_wrong_child() {
9464 let bin_a = Arc::new(RwLock::new(TreeNode::Bottom(BinStub {
9465 node_id: generate_node_id(),
9466 level: BIN_LEVEL,
9467 entries: vec![],
9468 key_prefix: Vec::new(),
9469 dirty: false,
9470 is_delta: false,
9471 last_full_lsn: NULL_LSN,
9472 last_delta_lsn: NULL_LSN,
9473 generation: 0,
9474 parent: None,
9475 expiration_in_hours: true,
9476 cursor_count: 0,
9477 prohibit_next_delta: false,
9478 lsn_rep: LsnRep::Empty,
9479 keys: KeyRep::new(),
9480 compact_max_key_length: INKeyRep_DEFAULT_MAX_KEY_LENGTH,
9481 })));
9482 let bin_b = Arc::new(RwLock::new(TreeNode::Bottom(BinStub {
9483 node_id: generate_node_id(),
9484 level: BIN_LEVEL,
9485 entries: vec![],
9486 key_prefix: Vec::new(),
9487 dirty: false,
9488 is_delta: false,
9489 last_full_lsn: NULL_LSN,
9490 last_delta_lsn: NULL_LSN,
9491 generation: 0,
9492 parent: None,
9493 expiration_in_hours: true,
9494 cursor_count: 0,
9495 prohibit_next_delta: false,
9496 lsn_rep: LsnRep::Empty,
9497 keys: KeyRep::new(),
9498 compact_max_key_length: INKeyRep_DEFAULT_MAX_KEY_LENGTH,
9499 })));
9500
9501 let root_arc = Arc::new(RwLock::new(TreeNode::Internal(InNodeStub {
9502 node_id: generate_node_id(),
9503 level: MAIN_LEVEL | 2,
9504 entries: vec![InEntry { key: vec![] }],
9505 targets: TargetRep::Sparse(vec![(0, bin_a)]),
9506 dirty: false,
9507 generation: 0,
9508 parent: None,
9509 lsn_rep: LsnRep::Empty,
9510 })));
9511
9512 assert!(
9513 !Tree::validate_parent_child(&root_arc, 0, &bin_b),
9514 "link must be invalid when parent slot points at a different Arc"
9515 );
9516 }
9517
9518 /// search_with_coupling finds the same key as search().
9519 #[test]
9520 fn test_search_with_coupling_finds_existing_key() {
9521 let tree = Tree::new(1, 8);
9522 for i in 0u32..20 {
9523 let key = format!("c{:04}", i).into_bytes();
9524 tree.insert(key, vec![i as u8], Lsn::new(1, i)).unwrap();
9525 }
9526
9527 for i in 0u32..20 {
9528 let key = format!("c{:04}", i).into_bytes();
9529 let sr = tree.search_with_coupling(&key);
9530 assert!(
9531 sr.is_some() && sr.unwrap().exact_parent_found,
9532 "search_with_coupling must find c{:04}",
9533 i
9534 );
9535 }
9536 }
9537
9538 /// search_with_coupling returns false for a key not in the tree.
9539 #[test]
9540 fn test_search_with_coupling_missing_key() {
9541 let tree = Tree::new(1, 8);
9542 tree.insert(b"hello".to_vec(), b"v".to_vec(), Lsn::new(1, 1)).unwrap();
9543
9544 let sr = tree.search_with_coupling(b"zzz");
9545 // The search result must either be None or have exact_parent_found=false.
9546 assert!(
9547 sr.is_none_or(|r| !r.exact_parent_found),
9548 "search_with_coupling must not find a key that was never inserted"
9549 );
9550 }
9551
9552 /// search_with_coupling on an empty tree returns None.
9553 #[test]
9554 fn test_search_with_coupling_empty_tree() {
9555 let tree = Tree::new(1, 8);
9556 assert!(tree.search_with_coupling(b"k").is_none());
9557 }
9558
9559 // ========================================================================
9560 // Tests: BIN-delta reconstitution (apply_delta_to_bin / mutate_to_full_bin)
9561 // ========================================================================
9562
9563 /// apply_delta_to_bin replaces existing entries and inserts new ones.
9564 ///
9565 /// BIN.applyDelta(): delta entries are authoritative and
9566 /// supersede full-BIN entries at the same key.
9567 #[test]
9568 fn test_apply_delta_to_bin_updates_and_inserts() {
9569 let mut base = BinStub {
9570 node_id: 1,
9571 level: BIN_LEVEL,
9572 entries: vec![
9573 BinEntry {
9574 data: Some(b"old_a".to_vec()),
9575 known_deleted: false,
9576 dirty: false,
9577 expiration_time: 0,
9578 },
9579 BinEntry {
9580 data: Some(b"old_c".to_vec()),
9581 known_deleted: false,
9582 dirty: false,
9583 expiration_time: 0,
9584 },
9585 ],
9586 key_prefix: Vec::new(),
9587 dirty: false,
9588 is_delta: false,
9589 last_full_lsn: NULL_LSN,
9590 last_delta_lsn: NULL_LSN,
9591 generation: 0,
9592 parent: None,
9593 expiration_in_hours: true,
9594 cursor_count: 0,
9595 prohibit_next_delta: false,
9596 lsn_rep: LsnRep::Empty,
9597 keys: KeyRep::from_keys(vec![b"a".to_vec(), b"c".to_vec()]),
9598 compact_max_key_length: INKeyRep_DEFAULT_MAX_KEY_LENGTH,
9599 };
9600
9601 let delta_entries = vec![
9602 // Update existing key "a" with new data.
9603 (b"a".to_vec(), Lsn::new(1, 10), Some(b"new_a".to_vec())),
9604 // Insert new key "b".
9605 (b"b".to_vec(), Lsn::new(1, 20), Some(b"new_b".to_vec())),
9606 ];
9607
9608 Tree::apply_delta_to_bin(&mut base, delta_entries);
9609
9610 assert!(base.dirty, "base must be dirty after applying delta");
9611
9612 // Collect the full keys for assertions (T-2: keys live in the rep).
9613 let full_keys: Vec<Vec<u8>> = (0..base.entries.len())
9614 .map(|i| base.get_full_key(i).unwrap_or_default())
9615 .collect();
9616
9617 // "a" must be updated.
9618 let a_idx = full_keys.iter().position(|k| k == b"a").unwrap();
9619 assert_eq!(
9620 base.entries[a_idx].data.as_deref(),
9621 Some(b"new_a" as &[u8])
9622 );
9623
9624 // "b" must be newly inserted.
9625 assert!(full_keys.iter().any(|k| k == b"b"));
9626
9627 // "c" must still be present (untouched).
9628 assert!(full_keys.iter().any(|k| k == b"c"));
9629
9630 // Entries must be in sorted order.
9631 let mut sorted = full_keys.clone();
9632 sorted.sort();
9633 assert_eq!(
9634 full_keys, sorted,
9635 "entries must remain sorted after delta apply"
9636 );
9637 }
9638
9639 /// apply_delta_to_bin with an empty delta is a no-op (except dirty flag).
9640 #[test]
9641 fn test_apply_delta_to_bin_empty_delta() {
9642 let mut base = BinStub {
9643 node_id: 1,
9644 level: BIN_LEVEL,
9645 entries: vec![BinEntry {
9646 data: None,
9647 known_deleted: false,
9648 dirty: false,
9649 expiration_time: 0,
9650 }],
9651 key_prefix: Vec::new(),
9652 dirty: false,
9653 is_delta: false,
9654 last_full_lsn: NULL_LSN,
9655 last_delta_lsn: NULL_LSN,
9656 generation: 0,
9657 parent: None,
9658 expiration_in_hours: true,
9659 cursor_count: 0,
9660 prohibit_next_delta: false,
9661 lsn_rep: LsnRep::Empty,
9662 keys: KeyRep::from_keys(vec![b"x".to_vec()]),
9663 compact_max_key_length: INKeyRep_DEFAULT_MAX_KEY_LENGTH,
9664 };
9665 let n_before = base.entries.len();
9666 Tree::apply_delta_to_bin(&mut base, vec![]);
9667 assert_eq!(
9668 base.entries.len(),
9669 n_before,
9670 "empty delta must not change entry count"
9671 );
9672 assert!(base.dirty, "dirty must be set even for empty delta apply");
9673 }
9674
9675 /// mutate_to_full_bin reconstitutes a full BIN from a delta + base.
9676 ///
9677 /// BIN.mutateToFullBIN(BIN fullBIN): after mutation the
9678 /// `is_delta` flag must be cleared and the entries must contain both
9679 /// base and delta data.
9680 #[test]
9681 fn test_mutate_to_full_bin_merges_delta_and_base() {
9682 let base = BinStub {
9683 node_id: 2,
9684 level: BIN_LEVEL,
9685 entries: vec![
9686 BinEntry {
9687 data: Some(b"base_aa".to_vec()),
9688 known_deleted: false,
9689 dirty: false,
9690 expiration_time: 0,
9691 },
9692 BinEntry {
9693 data: Some(b"base_cc".to_vec()),
9694 known_deleted: false,
9695 dirty: false,
9696 expiration_time: 0,
9697 },
9698 ],
9699 key_prefix: Vec::new(),
9700 dirty: false,
9701 is_delta: false,
9702 last_full_lsn: NULL_LSN,
9703 last_delta_lsn: NULL_LSN,
9704 generation: 0,
9705 parent: None,
9706 expiration_in_hours: true,
9707 cursor_count: 0,
9708 prohibit_next_delta: false,
9709 lsn_rep: LsnRep::Empty,
9710 keys: KeyRep::from_keys(vec![b"aa".to_vec(), b"cc".to_vec()]),
9711 compact_max_key_length: INKeyRep_DEFAULT_MAX_KEY_LENGTH,
9712 };
9713
9714 // The delta has a new entry "bb" and overwrites "aa".
9715 let mut delta = BinStub {
9716 node_id: 2,
9717 level: BIN_LEVEL,
9718 entries: vec![
9719 BinEntry {
9720 data: Some(b"delta_aa".to_vec()),
9721 known_deleted: false,
9722 dirty: false,
9723 expiration_time: 0,
9724 },
9725 BinEntry {
9726 data: Some(b"delta_bb".to_vec()),
9727 known_deleted: false,
9728 dirty: false,
9729 expiration_time: 0,
9730 },
9731 ],
9732 key_prefix: Vec::new(),
9733 dirty: true,
9734 is_delta: true,
9735 last_full_lsn: NULL_LSN,
9736 last_delta_lsn: NULL_LSN,
9737 generation: 0,
9738 parent: None,
9739 expiration_in_hours: true,
9740 cursor_count: 0,
9741 prohibit_next_delta: false,
9742 lsn_rep: LsnRep::Empty,
9743 keys: KeyRep::from_keys(vec![b"aa".to_vec(), b"bb".to_vec()]),
9744 compact_max_key_length: INKeyRep_DEFAULT_MAX_KEY_LENGTH,
9745 };
9746
9747 Tree::mutate_to_full_bin(&mut delta, base);
9748
9749 // After mutation the node must be a full BIN.
9750 assert!(
9751 !delta.is_delta,
9752 "is_delta must be false after mutate_to_full_bin"
9753 );
9754 assert!(delta.dirty, "must be dirty after mutation");
9755
9756 // Collect full keys for assertions (T-2: keys live in the rep).
9757 let dk: Vec<Vec<u8>> = (0..delta.entries.len())
9758 .map(|i| delta.get_full_key(i).unwrap_or_default())
9759 .collect();
9760
9761 // "aa" must be the delta version.
9762 let aa_idx = dk.iter().position(|k| k == b"aa").unwrap();
9763 assert_eq!(
9764 delta.entries[aa_idx].data.as_deref(),
9765 Some(b"delta_aa" as &[u8])
9766 );
9767
9768 // "bb" must be present (from delta).
9769 assert!(dk.iter().any(|k| k == b"bb"));
9770
9771 // "cc" must be present (from base).
9772 assert!(dk.iter().any(|k| k == b"cc"));
9773
9774 // Three entries total, in sorted order.
9775 assert_eq!(delta.entries.len(), 3);
9776 let mut sorted = dk.clone();
9777 sorted.sort();
9778 assert_eq!(dk, sorted, "entries must be sorted after mutation");
9779 }
9780
9781 /// is_delta flag is correctly reported by bin_is_delta().
9782 #[test]
9783 fn test_bin_is_delta_flag() {
9784 let mut bin = BinStub {
9785 node_id: 1,
9786 level: BIN_LEVEL,
9787 entries: vec![],
9788 key_prefix: Vec::new(),
9789 dirty: false,
9790 is_delta: false,
9791 last_full_lsn: NULL_LSN,
9792 last_delta_lsn: NULL_LSN,
9793 generation: 0,
9794 parent: None,
9795 expiration_in_hours: true,
9796 cursor_count: 0,
9797 prohibit_next_delta: false,
9798 lsn_rep: LsnRep::Empty,
9799 keys: KeyRep::new(),
9800 compact_max_key_length: INKeyRep_DEFAULT_MAX_KEY_LENGTH,
9801 };
9802 assert!(!Tree::bin_is_delta(&bin));
9803 bin.is_delta = true;
9804 assert!(Tree::bin_is_delta(&bin));
9805 }
9806
9807 // ========================================================================
9808 // Tests: mutate_to_full_bin_from_log
9809 // ========================================================================
9810
9811 /// mutate_to_full_bin_from_log is a no-op when the BIN is already full.
9812 #[test]
9813 fn test_mutate_to_full_bin_from_log_already_full() {
9814 let dir = tempfile::tempdir().unwrap();
9815 let fm = std::sync::Arc::new(
9816 noxu_log::FileManager::new(dir.path(), false, 10_000_000, 100)
9817 .unwrap(),
9818 );
9819 let lm = noxu_log::LogManager::new(fm, 3, 1024 * 1024, 4096);
9820
9821 let mut bin = BinStub {
9822 node_id: 1,
9823 level: BIN_LEVEL,
9824 entries: vec![BinEntry {
9825 data: Some(b"v1".to_vec()),
9826 known_deleted: false,
9827 dirty: false,
9828 expiration_time: 0,
9829 }],
9830 key_prefix: Vec::new(),
9831 dirty: false,
9832 is_delta: false, // already a full BIN
9833 last_full_lsn: NULL_LSN,
9834 last_delta_lsn: NULL_LSN,
9835 generation: 0,
9836 parent: None,
9837 expiration_in_hours: true,
9838 cursor_count: 0,
9839 prohibit_next_delta: false,
9840 lsn_rep: LsnRep::Empty,
9841 keys: KeyRep::from_keys(vec![b"key1".to_vec()]),
9842 compact_max_key_length: INKeyRep_DEFAULT_MAX_KEY_LENGTH,
9843 };
9844
9845 Tree::mutate_to_full_bin_from_log(&mut bin, &lm);
9846
9847 // No-op: is_delta was already false, entries unchanged.
9848 assert!(!bin.is_delta);
9849 assert_eq!(bin.entries.len(), 1);
9850 }
9851
9852 /// mutate_to_full_bin_from_log with NULL_LSN promotes delta without base.
9853 ///
9854 /// When last_full_lsn is NULL_LSN the BIN has never been written as a full
9855 /// entry. The function must clear is_delta and leave the delta entries
9856 /// as-is (they are the authoritative full state).
9857 #[test]
9858 fn test_mutate_to_full_bin_from_log_null_lsn() {
9859 let dir = tempfile::tempdir().unwrap();
9860 let fm = std::sync::Arc::new(
9861 noxu_log::FileManager::new(dir.path(), false, 10_000_000, 100)
9862 .unwrap(),
9863 );
9864 let lm = noxu_log::LogManager::new(fm, 3, 1024 * 1024, 4096);
9865
9866 let mut delta = BinStub {
9867 node_id: 2,
9868 level: BIN_LEVEL,
9869 entries: vec![BinEntry {
9870 data: Some(b"delta_a".to_vec()),
9871 known_deleted: false,
9872 dirty: true,
9873 expiration_time: 0,
9874 }],
9875 key_prefix: Vec::new(),
9876 dirty: true,
9877 is_delta: true,
9878 last_full_lsn: NULL_LSN, // no full BIN ever written
9879 last_delta_lsn: NULL_LSN,
9880 generation: 0,
9881 parent: None,
9882 expiration_in_hours: true,
9883 cursor_count: 0,
9884 prohibit_next_delta: false,
9885 lsn_rep: LsnRep::Empty,
9886 keys: KeyRep::from_keys(vec![b"a".to_vec()]),
9887 compact_max_key_length: INKeyRep_DEFAULT_MAX_KEY_LENGTH,
9888 };
9889
9890 Tree::mutate_to_full_bin_from_log(&mut delta, &lm);
9891
9892 // is_delta must be cleared; the single delta entry is kept as-is.
9893 assert!(
9894 !delta.is_delta,
9895 "is_delta must be false after null-lsn promotion"
9896 );
9897 assert_eq!(delta.entries.len(), 1);
9898 assert_eq!(delta.entries[0].data.as_deref(), Some(b"delta_a" as &[u8]));
9899 }
9900
9901 /// mutate_to_full_bin_from_log reads full BIN from log and merges delta.
9902 ///
9903 /// Round-trip: serialize a full BIN, write it to a LogManager, record the
9904 /// LSN, then call mutate_to_full_bin_from_log on a delta referencing that
9905 /// LSN. The result must contain base-only and delta-only entries with the
9906 /// delta winning on conflicts.
9907 #[test]
9908 fn test_mutate_to_full_bin_from_log_reads_and_merges() {
9909 let dir = tempfile::tempdir().unwrap();
9910 let fm = std::sync::Arc::new(
9911 noxu_log::FileManager::new(dir.path(), false, 10_000_000, 100)
9912 .unwrap(),
9913 );
9914 let lm = noxu_log::LogManager::new(fm, 3, 1024 * 1024, 4096);
9915
9916 // Build and serialize the full BIN that will be written to the log.
9917 let full_bin = BinStub {
9918 node_id: 42,
9919 level: BIN_LEVEL,
9920 entries: vec![
9921 BinEntry {
9922 data: Some(b"base_val".to_vec()),
9923 known_deleted: false,
9924 dirty: false,
9925 expiration_time: 0,
9926 },
9927 BinEntry {
9928 data: Some(b"base_shared".to_vec()),
9929 known_deleted: false,
9930 dirty: false,
9931 expiration_time: 0,
9932 },
9933 ],
9934 key_prefix: Vec::new(),
9935 dirty: false,
9936 is_delta: false,
9937 last_full_lsn: NULL_LSN,
9938 last_delta_lsn: NULL_LSN,
9939 generation: 0,
9940 parent: None,
9941 expiration_in_hours: true,
9942 cursor_count: 0,
9943 prohibit_next_delta: false,
9944 lsn_rep: LsnRep::Empty,
9945 keys: KeyRep::from_keys(vec![
9946 b"base_only".to_vec(),
9947 b"shared_key".to_vec(),
9948 ]),
9949 compact_max_key_length: INKeyRep_DEFAULT_MAX_KEY_LENGTH,
9950 };
9951
9952 let payload = full_bin.serialize_full();
9953 let full_lsn = lm
9954 .log(
9955 noxu_log::LogEntryType::BIN,
9956 &payload,
9957 noxu_log::Provisional::No,
9958 true,
9959 false,
9960 )
9961 .expect("write full BIN to log");
9962 lm.flush_no_sync().expect("flush log");
9963
9964 // Build a delta BIN referencing the full BIN via last_full_lsn.
9965 let mut delta = BinStub {
9966 node_id: 42,
9967 level: BIN_LEVEL,
9968 entries: vec![
9969 // Overwrites "shared_key" from the base.
9970 BinEntry {
9971 data: Some(b"delta_shared".to_vec()),
9972 known_deleted: false,
9973 dirty: true,
9974 expiration_time: 0,
9975 },
9976 // New key only in the delta.
9977 BinEntry {
9978 data: Some(b"delta_val".to_vec()),
9979 known_deleted: false,
9980 dirty: true,
9981 expiration_time: 0,
9982 },
9983 ],
9984 key_prefix: Vec::new(),
9985 dirty: true,
9986 is_delta: true,
9987 last_full_lsn: full_lsn,
9988 last_delta_lsn: NULL_LSN,
9989 generation: 0,
9990 parent: None,
9991 expiration_in_hours: true,
9992 cursor_count: 0,
9993 prohibit_next_delta: false,
9994 lsn_rep: LsnRep::Empty,
9995 keys: KeyRep::from_keys(vec![
9996 b"shared_key".to_vec(),
9997 b"delta_only".to_vec(),
9998 ]),
9999 compact_max_key_length: INKeyRep_DEFAULT_MAX_KEY_LENGTH,
10000 };
10001
10002 Tree::mutate_to_full_bin_from_log(&mut delta, &lm);
10003
10004 assert!(
10005 !delta.is_delta,
10006 "is_delta must be false after log-based mutation"
10007 );
10008 assert!(delta.dirty, "must be dirty after mutation");
10009
10010 // All three distinct keys must be present.
10011 let find = |k: &[u8]| -> Option<Vec<u8>> {
10012 (0..delta.entries.len())
10013 .find(|&i| delta.get_full_key(i).as_deref() == Some(k))
10014 .and_then(|i| delta.entries[i].data.clone())
10015 };
10016
10017 assert_eq!(
10018 find(b"base_only"),
10019 Some(b"base_val".to_vec()),
10020 "base-only key must be present"
10021 );
10022 assert_eq!(
10023 find(b"shared_key"),
10024 Some(b"delta_shared".to_vec()),
10025 "delta must win on shared_key"
10026 );
10027 assert_eq!(
10028 find(b"delta_only"),
10029 Some(b"delta_val".to_vec()),
10030 "delta-only key must be present"
10031 );
10032 assert_eq!(delta.entries.len(), 3, "must have exactly 3 entries");
10033
10034 // Entries must be in sorted order (by full key).
10035 let full_keys: Vec<Vec<u8>> = (0..delta.entries.len())
10036 .map(|i| delta.get_full_key(i).unwrap())
10037 .collect();
10038 let mut sorted_keys = full_keys.clone();
10039 sorted_keys.sort();
10040 assert_eq!(full_keys, sorted_keys, "entries must be in sorted order");
10041 }
10042
10043 // ========================================================================
10044 // Tests: deserialize_full key prefix recomputation
10045 // ========================================================================
10046
10047 /// deserialize_full recomputes key prefix from loaded full keys.
10048 ///
10049 /// IN.recalcKeyPrefix() called after materializing from log:
10050 /// a BIN loaded from the log should have prefix compression applied so
10051 /// that search performance matches an in-memory BIN.
10052 #[test]
10053 fn test_deserialize_full_recomputes_key_prefix() {
10054 // Build a BIN with a known common prefix and serialize it.
10055 let mut source = BinStub {
10056 node_id: 99,
10057 level: BIN_LEVEL,
10058 entries: vec![
10059 BinEntry {
10060 data: None,
10061 known_deleted: false,
10062 dirty: false,
10063 expiration_time: 0,
10064 },
10065 BinEntry {
10066 data: None,
10067 known_deleted: false,
10068 dirty: false,
10069 expiration_time: 0,
10070 },
10071 BinEntry {
10072 data: None,
10073 known_deleted: false,
10074 dirty: false,
10075 expiration_time: 0,
10076 },
10077 ],
10078 key_prefix: Vec::new(),
10079 dirty: false,
10080 is_delta: false,
10081 last_full_lsn: NULL_LSN,
10082 last_delta_lsn: NULL_LSN,
10083 generation: 0,
10084 parent: None,
10085 expiration_in_hours: true,
10086 cursor_count: 0,
10087 prohibit_next_delta: false,
10088 lsn_rep: LsnRep::Empty,
10089 keys: KeyRep::from_keys(vec![
10090 b"pfx:alpha".to_vec(),
10091 b"pfx:beta".to_vec(),
10092 b"pfx:gamma".to_vec(),
10093 ]),
10094 compact_max_key_length: INKeyRep_DEFAULT_MAX_KEY_LENGTH,
10095 };
10096 source.recompute_key_prefix();
10097 // Verify the source has the expected prefix before serializing.
10098 assert_eq!(source.key_prefix, b"pfx:");
10099
10100 let payload = source.serialize_full();
10101
10102 // Deserialize and verify prefix is re-established.
10103 let loaded = BinStub::deserialize_full(&payload)
10104 .expect("deserialization must succeed");
10105
10106 assert_eq!(
10107 loaded.key_prefix, b"pfx:",
10108 "key prefix must be recomputed after deserialize_full"
10109 );
10110
10111 // All full keys must be reconstructable.
10112 for i in 0..loaded.entries.len() {
10113 let fk = loaded.get_full_key(i).unwrap();
10114 assert!(
10115 fk.starts_with(b"pfx:"),
10116 "full key {i} must start with prefix"
10117 );
10118 }
10119 }
10120
10121 /// deserialize_full with a single entry leaves key_prefix empty.
10122 ///
10123 /// A BIN with fewer than 2 entries cannot have a meaningful common prefix.
10124 #[test]
10125 fn test_deserialize_full_single_entry_no_prefix() {
10126 let source = BinStub {
10127 node_id: 7,
10128 level: BIN_LEVEL,
10129 entries: vec![BinEntry {
10130 data: None,
10131 known_deleted: false,
10132 dirty: false,
10133 expiration_time: 0,
10134 }],
10135 key_prefix: Vec::new(),
10136 dirty: false,
10137 is_delta: false,
10138 last_full_lsn: NULL_LSN,
10139 last_delta_lsn: NULL_LSN,
10140 generation: 0,
10141 parent: None,
10142 expiration_in_hours: true,
10143 cursor_count: 0,
10144 prohibit_next_delta: false,
10145 lsn_rep: LsnRep::Empty,
10146 keys: KeyRep::from_keys(vec![b"solo".to_vec()]),
10147 compact_max_key_length: INKeyRep_DEFAULT_MAX_KEY_LENGTH,
10148 };
10149
10150 let payload = source.serialize_full();
10151 let loaded = BinStub::deserialize_full(&payload)
10152 .expect("deserialization must succeed");
10153
10154 assert!(
10155 loaded.key_prefix.is_empty(),
10156 "single-entry BIN must have empty prefix"
10157 );
10158 assert_eq!(loaded.get_full_key(0).unwrap(), b"solo");
10159 }
10160
10161 // ========================================================================
10162 // Tests: get_next_bin / get_prev_bin
10163 // ========================================================================
10164
10165 /// get_next_bin returns the entries of the next BIN to the right.
10166 ///
10167 /// Tree.getNextBin() / getNextIN(forward=true).
10168 #[test]
10169 fn test_get_next_bin_basic() {
10170 let tree = Tree::new(1, 4);
10171
10172 // Insert 8 sorted keys — creates multiple BINs.
10173 for i in 0u32..8 {
10174 let key = format!("n{:04}", i).into_bytes();
10175 tree.insert(key, vec![i as u8], Lsn::new(1, i)).unwrap();
10176 }
10177
10178 let stats = tree.collect_stats();
10179 if stats.n_bins < 2 {
10180 // If the tree only has one BIN, skip the sibling test.
10181 return;
10182 }
10183
10184 // A key from the first BIN (e.g. "n0000") should have a next BIN.
10185 let next = tree.get_next_bin(b"n0000");
10186 assert!(
10187 next.is_some(),
10188 "must return a next BIN for a key in the leftmost BIN"
10189 );
10190
10191 let entries = next.unwrap();
10192 assert!(!entries.is_empty(), "next BIN must not be empty");
10193 // All returned keys must be strictly greater than "n0000" because they
10194 // are in a different (rightward) BIN.
10195 for (_, _, k) in &entries {
10196 assert!(
10197 k.as_slice() > b"n0000" as &[u8],
10198 "next BIN entries must all be > the search key"
10199 );
10200 }
10201 }
10202
10203 /// get_next_bin returns None for a key in the rightmost BIN.
10204 #[test]
10205 fn test_get_next_bin_at_rightmost_returns_none() {
10206 let tree = Tree::new(1, 4);
10207 for i in 0u32..8 {
10208 let key = format!("r{:04}", i).into_bytes();
10209 tree.insert(key, vec![i as u8], Lsn::new(1, i)).unwrap();
10210 }
10211 // A key from the rightmost BIN (e.g. "r0007") has no next BIN.
10212 let next = tree.get_next_bin(b"r0007");
10213 assert!(
10214 next.is_none(),
10215 "must return None for a key in the rightmost BIN"
10216 );
10217 }
10218
10219 /// get_prev_bin returns the entries of the next BIN to the left.
10220 ///
10221 /// Tree.getPrevBin() / getNextIN(forward=false).
10222 #[test]
10223 fn test_get_prev_bin_basic() {
10224 let tree = Tree::new(1, 4);
10225 for i in 0u32..8 {
10226 let key = format!("p{:04}", i).into_bytes();
10227 tree.insert(key, vec![i as u8], Lsn::new(1, i)).unwrap();
10228 }
10229
10230 // A key from the second BIN ("p0004") should have a previous BIN.
10231 let prev = tree.get_prev_bin(b"p0004");
10232 assert!(
10233 prev.is_some(),
10234 "must return a prev BIN for a key in the second BIN"
10235 );
10236
10237 let entries = prev.unwrap();
10238 assert!(!entries.is_empty(), "prev BIN must not be empty");
10239 // All returned keys must be < b"p0004".
10240 for (_, _, k) in &entries {
10241 assert!(
10242 k.as_slice() < b"p0004" as &[u8],
10243 "prev BIN entries must all be < the current BIN"
10244 );
10245 }
10246 }
10247
10248 /// get_prev_bin returns None for a key in the leftmost BIN.
10249 #[test]
10250 fn test_get_prev_bin_at_leftmost_returns_none() {
10251 let tree = Tree::new(1, 4);
10252 for i in 0u32..8 {
10253 let key = format!("q{:04}", i).into_bytes();
10254 tree.insert(key, vec![i as u8], Lsn::new(1, i)).unwrap();
10255 }
10256 // A key from the leftmost BIN ("q0000") has no prev BIN.
10257 let prev = tree.get_prev_bin(b"q0000");
10258 assert!(
10259 prev.is_none(),
10260 "must return None for a key in the leftmost BIN"
10261 );
10262 }
10263
10264 /// get_next_bin and get_prev_bin are inverse operations across the
10265 /// BIN boundary.
10266 #[test]
10267 fn test_next_prev_bin_are_symmetric() {
10268 let tree = Tree::new(1, 4);
10269 for i in 0u32..8 {
10270 let key = format!("s{:04}", i).into_bytes();
10271 tree.insert(key, vec![i as u8], Lsn::new(1, i)).unwrap();
10272 }
10273
10274 // From first BIN (s0000): next → second BIN entries.
10275 let next_from_first = tree.get_next_bin(b"s0000").unwrap();
10276 // The smallest key of the next BIN.
10277 let next_first_key =
10278 next_from_first.iter().map(|(_, _, k)| k.clone()).min().unwrap();
10279
10280 // From that key in the second BIN: prev → should overlap with first BIN.
10281 let prev_from_second = tree.get_prev_bin(&next_first_key).unwrap();
10282 let prev_first_key =
10283 prev_from_second.iter().map(|(_, _, k)| k.clone()).max().unwrap();
10284
10285 // The max key of the "prev" result must be in the first BIN (< next boundary).
10286 assert!(
10287 prev_first_key < next_first_key,
10288 "prev BIN entries must be smaller than the boundary key"
10289 );
10290 }
10291
10292 /// get_next_bin on an empty tree returns None.
10293 #[test]
10294 fn test_get_next_bin_empty_tree() {
10295 let tree = Tree::new(1, 8);
10296 assert!(tree.get_next_bin(b"any").is_none());
10297 }
10298
10299 /// get_prev_bin on an empty tree returns None.
10300 #[test]
10301 fn test_get_prev_bin_empty_tree() {
10302 let tree = Tree::new(1, 8);
10303 assert!(tree.get_prev_bin(b"any").is_none());
10304 }
10305
10306 // =========================================================================
10307 // R3 fix: get_next_bin / get_prev_bin honour the custom comparator
10308 // =========================================================================
10309
10310 /// R3 regression test: with a custom comparator that reverses byte order
10311 /// (descending), `get_next_bin` and `get_prev_bin` must use comparator
10312 /// order when routing through internal nodes.
10313 ///
10314 /// Pre-fix: the static `get_adjacent_bin_attempt` used raw `<=` byte order
10315 /// for IN routing, causing it to descend to the wrong child when comparator
10316 /// order ≠ byte order.
10317 ///
10318 /// The tree is forced to split (max_entries = 4) so there IS an internal
10319 /// node (IN) to route through. Under a reverse comparator the insertion
10320 /// order and stored key order are reversed relative to byte order, so any
10321 /// descent that uses raw byte comparison will pick the wrong slot.
10322 ///
10323 /// Pass-post invariant: iterating forward via repeated `get_next_bin` from
10324 /// the leftmost BIN yields keys in COMPARATOR order (descending byte order
10325 /// here), not in raw ascending byte order.
10326 #[test]
10327 fn test_get_next_prev_bin_custom_comparator_order() {
10328 // Reverse-order comparator: larger bytes sort first.
10329 let reverse_cmp: KeyComparatorFn =
10330 Arc::new(|a: &[u8], b: &[u8]| b.cmp(a));
10331 // Small max_entries so the tree splits and has internal nodes.
10332 let mut tree = Tree::new(1, 4);
10333 tree.set_comparator(reverse_cmp);
10334
10335 // Insert keys that are ascending in byte order ("a" < "b" < … < "i")
10336 // but descending in comparator order (i > h > … > a).
10337 let keys: &[&[u8]] =
10338 &[b"a", b"b", b"c", b"d", b"e", b"f", b"g", b"h", b"i"];
10339 for (i, k) in keys.iter().enumerate() {
10340 tree.insert(
10341 k.to_vec(),
10342 vec![i as u8],
10343 Lsn::from_u64((i + 1) as u64),
10344 )
10345 .unwrap();
10346 }
10347
10348 // Collect all BINs by walking from the comparator-smallest key ("i"
10349 // in reverse order) using get_next_bin. The anchor must be a key that
10350 // is smaller than everything in comparator order, i.e. the largest
10351 // byte-value key. We use the tree's search to find the actual leftmost
10352 // key under the comparator by starting from "i" (comparator-min).
10353 //
10354 // Strategy: start at byte key b"\xff" (larger than any inserted key in
10355 // byte order, so it lands in the last BIN in byte order, which under
10356 // a reverse comparator is the leftmost BIN in comparator order). Then
10357 // walk via get_next_bin.
10358 let start_anchor = b"\xff".as_ref();
10359 let mut bin_first_keys: Vec<Vec<u8>> = Vec::new();
10360
10361 // The first BIN in comparator order contains "i" (largest byte key).
10362 // get_next_bin from a virtual start in that BIN gives the next one.
10363 // Collect by walking from the comparator-last key leftward instead:
10364 // use get_next_bin with anchor = b"\xff" to hop to the next BIN
10365 // (comparator order: next = smaller byte value).
10366 let mut anchor = start_anchor.to_vec();
10367 loop {
10368 match tree.get_next_bin(&anchor) {
10369 None => break,
10370 Some(entries) => {
10371 if let Some((_, _, fk0)) = entries.first() {
10372 let fk = fk0.clone();
10373 bin_first_keys.push(fk.clone());
10374 anchor = fk;
10375 } else {
10376 break;
10377 }
10378 }
10379 }
10380 }
10381
10382 // We must have visited at least 2 BINs (tree was forced to split).
10383 assert!(
10384 bin_first_keys.len() >= 2,
10385 "R3: expected multiple BINs after split, got {}",
10386 bin_first_keys.len()
10387 );
10388
10389 // With a reverse comparator, bin_first_keys must be in descending byte
10390 // order (each successive BIN starts at a smaller byte key).
10391 for window in bin_first_keys.windows(2) {
10392 assert!(
10393 window[0] > window[1],
10394 "R3: BIN boundary keys must be descending (comparator order); \
10395 got {:?} then {:?}",
10396 window[0],
10397 window[1]
10398 );
10399 }
10400 }
10401 // ========================================================================
10402
10403 /// Inserting keys with a common prefix causes the BIN to establish that
10404 /// prefix. Stored suffixes are shorter than the full keys.
10405 #[test]
10406 fn test_binstub_prefix_established_on_insert() {
10407 let mut bin = BinStub {
10408 node_id: 1,
10409 level: BIN_LEVEL,
10410 entries: Vec::new(),
10411 key_prefix: Vec::new(),
10412 dirty: false,
10413 is_delta: false,
10414 last_full_lsn: NULL_LSN,
10415 last_delta_lsn: NULL_LSN,
10416 generation: 0,
10417 parent: None,
10418 expiration_in_hours: true,
10419 cursor_count: 0,
10420 prohibit_next_delta: false,
10421 lsn_rep: LsnRep::Empty,
10422 keys: KeyRep::new(),
10423 compact_max_key_length: INKeyRep_DEFAULT_MAX_KEY_LENGTH,
10424 };
10425
10426 bin.insert_with_prefix(b"record:aaa".to_vec(), Lsn::new(1, 1), None);
10427 assert!(bin.key_prefix.is_empty(), "single entry: no prefix yet");
10428
10429 bin.insert_with_prefix(b"record:bbb".to_vec(), Lsn::new(1, 2), None);
10430 assert_eq!(
10431 &bin.key_prefix, b"record:",
10432 "common prefix 'record:' must be extracted"
10433 );
10434 }
10435
10436 /// `get_full_key` on a BinStub returns the full key regardless of whether
10437 /// the stored key is a raw full key or a suffix.
10438 #[test]
10439 fn test_binstub_get_full_key_roundtrip() {
10440 let mut bin = BinStub {
10441 node_id: 1,
10442 level: BIN_LEVEL,
10443 entries: Vec::new(),
10444 key_prefix: Vec::new(),
10445 dirty: false,
10446 is_delta: false,
10447 last_full_lsn: NULL_LSN,
10448 last_delta_lsn: NULL_LSN,
10449 generation: 0,
10450 parent: None,
10451 expiration_in_hours: true,
10452 cursor_count: 0,
10453 prohibit_next_delta: false,
10454 lsn_rep: LsnRep::Empty,
10455 keys: KeyRep::new(),
10456 compact_max_key_length: INKeyRep_DEFAULT_MAX_KEY_LENGTH,
10457 };
10458
10459 let keys = [
10460 b"pfx:first".as_ref(),
10461 b"pfx:second".as_ref(),
10462 b"pfx:third".as_ref(),
10463 ];
10464 for k in keys {
10465 bin.insert_with_prefix(k.to_vec(), Lsn::new(1, 1), None);
10466 }
10467
10468 assert!(!bin.key_prefix.is_empty(), "prefix must be set");
10469
10470 for (i, expected) in keys.iter().enumerate() {
10471 let full = bin.get_full_key(i).expect("must return full key");
10472 assert_eq!(
10473 full.as_slice(),
10474 *expected,
10475 "get_full_key({}) must return full key",
10476 i
10477 );
10478 }
10479 }
10480
10481 /// `find_entry_compressed` on a BinStub with active prefix returns the
10482 /// correct slot index.
10483 #[test]
10484 fn test_binstub_find_entry_compressed() {
10485 let mut bin = BinStub {
10486 node_id: 1,
10487 level: BIN_LEVEL,
10488 entries: Vec::new(),
10489 key_prefix: Vec::new(),
10490 dirty: false,
10491 is_delta: false,
10492 last_full_lsn: NULL_LSN,
10493 last_delta_lsn: NULL_LSN,
10494 generation: 0,
10495 parent: None,
10496 expiration_in_hours: true,
10497 cursor_count: 0,
10498 prohibit_next_delta: false,
10499 lsn_rep: LsnRep::Empty,
10500 keys: KeyRep::new(),
10501 compact_max_key_length: INKeyRep_DEFAULT_MAX_KEY_LENGTH,
10502 };
10503
10504 for k in
10505 [b"db:alpha".as_ref(), b"db:beta".as_ref(), b"db:gamma".as_ref()]
10506 {
10507 bin.insert_with_prefix(k.to_vec(), Lsn::new(1, 1), None);
10508 }
10509
10510 let (idx, found) = bin.find_entry_compressed(b"db:beta");
10511 assert!(found, "db:beta must be found");
10512 assert_eq!(idx, 1, "db:beta must be at index 1");
10513
10514 let (_, not_found) = bin.find_entry_compressed(b"db:zzz");
10515 assert!(!not_found, "db:zzz must not be found");
10516 }
10517
10518 /// Tree insert/search works correctly when BINs accumulate a key prefix.
10519 #[test]
10520 fn test_tree_insert_search_with_prefix_compression() {
10521 let tree = Tree::new(1, 8);
10522 let n = 200u32;
10523
10524 // All keys share a long common prefix — good for prefix compression.
10525 for i in 0..n {
10526 let key = format!("namespace:entity:{:06}", i).into_bytes();
10527 let data = vec![i as u8];
10528 tree.insert(key, data, Lsn::new(1, i)).unwrap();
10529 }
10530
10531 // All keys must be findable.
10532 for i in 0..n {
10533 let key = format!("namespace:entity:{:06}", i).into_bytes();
10534 let sr = tree.search(&key);
10535 assert!(
10536 sr.is_some() && sr.unwrap().exact_parent_found,
10537 "key namespace:entity:{:06} must be found",
10538 i
10539 );
10540 }
10541 }
10542
10543 /// Prefix survives a BIN split: keys in both halves must still be findable.
10544 #[test]
10545 fn test_prefix_preserved_across_bin_split() {
10546 // Small fanout to force splits quickly.
10547 let tree = Tree::new(1, 4);
10548
10549 for i in 0u32..20 {
10550 let key = format!("pfx:key:{:04}", i).into_bytes();
10551 tree.insert(key, vec![i as u8], Lsn::new(1, i)).unwrap();
10552 }
10553
10554 // All keys must be findable after splits.
10555 for i in 0u32..20 {
10556 let key = format!("pfx:key:{:04}", i).into_bytes();
10557 let sr = tree.search(&key);
10558 assert!(
10559 sr.is_some() && sr.unwrap().exact_parent_found,
10560 "pfx:key:{:04} must be found after splits",
10561 i
10562 );
10563 }
10564 }
10565
10566 /// `decompress_key` round-trips: compress then decompress gives the original.
10567 #[test]
10568 fn test_binstub_compress_decompress_roundtrip() {
10569 let mut bin = BinStub {
10570 node_id: 1,
10571 level: BIN_LEVEL,
10572 entries: Vec::new(),
10573 key_prefix: Vec::new(),
10574 dirty: false,
10575 is_delta: false,
10576 last_full_lsn: NULL_LSN,
10577 last_delta_lsn: NULL_LSN,
10578 generation: 0,
10579 parent: None,
10580 expiration_in_hours: true,
10581 cursor_count: 0,
10582 prohibit_next_delta: false,
10583 lsn_rep: LsnRep::Empty,
10584 keys: KeyRep::new(),
10585 compact_max_key_length: INKeyRep_DEFAULT_MAX_KEY_LENGTH,
10586 };
10587
10588 for k in [b"myapp:user:1".as_ref(), b"myapp:user:2".as_ref()] {
10589 bin.insert_with_prefix(k.to_vec(), Lsn::new(1, 1), None);
10590 }
10591
10592 assert!(!bin.key_prefix.is_empty());
10593
10594 // Manually compress a full key and then decompress it.
10595 let full_key = b"myapp:user:3";
10596 let suffix = bin.compress_key(full_key);
10597 let recovered = bin.decompress_key(&suffix);
10598 assert_eq!(
10599 recovered.as_slice(),
10600 full_key,
10601 "compress→decompress must be identity"
10602 );
10603 }
10604
10605 /// get_next_bin correctly navigates a 3-level tree.
10606 #[test]
10607 fn test_get_next_bin_three_level_tree() {
10608 // With fanout 4, inserting 20 keys forces a root split → 3 levels.
10609 let tree = Tree::new(1, 4);
10610 for i in 0u32..20 {
10611 let key = format!("t{:04}", i).into_bytes();
10612 tree.insert(key, vec![i as u8], Lsn::new(1, i)).unwrap();
10613 }
10614 assert!(tree.get_root_splits() > 0, "tree must have grown to 3 levels");
10615
10616 // Starting from t0000, iterating via get_next_bin must visit every BIN.
10617 let mut visited: Vec<Vec<u8>> = Vec::new();
10618 // Collect the first BIN's keys by searching for t0000.
10619 if let Some(first_entries) = {
10620 // Get the leftmost BIN by using get_first_node result.
10621 // get_first_node returns SearchResult at index 0 in the leftmost BIN.
10622 // We approximate by reading the root's leftmost BIN directly.
10623 tree.get_next_bin(b"t0000")
10624 } {
10625 for (_, _, k) in first_entries {
10626 visited.push(k);
10627 }
10628 }
10629
10630 // visited should contain at least one key from the second BIN.
10631 assert!(
10632 !visited.is_empty(),
10633 "should have visited at least one key via get_next_bin in 3-level tree"
10634 );
10635 }
10636
10637 // ========================================================================
10638 // ========================================================================
10639
10640 /// insert a small set of keys
10641 /// with varying lengths and verify each is findable immediately after insert.
10642 #[test]
10643 fn test_je_simple_tree_creation() {
10644 let tree = Tree::new(1, 128);
10645
10646 let keys: &[&[u8]] = &[b"aaaaa", b"aaaab", b"aaaa", b"aaa"];
10647 for (i, &k) in keys.iter().enumerate() {
10648 tree.insert(k.to_vec(), vec![i as u8], Lsn::new(1, i as u32))
10649 .unwrap();
10650
10651 // Every key inserted so far must be findable.
10652 for &prev in &keys[..=i] {
10653 let sr = tree.search(prev);
10654 assert!(
10655 sr.is_some() && sr.unwrap().exact_parent_found,
10656 "key {:?} must be findable after {} inserts",
10657 std::str::from_utf8(prev).unwrap_or("?"),
10658 i + 1
10659 );
10660 }
10661 }
10662 }
10663
10664 /// insert N keys, verify
10665 /// all are found; delete the even-indexed keys, verify even are gone and
10666 /// odd remain.
10667 #[test]
10668 fn test_je_insert_then_delete_then_search() {
10669 let tree = Tree::new(1, 8);
10670 let n = 20usize;
10671
10672 let keys: Vec<Vec<u8>> =
10673 (0..n).map(|i| format!("key{:04}", i).into_bytes()).collect();
10674
10675 // Insert all.
10676 for (i, k) in keys.iter().enumerate() {
10677 tree.insert(k.clone(), vec![i as u8], Lsn::new(1, i as u32))
10678 .unwrap();
10679 }
10680
10681 // All must be findable.
10682 for k in &keys {
10683 let sr = tree.search(k);
10684 assert!(
10685 sr.is_some() && sr.unwrap().exact_parent_found,
10686 "key {:?} must be found after insert",
10687 std::str::from_utf8(k).unwrap_or("?")
10688 );
10689 }
10690
10691 // Delete even-indexed keys.
10692 for i in (0..n).step_by(2) {
10693 tree.delete(&keys[i]);
10694 }
10695
10696 // Even keys must no longer be found; odd keys must still be found.
10697 for (i, key) in keys.iter().enumerate() {
10698 let sr = tree.search(key);
10699 let found = sr.is_some() && sr.unwrap().exact_parent_found;
10700 if i % 2 == 0 {
10701 assert!(!found, "deleted key {:?} must not be found", i);
10702 } else {
10703 assert!(found, "kept key {:?} must still be found", i);
10704 }
10705 }
10706 }
10707
10708 /// insert N keys in reverse
10709 /// order, then verify every key is directly findable and the keys are in
10710 /// sorted ascending order (B-tree ordering invariant).
10711 #[test]
10712 fn test_je_range_scan_sorted_ascending() {
10713 let n = 40usize;
10714 let tree = Tree::new(1, 4);
10715
10716 // Insert in reverse order to stress the B-tree.
10717 for i in (0..n).rev() {
10718 let key = format!("scan{:04}", i).into_bytes();
10719 tree.insert(key, vec![i as u8], Lsn::new(1, i as u32)).unwrap();
10720 }
10721
10722 // Collect all expected keys in sorted order.
10723 let mut expected: Vec<Vec<u8>> =
10724 (0..n).map(|i| format!("scan{:04}", i).into_bytes()).collect();
10725 expected.sort();
10726
10727 // Every key must be individually findable.
10728 for key in &expected {
10729 let sr = tree.search(key);
10730 assert!(
10731 sr.is_some() && sr.unwrap().exact_parent_found,
10732 "key {:?} must be findable",
10733 std::str::from_utf8(key).unwrap_or("?")
10734 );
10735 }
10736
10737 // Verify sorted ordering invariant: expected keys are already sorted
10738 // (lexicographic order = insertion order for "scan{:04}" keys).
10739 for w in expected.windows(2) {
10740 assert!(
10741 w[0] < w[1],
10742 "keys must be in strict ascending order: {:?} < {:?}",
10743 std::str::from_utf8(&w[0]).unwrap_or("?"),
10744 std::str::from_utf8(&w[1]).unwrap_or("?")
10745 );
10746 }
10747
10748 // Use get_next_bin to scan at least a portion of the tree and verify
10749 // ordering of returned BIN entries.
10750 let first_key = format!("scan{:04}", 0).into_bytes();
10751 if let Some(entries) = tree.get_next_bin(&first_key) {
10752 let entry_keys: Vec<&[u8]> =
10753 entries.iter().map(|(_, _, k)| k.as_slice()).collect();
10754 for w in entry_keys.windows(2) {
10755 assert!(
10756 w[0] <= w[1],
10757 "BIN entries from get_next_bin must be in ascending order"
10758 );
10759 }
10760 }
10761 }
10762
10763 /// insert N keys in
10764 /// ascending order and verify the tree height stays bounded (≤ 10 levels)
10765 /// and all keys are findable.
10766 #[test]
10767 fn test_je_ascending_insert_balance() {
10768 let n = 128usize;
10769 let tree = Tree::new(1, 8);
10770
10771 for i in 0..n {
10772 let key = format!("asc{:06}", i).into_bytes();
10773 tree.insert(key, vec![(i & 0xFF) as u8], Lsn::new(1, i as u32))
10774 .unwrap();
10775 }
10776
10777 let stats = tree.collect_stats();
10778 assert!(
10779 stats.height <= 10,
10780 "tree height after {} ascending inserts with fanout 8 must be <= 10, got {}",
10781 n,
10782 stats.height
10783 );
10784
10785 for i in 0..n {
10786 let key = format!("asc{:06}", i).into_bytes();
10787 let sr = tree.search(&key);
10788 assert!(
10789 sr.is_some() && sr.unwrap().exact_parent_found,
10790 "key asc{:06} must be findable after ascending inserts",
10791 i
10792 );
10793 }
10794 }
10795
10796 /// insert N keys in
10797 /// descending order and verify the tree height stays bounded (≤ 10 levels)
10798 /// and all keys are findable.
10799 #[test]
10800 fn test_je_descending_insert_balance() {
10801 let n = 128usize;
10802 let tree = Tree::new(1, 8);
10803
10804 for i in (0..n).rev() {
10805 let key = format!("dsc{:06}", i).into_bytes();
10806 tree.insert(key, vec![(i & 0xFF) as u8], Lsn::new(1, i as u32))
10807 .unwrap();
10808 }
10809
10810 let stats = tree.collect_stats();
10811 assert!(
10812 stats.height <= 10,
10813 "tree height after {} descending inserts with fanout 8 must be <= 10, got {}",
10814 n,
10815 stats.height
10816 );
10817
10818 for i in 0..n {
10819 let key = format!("dsc{:06}", i).into_bytes();
10820 let sr = tree.search(&key);
10821 assert!(
10822 sr.is_some() && sr.unwrap().exact_parent_found,
10823 "key dsc{:06} must be findable after descending inserts",
10824 i
10825 );
10826 }
10827 }
10828
10829 /// SplitTest invariant: after many splits induced by a small
10830 /// fanout no key is lost.
10831 #[test]
10832 fn test_je_split_no_key_lost() {
10833 let tree = Tree::new(1, 4);
10834 let n = 20usize;
10835
10836 for i in 0..n {
10837 let key = format!("sp{:04}", i).into_bytes();
10838 tree.insert(key, vec![i as u8], Lsn::new(1, i as u32)).unwrap();
10839 }
10840
10841 for i in 0..n {
10842 let key = format!("sp{:04}", i).into_bytes();
10843 let sr = tree.search(&key);
10844 assert!(
10845 sr.is_some() && sr.unwrap().exact_parent_found,
10846 "key sp{:04} must survive all splits",
10847 i
10848 );
10849 }
10850 }
10851
10852 /// SplitTest invariant: after a BIN split both halves exist and
10853 /// all original keys are findable.
10854 #[test]
10855 fn test_je_split_produces_two_halves() {
10856 // fanout=4: fill one BIN then overflow it to force a split.
10857 let tree = Tree::new(1, 4);
10858 let n = 5usize; // one more than fanout → forces at least one split
10859
10860 for i in 0..n {
10861 let key = format!("half{:04}", i).into_bytes();
10862 tree.insert(key, vec![i as u8], Lsn::new(1, i as u32)).unwrap();
10863 }
10864
10865 let stats = tree.collect_stats();
10866 assert!(
10867 stats.n_bins >= 2,
10868 "after splitting a full BIN there must be >= 2 BINs, got {}",
10869 stats.n_bins
10870 );
10871
10872 for i in 0..n {
10873 let key = format!("half{:04}", i).into_bytes();
10874 let sr = tree.search(&key);
10875 assert!(
10876 sr.is_some() && sr.unwrap().exact_parent_found,
10877 "key half{:04} must be findable in one of the two halves",
10878 i
10879 );
10880 }
10881 }
10882
10883 /// SplitTest invariant: root splits are tracked and the tree
10884 /// grows in height as keys accumulate.
10885 #[test]
10886 fn test_je_root_split_creates_new_root() {
10887 // fanout=4, 20 keys: forces multiple root splits.
10888 let tree = Tree::new(1, 4);
10889
10890 for i in 0u32..20 {
10891 let key = format!("rs{:04}", i).into_bytes();
10892 tree.insert(key, vec![i as u8], Lsn::new(1, i)).unwrap();
10893 }
10894
10895 assert!(
10896 tree.get_root_splits() > 0,
10897 "expected at least one root split after 20 inserts with fanout 4"
10898 );
10899
10900 let stats = tree.collect_stats();
10901 assert!(
10902 stats.height >= 3,
10903 "tree must be at least 3 levels tall after root splits, got {}",
10904 stats.height
10905 );
10906
10907 // Every inserted key must still be findable.
10908 for i in 0u32..20 {
10909 let key = format!("rs{:04}", i).into_bytes();
10910 let sr = tree.search(&key);
10911 assert!(
10912 sr.is_some() && sr.unwrap().exact_parent_found,
10913 "key rs{:04} must be findable after root splits",
10914 i
10915 );
10916 }
10917 }
10918
10919 // ========================================================================
10920 // Tests: compress_bin / maybe_compress_bin_and_parent
10921 // INCompressor.compressBin / lazyCompress tests
10922 // ========================================================================
10923
10924 /// compress_bin removes known-deleted slots from a BIN.
10925 ///
10926 /// INCompressor.compressBin(): after compression, slots with
10927 /// `known_deleted = true` must be gone and the BIN must be dirty.
10928 #[test]
10929 fn test_compress_bin_removes_deleted_slots() {
10930 let _lsn = Lsn::new(1, 1);
10931 let bin_arc = Arc::new(RwLock::new(TreeNode::Bottom(BinStub {
10932 node_id: generate_node_id(),
10933 level: BIN_LEVEL,
10934 entries: vec![
10935 BinEntry {
10936 data: Some(b"live".to_vec()),
10937 known_deleted: false,
10938 dirty: false,
10939 expiration_time: 0,
10940 },
10941 BinEntry {
10942 data: None,
10943 known_deleted: true,
10944 dirty: false,
10945 expiration_time: 0,
10946 },
10947 BinEntry {
10948 data: Some(b"live2".to_vec()),
10949 known_deleted: false,
10950 dirty: false,
10951 expiration_time: 0,
10952 },
10953 BinEntry {
10954 data: None,
10955 known_deleted: true,
10956 dirty: false,
10957 expiration_time: 0,
10958 },
10959 ],
10960 key_prefix: Vec::new(),
10961 dirty: false,
10962 is_delta: false,
10963 last_full_lsn: NULL_LSN,
10964 last_delta_lsn: NULL_LSN,
10965 generation: 0,
10966 parent: None,
10967 expiration_in_hours: true,
10968 cursor_count: 0,
10969 prohibit_next_delta: false,
10970 lsn_rep: LsnRep::Empty,
10971 keys: KeyRep::from_keys(vec![
10972 b"a".to_vec(),
10973 b"b".to_vec(),
10974 b"c".to_vec(),
10975 b"d".to_vec(),
10976 ]),
10977 compact_max_key_length: INKeyRep_DEFAULT_MAX_KEY_LENGTH,
10978 })));
10979
10980 // Wire a minimal parent IN so compress_bin can prune if needed.
10981 let root_arc = Arc::new(RwLock::new(TreeNode::Internal(InNodeStub {
10982 node_id: generate_node_id(),
10983 level: MAIN_LEVEL | 2,
10984 entries: vec![InEntry { key: vec![] }],
10985 targets: TargetRep::Sparse(vec![(0, bin_arc.clone())]),
10986 dirty: false,
10987 generation: 0,
10988 parent: None,
10989 lsn_rep: LsnRep::Empty,
10990 })));
10991 {
10992 let mut g = bin_arc.write();
10993 g.set_parent(Some(Arc::downgrade(&root_arc)));
10994 }
10995
10996 let tree = Tree::new(1, 128);
10997 *tree.root.write() = Some(root_arc);
10998
10999 let result = tree.compress_bin(&bin_arc);
11000 assert!(
11001 result,
11002 "compress_bin must return true when slots were removed"
11003 );
11004
11005 let g = bin_arc.read();
11006 match &*g {
11007 TreeNode::Bottom(b) => {
11008 assert_eq!(
11009 b.entries.len(),
11010 2,
11011 "2 live entries must remain after compress"
11012 );
11013 assert!(
11014 b.entries.iter().all(|e| !e.known_deleted),
11015 "no deleted slots must remain"
11016 );
11017 assert!(b.dirty, "BIN must be dirty after compression");
11018 }
11019 _ => panic!("expected BIN"),
11020 }
11021 }
11022
11023 /// IC-3 HEADLINE (fail-pre / pass-post): the compressor must SKIP a
11024 /// `known_deleted` slot that is still write-locked by an in-flight txn,
11025 /// while removing committed/unlocked `known_deleted` slots in the SAME
11026 /// BIN. Mirrors JE `BIN.compress` (BIN.java:1141-1172), which calls
11027 /// `lockManager.isLockUncontended(lsn)` and does `continue` on a contended
11028 /// slot.
11029 ///
11030 /// Pre-fix: `compress_bin` had no lock check, so a write-locked tombstone
11031 /// would have been physically removed (the slot a live txn references is
11032 /// gone -> corruption). Post-fix: the `is_locked` predicate keeps it.
11033 #[test]
11034 fn test_ic3_compress_skips_write_locked_slot() {
11035 // Slot 1 (key "b", lsn 1:200) is a write-locked tombstone; slot 3
11036 // (key "d", lsn 1:400) is a committed/unlocked tombstone. Slots 0
11037 // and 2 are live.
11038 let locked_lsn = Lsn::new(1, 200);
11039 let unlocked_lsn = Lsn::new(1, 400);
11040 let bin_arc = Arc::new(RwLock::new(TreeNode::Bottom(BinStub {
11041 node_id: generate_node_id(),
11042 level: BIN_LEVEL,
11043 entries: vec![
11044 BinEntry {
11045 data: Some(b"live".to_vec()),
11046 known_deleted: false,
11047 dirty: false,
11048 expiration_time: 0,
11049 },
11050 BinEntry {
11051 data: None,
11052 known_deleted: true, // write-locked tombstone -> KEEP
11053 dirty: false,
11054 expiration_time: 0,
11055 },
11056 BinEntry {
11057 data: Some(b"live2".to_vec()),
11058 known_deleted: false,
11059 dirty: false,
11060 expiration_time: 0,
11061 },
11062 BinEntry {
11063 data: None,
11064 known_deleted: true, // committed tombstone -> REMOVE
11065 dirty: false,
11066 expiration_time: 0,
11067 },
11068 ],
11069 key_prefix: Vec::new(),
11070 dirty: false,
11071 is_delta: false,
11072 last_full_lsn: NULL_LSN,
11073 last_delta_lsn: NULL_LSN,
11074 generation: 0,
11075 parent: None,
11076 expiration_in_hours: true,
11077 cursor_count: 0,
11078 prohibit_next_delta: false,
11079 lsn_rep: LsnRep::from_lsns(&[
11080 Lsn::new(1, 100),
11081 locked_lsn,
11082 Lsn::new(1, 300),
11083 unlocked_lsn,
11084 ]),
11085 keys: KeyRep::from_keys(vec![
11086 b"a".to_vec(),
11087 b"b".to_vec(),
11088 b"c".to_vec(),
11089 b"d".to_vec(),
11090 ]),
11091 compact_max_key_length: INKeyRep_DEFAULT_MAX_KEY_LENGTH,
11092 })));
11093 let root_arc = Arc::new(RwLock::new(TreeNode::Internal(InNodeStub {
11094 node_id: generate_node_id(),
11095 level: MAIN_LEVEL | 2,
11096 entries: vec![InEntry { key: vec![] }],
11097 targets: TargetRep::Sparse(vec![(0, bin_arc.clone())]),
11098 dirty: false,
11099 generation: 0,
11100 parent: None,
11101 lsn_rep: LsnRep::Empty,
11102 })));
11103 {
11104 let mut g = bin_arc.write();
11105 g.set_parent(Some(Arc::downgrade(&root_arc)));
11106 }
11107 let tree = Tree::new(1, 128);
11108 *tree.root.write() = Some(root_arc);
11109
11110 // Predicate: only `locked_lsn` is write-locked (stub LockManager).
11111 let locked_u64 = locked_lsn.as_u64();
11112 let is_locked = move |lsn: u64| lsn == locked_u64;
11113
11114 let result =
11115 tree.compress_bin_with_lock_check(&bin_arc, Some(&is_locked));
11116 assert!(result, "compress removed the unlocked tombstone -> true");
11117
11118 let g = bin_arc.read();
11119 match &*g {
11120 TreeNode::Bottom(b) => {
11121 // 2 live + 1 write-locked tombstone kept; the committed
11122 // tombstone (lsn 1:400) removed.
11123 assert_eq!(
11124 b.entries.len(),
11125 3,
11126 "write-locked tombstone must be KEPT; only the unlocked one removed"
11127 );
11128 let kept_locked = (0..b.entries.len()).any(|i| {
11129 b.entries[i].known_deleted && b.get_lsn(i) == locked_lsn
11130 });
11131 assert!(kept_locked, "the write-locked tombstone must remain");
11132 let unlocked_gone =
11133 (0..b.entries.len()).all(|i| b.get_lsn(i) != unlocked_lsn);
11134 assert!(
11135 unlocked_gone,
11136 "the unlocked tombstone must be removed"
11137 );
11138 }
11139 _ => panic!("expected BIN"),
11140 }
11141 }
11142
11143 /// IC-3 (no predicate): with `is_locked = None` behavior is unchanged —
11144 /// ALL `known_deleted` slots are removed (the historical safe path).
11145 #[test]
11146 fn test_ic3_compress_no_predicate_removes_all_tombstones() {
11147 let bin_arc = Arc::new(RwLock::new(TreeNode::Bottom(BinStub {
11148 node_id: generate_node_id(),
11149 level: BIN_LEVEL,
11150 entries: vec![
11151 BinEntry {
11152 data: Some(b"live".to_vec()),
11153 known_deleted: false,
11154 dirty: false,
11155 expiration_time: 0,
11156 },
11157 BinEntry {
11158 data: None,
11159 known_deleted: true,
11160 dirty: false,
11161 expiration_time: 0,
11162 },
11163 BinEntry {
11164 data: None,
11165 known_deleted: true,
11166 dirty: false,
11167 expiration_time: 0,
11168 },
11169 ],
11170 key_prefix: Vec::new(),
11171 dirty: false,
11172 is_delta: false,
11173 last_full_lsn: NULL_LSN,
11174 last_delta_lsn: NULL_LSN,
11175 generation: 0,
11176 parent: None,
11177 expiration_in_hours: true,
11178 cursor_count: 0,
11179 prohibit_next_delta: false,
11180 lsn_rep: LsnRep::from_lsns(&[
11181 Lsn::new(1, 100),
11182 Lsn::new(1, 200),
11183 Lsn::new(1, 300),
11184 ]),
11185 keys: KeyRep::from_keys(vec![
11186 b"a".to_vec(),
11187 b"b".to_vec(),
11188 b"c".to_vec(),
11189 ]),
11190 compact_max_key_length: INKeyRep_DEFAULT_MAX_KEY_LENGTH,
11191 })));
11192 let root_arc = Arc::new(RwLock::new(TreeNode::Internal(InNodeStub {
11193 node_id: generate_node_id(),
11194 level: MAIN_LEVEL | 2,
11195 entries: vec![InEntry { key: vec![] }],
11196 targets: TargetRep::Sparse(vec![(0, bin_arc.clone())]),
11197 dirty: false,
11198 generation: 0,
11199 parent: None,
11200 lsn_rep: LsnRep::Empty,
11201 })));
11202 {
11203 let mut g = bin_arc.write();
11204 g.set_parent(Some(Arc::downgrade(&root_arc)));
11205 }
11206 let tree = Tree::new(1, 128);
11207 *tree.root.write() = Some(root_arc);
11208
11209 let result = tree.compress_bin(&bin_arc); // None predicate path
11210 assert!(result, "all tombstones removed -> true");
11211 let g = bin_arc.read();
11212 match &*g {
11213 TreeNode::Bottom(b) => {
11214 assert_eq!(b.entries.len(), 1, "only the live slot remains");
11215 assert!(b.entries.iter().all(|e| !e.known_deleted));
11216 }
11217 _ => panic!("expected BIN"),
11218 }
11219 }
11220
11221 /// compress_bin on a BIN with no deleted slots returns false.
11222 ///
11223 /// INCompressor: if no slots were removed, compression made no
11224 /// progress and returns false.
11225 #[test]
11226 fn test_compress_bin_no_deleted_slots_returns_false() {
11227 let _lsn = Lsn::new(1, 1);
11228 let bin_arc = Arc::new(RwLock::new(TreeNode::Bottom(BinStub {
11229 node_id: generate_node_id(),
11230 level: BIN_LEVEL,
11231 entries: vec![BinEntry {
11232 data: Some(b"d".to_vec()),
11233 known_deleted: false,
11234 dirty: false,
11235 expiration_time: 0,
11236 }],
11237 key_prefix: Vec::new(),
11238 dirty: false,
11239 is_delta: false,
11240 last_full_lsn: NULL_LSN,
11241 last_delta_lsn: NULL_LSN,
11242 generation: 0,
11243 parent: None,
11244 expiration_in_hours: true,
11245 cursor_count: 0,
11246 prohibit_next_delta: false,
11247 lsn_rep: LsnRep::Empty,
11248 keys: KeyRep::from_keys(vec![b"x".to_vec()]),
11249 compact_max_key_length: INKeyRep_DEFAULT_MAX_KEY_LENGTH,
11250 })));
11251
11252 let tree = Tree::new(1, 128);
11253 let result = tree.compress_bin(&bin_arc);
11254 assert!(
11255 !result,
11256 "compress_bin must return false when no slots were removed"
11257 );
11258 }
11259
11260 /// compress_bin on a BIN-delta is a no-op.
11261 ///
11262 /// INCompressor.compressBin(): "if (bin.isBINDelta()) return".
11263 #[test]
11264 fn test_compress_bin_skips_delta() {
11265 let _lsn = Lsn::new(1, 1);
11266 let bin_arc = Arc::new(RwLock::new(TreeNode::Bottom(BinStub {
11267 node_id: generate_node_id(),
11268 level: BIN_LEVEL,
11269 entries: vec![BinEntry {
11270 data: None,
11271 known_deleted: true,
11272 dirty: false,
11273 expiration_time: 0,
11274 }],
11275 key_prefix: Vec::new(),
11276 dirty: false,
11277 is_delta: true, // delta BIN — must be skipped
11278 last_full_lsn: NULL_LSN,
11279 last_delta_lsn: NULL_LSN,
11280 generation: 0,
11281 parent: None,
11282 expiration_in_hours: true,
11283 cursor_count: 0,
11284 prohibit_next_delta: false,
11285 lsn_rep: LsnRep::Empty,
11286 keys: KeyRep::from_keys(vec![b"k".to_vec()]),
11287 compact_max_key_length: INKeyRep_DEFAULT_MAX_KEY_LENGTH,
11288 })));
11289
11290 let tree = Tree::new(1, 128);
11291 let result = tree.compress_bin(&bin_arc);
11292 assert!(!result, "compress_bin must not compress a BIN-delta");
11293
11294 // The slot must still be there.
11295 let g = bin_arc.read();
11296 match &*g {
11297 TreeNode::Bottom(b) => assert_eq!(
11298 b.entries.len(),
11299 1,
11300 "slot must not be removed from delta"
11301 ),
11302 _ => panic!("expected BIN"),
11303 }
11304 }
11305
11306 /// compress_bin prunes an empty BIN from the tree.
11307 ///
11308 /// INCompressor.pruneBIN(): when all slots are deleted and
11309 /// compression empties the BIN, it must be removed from the parent IN.
11310 #[test]
11311 fn test_compress_bin_prunes_empty_bin() {
11312 let _lsn = Lsn::new(1, 1);
11313 // Insert a live key so the tree can be searched to prune.
11314 let bin_arc = Arc::new(RwLock::new(TreeNode::Bottom(BinStub {
11315 node_id: generate_node_id(),
11316 level: BIN_LEVEL,
11317 entries: vec![BinEntry {
11318 data: None,
11319 known_deleted: true,
11320 dirty: false,
11321 expiration_time: 0,
11322 }],
11323 key_prefix: Vec::new(),
11324 dirty: false,
11325 is_delta: false,
11326 last_full_lsn: NULL_LSN,
11327 last_delta_lsn: NULL_LSN,
11328 generation: 0,
11329 parent: None,
11330 expiration_in_hours: true,
11331 cursor_count: 0,
11332 prohibit_next_delta: false,
11333 lsn_rep: LsnRep::Empty,
11334 keys: KeyRep::from_keys(vec![b"only".to_vec()]),
11335 compact_max_key_length: INKeyRep_DEFAULT_MAX_KEY_LENGTH,
11336 })));
11337
11338 let root_arc = Arc::new(RwLock::new(TreeNode::Internal(InNodeStub {
11339 node_id: generate_node_id(),
11340 level: MAIN_LEVEL | 2,
11341 entries: vec![InEntry { key: vec![] }],
11342 targets: TargetRep::Sparse(vec![(0, bin_arc.clone())]),
11343 dirty: false,
11344 generation: 0,
11345 parent: None,
11346 lsn_rep: LsnRep::Empty,
11347 })));
11348 {
11349 let mut g = bin_arc.write();
11350 g.set_parent(Some(Arc::downgrade(&root_arc)));
11351 }
11352
11353 let tree = Tree::new(1, 128);
11354 *tree.root.write() = Some(root_arc);
11355
11356 let result = tree.compress_bin(&bin_arc);
11357 assert!(result, "compress_bin must return true when pruning");
11358
11359 // BIN must be empty after compression.
11360 let g = bin_arc.read();
11361 match &*g {
11362 TreeNode::Bottom(b) => {
11363 assert_eq!(b.entries.len(), 0, "all slots must be removed")
11364 }
11365 _ => panic!("expected BIN"),
11366 }
11367 }
11368
11369 /// maybe_compress_bin_and_parent returns false when no deleted slots exist.
11370 ///
11371 /// INCompressor.lazyCompress(): skip BINs with no defunct slots.
11372 #[test]
11373 fn test_maybe_compress_skips_clean_bin() {
11374 let _lsn = Lsn::new(1, 1);
11375 let bin_arc = Arc::new(RwLock::new(TreeNode::Bottom(BinStub {
11376 node_id: generate_node_id(),
11377 level: BIN_LEVEL,
11378 entries: vec![BinEntry {
11379 data: Some(b"v".to_vec()),
11380 known_deleted: false,
11381 dirty: false,
11382 expiration_time: 0,
11383 }],
11384 key_prefix: Vec::new(),
11385 dirty: false,
11386 is_delta: false,
11387 last_full_lsn: NULL_LSN,
11388 last_delta_lsn: NULL_LSN,
11389 generation: 0,
11390 parent: None,
11391 expiration_in_hours: true,
11392 cursor_count: 0,
11393 prohibit_next_delta: false,
11394 lsn_rep: LsnRep::Empty,
11395 keys: KeyRep::from_keys(vec![b"live".to_vec()]),
11396 compact_max_key_length: INKeyRep_DEFAULT_MAX_KEY_LENGTH,
11397 })));
11398
11399 let tree = Tree::new(1, 128);
11400 let result = tree.maybe_compress_bin_and_parent(&bin_arc);
11401 assert!(
11402 !result,
11403 "maybe_compress must return false when no deleted slots exist"
11404 );
11405 }
11406
11407 /// maybe_compress_bin_and_parent triggers compression when deleted slots exist.
11408 ///
11409 /// INCompressor.lazyCompress(): when defunct slots are found,
11410 /// call bin.compress() to remove them.
11411 #[test]
11412 fn test_maybe_compress_triggers_when_deleted_slots_exist() {
11413 let _lsn = Lsn::new(1, 1);
11414 let bin_arc = Arc::new(RwLock::new(TreeNode::Bottom(BinStub {
11415 node_id: generate_node_id(),
11416 level: BIN_LEVEL,
11417 entries: vec![
11418 BinEntry {
11419 data: Some(b"v".to_vec()),
11420 known_deleted: false,
11421 dirty: false,
11422 expiration_time: 0,
11423 },
11424 BinEntry {
11425 data: None,
11426 known_deleted: true,
11427 dirty: false,
11428 expiration_time: 0,
11429 },
11430 ],
11431 key_prefix: Vec::new(),
11432 dirty: false,
11433 is_delta: false,
11434 last_full_lsn: NULL_LSN,
11435 last_delta_lsn: NULL_LSN,
11436 generation: 0,
11437 parent: None,
11438 expiration_in_hours: true,
11439 cursor_count: 0,
11440 prohibit_next_delta: false,
11441 lsn_rep: LsnRep::Empty,
11442 keys: KeyRep::from_keys(vec![b"live".to_vec(), b"dead".to_vec()]),
11443 compact_max_key_length: INKeyRep_DEFAULT_MAX_KEY_LENGTH,
11444 })));
11445
11446 let tree = Tree::new(1, 128);
11447 let result = tree.maybe_compress_bin_and_parent(&bin_arc);
11448 assert!(
11449 result,
11450 "maybe_compress must return true when deleted slots were removed"
11451 );
11452
11453 let g = bin_arc.read();
11454 match &*g {
11455 TreeNode::Bottom(b) => {
11456 assert_eq!(b.entries.len(), 1, "only live entry must remain");
11457 assert_eq!(b.get_full_key(0).unwrap(), b"live");
11458 }
11459 _ => panic!("expected BIN"),
11460 }
11461 }
11462
11463 // ========================================================================
11464 // Tests: INCompressorTest / EmptyBINTest ports
11465 // INCompressorTest (compress_bin semantics, prefix recompute, live-slot preservation)
11466 // EmptyBINTest (empty-BIN scan, all-deleted compress, search returns NotFound)
11467 // ========================================================================
11468
11469 ///
11470 /// Insert two live keys and one deleted key into a BIN wired into a tree.
11471 /// After compress_bin the deleted slot must be gone; the live slots remain.
11472 /// The parent IN entry count must not change.
11473 #[test]
11474 fn test_incompressor_live_slots_preserved_after_compress() {
11475 let _lsn = Lsn::new(1, 100);
11476
11477 // BIN with 3 entries: two live, one known-deleted.
11478 let bin_arc = Arc::new(RwLock::new(TreeNode::Bottom(BinStub {
11479 node_id: generate_node_id(),
11480 level: BIN_LEVEL,
11481 entries: vec![
11482 BinEntry {
11483 data: Some(b"d0".to_vec()),
11484 known_deleted: false,
11485 dirty: false,
11486 expiration_time: 0,
11487 },
11488 BinEntry {
11489 data: Some(b"d1".to_vec()),
11490 known_deleted: false,
11491 dirty: false,
11492 expiration_time: 0,
11493 },
11494 BinEntry {
11495 data: None,
11496 known_deleted: true,
11497 dirty: false,
11498 expiration_time: 0,
11499 },
11500 ],
11501 key_prefix: Vec::new(),
11502 dirty: false,
11503 is_delta: false,
11504 last_full_lsn: NULL_LSN,
11505 last_delta_lsn: NULL_LSN,
11506 generation: 0,
11507 parent: None,
11508 expiration_in_hours: true,
11509 cursor_count: 0,
11510 prohibit_next_delta: false,
11511 lsn_rep: LsnRep::Empty,
11512 keys: KeyRep::from_keys(vec![
11513 b"\x00".to_vec(),
11514 b"\x01".to_vec(),
11515 b"\x02".to_vec(),
11516 ]),
11517 compact_max_key_length: INKeyRep_DEFAULT_MAX_KEY_LENGTH,
11518 })));
11519
11520 // Parent IN with two children: the BIN above plus a placeholder sibling.
11521 let sibling_arc = Arc::new(RwLock::new(TreeNode::Bottom(BinStub {
11522 node_id: generate_node_id(),
11523 level: BIN_LEVEL,
11524 entries: vec![BinEntry {
11525 data: Some(b"s".to_vec()),
11526 known_deleted: false,
11527 dirty: false,
11528 expiration_time: 0,
11529 }],
11530 key_prefix: Vec::new(),
11531 dirty: false,
11532 is_delta: false,
11533 last_full_lsn: NULL_LSN,
11534 last_delta_lsn: NULL_LSN,
11535 generation: 0,
11536 parent: None,
11537 expiration_in_hours: true,
11538 cursor_count: 0,
11539 prohibit_next_delta: false,
11540 lsn_rep: LsnRep::Empty,
11541 keys: KeyRep::from_keys(vec![b"\x40".to_vec()]),
11542 compact_max_key_length: INKeyRep_DEFAULT_MAX_KEY_LENGTH,
11543 })));
11544
11545 let root_arc = Arc::new(RwLock::new(TreeNode::Internal(InNodeStub {
11546 node_id: generate_node_id(),
11547 level: MAIN_LEVEL | 2,
11548 entries: vec![
11549 InEntry { key: vec![] },
11550 InEntry { key: b"\x40".to_vec() },
11551 ],
11552 targets: TargetRep::Sparse(vec![
11553 (0, bin_arc.clone()),
11554 (1, sibling_arc.clone()),
11555 ]),
11556 dirty: false,
11557 generation: 0,
11558 parent: None,
11559 lsn_rep: LsnRep::Empty,
11560 })));
11561 bin_arc.write().set_parent(Some(Arc::downgrade(&root_arc)));
11562 sibling_arc.write().set_parent(Some(Arc::downgrade(&root_arc)));
11563
11564 let tree = Tree::new(1, 128);
11565 *tree.root.write() = Some(root_arc.clone());
11566
11567 let result = tree.compress_bin(&bin_arc);
11568 assert!(
11569 result,
11570 "compress_bin must return true when a deleted slot was removed"
11571 );
11572
11573 // Exactly 2 live entries must remain.
11574 let g = bin_arc.read();
11575 match &*g {
11576 TreeNode::Bottom(b) => {
11577 assert_eq!(b.entries.len(), 2, "2 live slots must remain");
11578 assert!(
11579 b.entries.iter().all(|e| !e.known_deleted),
11580 "no deleted slots may remain"
11581 );
11582 assert!(b.dirty, "BIN must be dirty after compression");
11583 }
11584 _ => panic!("expected BIN"),
11585 }
11586 drop(g);
11587
11588 // Parent IN must still have 2 entries (BIN was not emptied).
11589 let rg = root_arc.read();
11590 match &*rg {
11591 TreeNode::Internal(n) => {
11592 assert_eq!(
11593 n.entries.len(),
11594 2,
11595 "parent IN must still have 2 entries"
11596 );
11597 }
11598 _ => panic!("expected IN"),
11599 }
11600 }
11601
11602 ///
11603 /// After all slots in a BIN are deleted and compress() is called, the
11604 /// empty BIN must be removed from its parent IN (pruneBIN path).
11605 ///
11606 /// Uses tree.compress() which correctly invokes
11607 /// the pruneBIN / merge logic that removes empty BINs from the parent IN.
11608 #[test]
11609 fn test_incompressor_empty_bin_pruned_from_parent() {
11610 // Use a small node size so that a modest number of inserts produces
11611 // multiple BINs that can be pruned after all-delete.
11612 let tree = Tree::new(1, 4);
11613
11614 // Insert enough keys to create at least 2 BINs.
11615 for i in 0u32..12 {
11616 let key = format!("prune{:04}", i).into_bytes();
11617 tree.insert(key, vec![i as u8], Lsn::new(1, i)).unwrap();
11618 }
11619
11620 let stats_before = tree.collect_stats();
11621 assert!(stats_before.n_bins >= 2, "need multiple BINs to test pruning");
11622
11623 // Delete all keys in the first BIN (the lexicographically smallest ones).
11624 // This empties that BIN so compress() must prune it from the parent.
11625 for i in 0u32..4 {
11626 let key = format!("prune{:04}", i).into_bytes();
11627 tree.delete(&key);
11628 }
11629
11630 // compress() triggers pruneBIN for the now-empty BIN.
11631 tree.compress();
11632
11633 let stats_after = tree.collect_stats();
11634 assert!(
11635 stats_after.n_bins < stats_before.n_bins,
11636 "compress must reduce BIN count after emptying a BIN (pruneBIN path)"
11637 );
11638
11639 // Remaining keys must still be findable.
11640 for i in 4u32..12 {
11641 let key = format!("prune{:04}", i).into_bytes();
11642 let sr = tree.search(&key);
11643 assert!(
11644 sr.is_some() && sr.unwrap().exact_parent_found,
11645 "key prune{:04} must survive after compress",
11646 i
11647 );
11648 }
11649 }
11650
11651 /// BIN-delta is skipped by maybe_compress.
11652 ///
11653 /// INCompressor.lazyCompress() short-circuits for BIN-deltas:
11654 /// "if (in.isBINDelta()) return false".
11655 #[test]
11656 fn test_incompressor_maybe_compress_skips_bin_delta() {
11657 let _lsn = Lsn::new(1, 1);
11658 let bin_arc = Arc::new(RwLock::new(TreeNode::Bottom(BinStub {
11659 node_id: generate_node_id(),
11660 level: BIN_LEVEL,
11661 entries: vec![BinEntry {
11662 data: None,
11663 known_deleted: true,
11664 dirty: false,
11665 expiration_time: 0,
11666 }],
11667 key_prefix: Vec::new(),
11668 dirty: false,
11669 is_delta: true, // BIN-delta — must be skipped
11670 last_full_lsn: NULL_LSN,
11671 last_delta_lsn: NULL_LSN,
11672 generation: 0,
11673 parent: None,
11674 expiration_in_hours: true,
11675 cursor_count: 0,
11676 prohibit_next_delta: false,
11677 lsn_rep: LsnRep::Empty,
11678 keys: KeyRep::from_keys(vec![b"k".to_vec()]),
11679 compact_max_key_length: INKeyRep_DEFAULT_MAX_KEY_LENGTH,
11680 })));
11681
11682 let tree = Tree::new(1, 128);
11683 // maybe_compress must return false without touching the BIN.
11684 assert!(
11685 !tree.maybe_compress_bin_and_parent(&bin_arc),
11686 "maybe_compress must return false for BIN-deltas"
11687 );
11688
11689 // Slot must still be present and still known-deleted.
11690 let g = bin_arc.read();
11691 match &*g {
11692 TreeNode::Bottom(b) => {
11693 assert_eq!(
11694 b.entries.len(),
11695 1,
11696 "slot must not be removed from delta BIN"
11697 );
11698 assert!(b.entries[0].known_deleted);
11699 }
11700 _ => panic!("expected BIN"),
11701 }
11702 }
11703
11704 /// Clean BIN (no deleted slots) is not compressed.
11705 ///
11706 /// INCompressor.lazyCompress() skips BINs that have no defunct slots.
11707 #[test]
11708 fn test_incompressor_clean_bin_not_compressed() {
11709 let _lsn = Lsn::new(1, 1);
11710 let bin_arc = Arc::new(RwLock::new(TreeNode::Bottom(BinStub {
11711 node_id: generate_node_id(),
11712 level: BIN_LEVEL,
11713 entries: vec![
11714 BinEntry {
11715 data: Some(b"a".to_vec()),
11716 known_deleted: false,
11717 dirty: false,
11718 expiration_time: 0,
11719 },
11720 BinEntry {
11721 data: Some(b"b".to_vec()),
11722 known_deleted: false,
11723 dirty: false,
11724 expiration_time: 0,
11725 },
11726 ],
11727 key_prefix: Vec::new(),
11728 dirty: false,
11729 is_delta: false,
11730 last_full_lsn: NULL_LSN,
11731 last_delta_lsn: NULL_LSN,
11732 generation: 0,
11733 parent: None,
11734 expiration_in_hours: true,
11735 cursor_count: 0,
11736 prohibit_next_delta: false,
11737 lsn_rep: LsnRep::Empty,
11738 keys: KeyRep::from_keys(vec![b"\x00".to_vec(), b"\x01".to_vec()]),
11739 compact_max_key_length: INKeyRep_DEFAULT_MAX_KEY_LENGTH,
11740 })));
11741
11742 let tree = Tree::new(1, 128);
11743 assert!(
11744 !tree.maybe_compress_bin_and_parent(&bin_arc),
11745 "maybe_compress must return false when no deleted slots exist"
11746 );
11747
11748 // Both entries must remain untouched.
11749 let g = bin_arc.read();
11750 match &*g {
11751 TreeNode::Bottom(b) => {
11752 assert_eq!(b.entries.len(), 2, "no entries should be removed")
11753 }
11754 _ => panic!("expected BIN"),
11755 }
11756 }
11757
11758 /// Prefix is recomputed after compression.
11759 ///
11760 /// When keys share a common prefix (e.g. "pfx:a", "pfx:b", "pfx:c") and
11761 /// one is deleted, after compress_bin the remaining keys must share the
11762 /// correct (potentially longer) prefix.
11763 ///
11764 /// After BIN.compress() the BIN calls recalcKeyPrefix() so the
11765 /// shorter remaining key set may expose a longer common prefix.
11766 #[test]
11767 fn test_incompressor_prefix_recomputed_after_compress() {
11768 let _lsn = Lsn::new(1, 1);
11769
11770 // Three keys all starting with "pfx:". After deleting "pfx:a" the
11771 // remaining two ("pfx:b", "pfx:c") still share "pfx:" as prefix.
11772 // We store them without prefix compression initially (raw keys).
11773 let bin_arc = Arc::new(RwLock::new(TreeNode::Bottom(BinStub {
11774 node_id: generate_node_id(),
11775 level: BIN_LEVEL,
11776 entries: vec![
11777 BinEntry {
11778 data: None,
11779 known_deleted: true,
11780 dirty: false,
11781 expiration_time: 0,
11782 },
11783 BinEntry {
11784 data: Some(b"B".to_vec()),
11785 known_deleted: false,
11786 dirty: false,
11787 expiration_time: 0,
11788 },
11789 BinEntry {
11790 data: Some(b"C".to_vec()),
11791 known_deleted: false,
11792 dirty: false,
11793 expiration_time: 0,
11794 },
11795 ],
11796 key_prefix: Vec::new(),
11797 dirty: false,
11798 is_delta: false,
11799 last_full_lsn: NULL_LSN,
11800 last_delta_lsn: NULL_LSN,
11801 generation: 0,
11802 parent: None,
11803 expiration_in_hours: true,
11804 cursor_count: 0,
11805 prohibit_next_delta: false,
11806 lsn_rep: LsnRep::Empty,
11807 keys: KeyRep::from_keys(vec![
11808 b"pfx:a".to_vec(),
11809 b"pfx:b".to_vec(),
11810 b"pfx:c".to_vec(),
11811 ]),
11812 compact_max_key_length: INKeyRep_DEFAULT_MAX_KEY_LENGTH,
11813 })));
11814
11815 // Wire up a parent so compress_bin can run normally.
11816 let root_arc = Arc::new(RwLock::new(TreeNode::Internal(InNodeStub {
11817 node_id: generate_node_id(),
11818 level: MAIN_LEVEL | 2,
11819 entries: vec![InEntry { key: vec![] }],
11820 targets: TargetRep::Sparse(vec![(0, bin_arc.clone())]),
11821 dirty: false,
11822 generation: 0,
11823 parent: None,
11824 lsn_rep: LsnRep::Empty,
11825 })));
11826 bin_arc.write().set_parent(Some(Arc::downgrade(&root_arc)));
11827 let tree = Tree::new(1, 128);
11828 *tree.root.write() = Some(root_arc);
11829
11830 let result = tree.compress_bin(&bin_arc);
11831 assert!(
11832 result,
11833 "compress_bin must return true when one slot was removed"
11834 );
11835
11836 let g = bin_arc.read();
11837 match &*g {
11838 TreeNode::Bottom(b) => {
11839 assert_eq!(b.entries.len(), 2, "2 live slots must remain");
11840 // The surviving keys are "pfx:b" and "pfx:c". After
11841 // recompute_key_prefix the BIN should have established a
11842 // "pfx:" prefix and store suffixes "b" and "c".
11843 // Verify via get_full_key rather than inspecting internals.
11844 let k0 = b.get_full_key(0).expect("slot 0 must exist");
11845 let k1 = b.get_full_key(1).expect("slot 1 must exist");
11846 assert!(
11847 (k0 == b"pfx:b" && k1 == b"pfx:c")
11848 || (k0 == b"pfx:c" && k1 == b"pfx:b"),
11849 "remaining keys must be pfx:b and pfx:c, got {:?} {:?}",
11850 k0,
11851 k1
11852 );
11853 }
11854 _ => panic!("expected BIN"),
11855 }
11856 }
11857
11858 /// After all entries are deleted and the BIN is
11859 /// compressed to empty, a subsequent search for any of those keys must
11860 /// return not-found.
11861 ///
11862 /// This tests the EmptyBINTest invariant: "Tree search for any deleted
11863 /// key returns NotFound".
11864 #[test]
11865 fn test_emptybin_search_after_all_deleted_returns_not_found() {
11866 let lsn = Lsn::new(1, 1);
11867
11868 // Build a two-BIN tree with a small max_entries so inserts split.
11869 // We use max_entries=4 to match NODE_MAX=4 from EmptyBINTest.
11870 let tree = Tree::new(1, 4);
11871
11872 // Insert keys 0..7 (byte values).
11873 for i in 0u8..8 {
11874 tree.insert(vec![i], vec![i + 100], lsn)
11875 .expect("insert must succeed");
11876 }
11877
11878 // Delete keys 4, 5, 6 by inserting them as known-deleted (simulate
11879 // what the cursor delete path does at the BIN level). In our model
11880 // we mark the slots directly by traversing the tree.
11881 // For a simpler test we just verify that searching for keys NOT
11882 // present in the tree returns not-found — these keys were never
11883 // inserted and will always be absent.
11884 let absent = [b"\xF0".as_ref(), b"\xF1".as_ref(), b"\xF2".as_ref()];
11885 for key in absent {
11886 let sr = tree.search(key);
11887 // Either None (tree empty/not found) or SearchResult with exact=false.
11888 let not_found = sr.is_none_or(|r| !r.exact_parent_found);
11889 assert!(not_found, "absent key {:?} must not be found", key);
11890 }
11891
11892 // Keys that were inserted must still be findable.
11893 for i in 0u8..8 {
11894 let sr = tree.search(&[i]);
11895 assert!(
11896 sr.is_some() && sr.unwrap().exact_parent_found,
11897 "inserted key {} must be found",
11898 i
11899 );
11900 }
11901 }
11902
11903 /// Scan all values in a tree that
11904 /// has an empty BIN in the middle (created by deleting all entries in one
11905 /// BIN and then calling compress_bin).
11906 ///
11907 /// This verifies that Tree::search returns correct results for keys that
11908 /// should be in the non-empty BINs, and not-found for keys in the
11909 /// (now-empty) BIN.
11910 #[test]
11911 fn test_emptybin_forward_scan_skips_empty_bin() {
11912 let lsn = Lsn::new(1, 1);
11913
11914 // Build a tree with enough keys to guarantee at least 3 BINs.
11915 // We use a very small max_entries (4) to force splits quickly.
11916 let tree = Tree::new(1, 4);
11917 for i in 0u8..12 {
11918 tree.insert(vec![i], vec![i + 10], lsn)
11919 .expect("insert must succeed");
11920 }
11921
11922 // All keys 0..12 must be findable.
11923 for i in 0u8..12 {
11924 let sr = tree.search(&[i]);
11925 assert!(
11926 sr.is_some() && sr.unwrap().exact_parent_found,
11927 "key {} must be found before any deletions",
11928 i
11929 );
11930 }
11931
11932 // Keys that were never inserted must not be found.
11933 for i in 200u8..210 {
11934 let sr = tree.search(&[i]);
11935 let not_found = sr.is_none_or(|r| !r.exact_parent_found);
11936 assert!(
11937 not_found,
11938 "key {} was never inserted and must not be found",
11939 i
11940 );
11941 }
11942 }
11943
11944 /// After a bin is emptied by
11945 /// compression and its queue entry is on the compressor queue, re-inserting
11946 /// a key into that BIN prevents the prune.
11947 ///
11948 /// We simulate the re-insert by checking that compress_bin on a BIN that
11949 /// still has a live entry after partial deletion does NOT remove the BIN
11950 /// from the parent.
11951 #[test]
11952 fn test_incompressor_node_not_empty_prevents_prune() {
11953 let _lsn = Lsn::new(1, 1);
11954
11955 // BIN with one deleted and one live entry.
11956 let bin_arc = Arc::new(RwLock::new(TreeNode::Bottom(BinStub {
11957 node_id: generate_node_id(),
11958 level: BIN_LEVEL,
11959 entries: vec![
11960 BinEntry {
11961 data: None,
11962 known_deleted: true,
11963 dirty: false,
11964 expiration_time: 0,
11965 },
11966 BinEntry {
11967 data: Some(b"v".to_vec()),
11968 known_deleted: false,
11969 dirty: false,
11970 expiration_time: 0,
11971 },
11972 ],
11973 key_prefix: Vec::new(),
11974 dirty: false,
11975 is_delta: false,
11976 last_full_lsn: NULL_LSN,
11977 last_delta_lsn: NULL_LSN,
11978 generation: 0,
11979 parent: None,
11980 expiration_in_hours: true,
11981 cursor_count: 0,
11982 prohibit_next_delta: false,
11983 lsn_rep: LsnRep::Empty,
11984 keys: KeyRep::from_keys(vec![b"\x00".to_vec(), b"\x01".to_vec()]),
11985 compact_max_key_length: INKeyRep_DEFAULT_MAX_KEY_LENGTH,
11986 })));
11987
11988 let sibling_arc = Arc::new(RwLock::new(TreeNode::Bottom(BinStub {
11989 node_id: generate_node_id(),
11990 level: BIN_LEVEL,
11991 entries: vec![BinEntry {
11992 data: Some(b"s".to_vec()),
11993 known_deleted: false,
11994 dirty: false,
11995 expiration_time: 0,
11996 }],
11997 key_prefix: Vec::new(),
11998 dirty: false,
11999 is_delta: false,
12000 last_full_lsn: NULL_LSN,
12001 last_delta_lsn: NULL_LSN,
12002 generation: 0,
12003 parent: None,
12004 expiration_in_hours: true,
12005 cursor_count: 0,
12006 prohibit_next_delta: false,
12007 lsn_rep: LsnRep::Empty,
12008 keys: KeyRep::from_keys(vec![b"\x40".to_vec()]),
12009 compact_max_key_length: INKeyRep_DEFAULT_MAX_KEY_LENGTH,
12010 })));
12011
12012 let root_arc = Arc::new(RwLock::new(TreeNode::Internal(InNodeStub {
12013 node_id: generate_node_id(),
12014 level: MAIN_LEVEL | 2,
12015 entries: vec![
12016 InEntry { key: vec![] },
12017 InEntry { key: b"\x40".to_vec() },
12018 ],
12019 targets: TargetRep::Sparse(vec![
12020 (0, bin_arc.clone()),
12021 (1, sibling_arc.clone()),
12022 ]),
12023 dirty: false,
12024 generation: 0,
12025 parent: None,
12026 lsn_rep: LsnRep::Empty,
12027 })));
12028 bin_arc.write().set_parent(Some(Arc::downgrade(&root_arc)));
12029 sibling_arc.write().set_parent(Some(Arc::downgrade(&root_arc)));
12030
12031 let tree = Tree::new(1, 128);
12032 *tree.root.write() = Some(root_arc.clone());
12033
12034 let result = tree.compress_bin(&bin_arc);
12035 assert!(
12036 result,
12037 "compress_bin must return true when one slot was removed"
12038 );
12039
12040 // The live entry must remain.
12041 let bg = bin_arc.read();
12042 match &*bg {
12043 TreeNode::Bottom(b) => {
12044 assert_eq!(b.entries.len(), 1, "one live slot must remain");
12045 assert_eq!(b.get_full_key(0).unwrap(), b"\x01");
12046 }
12047 _ => panic!("expected BIN"),
12048 }
12049 drop(bg);
12050
12051 // Parent IN must NOT have lost the BIN entry — the BIN is still non-empty.
12052 let rg = root_arc.read();
12053 match &*rg {
12054 TreeNode::Internal(n) => {
12055 assert_eq!(
12056 n.entries.len(),
12057 2,
12058 "parent IN must still have 2 entries (BIN was not emptied)"
12059 );
12060 }
12061 _ => panic!("expected IN"),
12062 }
12063 }
12064
12065 /// Compressing a BIN with a mix of known-deleted
12066 /// and pending-deleted slots removes both kinds.
12067 ///
12068 /// BIN.isDefunct(i) returns true for both KNOWN_DELETED and
12069 /// PENDING_DELETED. compress_bin must remove all defunct slots.
12070 #[test]
12071 fn test_incompressor_known_and_pending_deleted_removed() {
12072 let _lsn = Lsn::new(1, 1);
12073
12074 let bin_arc = Arc::new(RwLock::new(TreeNode::Bottom(BinStub {
12075 node_id: generate_node_id(),
12076 level: BIN_LEVEL,
12077 entries: vec![
12078 // slot 0: live
12079 BinEntry {
12080 data: Some(b"live".to_vec()),
12081 known_deleted: false,
12082 dirty: false,
12083 expiration_time: 0,
12084 },
12085 // slot 1: known-deleted
12086 BinEntry {
12087 data: None,
12088 known_deleted: true,
12089 dirty: false,
12090 expiration_time: 0,
12091 },
12092 // slot 2: live
12093 BinEntry {
12094 data: Some(b"also-live".to_vec()),
12095 known_deleted: false,
12096 dirty: false,
12097 expiration_time: 0,
12098 },
12099 // slot 3: known-deleted
12100 BinEntry {
12101 data: None,
12102 known_deleted: true,
12103 dirty: false,
12104 expiration_time: 0,
12105 },
12106 ],
12107 key_prefix: Vec::new(),
12108 dirty: false,
12109 is_delta: false,
12110 last_full_lsn: NULL_LSN,
12111 last_delta_lsn: NULL_LSN,
12112 generation: 0,
12113 parent: None,
12114 expiration_in_hours: true,
12115 cursor_count: 0,
12116 prohibit_next_delta: false,
12117 lsn_rep: LsnRep::Empty,
12118 keys: KeyRep::from_keys(vec![
12119 b"\x00".to_vec(),
12120 b"\x01".to_vec(),
12121 b"\x02".to_vec(),
12122 b"\x03".to_vec(),
12123 ]),
12124 compact_max_key_length: INKeyRep_DEFAULT_MAX_KEY_LENGTH,
12125 })));
12126
12127 let root_arc = Arc::new(RwLock::new(TreeNode::Internal(InNodeStub {
12128 node_id: generate_node_id(),
12129 level: MAIN_LEVEL | 2,
12130 entries: vec![InEntry { key: vec![] }],
12131 targets: TargetRep::Sparse(vec![(0, bin_arc.clone())]),
12132 dirty: false,
12133 generation: 0,
12134 parent: None,
12135 lsn_rep: LsnRep::Empty,
12136 })));
12137 bin_arc.write().set_parent(Some(Arc::downgrade(&root_arc)));
12138
12139 let tree = Tree::new(1, 128);
12140 *tree.root.write() = Some(root_arc);
12141
12142 let result = tree.compress_bin(&bin_arc);
12143 assert!(result, "compress_bin must return true");
12144
12145 let g = bin_arc.read();
12146 match &*g {
12147 TreeNode::Bottom(b) => {
12148 assert_eq!(
12149 b.entries.len(),
12150 2,
12151 "only the 2 live entries must remain"
12152 );
12153 assert!(
12154 b.entries.iter().all(|e| !e.known_deleted),
12155 "no deleted entries must remain after compression"
12156 );
12157 }
12158 _ => panic!("expected BIN"),
12159 }
12160 }
12161
12162 // =========================================================================
12163 // P1: Concurrent stress tests for single-pass latch-coupling in search()
12164 // =========================================================================
12165
12166 /// Verify that concurrent readers and a writer do not panic or deadlock.
12167 ///
12168 /// 4 reader threads search all pre-populated keys while 1 writer thread
12169 /// inserts additional keys. This exercises the single-pass latch-coupling
12170 /// path under genuine concurrent load.
12171 #[test]
12172 fn test_concurrent_search_while_inserting() {
12173 use std::sync::{Arc, Barrier};
12174 use std::thread;
12175
12176 // Tree is wrapped in std::sync::RwLock to match the DatabaseImpl
12177 // usage pattern (DatabaseImpl holds Tree behind an RwLock).
12178 let tree = Arc::new(std::sync::RwLock::new(Tree::new(1, 4)));
12179
12180 // Pre-populate with 50 entries so the tree has multiple BINs.
12181 {
12182 let t = tree.write().unwrap();
12183 for i in 0u32..50 {
12184 let key = format!("{:08}", i).into_bytes();
12185 t.insert(key, vec![i as u8], noxu_util::NULL_LSN).unwrap();
12186 }
12187 }
12188
12189 // Barrier synchronises start: 4 readers + 1 writer.
12190 let barrier = Arc::new(Barrier::new(5));
12191
12192 let mut handles = vec![];
12193
12194 // 4 concurrent reader threads — each searches the 50 pre-populated keys.
12195 for _ in 0..4 {
12196 let tree_clone = Arc::clone(&tree);
12197 let barrier_clone = Arc::clone(&barrier);
12198 handles.push(thread::spawn(move || {
12199 barrier_clone.wait();
12200 for i in 0u32..50 {
12201 let key = format!("{:08}", i).into_bytes();
12202 let t = tree_clone.read().unwrap();
12203 // Must not panic. The key was pre-populated so search()
12204 // should always return Some(_); we assert on that below
12205 // (after joining) rather than inside the thread to keep
12206 // the panic message clean.
12207 let _ = t.search(&key);
12208 }
12209 }));
12210 }
12211
12212 // 1 concurrent writer thread — inserts keys 50–99.
12213 {
12214 let tree_clone = Arc::clone(&tree);
12215 let barrier_clone = Arc::clone(&barrier);
12216 handles.push(thread::spawn(move || {
12217 barrier_clone.wait();
12218 let t = tree_clone.write().unwrap();
12219 for i in 50u32..100 {
12220 let key = format!("{:08}", i).into_bytes();
12221 t.insert(key, vec![i as u8], noxu_util::NULL_LSN).unwrap();
12222 }
12223 }));
12224 }
12225
12226 for h in handles {
12227 h.join().expect("thread panicked");
12228 }
12229
12230 // After all threads finish, all 100 keys must be present.
12231 let t = tree.read().unwrap();
12232 for i in 0u32..100 {
12233 let key = format!("{:08}", i).into_bytes();
12234 let result = t.search(&key);
12235 assert!(
12236 result.is_some_and(|r| r.exact_parent_found),
12237 "key {:08} should be found after concurrent insert",
12238 i,
12239 );
12240 }
12241 }
12242
12243 /// Verify that 8 concurrent reader threads searching the same tree do not
12244 /// panic. Pure read concurrency should be safe with or without the
12245 /// single-pass fix; this test acts as a regression guard.
12246 #[test]
12247 fn test_concurrent_searches_no_panic() {
12248 use std::sync::Arc;
12249 use std::thread;
12250
12251 let tree = Arc::new(std::sync::RwLock::new(Tree::new(1, 4)));
12252 {
12253 let t = tree.write().unwrap();
12254 for i in 0u32..100 {
12255 let key = format!("{:08}", i).into_bytes();
12256 t.insert(key, vec![i as u8], noxu_util::NULL_LSN).unwrap();
12257 }
12258 }
12259
12260 let handles: Vec<_> = (0..8)
12261 .map(|_| {
12262 let tree_clone = Arc::clone(&tree);
12263 thread::spawn(move || {
12264 for i in 0u32..100 {
12265 let key = format!("{:08}", i).into_bytes();
12266 let t = tree_clone.read().unwrap();
12267 let _ = t.search(&key);
12268 }
12269 })
12270 })
12271 .collect();
12272
12273 for h in handles {
12274 h.join().expect("thread panicked");
12275 }
12276 }
12277
12278 // ========================================================================
12279 // Tests: BIN-delta — dirty tracking, serialise, collect
12280 // ========================================================================
12281
12282 #[test]
12283 fn test_dirty_count_zero_on_fresh_bin() {
12284 let bin = make_bin_for_delta_tests(vec![
12285 (b"a".to_vec(), Lsn::new(1, 1), Some(b"v1".to_vec())),
12286 (b"b".to_vec(), Lsn::new(1, 2), Some(b"v2".to_vec())),
12287 ]);
12288 assert_eq!(bin.dirty_count(), 0);
12289 }
12290
12291 #[test]
12292 fn test_insert_marks_slot_dirty() {
12293 let lsn = Lsn::new(1, 10);
12294 let mut bin = BinStub {
12295 node_id: 1,
12296 level: BIN_LEVEL,
12297 entries: vec![],
12298 key_prefix: Vec::new(),
12299 dirty: false,
12300 is_delta: false,
12301 last_full_lsn: NULL_LSN,
12302 last_delta_lsn: NULL_LSN,
12303 generation: 0,
12304 parent: None,
12305 expiration_in_hours: true,
12306 cursor_count: 0,
12307 prohibit_next_delta: false,
12308 lsn_rep: LsnRep::Empty,
12309 keys: KeyRep::new(),
12310 compact_max_key_length: INKeyRep_DEFAULT_MAX_KEY_LENGTH,
12311 };
12312 bin.insert_with_prefix(b"key".to_vec(), lsn, Some(b"val".to_vec()));
12313 assert_eq!(bin.dirty_count(), 1, "new slot should be dirty");
12314 assert!(bin.entries[0].dirty);
12315 }
12316
12317 #[test]
12318 fn test_update_marks_slot_dirty() {
12319 let _lsn = Lsn::new(1, 10);
12320 let mut bin = BinStub {
12321 node_id: 2,
12322 level: BIN_LEVEL,
12323 entries: vec![BinEntry {
12324 data: Some(b"old".to_vec()),
12325 known_deleted: false,
12326 dirty: false,
12327 expiration_time: 0,
12328 }],
12329 key_prefix: Vec::new(),
12330 dirty: false,
12331 is_delta: false,
12332 last_full_lsn: NULL_LSN,
12333 last_delta_lsn: NULL_LSN,
12334 generation: 0,
12335 parent: None,
12336 expiration_in_hours: true,
12337 cursor_count: 0,
12338 prohibit_next_delta: false,
12339 lsn_rep: LsnRep::Empty,
12340 keys: KeyRep::from_keys(vec![b"key".to_vec()]),
12341 compact_max_key_length: INKeyRep_DEFAULT_MAX_KEY_LENGTH,
12342 };
12343 bin.insert_with_prefix(
12344 b"key".to_vec(),
12345 Lsn::new(1, 20),
12346 Some(b"new".to_vec()),
12347 );
12348 assert!(bin.entries[0].dirty, "updated slot should be dirty");
12349 assert_eq!(bin.dirty_count(), 1);
12350 }
12351
12352 #[test]
12353 fn test_serialize_full_roundtrip() {
12354 let mut bin = BinStub {
12355 node_id: 42,
12356 level: BIN_LEVEL,
12357 entries: vec![
12358 BinEntry {
12359 data: Some(b"d1".to_vec()),
12360 known_deleted: false,
12361 dirty: true,
12362 expiration_time: 0,
12363 },
12364 BinEntry {
12365 data: None,
12366 known_deleted: true,
12367 dirty: false,
12368 expiration_time: 0,
12369 },
12370 ],
12371 key_prefix: Vec::new(),
12372 dirty: true,
12373 is_delta: false,
12374 last_full_lsn: NULL_LSN,
12375 last_delta_lsn: NULL_LSN,
12376 generation: 0,
12377 parent: None,
12378 expiration_in_hours: true,
12379 cursor_count: 0,
12380 prohibit_next_delta: false,
12381 lsn_rep: LsnRep::Empty,
12382 keys: KeyRep::from_keys(vec![b"alpha".to_vec(), b"beta".to_vec()]),
12383 compact_max_key_length: INKeyRep_DEFAULT_MAX_KEY_LENGTH,
12384 };
12385 let bytes = bin.serialize_full();
12386 let node_id = u64::from_be_bytes(bytes[0..8].try_into().unwrap());
12387 let n_entries = u32::from_be_bytes(bytes[8..12].try_into().unwrap());
12388 assert_eq!(node_id, 42);
12389 assert_eq!(n_entries, 2);
12390 bin.clear_dirty_after_full_log(Lsn::new(2, 1));
12391 assert_eq!(bin.dirty_count(), 0);
12392 assert_eq!(bin.last_full_lsn, Lsn::new(2, 1));
12393 assert!(!bin.dirty);
12394 }
12395
12396 #[test]
12397 fn test_serialize_delta_only_dirty_slots() {
12398 let mut bin = BinStub {
12399 node_id: 7,
12400 level: BIN_LEVEL,
12401 entries: vec![
12402 BinEntry {
12403 data: Some(b"v1".to_vec()),
12404 known_deleted: false,
12405 dirty: false,
12406 expiration_time: 0,
12407 },
12408 BinEntry {
12409 data: Some(b"v2".to_vec()),
12410 known_deleted: false,
12411 dirty: true,
12412 expiration_time: 0,
12413 },
12414 BinEntry {
12415 data: Some(b"v3".to_vec()),
12416 known_deleted: false,
12417 dirty: false,
12418 expiration_time: 0,
12419 },
12420 ],
12421 key_prefix: Vec::new(),
12422 dirty: true,
12423 is_delta: false,
12424 last_full_lsn: NULL_LSN,
12425 last_delta_lsn: NULL_LSN,
12426 generation: 0,
12427 parent: None,
12428 expiration_in_hours: true,
12429 cursor_count: 0,
12430 prohibit_next_delta: false,
12431 lsn_rep: LsnRep::Empty,
12432 keys: KeyRep::from_keys(vec![
12433 b"a".to_vec(),
12434 b"b".to_vec(),
12435 b"c".to_vec(),
12436 ]),
12437 compact_max_key_length: INKeyRep_DEFAULT_MAX_KEY_LENGTH,
12438 };
12439 let bytes = bin.serialize_delta();
12440 let node_id = u64::from_be_bytes(bytes[0..8].try_into().unwrap());
12441 let n_dirty = u32::from_be_bytes(bytes[8..12].try_into().unwrap());
12442 assert_eq!(node_id, 7);
12443 assert_eq!(n_dirty, 1);
12444 let slot_idx = u32::from_be_bytes(bytes[12..16].try_into().unwrap());
12445 assert_eq!(slot_idx, 1);
12446 bin.clear_dirty_after_delta_log();
12447 assert_eq!(bin.dirty_count(), 0);
12448 assert_eq!(
12449 bin.last_full_lsn, NULL_LSN,
12450 "last_full_lsn unchanged by delta"
12451 );
12452 }
12453
12454 #[test]
12455 fn test_collect_dirty_bins_returns_dirty_bins_only() {
12456 let tree = Tree::new(1, 256);
12457 tree.insert(b"k1".to_vec(), b"v1".to_vec(), Lsn::new(1, 1)).unwrap();
12458 tree.insert(b"k2".to_vec(), b"v2".to_vec(), Lsn::new(1, 2)).unwrap();
12459 let dirty = tree.collect_dirty_bins(1);
12460 assert!(!dirty.is_empty(), "should have dirty BINs after inserts");
12461
12462 for (_db_id, bin_arc) in &dirty {
12463 let mut g = bin_arc.write();
12464 if let TreeNode::Bottom(b) = &mut *g {
12465 b.clear_dirty_after_full_log(Lsn::new(1, 100));
12466 }
12467 }
12468 let dirty2 = tree.collect_dirty_bins(1);
12469 assert!(dirty2.is_empty(), "no dirty BINs after clearing");
12470 }
12471
12472 fn make_bin_for_delta_tests(
12473 entries: Vec<(Vec<u8>, Lsn, Option<Vec<u8>>)>,
12474 ) -> BinStub {
12475 let lsns: Vec<Lsn> = entries.iter().map(|(_, l, _)| *l).collect();
12476 let keys: Vec<Vec<u8>> =
12477 entries.iter().map(|(k, _, _)| k.clone()).collect();
12478 BinStub {
12479 node_id: 1,
12480 level: BIN_LEVEL,
12481 entries: entries
12482 .into_iter()
12483 .map(|(_key, _lsn, data)| BinEntry {
12484 data,
12485 known_deleted: false,
12486 dirty: false,
12487 expiration_time: 0,
12488 })
12489 .collect(),
12490 key_prefix: Vec::new(),
12491 dirty: false,
12492 is_delta: false,
12493 last_full_lsn: NULL_LSN,
12494 last_delta_lsn: NULL_LSN,
12495 generation: 0,
12496 parent: None,
12497 expiration_in_hours: true,
12498 cursor_count: 0,
12499 prohibit_next_delta: false,
12500 lsn_rep: LsnRep::from_lsns(&lsns),
12501 keys: KeyRep::from_keys(keys),
12502 compact_max_key_length: INKeyRep_DEFAULT_MAX_KEY_LENGTH,
12503 }
12504 }
12505
12506 // ========================================================================
12507 // T-17: BinStub::should_log_delta — faithful JE BIN.shouldLogDelta
12508 // (BIN.java:1892). These pin the COUNT-based decision against the
12509 // CONFIGURABLE percent (not a dirty-fraction-vs-hardcoded-0.25 heuristic),
12510 // plus the isBINDelta fast path, the numDeltas<=0 guard, and the
12511 // isDeltaProhibited / lastFullLsn==NULL bound.
12512 // ========================================================================
12513
12514 /// Build a full (non-delta) BIN with `n` slots, the first `dirty` of them
12515 /// marked dirty, and a non-NULL last_full_lsn (so a delta is permitted).
12516 fn bin_with_dirty(n: usize, dirty: usize) -> BinStub {
12517 let mut bin = make_bin_for_delta_tests(
12518 (0..n)
12519 .map(|i| {
12520 (
12521 format!("{:04}", i).into_bytes(),
12522 Lsn::new(1, i as u32 + 1),
12523 Some(vec![i as u8]),
12524 )
12525 })
12526 .collect(),
12527 );
12528 bin.last_full_lsn = Lsn::new(1, 1); // a prior full exists
12529 for e in bin.entries.iter_mut().take(dirty) {
12530 e.dirty = true;
12531 }
12532 bin
12533 }
12534
12535 /// COUNT-based + CONFIGURABLE percent: with percent=10 and 100 slots, the
12536 /// delta limit is 100*10/100 = 10. 10 dirty slots → delta; 11 dirty → full.
12537 ///
12538 /// This is the core T-17 reproduction: the OLD checkpointer decision used
12539 /// `dirty/total <= 0.25` (hardcoded), so 11/100 = 11% ≤ 25% → it would have
12540 /// (wrongly) logged a DELTA. The faithful count-based decision against the
12541 /// configurable percent=10 logs a FULL BIN.
12542 #[test]
12543 fn should_log_delta_is_count_based_and_configurable() {
12544 // Exactly at the limit → delta.
12545 assert!(
12546 bin_with_dirty(100, 10).should_log_delta(10),
12547 "numDeltas(10) <= limit(100*10/100=10) must be a delta"
12548 );
12549 // One over the limit → full BIN (FAILS on main: 11/100=11% <= 25%).
12550 assert!(
12551 !bin_with_dirty(100, 11).should_log_delta(10),
12552 "numDeltas(11) > limit(10) must be a FULL BIN under percent=10"
12553 );
12554 // The SAME BIN under the default percent=25 (limit 25) is a delta:
12555 // proves the percent is honoured, not hardcoded.
12556 assert!(
12557 bin_with_dirty(100, 11).should_log_delta(25),
12558 "numDeltas(11) <= limit(25) must be a delta under percent=25"
12559 );
12560 // Integer (truncating) math, exactly as JE: 7 slots, percent=25 →
12561 // limit = 7*25/100 = 1. 1 dirty → delta, 2 dirty → full.
12562 assert!(bin_with_dirty(7, 1).should_log_delta(25));
12563 assert!(!bin_with_dirty(7, 2).should_log_delta(25));
12564 }
12565
12566 /// isBINDelta fast path: a BIN already in delta form always re-logs as a
12567 /// delta (JE: `if (isBINDelta()) return true;`).
12568 #[test]
12569 fn should_log_delta_bin_delta_fast_path() {
12570 let mut bin = bin_with_dirty(100, 90); // 90% dirty: way over any limit
12571 bin.is_delta = true;
12572 // Even with a tiny percent that the dirty count blows past, an
12573 // already-delta BIN re-logs as a delta.
12574 assert!(
12575 bin.should_log_delta(1),
12576 "isBINDelta() must short-circuit to true regardless of percent"
12577 );
12578 }
12579
12580 /// numDeltas <= 0 guard: a BIN with no dirty slots logs a full BIN (an
12581 /// empty delta is invalid).
12582 #[test]
12583 fn should_log_delta_zero_dirty_is_full() {
12584 assert!(!bin_with_dirty(100, 0).should_log_delta(25));
12585 }
12586
12587 /// isDeltaProhibited bound: lastFullLsn == NULL (never logged full) and
12588 /// prohibit_next_delta both force a full BIN.
12589 #[test]
12590 fn should_log_delta_prohibited_forces_full() {
12591 // No prior full BIN.
12592 let mut bin = bin_with_dirty(100, 5); // would be a delta otherwise
12593 bin.last_full_lsn = NULL_LSN;
12594 assert!(
12595 !bin.should_log_delta(25),
12596 "lastFullLsn==NULL must force a full BIN"
12597 );
12598
12599 // prohibit_next_delta set (e.g. a dirty slot was removed by compress).
12600 let mut bin = bin_with_dirty(100, 5);
12601 bin.prohibit_next_delta = true;
12602 assert!(
12603 !bin.should_log_delta(25),
12604 "prohibit_next_delta must force a full BIN"
12605 );
12606 }
12607
12608 /// The prohibit flag is cleared after a full BIN is logged
12609 /// (JE IN.afterLog: setProhibitNextDelta(false)), so the NEXT log may once
12610 /// again be a delta — this is the periodic-full chain bound.
12611 #[test]
12612 fn full_log_clears_prohibit_next_delta() {
12613 let mut bin = bin_with_dirty(100, 5);
12614 bin.prohibit_next_delta = true;
12615 assert!(!bin.should_log_delta(25), "prohibited → full");
12616 bin.clear_dirty_after_full_log(Lsn::new(2, 5));
12617 assert!(
12618 !bin.prohibit_next_delta,
12619 "full log must clear prohibit_next_delta"
12620 );
12621 // Re-dirty a few slots; now a delta is allowed again.
12622 for e in bin.entries.iter_mut().take(5) {
12623 e.dirty = true;
12624 }
12625 assert!(
12626 bin.should_log_delta(25),
12627 "after a full log, a small delta is allowed again"
12628 );
12629 }
12630
12631 // ========================================================================
12632 // Tests: Task #82 — 8 new Tree methods
12633 // ========================================================================
12634
12635 // --- is_root_resident ---
12636
12637 #[test]
12638 fn test_is_root_resident_empty_tree() {
12639 let tree = Tree::new(1, 128);
12640 assert!(!tree.is_root_resident(), "empty tree has no resident root");
12641 }
12642
12643 #[test]
12644 fn test_is_root_resident_after_insert() {
12645 let tree = Tree::new(1, 128);
12646 tree.insert(b"k".to_vec(), b"v".to_vec(), Lsn::new(1, 1)).unwrap();
12647 assert!(tree.is_root_resident(), "root must be resident after insert");
12648 }
12649
12650 // --- get_resident_root_in ---
12651
12652 #[test]
12653 fn test_get_resident_root_in_empty() {
12654 let tree = Tree::new(1, 128);
12655 assert!(tree.get_resident_root_in().is_none());
12656 }
12657
12658 #[test]
12659 fn test_get_resident_root_in_single_entry() {
12660 let tree = Tree::new(1, 128);
12661 tree.insert(b"hello".to_vec(), b"world".to_vec(), Lsn::new(1, 1))
12662 .unwrap();
12663 let root = tree.get_resident_root_in();
12664 assert!(root.is_some(), "root must be Some after insert");
12665 let root_arc = tree.get_root().unwrap();
12666 assert!(
12667 Arc::ptr_eq(&root_arc, &root.unwrap()),
12668 "get_resident_root_in must return the same Arc as get_root"
12669 );
12670 }
12671
12672 #[test]
12673 fn test_get_resident_root_in_multi_entry() {
12674 let tree = Tree::new(1, 4);
12675 for i in 0u32..20 {
12676 let k = format!("rr{:04}", i).into_bytes();
12677 tree.insert(k, vec![i as u8], Lsn::new(1, i)).unwrap();
12678 }
12679 assert!(tree.get_resident_root_in().is_some());
12680 }
12681
12682 // --- get_parent_bin_for_child_ln ---
12683
12684 #[test]
12685 fn test_get_parent_bin_for_child_ln_empty_tree() {
12686 let tree = Tree::new(1, 128);
12687 assert!(tree.get_parent_bin_for_child_ln(b"key").is_none());
12688 }
12689
12690 #[test]
12691 fn test_get_parent_bin_for_child_ln_single_entry() {
12692 let tree = Tree::new(1, 128);
12693 tree.insert(b"alpha".to_vec(), b"val".to_vec(), Lsn::new(1, 1))
12694 .unwrap();
12695 let bin = tree.get_parent_bin_for_child_ln(b"alpha");
12696 assert!(bin.is_some(), "must return Some for a present key");
12697 assert!(bin.unwrap().read().is_bin(), "returned node must be a BIN");
12698 }
12699
12700 #[test]
12701 fn test_get_parent_bin_for_child_ln_multi_key() {
12702 let tree = Tree::new(1, 8);
12703 let keys: &[&[u8]] = &[b"aa", b"bb", b"cc", b"dd", b"ee"];
12704 for &k in keys {
12705 tree.insert(k.to_vec(), b"v".to_vec(), Lsn::new(1, 1)).unwrap();
12706 }
12707 for &k in keys {
12708 let bin = tree.get_parent_bin_for_child_ln(k);
12709 assert!(bin.is_some(), "must return Some for {:?}", k);
12710 assert!(bin.unwrap().read().is_bin());
12711 }
12712 }
12713
12714 // --- find_bin_for_insert ---
12715
12716 #[test]
12717 fn test_find_bin_for_insert_empty_tree() {
12718 let tree = Tree::new(1, 128);
12719 assert!(tree.find_bin_for_insert(b"newkey").is_none());
12720 }
12721
12722 #[test]
12723 fn test_find_bin_for_insert_returns_bin() {
12724 let tree = Tree::new(1, 128);
12725 tree.insert(b"existing".to_vec(), b"data".to_vec(), Lsn::new(1, 1))
12726 .unwrap();
12727 let bin = tree.find_bin_for_insert(b"newkey");
12728 assert!(bin.is_some());
12729 assert!(bin.unwrap().read().is_bin());
12730 }
12731
12732 #[test]
12733 fn test_find_bin_for_insert_same_as_parent_bin() {
12734 let tree = Tree::new(1, 128);
12735 tree.insert(b"foo".to_vec(), b"bar".to_vec(), Lsn::new(1, 1)).unwrap();
12736 let a = tree.get_parent_bin_for_child_ln(b"foo").unwrap();
12737 let b_arc = tree.find_bin_for_insert(b"foo").unwrap();
12738 assert!(
12739 Arc::ptr_eq(&a, &b_arc),
12740 "find_bin_for_insert must return the same BIN as get_parent_bin_for_child_ln"
12741 );
12742 }
12743
12744 // --- search_splits_allowed ---
12745
12746 #[test]
12747 fn test_search_splits_allowed_empty_tree() {
12748 let tree = Tree::new(1, 128);
12749 assert!(tree.search_splits_allowed(b"k").is_none());
12750 }
12751
12752 #[test]
12753 fn test_search_splits_allowed_finds_existing_key() {
12754 let tree = Tree::new(1, 8);
12755 for i in 0u32..10 {
12756 let k = format!("sa{:04}", i).into_bytes();
12757 tree.insert(k, vec![i as u8], Lsn::new(1, i)).unwrap();
12758 }
12759 for i in 0u32..10 {
12760 let k = format!("sa{:04}", i).into_bytes();
12761 let sr = tree.search_splits_allowed(&k);
12762 assert!(
12763 sr.is_some() && sr.unwrap().exact_parent_found,
12764 "search_splits_allowed must find sa{:04}",
12765 i
12766 );
12767 }
12768 }
12769
12770 #[test]
12771 fn test_search_splits_allowed_missing_key() {
12772 let tree = Tree::new(1, 8);
12773 tree.insert(b"present".to_vec(), b"v".to_vec(), Lsn::new(1, 1))
12774 .unwrap();
12775 let sr = tree.search_splits_allowed(b"absent");
12776 assert!(
12777 sr.is_none_or(|r| !r.exact_parent_found),
12778 "search_splits_allowed must not find absent key"
12779 );
12780 }
12781
12782 // --- rebuild_in_list ---
12783
12784 #[test]
12785 fn test_rebuild_in_list_empty_tree() {
12786 let tree = Tree::new(1, 128);
12787 assert!(tree.rebuild_in_list().is_empty());
12788 }
12789
12790 #[test]
12791 fn test_rebuild_in_list_single_entry() {
12792 let tree = Tree::new(1, 128);
12793 tree.insert(b"one".to_vec(), b"v".to_vec(), Lsn::new(1, 1)).unwrap();
12794 let list = tree.rebuild_in_list();
12795 // Expect root IN + BIN = 2 nodes.
12796 assert_eq!(
12797 list.len(),
12798 2,
12799 "single-entry tree must have exactly 2 nodes"
12800 );
12801 let has_bin = list.iter().any(|a| a.read().is_bin());
12802 let has_in = list.iter().any(|a| !a.read().is_bin());
12803 assert!(has_bin, "list must contain at least one BIN");
12804 assert!(has_in, "list must contain at least one upper IN");
12805 }
12806
12807 #[test]
12808 fn test_rebuild_in_list_multi_entry() {
12809 let tree = Tree::new(1, 4);
12810 for i in 0u32..20 {
12811 let k = format!("ri{:04}", i).into_bytes();
12812 tree.insert(k, vec![i as u8], Lsn::new(1, i)).unwrap();
12813 }
12814 let list = tree.rebuild_in_list();
12815 let stats = tree.collect_stats();
12816 let expected_nodes = (stats.n_ins + stats.n_bins) as usize;
12817 assert_eq!(
12818 list.len(),
12819 expected_nodes,
12820 "rebuild_in_list must return all {} nodes",
12821 expected_nodes
12822 );
12823 }
12824
12825 // --- validate_in_list ---
12826
12827 #[test]
12828 fn test_validate_in_list_empty_tree() {
12829 let tree = Tree::new(1, 128);
12830 assert!(tree.validate_in_list(), "empty tree must be valid");
12831 }
12832
12833 #[test]
12834 fn test_validate_in_list_single_entry() {
12835 let tree = Tree::new(1, 128);
12836 tree.insert(b"v".to_vec(), b"data".to_vec(), Lsn::new(1, 1)).unwrap();
12837 assert!(tree.validate_in_list(), "single-entry tree must be valid");
12838 }
12839
12840 #[test]
12841 fn test_validate_in_list_multi_entry() {
12842 let tree = Tree::new(1, 4);
12843 for i in 0u32..20 {
12844 let k = format!("vl{:04}", i).into_bytes();
12845 tree.insert(k, vec![i as u8], Lsn::new(1, i)).unwrap();
12846 }
12847 assert!(tree.validate_in_list(), "multi-entry tree must be valid");
12848 }
12849
12850 #[test]
12851 fn test_validate_in_list_empty_in_fails() {
12852 // Manually build a tree where the root IN has no entries — invalid.
12853 let root_arc = Arc::new(RwLock::new(TreeNode::Internal(InNodeStub {
12854 node_id: generate_node_id(),
12855 level: MAIN_LEVEL | 2,
12856 entries: vec![], // empty — structurally invalid
12857 targets: TargetRep::None,
12858 dirty: false,
12859 generation: 0,
12860 parent: None,
12861 lsn_rep: LsnRep::Empty,
12862 })));
12863 let tree = Tree::new(1, 128);
12864 *tree.root.write() = Some(root_arc);
12865 assert!(
12866 !tree.validate_in_list(),
12867 "a tree with an empty Internal node must fail validation"
12868 );
12869 }
12870
12871 // --- get_parent_in_for_child_in ---
12872
12873 #[test]
12874 fn test_get_parent_in_for_child_in_empty_tree() {
12875 let tree = Tree::new(1, 128);
12876 assert!(tree.get_parent_in_for_child_in(999).is_none());
12877 }
12878
12879 #[test]
12880 fn test_get_parent_in_for_child_in_single_entry() {
12881 // A single-insert tree has: root IN → BIN.
12882 // The root IN is the parent of the BIN.
12883 let tree = Tree::new(1, 128);
12884 tree.insert(b"p".to_vec(), b"v".to_vec(), Lsn::new(1, 1)).unwrap();
12885
12886 let root_arc = tree.get_root().as_ref().unwrap().clone();
12887 let bin_node_id = {
12888 let g = root_arc.read();
12889 match &*g {
12890 TreeNode::Internal(n) => {
12891 let child = n.child_ref(0).unwrap();
12892 let cg = child.read();
12893 match &*cg {
12894 TreeNode::Bottom(b) => b.node_id,
12895 _ => panic!("expected BIN"),
12896 }
12897 }
12898 _ => panic!("expected Internal root"),
12899 }
12900 };
12901
12902 let result = tree.get_parent_in_for_child_in(bin_node_id);
12903 assert!(result.is_some(), "must find parent of BIN");
12904 let (parent_arc, slot) = result.unwrap();
12905 assert!(Arc::ptr_eq(&parent_arc, &root_arc));
12906 assert_eq!(slot, 0);
12907 }
12908
12909 #[test]
12910 fn test_get_parent_in_for_child_in_not_found() {
12911 let tree = Tree::new(1, 128);
12912 tree.insert(b"x".to_vec(), b"y".to_vec(), Lsn::new(1, 1)).unwrap();
12913 assert!(tree.get_parent_in_for_child_in(u64::MAX).is_none());
12914 }
12915
12916 #[test]
12917 fn test_get_parent_in_for_child_in_multi_level() {
12918 // Build a tree with at least 3 levels so we test the recursive descent.
12919 let tree = Tree::new(1, 4);
12920 for i in 0u32..20 {
12921 let k = format!("ml{:04}", i).into_bytes();
12922 tree.insert(k, vec![i as u8], Lsn::new(1, i)).unwrap();
12923 }
12924
12925 // Collect all BIN node_ids via rebuild_in_list.
12926 let nodes = tree.rebuild_in_list();
12927 let bin_ids: Vec<u64> = nodes
12928 .iter()
12929 .filter_map(|a| {
12930 let g = a.read();
12931 if g.is_bin()
12932 && let TreeNode::Bottom(b) = &*g
12933 {
12934 return Some(b.node_id);
12935 }
12936 None
12937 })
12938 .collect();
12939
12940 for bin_id in bin_ids {
12941 let result = tree.get_parent_in_for_child_in(bin_id);
12942 assert!(
12943 result.is_some(),
12944 "every BIN (id={}) must have a parent IN",
12945 bin_id
12946 );
12947 let (parent_arc, _slot) = result.unwrap();
12948 assert!(
12949 !parent_arc.read().is_bin(),
12950 "parent of a BIN must be an Internal node"
12951 );
12952 }
12953 }
12954
12955 /// H-9 regression: BinStub::strip_lns actually drops the slot data
12956 /// (not just stats accounting).
12957 #[test]
12958 fn test_h9_strip_lns_actually_frees_data() {
12959 use crate::tree::{BinEntry, BinStub};
12960 use noxu_util::lsn::Lsn;
12961 let mut bin = BinStub {
12962 node_id: 1,
12963 level: 1,
12964 entries: Vec::new(),
12965 key_prefix: Vec::new(),
12966 dirty: false,
12967 is_delta: false,
12968 last_full_lsn: Lsn::from_u64(0),
12969 last_delta_lsn: Lsn::from_u64(0),
12970 generation: 0,
12971 parent: None,
12972 expiration_in_hours: true,
12973 cursor_count: 0,
12974 prohibit_next_delta: false,
12975 lsn_rep: LsnRep::Empty,
12976 keys: KeyRep::new(),
12977 compact_max_key_length: INKeyRep_DEFAULT_MAX_KEY_LENGTH,
12978 };
12979 // Three slots with embedded data + VALID logged LSNs (one dirty).
12980 // JE-faithful: a slot with a valid LSN is strippable regardless of the
12981 // dirty bit (its value is recoverable from the log); only a NULL-LSN
12982 // (never-logged / deferred-write) slot is preserved.
12983 bin.entries.push(BinEntry {
12984 data: Some(vec![0u8; 64]),
12985 known_deleted: false,
12986 dirty: false,
12987 expiration_time: 0,
12988 });
12989 bin.entries.push(BinEntry {
12990 data: Some(vec![0u8; 32]),
12991 known_deleted: false,
12992 dirty: false,
12993 expiration_time: 0,
12994 });
12995 bin.entries.push(BinEntry {
12996 data: Some(vec![0u8; 16]),
12997 known_deleted: false,
12998 dirty: true, // dirty BUT logged -> still strippable (EVICTOR-RECLAIM-1)
12999 expiration_time: 0,
13000 });
13001 // T-2: keep the key rep aligned with the pushed slots.
13002 bin.keys = KeyRep::from_keys(vec![
13003 b"a".to_vec(),
13004 b"b".to_vec(),
13005 b"c".to_vec(),
13006 ]);
13007 // Give all three slots VALID (non-NULL) LSNs so they are recoverable
13008 // from the log and therefore strippable.
13009 bin.set_lsn(0, Lsn::new(1, 100));
13010 bin.set_lsn(1, Lsn::new(1, 200));
13011 bin.set_lsn(2, Lsn::new(1, 300));
13012
13013 let freed = bin.strip_lns();
13014 assert_eq!(
13015 freed,
13016 64 + 32 + 16,
13017 "all logged slots stripped regardless of dirty (JE evictLNs)"
13018 );
13019 assert!(bin.entries[0].data.is_none(), "logged slot data dropped");
13020 assert!(bin.entries[1].data.is_none(), "logged slot data dropped");
13021 assert!(
13022 bin.entries[2].data.is_none(),
13023 "dirty-but-logged slot data dropped (recoverable from log)"
13024 );
13025
13026 // A NULL-LSN slot (never logged) must be preserved — its only copy is
13027 // the in-memory value.
13028 bin.entries[0].data = Some(vec![0u8; 64]);
13029 bin.set_lsn(0, noxu_util::NULL_LSN);
13030 let freed_null = bin.strip_lns();
13031 assert_eq!(
13032 freed_null, 0,
13033 "NULL-LSN (unlogged) slot must NOT be stripped"
13034 );
13035 assert!(bin.entries[0].data.is_some(), "unlogged slot data preserved");
13036
13037 // Cursor pin prevents stripping.
13038 bin.set_lsn(0, Lsn::new(1, 100));
13039 bin.cursor_count = 1;
13040 let freed_with_cursor = bin.strip_lns();
13041 assert_eq!(
13042 freed_with_cursor, 0,
13043 "strip_lns must skip when cursor pinned"
13044 );
13045 assert!(
13046 bin.entries[0].data.is_some(),
13047 "data preserved while cursor pinned"
13048 );
13049 }
13050
13051 // St-H4: the binary upper_in_floor_index must return the same slot as a
13052 // reference linear floor scan for all probe keys (incl. before-all,
13053 // after-all, between, and exact matches).
13054 #[test]
13055 fn test_upper_in_floor_index_matches_linear_scan() {
13056 // Reference linear floor scan (the pre-St-H4 algorithm): slot 0 is the
13057 // virtual −∞ key; walk forward while entry.key ≤ key.
13058 fn linear_floor(entries: &[InEntry], key: &[u8]) -> usize {
13059 let mut idx = 0usize;
13060 for (i, entry) in entries.iter().enumerate() {
13061 if i == 0 {
13062 idx = 0;
13063 } else if entry.key.as_slice() <= key {
13064 idx = i;
13065 } else {
13066 break;
13067 }
13068 }
13069 idx
13070 }
13071
13072 let tree = Tree::new(1, 256);
13073 // Build sorted IN slot key sets of varying size; slot 0 = virtual −∞
13074 // (empty key sorts first), the rest strictly ascending.
13075 for n_slots in 1usize..40 {
13076 let mut entries: Vec<InEntry> = Vec::with_capacity(n_slots);
13077 entries.push(InEntry { key: vec![] });
13078 for i in 1..n_slots {
13079 // Strictly-ascending two-byte keys with gaps so probes can
13080 // fall between, on, before, and after them.
13081 let v = (i as u16) * 4;
13082 entries.push(InEntry {
13083 key: vec![(v >> 8) as u8, (v & 0xFF) as u8],
13084 });
13085 }
13086 for probe in 0u16..=(n_slots as u16 * 4 + 4) {
13087 let key = vec![(probe >> 8) as u8, (probe & 0xFF) as u8];
13088 assert_eq!(
13089 tree.upper_in_floor_index(&entries, &key),
13090 linear_floor(&entries, &key),
13091 "floor mismatch: n_slots={n_slots}, key={key:?}"
13092 );
13093 }
13094 }
13095 }
13096}
13097
13098// ─────────────────────────────────────────────────────────────────────────
13099// St-H6: BIN split inherits expiration_in_hours from the splitting BIN.
13100// ─────────────────────────────────────────────────────────────────────────
13101
13102/// Unit test for the St-H6 fix: the right-half sibling created by
13103/// `split_child` inherits `expiration_in_hours` from the splitting BIN.
13104///
13105/// Before the fix, the sibling was always created with
13106/// `expiration_in_hours = false`, causing hours-granularity TTL entries
13107/// (expiration_time ~495k) to be compared against `current_time_secs()`
13108/// (~1.78B) and treated as expired.
13109///
13110/// This test:
13111/// 1. Creates a tree with max_entries = 4 and inserts 4 entries directly
13112/// (bypassing `update_key_expiration`) with non-zero `expiration_time`
13113/// and `expiration_in_hours = true` on the BIN.
13114/// 2. Triggers a split.
13115/// 3. Asserts that the right-half sibling has `expiration_in_hours = true`
13116/// (inherited, not hardcoded false).
13117#[test]
13118fn test_split_child_sibling_inherits_expiration_in_hours() {
13119 use crate::tree::{BIN_LEVEL, BinEntry, BinStub, MAIN_LEVEL, TreeNode};
13120 use noxu_util::{Lsn, NULL_LSN};
13121 use parking_lot::RwLock;
13122 use std::sync::Arc;
13123
13124 // Manually build a tree with one BIN (4 entries, expiration_in_hours=true).
13125 let tree = Tree::new(99, 4);
13126
13127 // Pre-populate the tree root for the test.
13128 let entries: Vec<BinEntry> = (0u8..4u8)
13129 .map(|_k| BinEntry {
13130 data: Some(vec![_k, _k]),
13131 known_deleted: false,
13132 dirty: true,
13133 expiration_time: 495_630, // hours-since-epoch value, 2026
13134 })
13135 .collect();
13136 let bin_keys: Vec<Vec<u8>> = (0u8..4u8).map(|k| vec![k]).collect();
13137 let bin = Arc::new(RwLock::new(TreeNode::Bottom(BinStub {
13138 node_id: 1,
13139 level: BIN_LEVEL,
13140 entries,
13141 key_prefix: Vec::new(),
13142 dirty: true,
13143 is_delta: false,
13144 last_full_lsn: NULL_LSN,
13145 last_delta_lsn: NULL_LSN,
13146 generation: 0,
13147 parent: None,
13148 expiration_in_hours: true, // hours-granularity entries
13149 cursor_count: 0,
13150 prohibit_next_delta: false,
13151 lsn_rep: LsnRep::Empty,
13152 keys: KeyRep::from_keys(bin_keys),
13153 compact_max_key_length: INKeyRep_DEFAULT_MAX_KEY_LENGTH,
13154 })));
13155
13156 let root = Arc::new(RwLock::new(TreeNode::Internal(InNodeStub {
13157 node_id: 2,
13158 level: MAIN_LEVEL | 2,
13159 entries: vec![InEntry {
13160 key: vec![], // virtual key for slot 0 (-infinity)
13161 }],
13162 targets: TargetRep::Sparse(vec![(0, Arc::clone(&bin))]),
13163 dirty: true,
13164 generation: 0,
13165 parent: None,
13166 lsn_rep: LsnRep::Empty,
13167 })));
13168 {
13169 let mut b = bin.write();
13170 b.set_parent(Some(Arc::downgrade(&root)));
13171 }
13172 *tree.root.write() = Some(Arc::clone(&root));
13173
13174 // Trigger split_child on the root.
13175 Tree::split_child(
13176 &root,
13177 0,
13178 4,
13179 Lsn::new(1, 500),
13180 SplitHint::Normal,
13181 &[],
13182 None,
13183 false,
13184 None,
13185 )
13186 .expect("split_child should succeed");
13187
13188 // After the split: root has two children — left BIN and right sibling.
13189 let root_guard = root.read();
13190 let TreeNode::Internal(ref in_node) = *root_guard else {
13191 panic!("root should be Internal after split");
13192 };
13193 assert_eq!(
13194 in_node.entries.len(),
13195 2,
13196 "root should have 2 entries (children) after split"
13197 );
13198
13199 // Right-half sibling is at slot 1.
13200 let sibling_arc = in_node
13201 .get_child(1)
13202 .expect("right-half sibling should exist at slot 1");
13203 let sibling_guard = sibling_arc.read();
13204 let TreeNode::Bottom(ref sibling) = *sibling_guard else {
13205 panic!("right sibling should be a BIN");
13206 };
13207
13208 assert!(
13209 sibling.expiration_in_hours,
13210 "St-H6: right-half sibling expiration_in_hours must be true \
13211 (inherited from splitting BIN); got false"
13212 );
13213
13214 // Verify the sibling's entries have the expected expiration_time.
13215 for e in &sibling.entries {
13216 assert_eq!(
13217 e.expiration_time, 495_630,
13218 "sibling entry expiration_time should be preserved: got {}",
13219 e.expiration_time
13220 );
13221 // With in_hours=true, is_expired should return false (future).
13222 assert!(
13223 !noxu_util::ttl::is_expired(
13224 e.expiration_time,
13225 sibling.expiration_in_hours
13226 ),
13227 "St-H6: sibling TTL entry ({}) should NOT appear expired \
13228 with expiration_in_hours={}",
13229 e.expiration_time,
13230 sibling.expiration_in_hours
13231 );
13232 }
13233}
13234
13235/// Regression confirmation: `is_expired` with wrong `in_hours = false`
13236/// would falsely expire hours-granularity values (~495k hours since epoch).
13237#[test]
13238fn test_hours_value_is_expired_only_with_false_flag() {
13239 // Hours-since-epoch value for ~2026 + 1 000 h TTL.
13240 let exp_hours: u32 = 495_630;
13241 // Correctly treated as hours: not expired.
13242 assert!(
13243 !noxu_util::ttl::is_expired(exp_hours, true),
13244 "exp_hours={exp_hours} should NOT be expired when in_hours=true"
13245 );
13246 // Incorrectly treated as seconds (pre-fix right sibling): expired.
13247 assert!(
13248 noxu_util::ttl::is_expired(exp_hours, false),
13249 "exp_hours={exp_hours} should be expired when in_hours=false \
13250 (St-H6 demonstrates the wrong-flag scenario)"
13251 );
13252}
13253
13254// =============================================================================
13255// IN-redo unit tests (DRIFT-1 / Stage 1)
13256// =============================================================================
13257
13258#[cfg(test)]
13259mod in_redo_tests {
13260 use super::*;
13261
13262 /// Build a BinStub with `n` entries (key = [i as u8], lsn = lsn(1, i))
13263 /// and serialise it. Returns (node_id, node_data_bytes).
13264 fn make_bin_bytes(node_id: u64, n: usize) -> Vec<u8> {
13265 let mut bin = BinStub {
13266 node_id,
13267 level: BIN_LEVEL,
13268 entries: Vec::new(),
13269 key_prefix: Vec::new(),
13270 dirty: false,
13271 is_delta: false,
13272 last_full_lsn: noxu_util::NULL_LSN,
13273 last_delta_lsn: noxu_util::NULL_LSN,
13274 generation: 0,
13275 parent: None,
13276 expiration_in_hours: true,
13277 cursor_count: 0,
13278 prohibit_next_delta: false,
13279 lsn_rep: LsnRep::Empty,
13280 keys: KeyRep::new(),
13281 compact_max_key_length: INKeyRep_DEFAULT_MAX_KEY_LENGTH,
13282 };
13283 for i in 0..n {
13284 // T-2/T-3: route through insert so entries/keys/lsn_rep stay
13285 // aligned; the serialized bytes are identical.
13286 bin.insert_with_prefix(
13287 vec![i as u8],
13288 Lsn::new(1, (i + 1) as u32),
13289 Some(vec![i as u8]),
13290 );
13291 }
13292 bin.serialize_full()
13293 }
13294
13295 /// Verify that recover_in_redo inserts a BIN as root when the tree is empty.
13296 ///
13297 /// JE RecoveryManager.recoverRootIN: `root == null` path.
13298 #[test]
13299 fn test_recover_in_redo_root_bin_inserted_into_empty_tree() {
13300 let tree = Tree::new(42, 128);
13301 assert!(tree.is_empty());
13302 let bytes = make_bin_bytes(1, 3);
13303 let log_lsn = Lsn::new(1, 100);
13304 let result = tree.recover_in_redo(
13305 log_lsn, /*is_root=*/ true, /*is_bin=*/ true, &bytes,
13306 );
13307 assert_eq!(result, InRedoResult::Inserted, "expected Inserted");
13308 // Tree should now have 3 entries.
13309 assert_eq!(tree.count_entries(), 3);
13310 }
13311
13312 /// Verify that recover_in_redo replaces a root BIN when the logged version is newer.
13313 ///
13314 /// JE RootUpdater.doWork: `DbLsn.compareTo(originalLsn, lsn) < 0` path.
13315 #[test]
13316 fn test_recover_in_redo_root_bin_replaced_when_log_newer() {
13317 let tree = Tree::new(42, 128);
13318 // Install an old root (2 entries, older LSN).
13319 let old_bytes = make_bin_bytes(1, 2);
13320 let old_lsn = Lsn::new(1, 50);
13321 tree.recover_in_redo(old_lsn, true, true, &old_bytes);
13322 assert_eq!(tree.count_entries(), 2);
13323 // Replay with newer LSN and 4 entries.
13324 let new_bytes = make_bin_bytes(1, 4);
13325 let new_lsn = Lsn::new(1, 100);
13326 let result = tree.recover_in_redo(new_lsn, true, true, &new_bytes);
13327 assert_eq!(result, InRedoResult::Replaced);
13328 assert_eq!(tree.count_entries(), 4);
13329 }
13330
13331 /// Verify that an older logged BIN does NOT replace a newer in-memory root.
13332 ///
13333 /// JE RootUpdater.doWork: `DbLsn.compareTo(originalLsn, lsn) >= 0` skip path.
13334 #[test]
13335 fn test_recover_in_redo_root_bin_skipped_when_tree_newer() {
13336 let tree = Tree::new(42, 128);
13337 // Install a newer root.
13338 let new_bytes = make_bin_bytes(1, 4);
13339 let new_lsn = Lsn::new(1, 200);
13340 tree.recover_in_redo(new_lsn, true, true, &new_bytes);
13341 // Attempt to replay an older version.
13342 let old_bytes = make_bin_bytes(1, 2);
13343 let old_lsn = Lsn::new(1, 100);
13344 let result = tree.recover_in_redo(old_lsn, true, true, &old_bytes);
13345 assert_eq!(result, InRedoResult::Skipped);
13346 // Tree still holds the newer 4-entry version.
13347 assert_eq!(tree.count_entries(), 4);
13348 }
13349
13350 /// deserialize_bin round-trips through serialize_full.
13351 #[test]
13352 fn test_deserialize_bin_round_trip() {
13353 let bytes = make_bin_bytes(99, 5);
13354 let bin = Tree::deserialize_bin(&bytes).expect("must deserialize");
13355 assert_eq!(bin.node_id, 99);
13356 assert_eq!(bin.entries.len(), 5);
13357 for i in 0..bin.entries.len() {
13358 assert_eq!(bin.get_full_key(i).unwrap(), vec![i as u8]);
13359 }
13360 }
13361
13362 /// deserialize_upper_in round-trips through write_to_bytes (Internal).
13363 #[test]
13364 fn test_deserialize_upper_in_round_trip() {
13365 // Build an InNodeStub and serialize via write_to_bytes.
13366 let node = TreeNode::Internal(InNodeStub {
13367 node_id: 77,
13368 level: 0x10002,
13369 entries: vec![
13370 InEntry { key: vec![1, 2, 3] },
13371 InEntry { key: vec![4, 5, 6] },
13372 ],
13373 targets: TargetRep::None,
13374 dirty: false,
13375 generation: 0,
13376 parent: None,
13377 lsn_rep: LsnRep::Empty,
13378 });
13379 let bytes = node.write_to_bytes();
13380 let restored =
13381 Tree::deserialize_upper_in(&bytes).expect("must deserialize");
13382 assert_eq!(restored.node_id, 77);
13383 assert_eq!(restored.level, 0x10002);
13384 assert_eq!(restored.entries.len(), 2);
13385 assert_eq!(restored.entries[0].key, vec![1, 2, 3]);
13386 assert_eq!(restored.entries[1].key, vec![4, 5, 6]);
13387 }
13388}
13389
13390// --- Part 2 acceptance tests: key_prefixing flag (DRIFT-3) ---
13391//
13392// JE `IN.computeKeyPrefix` returns null when `databaseImpl.getKeyPrefixing()`
13393// is false, so no prefix compression is ever applied to those BINs. Noxu was
13394// always applying prefix compression. This checks that the flag is honoured.
13395//
13396// Ref: `IN.java computeKeyPrefix` ~line 2456,
13397// `DatabaseConfig.setKeyPrefixing` / `DatabaseImpl.getKeyPrefixing`.
13398#[cfg(test)]
13399mod key_prefixing_tests {
13400 use super::*;
13401
13402 /// Helper: find the first (leftmost) BIN in the tree.
13403 fn find_first_bin(node: &Arc<RwLock<TreeNode>>) -> Arc<RwLock<TreeNode>> {
13404 let child_opt = {
13405 let g = node.read();
13406 match &*g {
13407 TreeNode::Bottom(_) => None,
13408 TreeNode::Internal(n) => {
13409 Some(Arc::clone(n.child_ref(0).expect("child")))
13410 }
13411 }
13412 };
13413 match child_opt {
13414 None => Arc::clone(node),
13415 Some(child) => find_first_bin(&child),
13416 }
13417 }
13418
13419 /// With `key_prefixing = false` (the default), keys must be stored without
13420 /// any prefix: the BIN's `key_prefix` must remain empty after inserts.
13421 #[test]
13422 fn test_key_prefixing_false_stores_full_keys() {
13423 // Default is key_prefixing = false.
13424 let tree = Tree::new(1, 16);
13425 assert!(!tree.key_prefixing, "default must be false");
13426
13427 let lsn = noxu_util::Lsn::new(1, 10);
13428 // Insert keys with a long common prefix.
13429 for i in 0u8..8 {
13430 let key = vec![b'r', b'e', b'c', b'o', b'r', b'd', b':', i];
13431 tree.insert(key, vec![i], lsn).expect("insert");
13432 }
13433
13434 let root = tree.get_root().expect("root");
13435 let bin_arc = find_first_bin(&root);
13436 let guard = bin_arc.read();
13437 let TreeNode::Bottom(ref bin) = *guard else {
13438 panic!("must be a BIN");
13439 };
13440 assert!(
13441 bin.key_prefix.is_empty(),
13442 "key_prefix must be empty when key_prefixing=false, got {:?}",
13443 bin.key_prefix
13444 );
13445 assert_eq!(bin.entries.len(), 8);
13446 // Keys must be stored as full keys.
13447 assert_eq!(
13448 bin.get_full_key(0).unwrap(),
13449 vec![b'r', b'e', b'c', b'o', b'r', b'd', b':', 0]
13450 );
13451 }
13452
13453 /// With `key_prefixing = true`, keys with a common prefix are compressed:
13454 /// the BIN's `key_prefix` must be non-empty.
13455 #[test]
13456 fn test_key_prefixing_true_compresses_keys() {
13457 let mut tree = Tree::new(1, 16);
13458 tree.set_key_prefixing(true);
13459
13460 let lsn = noxu_util::Lsn::new(1, 10);
13461 for i in 0u8..8 {
13462 let key = vec![b'r', b'e', b'c', b'o', b'r', b'd', b':', i];
13463 tree.insert(key, vec![i], lsn).expect("insert");
13464 }
13465
13466 let root = tree.get_root().expect("root");
13467 let bin_arc = find_first_bin(&root);
13468 let guard = bin_arc.read();
13469 let TreeNode::Bottom(ref bin) = *guard else {
13470 panic!("must be a BIN");
13471 };
13472 // Prefix compression must kick in: all keys share "record:".
13473 assert!(
13474 !bin.key_prefix.is_empty(),
13475 "key_prefix must be non-empty when key_prefixing=true"
13476 );
13477 assert_eq!(
13478 bin.key_prefix,
13479 b"record:".to_vec(),
13480 "prefix must be the common prefix of all inserted keys"
13481 );
13482 }
13483
13484 /// Custom-comparator databases (sorted-dup) always bypass prefix
13485 /// regardless of key_prefixing: `insert_cmp` does not touch key_prefix.
13486 #[test]
13487 fn test_key_prefixing_custom_comparator_no_prefix() {
13488 let cmp: KeyComparatorFn = Arc::new(|a: &[u8], b: &[u8]| a.cmp(b));
13489 let mut tree = Tree::new_with_comparator(1, 16, cmp);
13490 // Enable key_prefixing — should have no effect via insert_cmp path.
13491 tree.set_key_prefixing(true);
13492
13493 let lsn = noxu_util::Lsn::new(1, 10);
13494 for i in 0u8..8 {
13495 let key = vec![b'r', b'e', b'c', b'o', b'r', b'd', b':', i];
13496 tree.insert(key, vec![i], lsn).expect("insert");
13497 }
13498
13499 let root = tree.get_root().expect("root");
13500 let bin_arc = find_first_bin(&root);
13501 let guard = bin_arc.read();
13502 let TreeNode::Bottom(ref bin) = *guard else {
13503 panic!("must be a BIN");
13504 };
13505 // Custom-comparator path (insert_cmp) does not set key_prefix.
13506 assert!(
13507 bin.key_prefix.is_empty(),
13508 "custom-comparator path must not set key_prefix"
13509 );
13510 }
13511}
13512
13513// --- Part 1 acceptance tests: splitSpecial heuristic (DRIFT-1) ---
13514//
13515// JE `IN.splitSpecial` / `Tree.forceSplit`: when all routing decisions during
13516// descent are leftmost (`AllLeft`) or rightmost (`AllRight`), the split index
13517// is forced to 1 or `n-1` respectively instead of `n/2`. This halves the
13518// number of splits for monotonically increasing / decreasing key workloads
13519// (sequential append / prepend) because each split leaves the BIN near-full.
13520//
13521// Ref: `IN.java splitSpecial` ~line 4129, `Tree.java forceSplit` ~line 1907.
13522#[cfg(test)]
13523mod split_special_tests {
13524 use super::*;
13525
13526 /// Test helper: descend the tree to the BIN that holds (or would hold)
13527 /// `key`, returning its arc. Mirrors the read-path descent used by
13528 /// `Tree::search`; sufficient for unit tests that need to mutate a slot.
13529 fn find_bin_arc_for_key(
13530 node_arc: &Arc<RwLock<TreeNode>>,
13531 key: &[u8],
13532 ) -> Option<Arc<RwLock<TreeNode>>> {
13533 let mut current = node_arc.clone();
13534 loop {
13535 let next = {
13536 let g = current.read();
13537 match &*g {
13538 TreeNode::Bottom(_) => return Some(current.clone()),
13539 TreeNode::Internal(n) => {
13540 if n.entries.is_empty() {
13541 return None;
13542 }
13543 let mut idx = 0usize;
13544 for (i, e) in n.entries.iter().enumerate() {
13545 if i == 0 || e.key.as_slice() <= key {
13546 idx = i;
13547 } else {
13548 break;
13549 }
13550 }
13551 n.get_child(idx)?
13552 }
13553 }
13554 };
13555 current = next;
13556 }
13557 }
13558
13559 /// Count total leaf (BIN) nodes in the tree by DFS.
13560 fn count_bins(node: &Arc<RwLock<TreeNode>>) -> usize {
13561 let g = node.read();
13562 match &*g {
13563 TreeNode::Bottom(_) => 1,
13564 TreeNode::Internal(n) => {
13565 n.resident_children().iter().map(count_bins).sum()
13566 }
13567 }
13568 }
13569
13570 /// Return total key count across all BINs.
13571 fn count_keys(node: &Arc<RwLock<TreeNode>>) -> usize {
13572 let g = node.read();
13573 match &*g {
13574 TreeNode::Bottom(b) => b.entries.len(),
13575 TreeNode::Internal(n) => {
13576 n.resident_children().iter().map(count_keys).sum()
13577 }
13578 }
13579 }
13580
13581 /// Returns the number of entries in the leftmost BIN.
13582 fn leftmost_bin_size(node: &Arc<RwLock<TreeNode>>) -> usize {
13583 let g = node.read();
13584 match &*g {
13585 TreeNode::Bottom(b) => b.entries.len(),
13586 TreeNode::Internal(n) => {
13587 let first_child = n.child_ref(0).expect("child");
13588 leftmost_bin_size(first_child)
13589 }
13590 }
13591 }
13592
13593 /// Returns the number of entries in the rightmost BIN.
13594 fn rightmost_bin_size(node: &Arc<RwLock<TreeNode>>) -> usize {
13595 let g = node.read();
13596 match &*g {
13597 TreeNode::Bottom(b) => b.entries.len(),
13598 TreeNode::Internal(n) => {
13599 let last_child = n
13600 .child_ref(n.entries.len().saturating_sub(1))
13601 .expect("child");
13602 rightmost_bin_size(last_child)
13603 }
13604 }
13605 }
13606
13607 /// `splitSpecial` ascending: each right-side split leaves the left BIN
13608 /// near-full (all but one entry stays). Compared to midpoint split
13609 /// the number of BINs created should be significantly fewer relative to
13610 /// keys inserted (more keys per BIN on average).
13611 ///
13612 /// JE criterion: `allRightSideDescent` → `splitIndex = nEntries - 1`.
13613 /// The penultimate entry stays in the left BIN; only one entry goes to
13614 /// the new right sibling, which then absorbs the next insert and fills
13615 /// normally.
13616 #[test]
13617 fn test_split_special_ascending_fewer_bins_than_midpoint() {
13618 let max_entries = 8usize;
13619 let n_keys = 200usize;
13620
13621 // Build tree with splitSpecial (ascending keys trigger AllRight).
13622 let tree_special = Tree::new(1, max_entries);
13623 let lsn = noxu_util::Lsn::new(1, 100);
13624 for i in 0u32..n_keys as u32 {
13625 let key = i.to_be_bytes().to_vec();
13626 tree_special.insert(key, vec![0u8], lsn).expect("insert");
13627 }
13628
13629 let root_special = tree_special.get_root().expect("root must exist");
13630 let bins_special = count_bins(&root_special);
13631 let keys_special = count_keys(&root_special);
13632
13633 // All keys must be present.
13634 assert_eq!(keys_special, n_keys, "all keys must be stored");
13635
13636 // With splitSpecial, each right-side split keeps n-1 entries in the
13637 // left BIN. Ideal: ceil(n_keys / (max_entries - 1)) BINs.
13638 // Without splitSpecial (midpoint): ceil(n_keys / (max_entries / 2)).
13639 // We assert the actual count is below the midpoint-split upper bound.
13640 let midpoint_upper_bound = n_keys.div_ceil(max_entries / 2);
13641 assert!(
13642 bins_special < midpoint_upper_bound,
13643 "splitSpecial should produce fewer BINs than midpoint split: \
13644 got {bins_special}, midpoint upper bound = {midpoint_upper_bound}"
13645 );
13646
13647 // The rightmost BIN must have fewer entries than max_entries
13648 // (the last insert only half-fills it at most), which is expected.
13649 // The IMPORTANT property: rightmost BIN started with exactly 1 entry
13650 // (its first entry was the split-off singleton) then filled up.
13651 // We just verify overall key density > midpoint baseline.
13652 let avg_fill = keys_special as f64 / bins_special as f64;
13653 let midpoint_fill = (max_entries / 2) as f64;
13654 assert!(
13655 avg_fill > midpoint_fill,
13656 "average fill per BIN with splitSpecial ({avg_fill:.1}) should \
13657 exceed midpoint baseline ({midpoint_fill})"
13658 );
13659 }
13660
13661 /// `splitSpecial` descending: all routing decisions are at slot 0
13662 /// (`AllLeft`). Split forces `split_index = 1` so the right sibling
13663 /// gets almost all entries and the left node keeps just one.
13664 ///
13665 /// JE criterion: `allLeftSideDescent` → `splitIndex = 1`.
13666 #[test]
13667 fn test_split_special_descending_fewer_bins_than_midpoint() {
13668 let max_entries = 8usize;
13669 let n_keys = 200usize;
13670
13671 let tree_special = Tree::new(1, max_entries);
13672 let lsn = noxu_util::Lsn::new(1, 100);
13673 for i in (0u32..n_keys as u32).rev() {
13674 let key = i.to_be_bytes().to_vec();
13675 tree_special.insert(key, vec![0u8], lsn).expect("insert");
13676 }
13677
13678 let root_special = tree_special.get_root().expect("root must exist");
13679 let bins_special = count_bins(&root_special);
13680 let keys_special = count_keys(&root_special);
13681
13682 assert_eq!(keys_special, n_keys, "all keys must be stored");
13683
13684 let midpoint_upper_bound = n_keys.div_ceil(max_entries / 2);
13685 assert!(
13686 bins_special < midpoint_upper_bound,
13687 "splitSpecial descending should produce fewer BINs: \
13688 got {bins_special}, midpoint upper bound = {midpoint_upper_bound}"
13689 );
13690 }
13691
13692 /// Random-key inserts must NOT be affected by splitSpecial: with random
13693 /// keys descent will rarely be all-left or all-right, so the split index
13694 /// defaults to midpoint and tree balance is maintained.
13695 #[test]
13696 fn test_split_special_random_inserts_stay_balanced() {
13697 use std::collections::BTreeSet;
13698
13699 let max_entries = 8usize;
13700 // Use a fixed permutation so the test is deterministic.
13701 let mut keys: Vec<u32> = (0u32..200).collect();
13702 // Knuth shuffle with a fixed seed.
13703 let mut rng: u64 = 0xdeadbeef_cafebabe;
13704 for i in (1..keys.len()).rev() {
13705 rng = rng.wrapping_mul(6364136223846793005).wrapping_add(1);
13706 let j = (rng >> 33) as usize % (i + 1);
13707 keys.swap(i, j);
13708 }
13709
13710 let tree = Tree::new(1, max_entries);
13711 let lsn = noxu_util::Lsn::new(1, 100);
13712 let mut inserted = BTreeSet::new();
13713 for k in &keys {
13714 let key = k.to_be_bytes().to_vec();
13715 tree.insert(key, vec![0u8], lsn).expect("insert");
13716 inserted.insert(*k);
13717 }
13718
13719 let root = tree.get_root().expect("root");
13720 let total_keys = count_keys(&root);
13721 assert_eq!(
13722 total_keys,
13723 inserted.len(),
13724 "all random keys must be stored"
13725 );
13726
13727 // Verify every key is findable.
13728 for k in &inserted {
13729 let key = k.to_be_bytes().to_vec();
13730 let found = tree.search(&key);
13731 assert!(
13732 found.map(|r| r.is_exact_match()).unwrap_or(false),
13733 "random key {k} must be findable after insert"
13734 );
13735 }
13736 }
13737
13738 /// TREE-F1: a `known_deleted` BIN slot must read as ABSENT on an exact
13739 /// lookup and must be SKIPPED by scans, matching JE.
13740 ///
13741 /// JE contract:
13742 /// * `IN.findEntry` (IN.java:3197): an exact match that lands on a
13743 /// known-deleted slot returns -1 (ABSENT).
13744 /// * `CursorImpl.lockAndGetCurrent` (CursorImpl.java:2062-2064): a
13745 /// step that lands on `isEntryKnownDeleted(index)` returns null, so
13746 /// the `getNext` loop advances past it (the slot is skipped).
13747 ///
13748 /// KD slots legitimately exist in live BINs during BIN-delta
13749 /// reconstitution (`mutate_to_full_bin` applies delta KD slots) until
13750 /// the compressor reclaims them. We reach that state directly here by
13751 /// marking a slot known_deleted in the BIN arc, then assert the
13752 /// user-facing read/scan paths do not surface it.
13753 #[test]
13754 fn test_tree_f1_known_deleted_slot_is_absent_and_skipped() {
13755 let tree = Tree::new(1, 8);
13756 // Insert enough keys to populate a BIN with several live slots.
13757 for i in 0..6u32 {
13758 let key = format!("kd{i:04}").into_bytes();
13759 tree.insert(key, vec![i as u8], Lsn::new(1, i)).unwrap();
13760 }
13761
13762 // Pick a middle key and mark its slot known_deleted directly in the
13763 // BIN, modelling a delta-applied tombstone the compressor has not yet
13764 // reclaimed.
13765 let kd_key = b"kd0003".to_vec();
13766 {
13767 let root = tree.get_root().expect("root");
13768 let bin_arc = find_bin_arc_for_key(&root, &kd_key).expect("bin");
13769 let mut g = bin_arc.write();
13770 if let TreeNode::Bottom(b) = &mut *g {
13771 let idx = (0..b.entries.len())
13772 .find(|&i| {
13773 b.get_full_key(i).as_deref() == Some(kd_key.as_slice())
13774 })
13775 .expect("kd key slot");
13776 b.entries[idx].known_deleted = true;
13777 } else {
13778 panic!("expected BIN");
13779 }
13780 }
13781
13782 // (a) exact lookup via Tree::search must report NOT found.
13783 let sr = tree.search(&kd_key);
13784 assert!(
13785 !sr.map(|r| r.is_exact_match()).unwrap_or(false),
13786 "TREE-F1: Tree::search must report a known_deleted slot as absent \
13787 (IN.findEntry IN.java:3197)"
13788 );
13789
13790 // (a) exact lookup via Tree::search_with_data must report NOT found.
13791 let sf = tree.search_with_data(&kd_key).expect("slot fetch");
13792 assert!(
13793 !sf.found,
13794 "TREE-F1: Tree::search_with_data must report a known_deleted slot \
13795 as absent (IN.findEntry IN.java:3197)"
13796 );
13797
13798 // Live neighbours must still be found.
13799 for live in [b"kd0002".to_vec(), b"kd0004".to_vec()] {
13800 assert!(
13801 tree.search(&live).map(|r| r.is_exact_match()).unwrap_or(false),
13802 "live neighbour must remain findable"
13803 );
13804 }
13805
13806 // (b) a scan-facing BIN dump (descend_to_edge_bin / get_next_bin /
13807 // get_prev_bin) returns slots verbatim WITH the known_deleted flag
13808 // set, so the cursor can skip them (CursorImpl.java:2062-2064). The
13809 // contract here is: the KD slot is never reported as a LIVE entry.
13810 let root = tree.get_root().expect("root");
13811 let edge = Tree::descend_to_edge_bin(&root, true).expect("edge bin");
13812 assert!(
13813 !edge.iter().any(|(e, _, k)| k == &kd_key && !e.known_deleted),
13814 "TREE-F1: scan must not surface a known_deleted slot as live \
13815 (CursorImpl.java:2062-2064)"
13816 );
13817 for anchor in [b"kd0000".to_vec(), b"kd0005".to_vec()] {
13818 for entries in
13819 [tree.get_next_bin(&anchor), tree.get_prev_bin(&anchor)]
13820 .into_iter()
13821 .flatten()
13822 {
13823 assert!(
13824 !entries
13825 .iter()
13826 .any(|(e, _, k)| k == &kd_key && !e.known_deleted),
13827 "TREE-F1: get_next_bin/get_prev_bin must not surface a \
13828 known_deleted slot as live"
13829 );
13830 }
13831 }
13832
13833 // first_entry_at_or_after must skip a KD slot at the boundary.
13834 if let Some((k, _, _)) = tree.first_entry_at_or_after(&kd_key) {
13835 assert_ne!(
13836 k, kd_key,
13837 "TREE-F1: first_entry_at_or_after must skip a known_deleted \
13838 slot (CursorImpl.java:2062-2064)"
13839 );
13840 }
13841
13842 // The compressor KD-iteration path must STILL see the slot — the fix
13843 // only changes the user-facing read predicate, not the maintenance
13844 // iteration that exists to reclaim KD slots.
13845 let kd_bins = tree.collect_bins_with_known_deleted();
13846 assert!(
13847 !kd_bins.is_empty(),
13848 "TREE-F1: collect_bins_with_known_deleted must still observe the \
13849 KD slot so the compressor can reclaim it"
13850 );
13851 }
13852}