noxu_tree/tree.rs
1//! B+tree implementation.
2//!
3//!
4//! Tree implements the B+tree. It provides search, insert, and delete
5//! operations on the tree structure. The tree uses latch-coupling for
6//! concurrent access: when traversing down the tree, the parent latch
7//! is released after the child latch is acquired.
8//!
9//! # Architecture
10//!
11//! The tree has a hierarchical structure:
12//! - Internal Nodes (IN) at levels 2 and above
13//! - Bottom Internal Nodes (BIN) at level 1
14//! - Leaf Nodes (LN) containing actual data
15//!
16//! # Locking Strategy
17//!
18//! - Root latch protects the root pointer itself
19//! - Each node has its own latch for concurrent access
20//! - Search uses latch-coupling: acquire child, release parent
21//! - Modifications may require exclusive latches
22
23use crate::error::TreeError;
24use crate::key::{create_key_prefix, get_key_prefix_length};
25use crate::search_result::SearchResult;
26use noxu_latch::{LatchContext, SharedLatch};
27use noxu_util::{Lsn, NULL_LSN};
28use parking_lot::RwLock;
29use std::sync::atomic::{AtomicI64, AtomicU64, Ordering};
30use std::sync::{Arc, Weak};
31
32/// Observer that mirrors JE's `INList` feeding the evictor's `LRUList`s.
33///
34/// The tree owns no eviction policy of its own; instead it notifies a
35/// registered listener whenever an IN/BIN node enters the resident cache, is
36/// accessed, or is removed. The `Evictor` (in `noxu-evictor`) implements this
37/// trait, but the dependency is one-way (`noxu-evictor` → `noxu-tree`), so the
38/// tree refers to the listener only through this trait object — avoiding a
39/// circular crate dependency.
40///
41/// JE reference: `IN.fetchTarget` / split / `rebuildINList` call
42/// `Evictor.addBack`; node access calls `Evictor.moveBack`; node removal
43/// calls `Evictor.remove`.
44pub trait InListListener: Send + Sync {
45 /// A node has just become resident in the cache (JE `Evictor.addBack`).
46 fn note_ins_added(&self, node_id: u64);
47 /// A resident node was accessed (JE `Evictor.moveBack` — LRU touch).
48 fn note_ins_accessed(&self, node_id: u64);
49 /// A node was removed from the cache (JE `Evictor.remove`).
50 fn note_ins_removed(&self, node_id: u64);
51}
52
53// Level and flag constants re-exported here for tree-internal use.
54pub const DBMAP_LEVEL: i32 = 0x20000;
55pub const MAIN_LEVEL: i32 = 0x10000;
56pub const LEVEL_MASK: i32 = 0x0ffff;
57pub const MIN_LEVEL: i32 = -1;
58pub const BIN_LEVEL: i32 = MAIN_LEVEL | 1;
59pub const EXACT_MATCH: i32 = 1 << 16;
60pub const INSERT_SUCCESS: i32 = 1 << 17;
61
62/// Per-slot fixed memory overhead for a BIN entry, in bytes (DBI-23).
63///
64/// This is the heap footprint of one `BinEntry` *struct* as it lives inside
65/// the BIN's `Vec<BinEntry>` buffer — NOT counting the variable-length key and
66/// data bytes, which are separate heap allocations counted on top of this.
67///
68/// Faithful to JE `IN.getEntryInMemorySize` + the per-slot `entryStates` /
69/// LSN-array overhead folded into `IN.computeMemorySize` (IN.java ~4632):
70/// JE measures the slot's fixed cost with `Sizeof` on the JVM; Rust has a
71/// fixed struct layout so `size_of::<BinEntry>()` is exact.
72///
73/// T-2/T-3: the per-slot `key` (`Vec<u8>` header) and `lsn` (`u64`) were
74/// hoisted out of `BinEntry` into the node-level `KeyRep`/`LsnRep`. The
75/// `size_of::<BinEntry>()` therefore shrank; we add back the packed per-slot
76/// LSN-rep cost (`LsnRep::BYTES_PER_LSN_ENTRY`, 4 bytes) so the incremental
77/// live counter still approximates the walked heap (the key bytes are charged
78/// separately as `key.len()` at the call site, matching the compact key rep).
79///
80/// Derived (not hard-coded) so a layout change to `BinEntry` is tracked
81/// automatically — see `bin_stub_conformance` for the drift guard.
82pub const BIN_ENTRY_OVERHEAD: usize =
83 std::mem::size_of::<BinEntry>() + LsnRep::BYTES_PER_LSN_ENTRY;
84
85/// Per-slot fixed memory overhead for an IN entry, in bytes (DBI-23).
86///
87/// Heap footprint of one `InEntry` struct inside the IN's `Vec<InEntry>`
88/// buffer (key bytes counted separately). JE `IN.getEntryInMemorySize` for
89/// an upper IN plus the per-slot state/LSN/target overhead from
90/// `IN.computeMemorySize`.
91pub const IN_ENTRY_OVERHEAD: usize = std::mem::size_of::<InEntry>();
92
93/// Type alias for the key comparator used by sorted-duplicate databases.
94///
95/// The comparator takes two full (uncompressed) keys and returns their
96/// relative ordering. For sorted-dup databases this is `DupKeyData::compare`,
97/// which splits each key into primary + data parts and applies separate
98/// comparators to each. For normal databases this field is `None` and
99/// lexicographic byte comparison is used.
100///
101/// `DatabaseImpl.btreeComparator` / `DatabaseImpl.dupComparator`.
102pub type KeyComparatorFn =
103 Arc<dyn Fn(&[u8], &[u8]) -> std::cmp::Ordering + Send + Sync>;
104
105/// Combined search result carrying slot data and the BIN arc, returned by
106/// [`Tree::search_with_data`].
107///
108/// Avoids the double-descent pattern where `Tree::search` checked key
109/// existence and a second call re-descended to fetch the actual slot bytes.
110/// One descent now serves both purposes (Wave-11-I optimisation).
111pub struct SlotFetch {
112 /// `true` if an exact key match was found and is not expired.
113 pub found: bool,
114 /// Data bytes for the slot (`None` when `found` is `false`).
115 pub data: Option<Vec<u8>>,
116 /// Raw slot LSN as `u64`; zero when `found` is `false`.
117 pub lsn: u64,
118 /// Slot index within the BIN. Set to the actual BIN slot index when
119 /// `found` is `true`; `0` otherwise.
120 ///
121 /// Used by `CursorImpl` to set `current_index` correctly so that
122 /// `retrieve_next` advances to the right slot after a search.
123 pub slot_index: usize,
124 /// Arc to the BIN that the descent reached. Always `Some` when the
125 /// tree has at least one node, regardless of whether `found` is `true`.
126 pub bin_arc: Arc<RwLock<TreeNode>>,
127}
128
129/// The B+tree.
130///
131///
132///
133/// This is the main tree structure that manages the B+tree nodes and
134/// provides operations for search, insert, delete, and tree maintenance.
135pub struct Tree {
136 /// Database ID this tree belongs to.
137 database_id: u64,
138
139 /// Maximum entries per node (from config).
140 max_entries_per_node: usize,
141
142 /// Root of the tree. None if tree is empty.
143 ///
144 /// Wrapped in `RwLock` so that `insert`, `delete`, and other mutating
145 /// operations can take `&self` (interior mutability), enabling concurrent
146 /// access to different BIN nodes without requiring a global `&mut Tree`
147 /// borrow. The root pointer itself is only written during root splits
148 /// and initial creation; all other access is read-only.
149 ///
150 /// `Tree.root` protected by the root latch.
151 root: RwLock<Option<Arc<RwLock<TreeNode>>>>,
152
153 /// Latch protecting the root reference itself.
154 /// Must be held when changing the root pointer.
155 root_latch: SharedLatch,
156
157 /// LSN at which the current root IN/BIN was last logged.
158 ///
159 /// Used by the IN-redo currency check (`recover_root_bin` /
160 /// `recover_root_upper_in`) to decide whether a logged root replaces the
161 /// in-memory one. Updated whenever a new root is installed via
162 /// `set_root_with_lsn` or the IN-redo recover-root path.
163 ///
164 /// JE `RootUpdater.originalLsn` / `ChildReference.getLsn()` for the root.
165 root_log_lsn: RwLock<noxu_util::Lsn>,
166
167 /// Statistics: number of times the root has been split.
168 root_splits: AtomicU64,
169
170 /// Statistics: number of latch upgrades from shared to exclusive.
171 relatches_required: AtomicU64,
172
173 /// Optional custom key comparator for sorted-duplicate databases.
174 ///
175 /// When `Some`, all key comparisons in tree traversal (upper IN routing
176 /// and BIN entry search/insert/delete) use this comparator instead of
177 /// lexicographic byte comparison.
178 ///
179 /// / `dupComparator` stored on the
180 /// database and consulted at every `IN.findEntry()` call.
181 pub key_comparator: Option<KeyComparatorFn>,
182
183 /// Shared memory counter for the evictor / MemoryBudget.
184 ///
185 /// Updated on every BIN entry insert (+key+data+overhead) and delete
186 /// (-key+overhead) so the evictor sees real cache pressure.
187 ///
188 /// `env.getMemoryBudget().updateTreeMemoryUsage(delta)` call
189 /// in the equivalent `IN.updateMemorySize()`. In Noxu the counter is an
190 /// `Arc<AtomicI64>` shared with the `Arbiter` (and later `MemoryBudget`)
191 /// to avoid a circular crate dependency (`noxu-tree` → `noxu-dbi`).
192 pub memory_counter: Option<Arc<AtomicI64>>,
193
194 /// Optional listener fed on node add/access/remove, mirroring JE's
195 /// `INList` feeding the evictor's `LRUList`s.
196 ///
197 /// When `None` (the default — used by unit tests with no environment),
198 /// the notifications are no-ops. `EnvironmentImpl` installs the
199 /// `Evictor` here so production inserts/accesses populate the LRU lists
200 /// the evictor drains.
201 ///
202 /// JE reference: `IN.fetchTarget`/split/`rebuildINList` → `addBack`,
203 /// access → `moveBack`, removal → `remove`.
204 pub in_list_listener: Option<Arc<dyn InListListener>>,
205
206 /// Optional log manager so an evicted root IN can be re-materialized from
207 /// its persisted `root_log_lsn` on the next access (EV-14, piece B).
208 ///
209 /// JE's `Tree` reaches the log via `database.getEnv().getLogManager()`;
210 /// `Tree.getRootINRootAlreadyLatched` calls `root.fetchTarget(...)` which
211 /// reads the root IN back from its `ChildReference` LSN when the in-memory
212 /// target is null (Tree.java:477-516, ChildReference.fetchTarget). Noxu
213 /// has no env back-reference here, so the log manager is installed
214 /// directly (the same one-way wiring as `in_list_listener`). When `None`
215 /// (unit tests with no environment), an evicted root cannot be re-fetched
216 /// — but `evict_root` refuses to evict without a log manager, so the root
217 /// is never made non-resident in that configuration.
218 pub log_manager: Option<Arc<noxu_log::LogManager>>,
219
220 /// Capacity hint for the recovery redo path.
221 ///
222 /// When non-zero, the first BIN created by `redo_insert` (the first-key
223 /// path) pre-allocates its `entries` Vec with this capacity so that
224 /// redo insertions proceed without Vec-resize doublings. The value is
225 /// clamped to `max_entries_per_node` at use.
226 ///
227 /// Set by `hint_redo_capacity` before the redo loop.
228 /// Wave 11-K optimisation (Fix 3).
229 redo_capacity_hint: usize,
230
231 /// Whether key-prefix compression is enabled for this tree's BINs.
232 ///
233 /// JE `DatabaseImpl.getKeyPrefixing()` / `DatabaseConfig.setKeyPrefixing()`.
234 /// When `false`, `IN.computeKeyPrefix` returns `null` in JE — no prefix
235 /// is ever set. Noxu mirrors this: `insert_with_prefix` is skipped in
236 /// favour of `insert_raw`, and `recompute_key_prefix` is not called on
237 /// BIN halves after a split.
238 ///
239 /// Default: `false` (matches JE's `DatabaseConfig.KEY_PREFIXING_DEFAULT`).
240 ///
241 /// Ref: `IN.java computeKeyPrefix` ~line 2456.
242 pub key_prefixing: bool,
243 /// T-5: maximum post-prefix key length (bytes) for the compact key rep
244 /// (`INKeyRep.MaxKeySize`). A node packs all its keys into one fixed-width
245 /// byte array when every post-prefix key is `<=` this length; a longer key
246 /// inflates the node to the `Default` rep. `<= 0` disables the compact
247 /// rep entirely.
248 ///
249 /// Default 16 (`TREE_COMPACT_MAX_KEY_LENGTH` /
250 /// `INKeyRep.MaxKeySize.DEFAULT_MAX_KEY_LENGTH`). Wired from
251 /// `EnvironmentConfig` via `Tree::set_compact_max_key_length`
252 /// (`IN.getCompactMaxKeyLength`, IN.java:4929).
253 pub compact_max_key_length: i32,
254}
255
256/// A node in the tree.
257///
258/// TreeNode wraps an upper IN or a BIN. Each variant carries a lightweight
259/// stub whose fields mirror the persistent IN/BIN structure. The stubs will
260/// be replaced with full InNode/Bin types as the implementation matures; the
261/// API surface here is intentionally minimal.
262#[derive(Debug)]
263pub enum TreeNode {
264 /// Internal Node (IN) - non-leaf node in the tree.
265 Internal(InNodeStub),
266
267 /// Bottom Internal Node (BIN) - leaf-level internal node.
268 Bottom(BinStub),
269}
270
271/// Type alias for a resident child pointer.
272pub type ChildArc = Arc<RwLock<TreeNode>>;
273
274/// T-4: per-node representation of the resident-child-pointer array.
275///
276/// Faithful to JE `INTargetRep` (`INTargetRep.java`), the abstract array of
277/// target pointers to an IN's cached children. These arrays are usually
278/// sparse — most upper INs have NO resident children — so JE never stores a
279/// full per-slot `Node[]` until many children are actually cached:
280///
281/// * `None` — `INTargetRep.None`: a shared singleton, 0 child-pointer
282/// bytes, used when no children are cached (the common case for upper
283/// INs). `get` returns null for every slot.
284/// * `Sparse` — `INTargetRep.Sparse`: a small parallel `(index, target)[]`
285/// for 1..=`MAX_ENTRIES` cached children (JE caps at 4). `get(j)` is a
286/// linear scan of the index array.
287/// * `Default`— `INTargetRep.Default`: the full `Vec<Option<Arc>>`, one
288/// slot per entry, used once more than `MAX_ENTRIES` children are
289/// resident.
290///
291/// A node starts `None` and grows `None → Sparse → Default`. JE does not
292/// shrink back when entries are nulled (it only compacts on IN-stripping) to
293/// avoid transitionary rep churn; we follow the same policy — `set_child` only
294/// inflates, and `compact()` (called on eviction/stripping) collapses an
295/// empty/small `Default`/`Sparse` back toward `None`.
296#[derive(Debug)]
297pub enum TargetRep {
298 /// `INTargetRep.None` — no children cached (shared-singleton semantics).
299 None,
300 /// `INTargetRep.Sparse` — a few cached children, `(slot_index, child)`.
301 /// Invariant: `len() <= SPARSE_MAX_ENTRIES`.
302 Sparse(Vec<(u16, ChildArc)>),
303 /// `INTargetRep.Default` — full parallel array, one slot per entry.
304 Default(Vec<Option<ChildArc>>),
305}
306
307impl TargetRep {
308 /// `INTargetRep.Sparse.MAX_ENTRIES` (INTargetRep.java) — the maximum
309 /// number of cached children the `Sparse` rep holds before inflating to
310 /// `Default`.
311 pub const SPARSE_MAX_ENTRIES: usize = 4;
312
313 /// `INTargetRep.get(idx)` — the cached child for slot `idx`, or `None`.
314 #[inline]
315 pub fn get(&self, idx: usize) -> Option<&ChildArc> {
316 match self {
317 TargetRep::None => None,
318 TargetRep::Sparse(v) => {
319 v.iter().find(|(i, _)| *i as usize == idx).map(|(_, c)| c)
320 }
321 TargetRep::Default(v) => v.get(idx).and_then(|o| o.as_ref()),
322 }
323 }
324
325 /// `INTargetRep.set(idx, node, parent)` — set (or clear, when `node` is
326 /// `None`) the cached child for slot `idx`, mutating the representation
327 /// upward (`None → Sparse → Default`) as needed.
328 pub fn set(&mut self, idx: usize, node: Option<ChildArc>) {
329 match self {
330 TargetRep::None => {
331 // INTargetRep.None.set: clearing stays None; setting mutates
332 // to a Sparse rep and sets there.
333 if let Some(child) = node {
334 *self = TargetRep::Sparse(vec![(idx as u16, child)]);
335 }
336 }
337 TargetRep::Sparse(v) => {
338 // Update existing slot in place.
339 if let Some(pos) =
340 v.iter().position(|(i, _)| *i as usize == idx)
341 {
342 match node {
343 Some(child) => v[pos].1 = child,
344 None => {
345 v.swap_remove(pos);
346 }
347 }
348 return;
349 }
350 // New child: clearing a non-present slot is a no-op.
351 let Some(child) = node else { return };
352 if v.len() < Self::SPARSE_MAX_ENTRIES {
353 v.push((idx as u16, child));
354 return;
355 }
356 // Full — INTargetRep.Sparse.set mutates to Default.
357 let cap = v.iter().map(|(i, _)| *i as usize).max().unwrap_or(0);
358 let cap = cap.max(idx) + 1;
359 let mut def: Vec<Option<ChildArc>> = vec![None; cap];
360 for (i, c) in v.drain(..) {
361 def[i as usize] = Some(c);
362 }
363 def[idx] = Some(child);
364 *self = TargetRep::Default(def);
365 }
366 TargetRep::Default(v) => {
367 if idx >= v.len() {
368 if node.is_none() {
369 return;
370 }
371 v.resize_with(idx + 1, || None);
372 }
373 v[idx] = node;
374 }
375 }
376 }
377
378 /// `INTargetRep.None`-aware take: remove and return the cached child for
379 /// slot `idx`, leaving the slot empty (JE `IN.setTarget(idx, null)` plus
380 /// returning the old target).
381 pub fn take(&mut self, idx: usize) -> Option<ChildArc> {
382 match self {
383 TargetRep::None => None,
384 TargetRep::Sparse(v) => v
385 .iter()
386 .position(|(i, _)| *i as usize == idx)
387 .map(|pos| v.swap_remove(pos).1),
388 TargetRep::Default(v) => v.get_mut(idx).and_then(|o| o.take()),
389 }
390 }
391
392 /// JE `INArrayRep.copy(from, to, n, parent)` adapted to slice ops: shift
393 /// the child mapping when an entry is INSERTED at `idx` (all children at
394 /// slots `>= idx` move up by one). Mirrors how `Vec::insert` shifts the
395 /// parallel `entries` array.
396 pub fn insert_shift(&mut self, idx: usize) {
397 match self {
398 TargetRep::None => {}
399 TargetRep::Sparse(v) => {
400 for (i, _) in v.iter_mut() {
401 if (*i as usize) >= idx {
402 *i += 1;
403 }
404 }
405 }
406 TargetRep::Default(v) => {
407 if idx <= v.len() {
408 v.insert(idx, None);
409 }
410 }
411 }
412 }
413
414 /// JE `INArrayRep.copy` adapted: shift the child mapping when the entry at
415 /// `idx` is REMOVED (all children at slots `> idx` move down by one; the
416 /// child at `idx` itself is dropped). Mirrors `Vec::remove`.
417 pub fn remove_shift(&mut self, idx: usize) {
418 match self {
419 TargetRep::None => {}
420 TargetRep::Sparse(v) => {
421 v.retain(|(i, _)| *i as usize != idx);
422 for (i, _) in v.iter_mut() {
423 if (*i as usize) > idx {
424 *i -= 1;
425 }
426 }
427 }
428 TargetRep::Default(v) => {
429 if idx < v.len() {
430 v.remove(idx);
431 }
432 }
433 }
434 }
435
436 /// `INTargetRep.compact(parent)` — collapse toward the most compact rep:
437 /// an empty rep becomes `None`; a `Default` with `<= MAX_ENTRIES` children
438 /// becomes `Sparse` (or `None`). Called when an IN is stripped/evicted.
439 pub fn compact(&mut self) {
440 let count = self.resident_count();
441 if count == 0 {
442 *self = TargetRep::None;
443 return;
444 }
445 if count <= Self::SPARSE_MAX_ENTRIES
446 && let TargetRep::Default(v) = self
447 {
448 let sparse: Vec<(u16, ChildArc)> = v
449 .iter()
450 .enumerate()
451 .filter_map(|(i, o)| o.as_ref().map(|c| (i as u16, c.clone())))
452 .collect();
453 *self = TargetRep::Sparse(sparse);
454 }
455 }
456
457 /// Number of resident (non-null) children.
458 pub fn resident_count(&self) -> usize {
459 match self {
460 TargetRep::None => 0,
461 TargetRep::Sparse(v) => v.len(),
462 TargetRep::Default(v) => v.iter().filter(|o| o.is_some()).count(),
463 }
464 }
465
466 /// True if no children are cached (`INTargetRep.None` or empty).
467 pub fn is_empty(&self) -> bool {
468 self.resident_count() == 0
469 }
470
471 /// Iterate every resident child (in unspecified order).
472 pub fn iter_children(&self) -> Box<dyn Iterator<Item = ChildArc> + '_> {
473 match self {
474 TargetRep::None => Box::new(std::iter::empty()),
475 TargetRep::Sparse(v) => Box::new(v.iter().map(|(_, c)| c.clone())),
476 TargetRep::Default(v) => {
477 Box::new(v.iter().filter_map(|o| o.clone()))
478 }
479 }
480 }
481
482 /// `INTargetRep.calculateMemorySize()` — heap bytes of the rep itself
483 /// (excluding the children it points at). `None` is 0 (shared singleton),
484 /// matching `INTargetRep.None.calculateMemorySize() == 0`.
485 pub fn memory_size(&self) -> usize {
486 use std::mem::size_of;
487 match self {
488 TargetRep::None => 0,
489 TargetRep::Sparse(v) => v.capacity() * size_of::<(u16, ChildArc)>(),
490 TargetRep::Default(v) => {
491 v.capacity() * size_of::<Option<ChildArc>>()
492 }
493 }
494 }
495}
496
497/// T-3: node-level packed LSN array — `IN.entryLsnByteArray` /
498/// `IN.entryLsnLongArray` (IN.java:251-289, getLsn/setLsnInternal
499/// IN.java:1752-1935).
500///
501/// JE stores one LSN per slot. A naive `Lsn` (u64) costs 8 bytes/slot even
502/// though most LSNs in a node share a file number and have a file offset that
503/// fits in 3 bytes. JE's compact rep is a single `byte[]` with
504/// `BYTES_PER_LSN_ENTRY == 4` bytes per slot:
505///
506/// * `base_file_number` is the lowest file number of any non-NULL LSN in the
507/// node;
508/// * byte 0 of each slot = `file_number - base_file_number` (0..=127,
509/// `Byte.MAX_VALUE`);
510/// * bytes 1..4 = the 3-byte little-endian file offset (max
511/// `MAX_FILE_OFFSET == 0xff_fffe`).
512///
513/// The NULL_LSN blocker (Noxu `NULL_LSN == u64::MAX`) is solved EXACTLY as JE
514/// does it: NULL is NOT stored as the raw u64; the slot's 3 file-offset bytes
515/// are set to `0xff_ffff` (`THREE_BYTE_NEGATIVE_ONE`), a value `MAX_FILE_OFFSET`
516/// can never reach, and `get_lsn` maps it back to `NULL_LSN`.
517///
518/// If a file-number difference exceeds 127 or a file offset exceeds
519/// `MAX_FILE_OFFSET`, the rep mutates to `Long` (one `u64` per slot), matching
520/// JE's `mutateToLongArray` (IN.java:1924). An all-NULL node uses `Empty`
521/// (0 bytes), matching the EMPTY_REP/initial-capacity-free state.
522#[derive(Debug)]
523pub enum LsnRep {
524 /// All slots NULL — 0 heap bytes (the `byteArray == null` initial state).
525 Empty,
526 /// `IN.entryLsnByteArray` — 4 bytes/slot, `base_file_number`-relative.
527 Compact { base_file_number: u32, bytes: Vec<u8> },
528 /// `IN.entryLsnLongArray` — 8 bytes/slot fallback after `mutateToLongArray`.
529 Long(Vec<Lsn>),
530}
531
532impl LsnRep {
533 /// `IN.BYTES_PER_LSN_ENTRY` (IN.java:151).
534 pub const BYTES_PER_LSN_ENTRY: usize = 4;
535 /// `IN.MAX_FILE_OFFSET` (IN.java:152) — max file offset the 3-byte form holds.
536 const MAX_FILE_OFFSET: u32 = 0x00ff_fffe;
537 /// `IN.THREE_BYTE_NEGATIVE_ONE` (IN.java:153) — the NULL sentinel in the
538 /// 3 file-offset bytes.
539 const THREE_BYTE_NEGATIVE_ONE: u32 = 0x00ff_ffff;
540 /// `Byte.MAX_VALUE` — max file-number difference the 1-byte offset holds.
541 const MAX_FILE_NUMBER_OFFSET: u32 = 127;
542
543 /// A rep sized for `n` slots, all NULL. Returns `Empty` (0 bytes); the
544 /// Compact byte array is lazily allocated by the first non-NULL `set_lsn`
545 /// — `base_file_number` is unknown until then (IN.java:1820, the
546 /// `baseFileNumber == -1` first-entry case).
547 #[inline]
548 pub fn new(_n: usize) -> Self {
549 LsnRep::Empty
550 }
551
552 /// Build a rep from a per-slot `Lsn` slice (used by node construction and
553 /// split, where slots arrive together). Equivalent to `new(lsns.len())`
554 /// followed by `set(i, lsns[i])` for each slot.
555 pub fn from_lsns(lsns: &[Lsn]) -> Self {
556 let mut rep = LsnRep::Empty;
557 let n = lsns.len();
558 for (i, &lsn) in lsns.iter().enumerate() {
559 rep.set(i, lsn, n);
560 }
561 rep
562 }
563
564 /// `IN.getLsn(idx)` (IN.java:1752).
565 pub fn get(&self, idx: usize) -> Lsn {
566 match self {
567 LsnRep::Empty => NULL_LSN,
568 LsnRep::Long(v) => v.get(idx).copied().unwrap_or(NULL_LSN),
569 LsnRep::Compact { base_file_number, bytes } => {
570 let off = idx * Self::BYTES_PER_LSN_ENTRY;
571 if off + Self::BYTES_PER_LSN_ENTRY > bytes.len() {
572 return NULL_LSN;
573 }
574 let file_offset = Self::get_3byte(bytes, off + 1);
575 if file_offset == Self::THREE_BYTE_NEGATIVE_ONE {
576 NULL_LSN
577 } else {
578 let file_number = base_file_number + bytes[off] as u32;
579 Lsn::new(file_number, file_offset)
580 }
581 }
582 }
583 }
584
585 /// `IN.setLsnInternal(idx, value)` (IN.java:1801) — set the LSN of slot
586 /// `idx`, mutating Empty→Compact→Long as necessary. `n` is the node's
587 /// slot count (sizes a freshly-allocated Compact array).
588 pub fn set(&mut self, idx: usize, lsn: Lsn, n: usize) {
589 // Empty: first non-NULL value allocates the Compact array; a NULL set
590 // on an Empty rep is a no-op (all slots already read NULL).
591 if let LsnRep::Empty = self {
592 if lsn.is_null() {
593 return;
594 }
595 let cap = n.max(idx + 1);
596 *self = LsnRep::Compact {
597 base_file_number: lsn.file_number(),
598 bytes: vec![0u8; cap * Self::BYTES_PER_LSN_ENTRY],
599 };
600 // Mark every other slot NULL (3-byte offset = 0xffffff).
601 if let LsnRep::Compact { bytes, .. } = self {
602 for s in 0..cap {
603 if s != idx {
604 Self::put_3byte(
605 bytes,
606 s * Self::BYTES_PER_LSN_ENTRY + 1,
607 Self::THREE_BYTE_NEGATIVE_ONE,
608 );
609 }
610 }
611 }
612 self.set(idx, lsn, n);
613 return;
614 }
615
616 if let LsnRep::Long(v) = self {
617 if idx >= v.len() {
618 v.resize(idx + 1, NULL_LSN);
619 }
620 v[idx] = lsn;
621 return;
622 }
623
624 // Compact path.
625 let LsnRep::Compact { base_file_number, bytes } = self else {
626 unreachable!()
627 };
628 let need = (idx + 1) * Self::BYTES_PER_LSN_ENTRY;
629 if need > bytes.len() {
630 let old = bytes.len() / Self::BYTES_PER_LSN_ENTRY;
631 bytes.resize(need, 0);
632 for s in old..(idx + 1) {
633 Self::put_3byte(
634 bytes,
635 s * Self::BYTES_PER_LSN_ENTRY + 1,
636 Self::THREE_BYTE_NEGATIVE_ONE,
637 );
638 }
639 }
640 let off = idx * Self::BYTES_PER_LSN_ENTRY;
641
642 if lsn.is_null() {
643 // IN.java:1812 — file-number offset 0, file offset -1 (0xffffff).
644 bytes[off] = 0;
645 Self::put_3byte(bytes, off + 1, Self::THREE_BYTE_NEGATIVE_ONE);
646 return;
647 }
648
649 let this_file_number = lsn.file_number();
650 let this_file_offset = lsn.file_offset();
651
652 // Whether to fall back to the Long rep.
653 let mutate = this_file_offset > Self::MAX_FILE_OFFSET || {
654 if this_file_number < *base_file_number {
655 // IN.java:1827 — try to re-base downward; bail if any existing
656 // slot would then exceed the 1-byte file-number offset.
657 !Self::adjust_file_numbers(
658 bytes,
659 *base_file_number,
660 this_file_number,
661 )
662 } else {
663 this_file_number - *base_file_number
664 > Self::MAX_FILE_NUMBER_OFFSET
665 }
666 };
667
668 if mutate {
669 // IN.java:1924 mutateToLongArray.
670 let nelts = bytes.len() / Self::BYTES_PER_LSN_ENTRY;
671 let mut longs = vec![NULL_LSN; nelts.max(idx + 1)];
672 for (s, slot) in longs.iter_mut().enumerate().take(nelts) {
673 *slot = self_get_compact(*base_file_number, bytes, s);
674 }
675 longs[idx] = lsn;
676 *self = LsnRep::Long(longs);
677 return;
678 }
679
680 if this_file_number < *base_file_number {
681 *base_file_number = this_file_number;
682 }
683 bytes[off] = (this_file_number - *base_file_number) as u8;
684 Self::put_3byte(bytes, off + 1, this_file_offset);
685 }
686
687 /// `IN.adjustFileNumbers` (IN.java:1855) — re-base to a lower file number,
688 /// rewriting every existing slot's 1-byte offset. Returns false (and
689 /// leaves `bytes` unchanged) if any slot would overflow the 1-byte offset.
690 fn adjust_file_numbers(
691 bytes: &mut [u8],
692 old_base: u32,
693 new_base: u32,
694 ) -> bool {
695 let stride = Self::BYTES_PER_LSN_ENTRY;
696 // First pass: verify none overflow.
697 let mut i = 0;
698 while i < bytes.len() {
699 if Self::get_3byte(bytes, i + 1) != Self::THREE_BYTE_NEGATIVE_ONE {
700 let cur_fn = old_base + bytes[i] as u32;
701 if cur_fn - new_base > Self::MAX_FILE_NUMBER_OFFSET {
702 return false;
703 }
704 }
705 i += stride;
706 }
707 // Second pass: apply.
708 let mut i = 0;
709 while i < bytes.len() {
710 if Self::get_3byte(bytes, i + 1) != Self::THREE_BYTE_NEGATIVE_ONE {
711 let cur_fn = old_base + bytes[i] as u32;
712 bytes[i] = (cur_fn - new_base) as u8;
713 }
714 i += stride;
715 }
716 true
717 }
718
719 /// `INArrayRep.copy` analogue: shift LSNs when an entry is inserted at
720 /// `idx` (slots `>= idx` move up one). Mirrors `targets.insert_shift`.
721 pub fn insert_shift(&mut self, idx: usize, n: usize) {
722 match self {
723 LsnRep::Empty => {}
724 LsnRep::Long(v) => {
725 if idx <= v.len() {
726 v.insert(idx, NULL_LSN);
727 }
728 }
729 LsnRep::Compact { bytes, .. } => {
730 let stride = Self::BYTES_PER_LSN_ENTRY;
731 let cap = (n.max((bytes.len() / stride) + 1)) * stride;
732 bytes.resize(cap, 0);
733 let at = idx * stride;
734 // Shift the tail up by one slot.
735 bytes.copy_within(at..cap - stride, at + stride);
736 // The new slot reads NULL.
737 Self::put_3byte(bytes, at + 1, Self::THREE_BYTE_NEGATIVE_ONE);
738 }
739 }
740 }
741
742 /// `INArrayRep.copy` analogue: shift LSNs when entry `idx` is removed
743 /// (slots `> idx` move down one). Mirrors `targets.remove_shift`.
744 pub fn remove_shift(&mut self, idx: usize) {
745 match self {
746 LsnRep::Empty => {}
747 LsnRep::Long(v) => {
748 if idx < v.len() {
749 v.remove(idx);
750 }
751 }
752 LsnRep::Compact { bytes, .. } => {
753 let stride = Self::BYTES_PER_LSN_ENTRY;
754 let at = idx * stride;
755 if at + stride <= bytes.len() {
756 bytes.copy_within(at + stride.., at);
757 let newlen = bytes.len() - stride;
758 bytes.truncate(newlen);
759 }
760 }
761 }
762 }
763
764 /// `IN.computeLsnOverhead` analogue: heap bytes of the rep itself.
765 pub fn memory_size(&self) -> usize {
766 use std::mem::size_of;
767 match self {
768 LsnRep::Empty => 0,
769 LsnRep::Compact { bytes, .. } => bytes.capacity(),
770 LsnRep::Long(v) => v.capacity() * size_of::<Lsn>(),
771 }
772 }
773
774 fn put_3byte(bytes: &mut [u8], offset: usize, value: u32) {
775 bytes[offset] = (value & 0xFF) as u8;
776 bytes[offset + 1] = ((value >> 8) & 0xFF) as u8;
777 bytes[offset + 2] = ((value >> 16) & 0xFF) as u8;
778 }
779
780 fn get_3byte(bytes: &[u8], offset: usize) -> u32 {
781 (bytes[offset] as u32)
782 | ((bytes[offset + 1] as u32) << 8)
783 | ((bytes[offset + 2] as u32) << 16)
784 }
785}
786
787/// Helper used by `LsnRep::set` during `mutateToLongArray` to read an existing
788/// Compact slot without borrowing `self` (which is mid-mutation).
789fn self_get_compact(base_file_number: u32, bytes: &[u8], idx: usize) -> Lsn {
790 let off = idx * LsnRep::BYTES_PER_LSN_ENTRY;
791 let file_offset = LsnRep::get_3byte(bytes, off + 1);
792 if file_offset == LsnRep::THREE_BYTE_NEGATIVE_ONE {
793 NULL_LSN
794 } else {
795 Lsn::new(base_file_number + bytes[off] as u32, file_offset)
796 }
797}
798
799/// `INKeyRep.MaxKeySize.DEFAULT_MAX_KEY_LENGTH` (INKeyRep.java) and the
800/// `TREE_COMPACT_MAX_KEY_LENGTH` config default.
801#[allow(non_upper_case_globals)]
802pub const INKeyRep_DEFAULT_MAX_KEY_LENGTH: i32 = 16;
803
804/// T-2: node-level key array — `INKeyRep.{Default,MaxKeySize}` (INKeyRep.java).
805///
806/// The per-slot key that used to live in `BinEntry`/`InEntry` as a `Vec<u8>`
807/// (24-byte header + a separate heap allocation per key) is hoisted here as a
808/// node-level rep. When every (post-prefix) key in the node is `<=`
809/// `TREE_COMPACT_MAX_KEY_LENGTH` (default 16) the keys pack into ONE
810/// fixed-width byte buffer (`MaxKeySize`): `slot_width` bytes per slot, with a
811/// parallel `lengths` vector tracking the actual length of each key. A key
812/// longer than the threshold inflates the whole node to the `Default` rep
813/// (one `Vec<u8>` per slot), matching JE's `Default.compact` /
814/// `MaxKeySize.expandToDefaultRep`.
815///
816/// As in JE, this stores the UNPREFIXED suffix (key prefixing strips the
817/// common prefix first), so the compact rep is the smaller post-prefix bytes.
818#[derive(Debug, Clone)]
819pub enum KeyRep {
820 /// `INKeyRep.Default` — one owned key per slot (any length).
821 Default(Vec<Vec<u8>>),
822 /// `INKeyRep.MaxKeySize` — all keys packed into one fixed-width buffer.
823 /// `buf.len() == slot_width * lengths.len()`; slot `i` occupies
824 /// `buf[i*slot_width .. i*slot_width + lengths[i]]`.
825 Compact { buf: Vec<u8>, slot_width: usize, lengths: Vec<u16> },
826}
827
828impl KeyRep {
829 /// An empty `Default` rep.
830 #[inline]
831 pub fn new() -> Self {
832 KeyRep::Default(Vec::new())
833 }
834
835 /// Build a `Default` rep from owned keys (callers may later `compact`).
836 #[inline]
837 pub fn from_keys(keys: Vec<Vec<u8>>) -> Self {
838 KeyRep::Default(keys)
839 }
840
841 /// Number of slots.
842 #[inline]
843 pub fn len(&self) -> usize {
844 match self {
845 KeyRep::Default(v) => v.len(),
846 KeyRep::Compact { lengths, .. } => lengths.len(),
847 }
848 }
849
850 #[inline]
851 pub fn is_empty(&self) -> bool {
852 self.len() == 0
853 }
854
855 /// `INKeyRep.get(idx)` / `getKey` — borrow the (post-prefix) key at slot
856 /// `idx` without allocating.
857 #[inline]
858 pub fn get(&self, idx: usize) -> &[u8] {
859 match self {
860 KeyRep::Default(v) => v[idx].as_slice(),
861 KeyRep::Compact { buf, slot_width, lengths } => {
862 let off = idx * slot_width;
863 &buf[off..off + lengths[idx] as usize]
864 }
865 }
866 }
867
868 /// Set the key at slot `idx`. A key longer than a Compact rep's
869 /// `slot_width` inflates the rep to `Default` first
870 /// (`MaxKeySize.expandToDefaultRep`).
871 pub fn set(&mut self, idx: usize, key: Vec<u8>) {
872 match self {
873 KeyRep::Default(v) => v[idx] = key,
874 KeyRep::Compact { slot_width, .. } if key.len() > *slot_width => {
875 self.inflate_to_default();
876 self.set(idx, key);
877 }
878 KeyRep::Compact { buf, slot_width, lengths } => {
879 let off = idx * *slot_width;
880 buf[off..off + key.len()].copy_from_slice(&key);
881 lengths[idx] = key.len() as u16;
882 }
883 }
884 }
885
886 /// Insert a key at slot `idx`, shifting later slots up (mirrors
887 /// `Vec::insert` + `INArrayRep.copy`).
888 pub fn insert(&mut self, idx: usize, key: Vec<u8>) {
889 match self {
890 KeyRep::Default(v) => v.insert(idx, key),
891 KeyRep::Compact { slot_width, .. } if key.len() > *slot_width => {
892 self.inflate_to_default();
893 self.insert(idx, key);
894 }
895 KeyRep::Compact { buf, slot_width, lengths } => {
896 let sw = *slot_width;
897 let at = idx * sw;
898 buf.splice(at..at, std::iter::repeat_n(0u8, sw));
899 buf[at..at + key.len()].copy_from_slice(&key);
900 lengths.insert(idx, key.len() as u16);
901 }
902 }
903 }
904
905 /// Remove the key at slot `idx`, shifting later slots down.
906 pub fn remove(&mut self, idx: usize) -> Vec<u8> {
907 match self {
908 KeyRep::Default(v) => v.remove(idx),
909 KeyRep::Compact { buf, slot_width, lengths } => {
910 let sw = *slot_width;
911 let len = lengths[idx] as usize;
912 let at = idx * sw;
913 let out = buf[at..at + len].to_vec();
914 buf.drain(at..at + sw);
915 lengths.remove(idx);
916 out
917 }
918 }
919 }
920
921 /// `INKeyRep.MaxKeySize.expandToDefaultRep` — mutate a Compact rep to a
922 /// Default rep (one owned `Vec<u8>` per slot).
923 fn inflate_to_default(&mut self) {
924 if let KeyRep::Compact { .. } = self {
925 let keys: Vec<Vec<u8>> =
926 (0..self.len()).map(|i| self.get(i).to_vec()).collect();
927 *self = KeyRep::Default(keys);
928 }
929 }
930
931 /// `INKeyRep.Default.compact(parent)` (INKeyRep.java) — if every key in a
932 /// `Default` rep fits `compact_max_key_length`, pack them into a
933 /// `MaxKeySize` (`Compact`) rep. `compact_max_key_length <= 0` disables
934 /// compaction. No-op when already Compact.
935 pub fn compact(&mut self, compact_max_key_length: i32) {
936 if compact_max_key_length <= 0 {
937 return;
938 }
939 let KeyRep::Default(keys) = self else {
940 return; // already Compact
941 };
942 if keys.is_empty() {
943 return;
944 }
945 let max_len = keys.iter().map(|k| k.len()).max().unwrap_or(0);
946 if max_len > compact_max_key_length as usize {
947 return; // a key exceeds the threshold — stay Default
948 }
949 let slot_width = max_len.max(1);
950 let mut buf = vec![0u8; slot_width * keys.len()];
951 let mut lengths = Vec::with_capacity(keys.len());
952 for (i, k) in keys.iter().enumerate() {
953 let off = i * slot_width;
954 buf[off..off + k.len()].copy_from_slice(k);
955 lengths.push(k.len() as u16);
956 }
957 *self = KeyRep::Compact { buf, slot_width, lengths };
958 }
959
960 /// True when key-byte memory is accounted for inside this rep (Compact),
961 /// vs per-slot `Vec` allocations (Default).
962 /// `INKeyRep.accountsForKeyByteMemUsage`.
963 #[inline]
964 pub fn is_compact(&self) -> bool {
965 matches!(self, KeyRep::Compact { .. })
966 }
967
968 /// Heap bytes of the rep itself (`INKeyRep.calculateMemorySize` +
969 /// key-byte accounting). For Default this is the `Vec<Vec<u8>>` header
970 /// plus each key's heap allocation; for Compact it is the single buffer
971 /// plus the lengths vector.
972 pub fn memory_size(&self) -> usize {
973 use std::mem::size_of;
974 match self {
975 KeyRep::Default(v) => {
976 v.capacity() * size_of::<Vec<u8>>()
977 + v.iter().map(|k| k.capacity()).sum::<usize>()
978 }
979 KeyRep::Compact { buf, lengths, .. } => {
980 buf.capacity() + lengths.capacity() * size_of::<u16>()
981 }
982 }
983 }
984}
985
986impl Default for KeyRep {
987 fn default() -> Self {
988 KeyRep::new()
989 }
990}
991
992/// Lightweight upper-IN representation used by the tree traversal layer.
993///
994/// `IN`: carries the dirty flag (IN_DIRTY_BIT), the LRU
995/// generation counter, and a weak back-pointer to the parent so that
996/// dirty state can be propagated upward.
997#[derive(Debug)]
998pub struct InNodeStub {
999 /// Node ID.
1000 pub node_id: u64,
1001 /// Level in tree.
1002 pub level: i32,
1003 /// Child entries (key, lsn).
1004 pub entries: Vec<InEntry>,
1005 /// T-4: per-node resident-child-pointer representation.
1006 ///
1007 /// `IN.entryTargets` (`INTargetRep`). The cached child pointer is no
1008 /// longer a per-`InEntry` `Option<Arc>` (which cost a pointer-sized slot
1009 /// even when no child was resident); it lives here as a compact
1010 /// node-level rep that starts `None` (0 child-pointer bytes — most upper
1011 /// INs have no resident children), grows to `Sparse` for a few cached
1012 /// children, and inflates to `Default` (the full parallel array) once
1013 /// many children are resident. See `INTargetRep.{None,Sparse,Default}`.
1014 pub targets: TargetRep,
1015 /// Dirty flag — set whenever this node is modified.
1016 /// `IN.dirty` (IN_DIRTY_BIT).
1017 pub dirty: bool,
1018 /// LRU generation counter for the evictor.
1019 /// `IN.generation`.
1020 pub generation: u64,
1021 /// Weak back-pointer to parent IN.
1022 /// Enables dirty-propagation and latch-coupling validation.
1023 /// `IN.parent` reference used during splits and logging.
1024 pub parent: Option<Weak<RwLock<TreeNode>>>,
1025 /// T-3: per-node packed LSN array (`IN.entryLsnByteArray`). The per-slot
1026 /// `lsn` (8 bytes) that used to live in `InEntry` is hoisted here as a
1027 /// `base_file_number`-relative 4-byte-per-slot rep, falling back to a
1028 /// `u64`-per-slot `Long` rep only when a node's LSN range exceeds the
1029 /// compact form. Access via `get_lsn(slot)` / `set_lsn(slot, lsn)`.
1030 pub lsn_rep: LsnRep,
1031}
1032
1033/// Entry in an IN node.
1034///
1035/// T-4: the resident-child pointer that used to live here (`Option<Arc>`) was
1036/// hoisted to the node-level `InNodeStub.targets` (`INTargetRep`); access the
1037/// child for slot `i` via `InNodeStub::get_child(i)` / `set_child` / etc.
1038///
1039/// T-3: the per-slot `lsn` (8 bytes) that used to live here was hoisted to the
1040/// node-level `InNodeStub.lsn_rep` (`IN.entryLsnByteArray`); access the LSN for
1041/// slot `i` via `InNodeStub::get_lsn(i)` / `set_lsn(i, lsn)`.
1042#[derive(Debug, Clone)]
1043pub struct InEntry {
1044 /// Key for this entry.
1045 pub key: Vec<u8>,
1046}
1047
1048/// Lightweight BIN representation used by the tree traversal layer.
1049///
1050/// `BIN` (which extends `IN`): carries the dirty flag, LRU
1051/// generation counter, and a weak back-pointer to the parent IN.
1052///
1053/// # Key Prefix Compression
1054///
1055/// BINs support key prefix compression. When
1056/// `key_prefix` is non-empty the `key` field of every `BinEntry` stores only
1057/// the *suffix* — the bytes after stripping the common leading bytes. The
1058/// full key is reconstructed by prepending `key_prefix` to the stored suffix.
1059///
1060/// This is transparent to callers through the `get_full_key` / `find_entry`
1061/// helpers on `BinStub`. The prefix is recomputed after every insert and
1062/// after a split via `recompute_key_prefix`.
1063#[derive(Debug)]
1064pub struct BinStub {
1065 /// Node ID.
1066 pub node_id: u64,
1067 /// Level (always BIN_LEVEL).
1068 pub level: i32,
1069 /// Entries. When `key_prefix` is non-empty the `key` field in each entry
1070 /// is the *suffix* of the full key (leading `key_prefix` bytes stripped).
1071 /// `IN.entryKeys` (suffix-only storage when prefixing is on).
1072 pub entries: Vec<BinEntry>,
1073 /// Common prefix shared by every key in this BIN.
1074 /// Empty slice means no prefix compression is active.
1075 /// `IN.keyPrefix`.
1076 pub key_prefix: Vec<u8>,
1077 /// Dirty flag — set whenever this BIN is modified.
1078 /// `IN.dirty` (IN_DIRTY_BIT).
1079 pub dirty: bool,
1080 /// BIN-delta flag — true when this BIN contains only dirty (delta) slots
1081 /// rather than a complete set of entries.
1082 /// `IN.IN_DELTA_BIT` (the IN_DELTA_BIT flag inside `flags`).
1083 pub is_delta: bool,
1084 /// LSN at which this BIN was last logged as a full (non-delta) BIN.
1085 ///
1086 /// Used by the checkpoint path to construct `BINDeltaLogEntry.prev_full_lsn`
1087 /// and to compare against `prev_delta_lsn` when deciding whether to write
1088 /// a delta or a full BIN.
1089 ///
1090 /// `BIN.lastFullLsn`.
1091 pub last_full_lsn: Lsn,
1092 /// LSN at which this BIN was last logged as a BIN-delta.
1093 ///
1094 /// Written as `prev_delta_lsn` into the next `BINDeltaLogEntry` so the
1095 /// cleaner's utilization tracker can mark the superseded delta obsolete.
1096 /// Reset to `NULL_LSN` whenever a full BIN is written.
1097 ///
1098 /// `BIN.lastDeltaVersion` / `BIN.getLastDeltaLsn()`.
1099 pub last_delta_lsn: Lsn,
1100 /// LRU generation counter for the evictor.
1101 /// `IN.generation`.
1102 pub generation: u64,
1103 /// Weak back-pointer to parent IN.
1104 /// Enables dirty-propagation and latch-coupling validation.
1105 pub parent: Option<Weak<RwLock<TreeNode>>>,
1106 /// If true, `BinEntry.expiration_time` values in this BIN are packed hours
1107 /// since epoch; if false, they are packed seconds since epoch.
1108 ///
1109 /// Default: `true` (hours, matching TTL resolution).
1110 ///
1111 /// `BIN.expirationInHours`.
1112 pub expiration_in_hours: bool,
1113 /// Number of cursors currently positioned on this BIN.
1114 ///
1115 /// The evictor skips BINs with a non-zero cursor count to avoid evicting
1116 /// a node that a cursor is actively traversing. CursorImpl increments
1117 /// this when positioning on a BIN and decrements it on reposition/close.
1118 ///
1119 /// `IN.cursorSet.size()` used by `Evictor.selectIN()`.
1120 pub cursor_count: i32,
1121 /// When true, the NEXT log of this BIN must be a full BIN, not a delta.
1122 ///
1123 /// Set after a dirty slot is removed (a delta would silently lose that
1124 /// removal) and cleared after a full BIN is written. This is the
1125 /// delta-chain bound: it forces a periodic full BIN so a delta never
1126 /// references stale state.
1127 ///
1128 /// `IN.prohibitNextDelta` / `IN.setProhibitNextDelta` (IN.java:5013) /
1129 /// `IN.getProhibitNextDelta`.
1130 pub prohibit_next_delta: bool,
1131 /// T-3: per-node packed LSN array (`IN.entryLsnByteArray`). The per-slot
1132 /// `lsn` (8 bytes) that used to live in `BinEntry` is hoisted here as a
1133 /// `base_file_number`-relative 4-byte-per-slot rep. Access via
1134 /// `get_lsn(slot)` / `set_lsn(slot, lsn)`.
1135 pub lsn_rep: LsnRep,
1136 /// T-2: per-node key array (`INKeyRep.{Default,MaxKeySize}`). The per-slot
1137 /// `key` (`Vec<u8>`, 24-byte header + heap alloc) that used to live in
1138 /// `BinEntry` is hoisted here. Stores the post-prefix SUFFIX (key
1139 /// prefixing strips the common prefix first). Packs into one fixed-width
1140 /// buffer (`Compact`) when every suffix is `<= compact_max_key_length`,
1141 /// else one `Vec<u8>` per slot (`Default`). `keys.len()` is kept in lock
1142 /// step with `entries.len()`. Access via `get_key(slot)` /
1143 /// `get_full_key(slot)`.
1144 pub keys: KeyRep,
1145 /// T-5: the node's compact-key threshold (`IN.getCompactMaxKeyLength`),
1146 /// copied from the owning `Tree` at construction so `apply_new_prefix` can
1147 /// decide whether the suffixes now fit `MaxKeySize`. Default 16.
1148 pub compact_max_key_length: i32,
1149}
1150
1151/// Entry in a BIN node.
1152///
1153/// T-3: the per-slot `lsn` (8 bytes) that used to live here was hoisted to the
1154/// node-level `BinStub.lsn_rep` (`IN.entryLsnByteArray`); access the LSN for
1155/// slot `i` via `BinStub::get_lsn(i)` / `set_lsn(i, lsn)`.
1156#[derive(Debug, Clone)]
1157pub struct BinEntry {
1158 /// Optional embedded data (for small records) or cached LN.
1159 pub data: Option<Vec<u8>>,
1160 /// True when this slot has been marked known-deleted (analogous to the
1161 /// KNOWN_DELETED_BIT in `IN.entryStates`). The slot is eligible for
1162 /// removal by `compress_bin()`.
1163 pub known_deleted: bool,
1164 /// True when this slot has been modified since the last full BIN log write.
1165 ///
1166 /// `IN.entryStates[i] & IN_DIRTY_BIT`. Used by the checkpoint
1167 /// path to decide whether to write a BIN-delta (few dirty slots) or a
1168 /// full BIN (many dirty slots).
1169 pub dirty: bool,
1170 /// Packed expiration time (0 = no expiration).
1171 ///
1172 /// When the owning `BinStub.expiration_in_hours` is true, this value is
1173 /// hours since Unix epoch; otherwise it is seconds since Unix epoch.
1174 ///
1175 /// `IN.entryExpiration`.
1176 pub expiration_time: u32,
1177}
1178
1179impl InNodeStub {
1180 /// `IN.getTarget(idx)` — the resident child cached for slot `idx`, cloned
1181 /// (a strong `Arc`), or `None` if the child is not cached. Routes through
1182 /// the node-level `INTargetRep` (T-4).
1183 #[inline]
1184 pub fn get_child(&self, idx: usize) -> Option<ChildArc> {
1185 self.targets.get(idx).cloned()
1186 }
1187
1188 /// Borrow the resident child for slot `idx` without cloning.
1189 #[inline]
1190 pub fn child_ref(&self, idx: usize) -> Option<&ChildArc> {
1191 self.targets.get(idx)
1192 }
1193
1194 /// True if slot `idx` has no resident (cached) child.
1195 /// `IN.getTarget(idx) == null`.
1196 #[inline]
1197 pub fn child_is_none(&self, idx: usize) -> bool {
1198 self.targets.get(idx).is_none()
1199 }
1200
1201 /// `IN.setTarget(idx, node)` — set (or clear) the cached child for slot
1202 /// `idx`, mutating the `INTargetRep` upward as needed.
1203 #[inline]
1204 pub fn set_child(&mut self, idx: usize, node: Option<ChildArc>) {
1205 self.targets.set(idx, node);
1206 }
1207
1208 /// `IN.detachNode` helper — remove and return the cached child for slot
1209 /// `idx`, leaving the slot's key/LSN intact for re-fetch.
1210 #[inline]
1211 pub fn take_child(&mut self, idx: usize) -> Option<ChildArc> {
1212 self.targets.take(idx)
1213 }
1214
1215 /// `IN.getLsn(idx)` (IN.java:1752) — the LSN of slot `idx` via the
1216 /// node-level packed `LsnRep` (T-3).
1217 #[inline]
1218 pub fn get_lsn(&self, idx: usize) -> Lsn {
1219 self.lsn_rep.get(idx)
1220 }
1221
1222 /// `IN.setLsn(idx, lsn)` (IN.java:1773) — set the LSN of slot `idx` via
1223 /// the node-level packed `LsnRep` (T-3).
1224 #[inline]
1225 pub fn set_lsn(&mut self, idx: usize, lsn: Lsn) {
1226 let n = self.entries.len();
1227 self.lsn_rep.set(idx, lsn, n);
1228 }
1229
1230 /// Insert an entry at `idx`, shifting the child mapping to stay aligned
1231 /// (`INArrayRep.copy`), then set the new slot's cached child. Mirrors the
1232 /// old `entries.insert(idx, InEntry{ child: ..})` in one call.
1233 pub fn insert_entry(
1234 &mut self,
1235 idx: usize,
1236 key: Vec<u8>,
1237 lsn: Lsn,
1238 child: Option<ChildArc>,
1239 ) {
1240 self.entries.insert(idx, InEntry { key });
1241 let n = self.entries.len();
1242 self.lsn_rep.insert_shift(idx, n);
1243 self.lsn_rep.set(idx, lsn, n);
1244 self.targets.insert_shift(idx);
1245 if child.is_some() {
1246 self.targets.set(idx, child);
1247 }
1248 }
1249
1250 /// Remove the entry at `idx`, shifting the child mapping to stay aligned
1251 /// (`INArrayRep.copy`). Returns the removed `InEntry` (key).
1252 pub fn remove_entry(&mut self, idx: usize) -> InEntry {
1253 let e = self.entries.remove(idx);
1254 self.lsn_rep.remove_shift(idx);
1255 self.targets.remove_shift(idx);
1256 e
1257 }
1258
1259 /// All resident children (cloned `Arc`s), in unspecified order.
1260 /// Replaces `entries.iter().filter_map(|e| e.child.clone())`.
1261 pub fn resident_children(&self) -> Vec<ChildArc> {
1262 self.targets.iter_children().collect()
1263 }
1264
1265 /// `(slot_index, child)` of the first resident child, if any.
1266 pub fn first_resident_child(&self) -> Option<(usize, ChildArc)> {
1267 (0..self.entries.len())
1268 .find_map(|i| self.targets.get(i).map(|c| (i, c.clone())))
1269 }
1270}
1271
1272impl BinStub {
1273 /// `IN.getLsn(idx)` (IN.java:1752) — the LSN of slot `idx` via the
1274 /// node-level packed `LsnRep` (T-3).
1275 #[inline]
1276 pub fn get_lsn(&self, idx: usize) -> Lsn {
1277 self.lsn_rep.get(idx)
1278 }
1279
1280 /// `IN.setLsn(idx, lsn)` (IN.java:1773) — set the LSN of slot `idx` via
1281 /// the node-level packed `LsnRep` (T-3).
1282 #[inline]
1283 pub fn set_lsn(&mut self, idx: usize, lsn: Lsn) {
1284 let n = self.entries.len();
1285 self.lsn_rep.set(idx, lsn, n);
1286 }
1287
1288 /// TREE-F1: the single user-facing liveness predicate for a BIN slot.
1289 ///
1290 /// A slot is LIVE for reads/scans iff it is neither `known_deleted` nor
1291 /// TTL-expired. This mirrors the two ways JE makes a slot read as ABSENT:
1292 /// * `IN.findEntry` (IN.java:3197) returns -1 for a `known_deleted`
1293 /// exact match;
1294 /// * `CursorImpl.isProbablyExpired` / `lockAndGetCurrent`
1295 /// (CursorImpl.java:2062-2064) skip `isEntryKnownDeleted` (and
1296 /// expired) slots while stepping.
1297 ///
1298 /// KD slots legitimately exist in live BINs during BIN-delta
1299 /// reconstitution until the compressor reclaims them; the maintenance
1300 /// paths (compressor / recovery undo) iterate them on purpose and do NOT
1301 /// use this predicate.
1302 #[inline]
1303 pub fn slot_is_live(&self, idx: usize) -> bool {
1304 match self.entries.get(idx) {
1305 Some(e) => {
1306 !(e.known_deleted
1307 || (e.expiration_time != 0
1308 && noxu_util::ttl::is_expired(
1309 e.expiration_time,
1310 self.expiration_in_hours,
1311 )))
1312 }
1313 None => false,
1314 }
1315 }
1316
1317 // ========================================================================
1318 // Key prefix compression helpers
1319 // IN.computeKeyPrefix / IN.recalcSuffixes / IN.getKey
1320 // ========================================================================
1321
1322 /// Strips embedded LN data from non-dirty slots, freeing the heap
1323 /// allocations of the per-slot value bytes while keeping the slot keys
1324 /// and LSNs addressable. Used by the evictor's PartialEvict path: a
1325 /// hot BIN is kept in cache so its descent path stays warm, but the LN
1326 /// data is dropped to make room for hotter content. Subsequent reads
1327 /// re-fetch the data from the log via the slot LSN.
1328 ///
1329 /// Skips slots that are still dirty (their data has not been written
1330 /// to the log yet, so dropping the in-memory copy would lose the
1331 /// update). Returns the number of bytes freed (sum of the lengths
1332 /// of the dropped `Vec<u8>` data fields).
1333 ///
1334 /// Returns 0 if the BIN has any open cursors (the cursor may be
1335 /// reading the data right now).
1336 pub fn strip_lns(&mut self) -> usize {
1337 if self.cursor_count > 0 {
1338 return 0;
1339 }
1340 let mut freed = 0usize;
1341 for idx in 0..self.entries.len() {
1342 // JE BIN.evictLNs / LN.isEvictable (LN.java:263 returns true): an
1343 // LN's in-memory value can be stripped whenever it is recoverable
1344 // from the log — i.e. the slot has a valid (logged) LSN — REGARDLESS
1345 // of the dirty bit. The dirty bit governs whether the BIN's
1346 // *structure* needs re-logging at the next checkpoint (BIN-delta vs
1347 // full BIN), NOT whether the LN *value* is durable: a transactional
1348 // commit logs the LN, so the slot's LSN points at the durable copy
1349 // even while the slot is still dirty. Gating the strip on `!dirty`
1350 // (the previous behaviour) meant a freshly-written, not-yet-
1351 // checkpointed record — the common case under a write/recently-read
1352 // workload — could never be stripped, so eviction reclaimed almost
1353 // nothing under pressure (EVICTOR-RECLAIM-1). A slot with a NULL/
1354 // transient LSN (a deferred-write LN never logged) is NOT
1355 // strippable — its only copy is the in-memory value.
1356 if self.get_lsn(idx) == NULL_LSN {
1357 continue;
1358 }
1359 if let Some(data) = self.entries[idx].data.take() {
1360 freed = freed.saturating_add(data.len());
1361 }
1362 }
1363 freed
1364 }
1365
1366 /// Reconstruct the full key for slot `idx` by prepending the BIN's
1367 /// current prefix to the stored suffix.
1368 ///
1369 /// `IN.getKey(int idx)`.
1370 pub fn get_full_key(&self, idx: usize) -> Option<Vec<u8>> {
1371 if idx >= self.keys.len() {
1372 return None;
1373 }
1374 let suffix = self.keys.get(idx); // T-2
1375 if self.key_prefix.is_empty() {
1376 Some(suffix.to_vec())
1377 } else {
1378 let mut full =
1379 Vec::with_capacity(self.key_prefix.len() + suffix.len());
1380 full.extend_from_slice(&self.key_prefix);
1381 full.extend_from_slice(suffix);
1382 Some(full)
1383 }
1384 }
1385
1386 /// Borrow the stored (post-prefix) suffix at slot `idx` (`INKeyRep.get`).
1387 #[inline]
1388 pub fn get_key(&self, idx: usize) -> &[u8] {
1389 self.keys.get(idx)
1390 }
1391
1392 /// T-2: insert a new slot at `idx` keeping the parallel `entries`, `keys`,
1393 /// and `lsn_rep` arrays in lock step. `suffix` is the post-prefix key.
1394 fn insert_slot(
1395 &mut self,
1396 idx: usize,
1397 suffix: Vec<u8>,
1398 lsn: Lsn,
1399 data: Option<Vec<u8>>,
1400 ) {
1401 self.entries.insert(
1402 idx,
1403 BinEntry {
1404 data,
1405 known_deleted: false,
1406 dirty: true,
1407 expiration_time: 0,
1408 },
1409 );
1410 self.keys.insert(idx, suffix); // T-2
1411 let n = self.entries.len();
1412 self.lsn_rep.insert_shift(idx, n); // T-3
1413 self.lsn_rep.set(idx, lsn, n);
1414 }
1415
1416 /// Decompress a stored suffix back to a full key.
1417 ///
1418 /// `IN.getKey` used from outside: prepend `key_prefix` to
1419 /// `suffix`. If `key_prefix` is empty the suffix *is* the full key.
1420 pub fn decompress_key(&self, suffix: &[u8]) -> Vec<u8> {
1421 if self.key_prefix.is_empty() {
1422 suffix.to_vec()
1423 } else {
1424 let mut full =
1425 Vec::with_capacity(self.key_prefix.len() + suffix.len());
1426 full.extend_from_slice(&self.key_prefix);
1427 full.extend_from_slice(suffix);
1428 full
1429 }
1430 }
1431
1432 /// Strip the current prefix from a full key to obtain the stored suffix.
1433 ///
1434 /// `IN.computeKeySuffix(byte[] prefix, byte[] key)`.
1435 ///
1436 /// # Panics
1437 /// Panics (debug only) if `full_key` does not start with `key_prefix`.
1438 pub fn compress_key(&self, full_key: &[u8]) -> Vec<u8> {
1439 let plen = self.key_prefix.len();
1440 if plen == 0 {
1441 full_key.to_vec()
1442 } else {
1443 debug_assert!(
1444 full_key.starts_with(&self.key_prefix),
1445 "compress_key: key does not start with current prefix"
1446 );
1447 full_key[plen..].to_vec()
1448 }
1449 }
1450
1451 /// Compute the longest common prefix of all full keys currently in this
1452 /// BIN, optionally excluding the entry at `exclude_idx` (used during
1453 /// insertions to ignore the slot that is about to be replaced).
1454 ///
1455 /// Returns an empty `Vec` if the BIN has fewer than 2 entries or if the
1456 /// keys share no common leading bytes.
1457 ///
1458 /// `IN.computeKeyPrefix(int excludeIdx)`.
1459 pub fn compute_key_prefix(&self, exclude_idx: Option<usize>) -> Vec<u8> {
1460 // Need at least 2 entries to find a common prefix.
1461 let n = self.keys.len();
1462 if n < 2 {
1463 return Vec::new();
1464 }
1465
1466 // Pick the first non-excluded index as the seed.
1467 let first_idx = match exclude_idx {
1468 Some(0) => 1,
1469 _ => 0,
1470 };
1471
1472 // The current prefix_len is taken from the seed full key.
1473 let seed_full = match self.get_full_key(first_idx) {
1474 Some(k) => k,
1475 None => return Vec::new(),
1476 };
1477 let mut prefix_len = seed_full.len();
1478
1479 // Compare every other non-excluded entry against the running prefix.
1480 // Iterate all entries (byteOrdered disabled in too).
1481 for i in (first_idx + 1)..n {
1482 if let Some(ex) = exclude_idx
1483 && i == ex
1484 {
1485 continue;
1486 }
1487 let full_key = match self.get_full_key(i) {
1488 Some(k) => k,
1489 None => continue,
1490 };
1491 let new_len =
1492 get_key_prefix_length(&seed_full[..prefix_len], &full_key);
1493 if new_len < prefix_len {
1494 prefix_len = new_len;
1495 }
1496 if prefix_len == 0 {
1497 return Vec::new();
1498 }
1499 }
1500
1501 seed_full[..prefix_len].to_vec()
1502 }
1503
1504 /// Recompute the key prefix from scratch and re-encode every stored suffix.
1505 ///
1506 /// Call this after bulk inserts, splits, or merges.
1507 ///
1508 /// `IN.recalcKeyPrefix()` → `IN.recalcSuffixes(newPrefix, …)`.
1509 pub fn recompute_key_prefix(&mut self) {
1510 let new_prefix = self.compute_key_prefix(None);
1511 self.apply_new_prefix(new_prefix);
1512 }
1513
1514 /// Apply `new_prefix` as the BIN's key prefix, re-encoding all stored
1515 /// suffixes from the old prefix into the new one.
1516 ///
1517 /// This is the Rust.
1518 fn apply_new_prefix(&mut self, new_prefix: Vec<u8>) {
1519 // Reconstruct all full keys (using old prefix), then re-encode with
1520 // the new prefix.
1521 let full_keys: Vec<Vec<u8>> = (0..self.keys.len())
1522 .map(|i| self.get_full_key(i).unwrap_or_default())
1523 .collect();
1524
1525 self.key_prefix = new_prefix;
1526
1527 // T-2: re-encode every suffix into the key rep, then re-attempt
1528 // compaction (a smaller prefix may make all suffixes fit MaxKeySize).
1529 for (i, full_key) in full_keys.into_iter().enumerate() {
1530 let suffix = self.compress_key(&full_key);
1531 self.keys.set(i, suffix);
1532 }
1533 self.keys.compact(self.compact_max_key_length);
1534 }
1535
1536 /// Binary-search this BIN for `full_key` (a full, uncompressed key).
1537 ///
1538 /// The stored suffixes are compared after stripping the current prefix
1539 /// from `full_key`, so the search is done entirely in suffix-space — no
1540 /// heap allocation needed in the happy path.
1541 ///
1542 /// Returns `(idx, exact)` where:
1543 /// - `idx` is the slot index (or insertion point when `exact == false`).
1544 /// - `exact` is `true` when an exact match was found.
1545 ///
1546 /// `IN.findEntry(key, indicateIfDuplicate, exact)`.
1547 pub fn find_entry_compressed(&self, full_key: &[u8]) -> (usize, bool) {
1548 let plen = self.key_prefix.len();
1549 // Check that the key shares the current prefix; if not it cannot be
1550 // present and we return the appropriate insertion point.
1551 if plen > 0
1552 && (full_key.len() < plen
1553 || &full_key[..plen] != self.key_prefix.as_slice())
1554 {
1555 // The key does not share the current prefix.
1556 // Determine insertion point using full-key comparison.
1557 let pos = self.key_partition_point(|s| {
1558 self.decompress_key(s).as_slice() < full_key
1559 });
1560 return (pos, false);
1561 }
1562 let suffix = &full_key[plen..];
1563 // T-2: binary search over the node-level key rep (suffix space).
1564 match self.key_binary_search(suffix) {
1565 Ok(idx) => (idx, true),
1566 Err(idx) => (idx, false),
1567 }
1568 }
1569
1570 /// Binary search the key rep for `suffix` (suffix space, unsigned bytes).
1571 /// Mirrors `Vec::binary_search_by(|e| e.key.cmp(suffix))` over the
1572 /// node-level `KeyRep` (T-2).
1573 #[inline]
1574 fn key_binary_search(&self, suffix: &[u8]) -> Result<usize, usize> {
1575 let mut lo = 0usize;
1576 let mut hi = self.keys.len();
1577 while lo < hi {
1578 let mid = lo + (hi - lo) / 2;
1579 match self.keys.get(mid).cmp(suffix) {
1580 std::cmp::Ordering::Less => lo = mid + 1,
1581 std::cmp::Ordering::Greater => hi = mid,
1582 std::cmp::Ordering::Equal => return Ok(mid),
1583 }
1584 }
1585 Err(lo)
1586 }
1587
1588 /// `slice::partition_point` over the node-level key rep suffixes (T-2):
1589 /// the index of the first slot for which `pred(suffix)` is false.
1590 #[inline]
1591 fn key_partition_point(
1592 &self,
1593 mut pred: impl FnMut(&[u8]) -> bool,
1594 ) -> usize {
1595 let mut lo = 0usize;
1596 let mut hi = self.keys.len();
1597 while lo < hi {
1598 let mid = lo + (hi - lo) / 2;
1599 if pred(self.keys.get(mid)) {
1600 lo = mid + 1;
1601 } else {
1602 hi = mid;
1603 }
1604 }
1605 lo
1606 }
1607
1608 /// Insert or update a full (uncompressed) key in this BIN.
1609 ///
1610 /// After insertion the key prefix is recomputed; if the prefix changes all
1611 /// stored suffixes are re-encoded.
1612 ///
1613 /// Returns `(slot_index, is_new_insert)`.
1614 ///
1615 /// `IN.setKey` / BIN insert path.
1616 pub fn insert_with_prefix(
1617 &mut self,
1618 full_key: Vec<u8>,
1619 lsn: Lsn,
1620 data: Option<Vec<u8>>,
1621 ) -> (usize, bool) {
1622 // Is the current prefix still compatible with this key?
1623 let plen = self.key_prefix.len();
1624 let new_len = if plen > 0 {
1625 get_key_prefix_length(&self.key_prefix, &full_key)
1626 } else {
1627 0
1628 };
1629
1630 // If the new key shrinks the prefix we must re-encode everything first.
1631 if plen > 0 && new_len < plen {
1632 // Compute new prefix considering the incoming key and
1633 // all existing full keys. We pass `None` for exclude_idx because
1634 // the slot for this key does not yet exist.
1635 let mut candidate = self.compute_key_prefix(None);
1636 // Also constrain by the new key itself.
1637 if !candidate.is_empty() {
1638 let cl = get_key_prefix_length(&candidate, &full_key);
1639 candidate.truncate(cl);
1640 } else {
1641 // No existing prefix; try to build one from the new key
1642 // against the existing full keys.
1643 if !self.entries.is_empty()
1644 && let Some(first_full) = self.get_full_key(0)
1645 {
1646 candidate = create_key_prefix(&first_full, &full_key)
1647 .unwrap_or_default();
1648 for i in 1..self.entries.len() {
1649 if candidate.is_empty() {
1650 break;
1651 }
1652 if let Some(fk) = self.get_full_key(i) {
1653 let l = get_key_prefix_length(&candidate, &fk);
1654 candidate.truncate(l);
1655 }
1656 }
1657 }
1658 }
1659 self.apply_new_prefix(candidate);
1660 }
1661
1662 // Compress the new key under the (possibly updated) prefix.
1663 let suffix = self.compress_key(&full_key);
1664
1665 match self.key_binary_search(&suffix) {
1666 Ok(idx) => {
1667 // Key exists — update in place.
1668 self.set_lsn(idx, lsn); // T-3
1669 self.entries[idx].data = data;
1670 // Mark slot dirty: this slot changed since the last full BIN log.
1671 // `IN.setDirtyEntry(idx)`.
1672 self.entries[idx].dirty = true;
1673 (idx, false)
1674 }
1675 Err(idx) => {
1676 // New key — insert in sorted position.
1677 // New slots start dirty: they have never been logged in any BIN.
1678 // `IN.setDirtyEntry(idx)` called after `insertEntry`.
1679 self.insert_slot(idx, suffix, lsn, data);
1680 // After insertion, if there is no prefix yet, try to establish one.
1681 if self.key_prefix.is_empty() && self.entries.len() >= 2 {
1682 self.recompute_key_prefix();
1683 }
1684 (idx, true)
1685 }
1686 }
1687 }
1688
1689 /// Slice-based variant of [`BinStub::insert_with_prefix`] for the recovery redo path.
1690 ///
1691 /// Accepts `key` and `data` as `&[u8]` slices instead of owned `Vec<u8>`,
1692 /// eliminating the intermediate `Vec<u8>` that `redo_ln` would otherwise
1693 /// allocate before crossing the BIN boundary. The compressed suffix and
1694 /// the data bytes are each copied into the `BinEntry` exactly once.
1695 ///
1696 /// Semantics are identical to `insert_with_prefix`:
1697 /// - Updates the slot in place when the key already exists.
1698 /// - Inserts a new sorted entry when absent, recomputing the key prefix.
1699 ///
1700 /// Wave 11-K optimisation (Fix 1).
1701 pub fn insert_with_prefix_slice(
1702 &mut self,
1703 full_key: &[u8],
1704 lsn: Lsn,
1705 data: Option<&[u8]>,
1706 ) -> (usize, bool) {
1707 let plen = self.key_prefix.len();
1708 let new_len = if plen > 0 {
1709 get_key_prefix_length(&self.key_prefix, full_key)
1710 } else {
1711 0
1712 };
1713
1714 if plen > 0 && new_len < plen {
1715 let mut candidate = self.compute_key_prefix(None);
1716 if !candidate.is_empty() {
1717 let cl = get_key_prefix_length(&candidate, full_key);
1718 candidate.truncate(cl);
1719 } else {
1720 if !self.entries.is_empty()
1721 && let Some(first_full) = self.get_full_key(0)
1722 {
1723 candidate = create_key_prefix(&first_full, full_key)
1724 .unwrap_or_default();
1725 for i in 1..self.entries.len() {
1726 if candidate.is_empty() {
1727 break;
1728 }
1729 if let Some(fk) = self.get_full_key(i) {
1730 let l = get_key_prefix_length(&candidate, &fk);
1731 candidate.truncate(l);
1732 }
1733 }
1734 }
1735 }
1736 self.apply_new_prefix(candidate);
1737 }
1738
1739 let suffix = self.compress_key(full_key);
1740
1741 match self.key_binary_search(&suffix) {
1742 Ok(idx) => {
1743 self.set_lsn(idx, lsn); // T-3
1744 self.entries[idx].data = data.map(|d| d.to_vec());
1745 self.entries[idx].dirty = true;
1746 (idx, false)
1747 }
1748 Err(idx) => {
1749 self.insert_slot(idx, suffix, lsn, data.map(|d| d.to_vec()));
1750 if self.key_prefix.is_empty() && self.entries.len() >= 2 {
1751 self.recompute_key_prefix();
1752 }
1753 (idx, true)
1754 }
1755 }
1756 }
1757
1758 /// Returns the number of slots that are marked dirty.
1759 ///
1760 /// `BIN.getNumDirtyEntries()`.
1761 pub fn dirty_count(&self) -> usize {
1762 self.entries.iter().filter(|e| e.dirty).count()
1763 }
1764
1765 /// Decide whether to log this BIN as a delta (true) or a full BIN (false).
1766 ///
1767 /// Faithful port of JE `BIN.shouldLogDelta()` (BIN.java:1892). The
1768 /// decision is COUNT-based (number of would-be delta slots vs a percent of
1769 /// `nEntries`), NOT a dirty-fraction-vs-hardcoded-0.25 heuristic:
1770 ///
1771 /// ```text
1772 /// if (isBINDelta()) { return true; } // already a delta
1773 /// if (isDeltaProhibited()) return false; // prohibit / no prior full
1774 /// numDeltas = getNDeltas();
1775 /// if (numDeltas <= 0) return false; // empty delta is invalid
1776 /// deltaLimit = (getNEntries() * binDeltaPercent) / 100; // INTEGER math
1777 /// return numDeltas <= deltaLimit;
1778 /// ```
1779 ///
1780 /// `numDeltas` (JE `getNDeltas`) is the count of slots that would appear in
1781 /// the delta — i.e. the dirty slots since the last full BIN — which here is
1782 /// `dirty_count()`. `binDeltaPercent` is the CONFIGURABLE `TREE_BIN_DELTA`
1783 /// param (JE `DatabaseImpl.getBinDeltaPercent()`, default 25), threaded in
1784 /// by the checkpointer — NOT a hardcoded constant.
1785 ///
1786 /// `isDeltaProhibited()` (BIN.java:1867) is
1787 /// `getProhibitNextDelta() || isDeferredWriteMode() || lastFullLsn == NULL`.
1788 /// Deferred-write mode is not modelled in the runtime stub; the other two
1789 /// terms are.
1790 ///
1791 /// JE ref: `BIN.shouldLogDelta` (BIN.java:1892), `BIN.isDeltaProhibited`
1792 /// (BIN.java:1867).
1793 pub fn should_log_delta(&self, bin_delta_percent: i32) -> bool {
1794 // Already a delta: re-log as a delta. JE asserts !prohibitNextDelta
1795 // and lastFullLsn != NULL here.
1796 if self.is_delta {
1797 return self.last_full_lsn != NULL_LSN && !self.prohibit_next_delta;
1798 }
1799
1800 // isDeltaProhibited(): cheapest checks first.
1801 if self.prohibit_next_delta || self.last_full_lsn == NULL_LSN {
1802 return false;
1803 }
1804
1805 // numDeltas = getNDeltas(): the dirty slots that would be in the delta.
1806 let num_deltas = self.dirty_count() as i32;
1807
1808 // A delta with zero items is not valid.
1809 if num_deltas <= 0 {
1810 return false;
1811 }
1812
1813 // Configured BinDeltaPercent limit — INTEGER math, exactly as JE.
1814 let delta_limit = (self.entries.len() as i32 * bin_delta_percent) / 100;
1815 num_deltas <= delta_limit
1816 }
1817
1818 /// Comparator-aware binary search: finds `full_key` using `cmp`.
1819 ///
1820 /// Unlike `find_entry_compressed` (which uses suffix-based lexicographic
1821 /// comparison), this decompresses each entry's key to its full form and
1822 /// applies the provided comparator — required for sorted-dup databases
1823 /// where lexicographic suffix comparison would give wrong results when
1824 /// different-length primary keys are in the same BIN.
1825 ///
1826 /// Returns `(idx, exact)`. Does NOT do prefix compression.
1827 ///
1828 /// `IN.findEntry` with btreeComparator active.
1829 pub fn find_entry_cmp(
1830 &self,
1831 full_key: &[u8],
1832 cmp: &dyn Fn(&[u8], &[u8]) -> std::cmp::Ordering,
1833 ) -> (usize, bool) {
1834 // Hot path: avoid per-comparison Vec<u8> allocation.
1835 // When key_prefix is empty the stored suffix IS the full key, so we
1836 // pass the suffix slice directly. When prefix is non-empty we build a
1837 // temporary concatenation only once per comparison using a small
1838 // stack-local Vec that is dropped immediately after the call — this
1839 // still allocates but is limited to O(key_len) bytes per call and
1840 // avoids retaining any heap state between comparisons.
1841 if self.key_prefix.is_empty() {
1842 match self.key_binary_search_by(|s| cmp(s, full_key)) {
1843 Ok(idx) => (idx, true),
1844 Err(idx) => (idx, false),
1845 }
1846 } else {
1847 let prefix = self.key_prefix.as_slice();
1848 match self.key_binary_search_by(|s| {
1849 let mut fk = Vec::with_capacity(prefix.len() + s.len());
1850 fk.extend_from_slice(prefix);
1851 fk.extend_from_slice(s);
1852 cmp(&fk, full_key)
1853 }) {
1854 Ok(idx) => (idx, true),
1855 Err(idx) => (idx, false),
1856 }
1857 }
1858 }
1859
1860 /// Comparator-driven binary search over the node-level key rep (T-2).
1861 /// `cmp(stored_suffix)` returns how the stored slot compares to the
1862 /// search key.
1863 #[inline]
1864 fn key_binary_search_by(
1865 &self,
1866 mut cmp: impl FnMut(&[u8]) -> std::cmp::Ordering,
1867 ) -> Result<usize, usize> {
1868 let mut lo = 0usize;
1869 let mut hi = self.keys.len();
1870 while lo < hi {
1871 let mid = lo + (hi - lo) / 2;
1872 match cmp(self.keys.get(mid)) {
1873 std::cmp::Ordering::Less => lo = mid + 1,
1874 std::cmp::Ordering::Greater => hi = mid,
1875 std::cmp::Ordering::Equal => return Ok(mid),
1876 }
1877 }
1878 Err(lo)
1879 }
1880
1881 /// Returns the LSN of the slot matching `full_key`, if one exists.
1882 ///
1883 /// Used by the recovery LN-redo apply to enforce JE's currency check
1884 /// (`RecoveryManager.redo()` line ~2512): a logged LN is applied only
1885 /// when `logrecLsn > treeLsn`. Returns `None` when the key is absent
1886 /// (always apply). Uses the same lookup variant the matching insert
1887 /// path uses so the comparison is over the right slot.
1888 pub fn redo_slot_lsn(
1889 &self,
1890 full_key: &[u8],
1891 cmp: Option<&dyn Fn(&[u8], &[u8]) -> std::cmp::Ordering>,
1892 key_prefixing: bool,
1893 ) -> Option<Lsn> {
1894 let (idx, found) = match cmp {
1895 Some(c) => self.find_entry_cmp(full_key, c),
1896 None if key_prefixing => self.find_entry_compressed(full_key),
1897 None => {
1898 // insert_raw path: full keys stored verbatim.
1899 match self.key_binary_search(full_key) {
1900 Ok(idx) => (idx, true),
1901 Err(idx) => (idx, false),
1902 }
1903 }
1904 };
1905 if found { Some(self.get_lsn(idx)) } else { None }
1906 }
1907
1908 /// Raw insert (no prefix compression) for databases with
1909 /// `key_prefixing = false`.
1910 ///
1911 /// JE `IN.computeKeyPrefix` returns `null` when
1912 /// `databaseImpl.getKeyPrefixing()` is `false`, so no prefix is ever
1913 /// set on those BINs. Noxu was previously ignoring the flag and always
1914 /// calling `insert_with_prefix`; this method provides the faithful path.
1915 ///
1916 /// The key is stored verbatim (no suffix stripping). An existing
1917 /// `key_prefix` on the BIN is left untouched; callers must ensure it is
1918 /// empty (split_child already guarantees this for new BINs when
1919 /// `key_prefixing = false`).
1920 ///
1921 /// Returns `(slot_index, is_new_insert)`.
1922 ///
1923 /// Ref: `IN.java computeKeyPrefix` ~line 2456,
1924 /// `DatabaseConfig.setKeyPrefixing` / `DatabaseImpl.getKeyPrefixing`.
1925 pub fn insert_raw(
1926 &mut self,
1927 full_key: Vec<u8>,
1928 lsn: Lsn,
1929 data: Option<Vec<u8>>,
1930 ) -> (usize, bool) {
1931 // Binary search on the stored (full) keys.
1932 // When key_prefix is empty entries store full keys directly; for
1933 // key_prefixing=false DBs the prefix is always empty.
1934 match self.key_binary_search(full_key.as_slice()) {
1935 Ok(idx) => {
1936 self.set_lsn(idx, lsn); // T-3
1937 self.entries[idx].data = data;
1938 self.entries[idx].dirty = true;
1939 (idx, false)
1940 }
1941 Err(idx) => {
1942 self.insert_slot(idx, full_key, lsn, data);
1943 (idx, true)
1944 }
1945 }
1946 }
1947
1948 /// Comparator-aware insert: inserts `full_key` into the BIN using `cmp`.
1949 ///
1950 /// Prefix compression is DISABLED: the key is stored as-is. This is
1951 /// intentional for sorted-dup databases where the custom comparator
1952 /// requires full-key access at every comparison.
1953 ///
1954 /// Returns `(slot_index, is_new_insert)`.
1955 ///
1956 pub fn insert_cmp(
1957 &mut self,
1958 full_key: Vec<u8>,
1959 lsn: Lsn,
1960 data: Option<Vec<u8>>,
1961 cmp: &dyn Fn(&[u8], &[u8]) -> std::cmp::Ordering,
1962 ) -> (usize, bool) {
1963 if self.key_prefix.is_empty() {
1964 match self.key_binary_search_by(|s| cmp(s, &full_key)) {
1965 Ok(idx) => {
1966 self.set_lsn(idx, lsn); // T-3
1967 self.entries[idx].data = data;
1968 self.entries[idx].dirty = true;
1969 (idx, false)
1970 }
1971 Err(idx) => {
1972 self.insert_slot(idx, full_key, lsn, data);
1973 (idx, true)
1974 }
1975 }
1976 } else {
1977 let prefix = self.key_prefix.clone();
1978 match self.key_binary_search_by(|s| {
1979 let mut fk = Vec::with_capacity(prefix.len() + s.len());
1980 fk.extend_from_slice(&prefix);
1981 fk.extend_from_slice(s);
1982 cmp(&fk, &full_key)
1983 }) {
1984 Ok(idx) => {
1985 // Key exists — update in place.
1986 self.set_lsn(idx, lsn); // T-3
1987 self.entries[idx].data = data;
1988 self.entries[idx].dirty = true;
1989 (idx, false)
1990 }
1991 Err(idx) => {
1992 // New key — insert at sorted position (no prefix compression).
1993 self.insert_slot(idx, full_key, lsn, data);
1994 (idx, true)
1995 }
1996 }
1997 }
1998 }
1999
2000 /// Comparator-aware delete: removes `full_key` from the BIN using `cmp`.
2001 ///
2002 /// Returns `true` if the entry was found and removed.
2003 pub fn delete_cmp(
2004 &mut self,
2005 full_key: &[u8],
2006 cmp: &dyn Fn(&[u8], &[u8]) -> std::cmp::Ordering,
2007 ) -> bool {
2008 let result = if self.key_prefix.is_empty() {
2009 self.key_binary_search_by(|s| cmp(s, full_key))
2010 } else {
2011 let prefix = self.key_prefix.clone();
2012 self.key_binary_search_by(|s| {
2013 let mut fk = Vec::with_capacity(prefix.len() + s.len());
2014 fk.extend_from_slice(&prefix);
2015 fk.extend_from_slice(s);
2016 cmp(&fk, full_key)
2017 })
2018 };
2019 match result {
2020 Ok(idx) => {
2021 self.entries.remove(idx);
2022 self.keys.remove(idx); // T-2
2023 self.lsn_rep.remove_shift(idx); // T-3
2024 self.dirty = true;
2025 true
2026 }
2027 Err(_) => false,
2028 }
2029 }
2030
2031 /// Serialise ALL entries (full BIN write).
2032 ///
2033 /// Format (per slot): key_len(u32BE) | key | lsn(u64BE) |
2034 /// has_data(u8) | data_len(u32BE) | data | known_deleted(u8)
2035 ///
2036 /// Prepended by: node_id(u64BE) | num_entries(u32BE).
2037 ///
2038 /// `BIN.writeToLog()` (non-delta path).
2039 pub fn serialize_full(&self) -> Vec<u8> {
2040 let mut buf = Vec::new();
2041 buf.extend_from_slice(&self.node_id.to_be_bytes());
2042 buf.extend_from_slice(&(self.entries.len() as u32).to_be_bytes());
2043 for i in 0..self.entries.len() {
2044 let full_key = self.get_full_key(i).unwrap_or_default();
2045 buf.extend_from_slice(&(full_key.len() as u32).to_be_bytes());
2046 buf.extend_from_slice(&full_key);
2047 let lsn = self.get_lsn(i); // T-3
2048 let e = &self.entries[i];
2049 buf.extend_from_slice(&lsn.as_u64().to_be_bytes());
2050 if let Some(d) = &e.data {
2051 buf.push(1u8);
2052 buf.extend_from_slice(&(d.len() as u32).to_be_bytes());
2053 buf.extend_from_slice(d);
2054 } else {
2055 buf.push(0u8);
2056 }
2057 buf.push(e.known_deleted as u8);
2058 }
2059 buf
2060 }
2061
2062 /// Serialise only dirty slots (BIN-delta write).
2063 ///
2064 /// Format (per dirty slot): slot_idx(u32BE) | key_len(u32BE) | key |
2065 /// lsn(u64BE) | has_data(u8) | data_len(u32BE) | data | known_deleted(u8)
2066 ///
2067 /// Prepended by: node_id(u64BE) | num_dirty(u32BE).
2068 ///
2069 /// `BIN.writeToLog()` (delta path).
2070 pub fn serialize_delta(&self) -> Vec<u8> {
2071 let dirty: Vec<usize> = (0..self.entries.len())
2072 .filter(|&i| self.entries[i].dirty)
2073 .collect();
2074 let mut buf = Vec::new();
2075 buf.extend_from_slice(&self.node_id.to_be_bytes());
2076 buf.extend_from_slice(&(dirty.len() as u32).to_be_bytes());
2077 for idx in dirty {
2078 buf.extend_from_slice(&(idx as u32).to_be_bytes());
2079 let full_key = self.get_full_key(idx).unwrap_or_default();
2080 buf.extend_from_slice(&(full_key.len() as u32).to_be_bytes());
2081 buf.extend_from_slice(&full_key);
2082 let lsn = self.get_lsn(idx); // T-3
2083 let e = &self.entries[idx];
2084 buf.extend_from_slice(&lsn.as_u64().to_be_bytes());
2085 if let Some(d) = &e.data {
2086 buf.push(1u8);
2087 buf.extend_from_slice(&(d.len() as u32).to_be_bytes());
2088 buf.extend_from_slice(d);
2089 } else {
2090 buf.push(0u8);
2091 }
2092 buf.push(e.known_deleted as u8);
2093 }
2094 buf
2095 }
2096
2097 /// Deserialise a full BIN from the bytes produced by `serialize_full()`.
2098 ///
2099 /// Returns a `BinStub` with all entries populated and all slots marked
2100 /// clean (they are already on disk at `last_full_lsn`). Returns `None`
2101 /// if the byte slice is malformed.
2102 ///
2103 /// `INLogEntry.readEntry()` / `IN.readFromLog()` (non-delta).
2104 pub fn deserialize_full(bytes: &[u8]) -> Option<BinStub> {
2105 if bytes.len() < 12 {
2106 return None;
2107 }
2108 let node_id = u64::from_be_bytes(bytes[0..8].try_into().ok()?);
2109 let num_entries =
2110 u32::from_be_bytes(bytes[8..12].try_into().ok()?) as usize;
2111 let mut pos = 12usize;
2112 let mut entries = Vec::with_capacity(num_entries);
2113 let mut lsns: Vec<Lsn> = Vec::with_capacity(num_entries);
2114 let mut keys: Vec<Vec<u8>> = Vec::with_capacity(num_entries); // T-2
2115 for _ in 0..num_entries {
2116 // key_len(u32BE) | key | lsn(u64BE) | has_data(u8) [| data_len(u32BE) | data] | known_deleted(u8)
2117 if pos + 4 > bytes.len() {
2118 return None;
2119 }
2120 let key_len =
2121 u32::from_be_bytes(bytes[pos..pos + 4].try_into().ok()?)
2122 as usize;
2123 pos += 4;
2124 if pos + key_len > bytes.len() {
2125 return None;
2126 }
2127 let key = bytes[pos..pos + key_len].to_vec();
2128 pos += key_len;
2129 if pos + 8 > bytes.len() {
2130 return None;
2131 }
2132 let lsn = Lsn::from_u64(u64::from_be_bytes(
2133 bytes[pos..pos + 8].try_into().ok()?,
2134 ));
2135 pos += 8;
2136 if pos + 1 > bytes.len() {
2137 return None;
2138 }
2139 let has_data = bytes[pos] != 0;
2140 pos += 1;
2141 let data = if has_data {
2142 if pos + 4 > bytes.len() {
2143 return None;
2144 }
2145 let data_len =
2146 u32::from_be_bytes(bytes[pos..pos + 4].try_into().ok()?)
2147 as usize;
2148 pos += 4;
2149 if pos + data_len > bytes.len() {
2150 return None;
2151 }
2152 let d = bytes[pos..pos + data_len].to_vec();
2153 pos += data_len;
2154 Some(d)
2155 } else {
2156 None
2157 };
2158 if pos + 1 > bytes.len() {
2159 return None;
2160 }
2161 let known_deleted = bytes[pos] != 0;
2162 pos += 1;
2163 entries.push(BinEntry {
2164 data,
2165 known_deleted,
2166 dirty: false, // freshly loaded from log — clean
2167 expiration_time: 0,
2168 });
2169 keys.push(key); // T-2 (full keys; recompute_key_prefix compresses)
2170 lsns.push(lsn); // T-3
2171 }
2172 // Keys stored in the serialized format are full (uncompressed) keys.
2173 // Re-establish the key prefix after loading so that memory use and
2174 // search performance match an in-memory BIN.
2175 // `IN.readFromLog()` → key prefix is part of the wire
2176 // format in the; in Noxu we store full keys and recompute on load.
2177 let mut bin = BinStub {
2178 node_id,
2179 level: BIN_LEVEL,
2180 entries,
2181 key_prefix: Vec::new(),
2182 dirty: false,
2183 is_delta: false,
2184 last_full_lsn: NULL_LSN, // caller sets this to the logged LSN
2185 last_delta_lsn: NULL_LSN,
2186 generation: 0,
2187 parent: None,
2188 expiration_in_hours: true,
2189 cursor_count: 0,
2190 prohibit_next_delta: false,
2191 lsn_rep: LsnRep::from_lsns(&lsns), // T-3
2192 keys: KeyRep::from_keys(keys), // T-2 (full keys, no prefix yet)
2193 compact_max_key_length: INKeyRep_DEFAULT_MAX_KEY_LENGTH,
2194 };
2195 // Recompute key prefix from the full keys just loaded.
2196 // `IN.recalcKeyPrefix()` called after materializing from log.
2197 if bin.entries.len() >= 2 {
2198 bin.recompute_key_prefix();
2199 } else {
2200 // Even a single-slot BIN should attempt compaction.
2201 bin.keys.compact(bin.compact_max_key_length);
2202 }
2203 Some(bin)
2204 }
2205
2206 /// Deserialise a BIN delta from the bytes produced by `serialize_delta()`.
2207 ///
2208 /// **DO NOT USE for BIN reconstruction.** This helper writes full
2209 /// (uncompressed) keys directly into slots without recomputing the BIN
2210 /// key prefix, so on a prefix-compressed BIN it corrupts the slot keys and
2211 /// breaks the sorted-suffix invariant. It is NOT wired into any live path.
2212 /// The correct delta-reconstruction path is
2213 /// `mutate_to_full_bin` → `apply_delta_to_bin` → `insert_with_prefix`,
2214 /// which recomputes the prefix. This function is retained only for the
2215 /// raw byte-format round-trip and must not be used to reconstitute a BIN.
2216 /// Tracked for removal — see the v3.x review synthesis (storage C-2).
2217 ///
2218 /// Returns `None` if `delta_bytes` is malformed.
2219 pub fn apply_delta(base: &mut BinStub, delta_bytes: &[u8]) -> Option<()> {
2220 if delta_bytes.len() < 12 {
2221 return None;
2222 }
2223 // node_id(u64BE) — must match base
2224 let _node_id = u64::from_be_bytes(delta_bytes[0..8].try_into().ok()?);
2225 let num_dirty =
2226 u32::from_be_bytes(delta_bytes[8..12].try_into().ok()?) as usize;
2227 let mut pos = 12usize;
2228 for _ in 0..num_dirty {
2229 // slot_idx(u32BE) | key_len(u32BE) | key | lsn(u64BE) | has_data(u8) [| data_len | data] | known_deleted(u8)
2230 if pos + 4 > delta_bytes.len() {
2231 return None;
2232 }
2233 let slot_idx =
2234 u32::from_be_bytes(delta_bytes[pos..pos + 4].try_into().ok()?)
2235 as usize;
2236 pos += 4;
2237 if pos + 4 > delta_bytes.len() {
2238 return None;
2239 }
2240 let key_len =
2241 u32::from_be_bytes(delta_bytes[pos..pos + 4].try_into().ok()?)
2242 as usize;
2243 pos += 4;
2244 if pos + key_len > delta_bytes.len() {
2245 return None;
2246 }
2247 let key = delta_bytes[pos..pos + key_len].to_vec();
2248 pos += key_len;
2249 if pos + 8 > delta_bytes.len() {
2250 return None;
2251 }
2252 let lsn = Lsn::from_u64(u64::from_be_bytes(
2253 delta_bytes[pos..pos + 8].try_into().ok()?,
2254 ));
2255 pos += 8;
2256 if pos + 1 > delta_bytes.len() {
2257 return None;
2258 }
2259 let has_data = delta_bytes[pos] != 0;
2260 pos += 1;
2261 let data = if has_data {
2262 if pos + 4 > delta_bytes.len() {
2263 return None;
2264 }
2265 let data_len = u32::from_be_bytes(
2266 delta_bytes[pos..pos + 4].try_into().ok()?,
2267 ) as usize;
2268 pos += 4;
2269 if pos + data_len > delta_bytes.len() {
2270 return None;
2271 }
2272 let d = delta_bytes[pos..pos + data_len].to_vec();
2273 pos += data_len;
2274 Some(d)
2275 } else {
2276 None
2277 };
2278 if pos + 1 > delta_bytes.len() {
2279 return None;
2280 }
2281 let known_deleted = delta_bytes[pos] != 0;
2282 pos += 1;
2283
2284 // Apply to base: update existing slot or insert new one.
2285 if slot_idx < base.entries.len() {
2286 base.keys.set(slot_idx, key); // T-2
2287 base.set_lsn(slot_idx, lsn); // T-3
2288 base.entries[slot_idx].data = data;
2289 base.entries[slot_idx].known_deleted = known_deleted;
2290 base.entries[slot_idx].dirty = false;
2291 } else {
2292 // Slot index beyond current length — append.
2293 base.entries.push(BinEntry {
2294 data,
2295 known_deleted,
2296 dirty: false,
2297 expiration_time: 0,
2298 });
2299 let n = base.entries.len();
2300 base.keys.insert(n - 1, key); // T-2
2301 base.lsn_rep.set(n - 1, lsn, n); // T-3
2302 }
2303 }
2304 Some(())
2305 }
2306
2307 /// Clear per-slot dirty flags and record `logged_at` as the LSN at which
2308 /// this BIN was last fully logged.
2309 ///
2310 /// Called by the checkpoint path after a successful full-BIN log write.
2311 /// `BIN.afterLog()` / `BIN.setLastFullLsn()`.
2312 pub fn clear_dirty_after_full_log(&mut self, logged_at: Lsn) {
2313 for e in &mut self.entries {
2314 e.dirty = false;
2315 }
2316 self.last_full_lsn = logged_at;
2317 self.dirty = false;
2318 // A full BIN captures all current state, so the delta-chain bound is
2319 // cleared: the next log may once again be a delta.
2320 // JE `IN.afterLog` clears the prohibit flag after a full log
2321 // (IN.java:5557 `bin.setProhibitNextDelta(false)`).
2322 self.prohibit_next_delta = false;
2323 }
2324
2325 /// Clear per-slot dirty flags after a successful delta log write.
2326 ///
2327 /// `last_full_lsn` is NOT updated — the full LSN only changes after a
2328 /// full BIN write.
2329 /// `BIN.afterLog()` (delta path).
2330 pub fn clear_dirty_after_delta_log(&mut self) {
2331 for e in &mut self.entries {
2332 e.dirty = false;
2333 }
2334 self.dirty = false;
2335 }
2336}
2337
2338impl TreeNode {
2339 /// Returns true if this is a BIN (bottom internal node).
2340 pub fn is_bin(&self) -> bool {
2341 matches!(self, TreeNode::Bottom(_))
2342 }
2343
2344 /// Returns the level of this node.
2345 pub fn level(&self) -> i32 {
2346 match self {
2347 TreeNode::Internal(n) => n.level,
2348 TreeNode::Bottom(b) => b.level,
2349 }
2350 }
2351
2352 /// Returns the node id of this node.
2353 pub fn node_id(&self) -> u64 {
2354 match self {
2355 TreeNode::Internal(n) => n.node_id,
2356 TreeNode::Bottom(b) => b.node_id,
2357 }
2358 }
2359
2360 /// Faithful in-memory heap footprint of this node, in bytes.
2361 ///
2362 /// JE `IN.getBudgetedMemorySize()` (IN.java) returns the running
2363 /// `inMemorySize` that `MemoryBudget` tracks for the node: the fixed
2364 /// IN/BIN struct overhead plus, per slot, the fixed entry overhead and the
2365 /// variable key (and embedded-LN data for BINs) bytes. This is the single
2366 /// source of truth for both the live tree accounting and the evictor's
2367 /// detach credit (EV-13) — keeping it on `TreeNode` avoids the formula
2368 /// drifting between `noxu-tree` and `noxu-evictor`.
2369 ///
2370 /// Rust has a fixed struct layout (unlike JE's `Sizeof`-measured JVM
2371 /// constants) so `size_of` is exact for the fixed overheads; the variable
2372 /// part mirrors JE's per-slot `entryKeys`/embedded-data accounting.
2373 pub fn budgeted_memory_size(&self) -> u64 {
2374 use std::mem::size_of;
2375 match self {
2376 TreeNode::Bottom(b) => {
2377 (size_of::<BinStub>()
2378 + b.entries.len() * size_of::<BinEntry>()
2379 + b.key_prefix.len()
2380 + b.keys.memory_size() // T-2: node-level key rep bytes
2381 + b.lsn_rep.memory_size() // T-3: node-level LSN rep bytes
2382 + b.entries
2383 .iter()
2384 .map(|e| {
2385 e.data.as_ref().map(|d| d.len()).unwrap_or(0)
2386 })
2387 .sum::<usize>()) as u64
2388 }
2389 TreeNode::Internal(n) => {
2390 (size_of::<InNodeStub>()
2391 + n.entries.len() * size_of::<InEntry>()
2392 + n.targets.memory_size()
2393 + n.entries.iter().map(|e| e.key.len()).sum::<usize>())
2394 as u64
2395 }
2396 }
2397 }
2398
2399 /// Binary search for a key in this node.
2400 ///
2401 /// For BIN nodes the search is prefix-aware: if the BIN has a key prefix,
2402 /// `key` (a full, uncompressed key) is compared against stored suffixes
2403 /// after stripping the prefix.
2404 /// `IN.findEntry(key, indicateIfDuplicate, exact)`.
2405 ///
2406 /// Returns index with EXACT_MATCH flag set if exact match found.
2407 /// If exact is false, returns insertion point.
2408 pub fn find_entry(&self, key: &[u8], _indicator: bool, exact: bool) -> i32 {
2409 match self {
2410 TreeNode::Internal(n) => {
2411 let result = n
2412 .entries
2413 .binary_search_by(|entry| entry.key.as_slice().cmp(key));
2414 match result {
2415 Ok(idx) => (idx as i32) | EXACT_MATCH,
2416 Err(idx) => {
2417 if exact {
2418 -1
2419 } else {
2420 // Floor (not insertion point): the child slot to
2421 // descend into is the largest entry ≤ key. Slot 0
2422 // is the leftmost child, so a key below every
2423 // separator floors to 0. (St-H5: previously
2424 // returned the insertion point `idx`, which routes
2425 // one child too far right.)
2426 (idx as i32 - 1).max(0)
2427 }
2428 }
2429 }
2430 }
2431 TreeNode::Bottom(b) => {
2432 // Use prefix-aware search: the stored key is a suffix when
2433 // key_prefix is non-empty.
2434 let (idx, found) = b.find_entry_compressed(key);
2435 if found {
2436 (idx as i32) | EXACT_MATCH
2437 } else if exact {
2438 -1
2439 } else {
2440 idx as i32
2441 }
2442 }
2443 }
2444 }
2445
2446 /// Gets the number of entries in this node.
2447 pub fn get_n_entries(&self) -> usize {
2448 match self {
2449 TreeNode::Internal(n) => n.entries.len(),
2450 TreeNode::Bottom(b) => b.entries.len(),
2451 }
2452 }
2453
2454 // ========================================================================
2455 // Dirty flag
2456 // ========================================================================
2457
2458 /// Returns true if this node has been modified since last checkpoint.
2459 ///
2460 /// `IN.getDirty()`.
2461 pub fn is_dirty(&self) -> bool {
2462 match self {
2463 TreeNode::Internal(n) => n.dirty,
2464 TreeNode::Bottom(b) => b.dirty,
2465 }
2466 }
2467
2468 /// Sets or clears the dirty flag on this node.
2469 ///
2470 /// `IN.setDirty(boolean dirty)`.
2471 pub fn set_dirty(&mut self, dirty: bool) {
2472 match self {
2473 TreeNode::Internal(n) => n.dirty = dirty,
2474 TreeNode::Bottom(b) => b.dirty = dirty,
2475 }
2476 }
2477
2478 // ========================================================================
2479 // LRU generation
2480 // ========================================================================
2481
2482 /// Returns the LRU generation counter.
2483 ///
2484 /// `IN.getGeneration()`.
2485 pub fn get_generation(&self) -> u64 {
2486 match self {
2487 TreeNode::Internal(n) => n.generation,
2488 TreeNode::Bottom(b) => b.generation,
2489 }
2490 }
2491
2492 /// Sets the LRU generation counter.
2493 ///
2494 /// `IN.setGeneration(long gen)`.
2495 pub fn set_generation(&mut self, r#gen: u64) {
2496 match self {
2497 TreeNode::Internal(n) => n.generation = r#gen,
2498 TreeNode::Bottom(b) => b.generation = r#gen,
2499 }
2500 }
2501
2502 // ========================================================================
2503 // Parent pointer
2504 // ========================================================================
2505
2506 /// Returns a clone of the weak parent pointer, if any.
2507 pub fn get_parent(&self) -> Option<Weak<RwLock<TreeNode>>> {
2508 match self {
2509 TreeNode::Internal(n) => n.parent.clone(),
2510 TreeNode::Bottom(b) => b.parent.clone(),
2511 }
2512 }
2513
2514 /// Sets the weak parent pointer on this node.
2515 pub fn set_parent(&mut self, parent: Option<Weak<RwLock<TreeNode>>>) {
2516 match self {
2517 TreeNode::Internal(n) => n.parent = parent,
2518 TreeNode::Bottom(b) => b.parent = parent,
2519 }
2520 }
2521
2522 // ========================================================================
2523 // Log serialization
2524 // ========================================================================
2525
2526 /// Estimates the serialized byte size of this node for log/checkpoint use.
2527 ///
2528 /// `IN.getLogSize()` — Noxu-native serialization format.
2529 ///
2530 /// Format (big-endian):
2531 /// - node_id : 8 bytes
2532 /// - level : 4 bytes
2533 /// - n_entries : 4 bytes
2534 /// - dirty : 1 byte
2535 /// - For each entry:
2536 /// - key_len : 2 bytes
2537 /// - key : key_len bytes
2538 /// - lsn : 8 bytes
2539 pub fn log_size(&self) -> usize {
2540 // Fixed header: node_id(8) + level(4) + n_entries(4) + dirty(1)
2541 let mut size: usize = 8 + 4 + 4 + 1;
2542 match self {
2543 TreeNode::Internal(n) => {
2544 for entry in &n.entries {
2545 size += 2 + entry.key.len() + 8; // key_len + key + lsn
2546 }
2547 }
2548 TreeNode::Bottom(b) => {
2549 for i in 0..b.entries.len() {
2550 size += 2 + b.get_key(i).len() + 8; // key_len + key + lsn
2551 }
2552 }
2553 }
2554 size
2555 }
2556
2557 /// Serializes this node to bytes for log writing.
2558 ///
2559 /// `IN.writeToLog(ByteBuffer logBuffer)` — Noxu-native
2560 /// format matching `log_size()`.
2561 pub fn write_to_bytes(&self) -> Vec<u8> {
2562 let mut buf = Vec::with_capacity(self.log_size());
2563 match self {
2564 TreeNode::Internal(n) => {
2565 buf.extend_from_slice(&n.node_id.to_be_bytes());
2566 buf.extend_from_slice(&n.level.to_be_bytes());
2567 buf.extend_from_slice(&(n.entries.len() as u32).to_be_bytes());
2568 buf.push(n.dirty as u8);
2569 for (i, entry) in n.entries.iter().enumerate() {
2570 buf.extend_from_slice(
2571 &(entry.key.len() as u16).to_be_bytes(),
2572 );
2573 buf.extend_from_slice(&entry.key);
2574 buf.extend_from_slice(&n.get_lsn(i).as_u64().to_be_bytes());
2575 }
2576 }
2577 TreeNode::Bottom(b) => {
2578 buf.extend_from_slice(&b.node_id.to_be_bytes());
2579 buf.extend_from_slice(&b.level.to_be_bytes());
2580 buf.extend_from_slice(&(b.entries.len() as u32).to_be_bytes());
2581 buf.push(b.dirty as u8);
2582 for i in 0..b.entries.len() {
2583 let key = b.get_key(i);
2584 buf.extend_from_slice(&(key.len() as u16).to_be_bytes());
2585 buf.extend_from_slice(key);
2586 buf.extend_from_slice(&b.get_lsn(i).as_u64().to_be_bytes());
2587 }
2588 }
2589 }
2590 buf
2591 }
2592}
2593
2594/// Internal helper used during splits to carry entries of either node kind.
2595///
2596/// `BinStub` and `InNodeStub` store different entry types, so we need a
2597/// common wrapper to pass split slices around without code duplication.
2598enum SplitEntries {
2599 /// Upper-IN entries plus the parallel resident-child pointers (one per
2600 /// entry; `None` when the child is not cached) and the parallel per-slot
2601 /// LSNs (T-3: LSNs travel with their slots on a split, just like JE
2602 /// `IN.split` copies `entryLsnByteArray`/`entryLsnLongArray`).
2603 Internal(Vec<InEntry>, Vec<Option<ChildArc>>, Vec<Lsn>),
2604 /// BIN entries (metadata only) plus the parallel per-slot LSNs and the
2605 /// parallel FULL keys (T-2: keys live in the node-level `KeyRep`, not in
2606 /// `BinEntry`, so they travel as a separate `Vec<Vec<u8>>` of full keys
2607 /// through the split — the new BINs recompute their prefix from these).
2608 Bottom(Vec<BinEntry>, Vec<Lsn>, Vec<Vec<u8>>),
2609}
2610
2611impl SplitEntries {
2612 /// Returns the number of entries.
2613 fn len(&self) -> usize {
2614 match self {
2615 SplitEntries::Internal(v, _, _) => v.len(),
2616 SplitEntries::Bottom(v, _, _) => v.len(),
2617 }
2618 }
2619
2620 /// Returns the key at `index` as a slice.
2621 fn get_key(&self, index: usize) -> &[u8] {
2622 match self {
2623 SplitEntries::Internal(v, _, _) => v[index].key.as_slice(),
2624 SplitEntries::Bottom(_, _, k) => k[index].as_slice(),
2625 }
2626 }
2627
2628 /// Returns a sub-range `[lo, hi)` as a new `SplitEntries`.
2629 fn slice(&self, lo: usize, hi: usize) -> Self {
2630 match self {
2631 SplitEntries::Internal(v, c, l) => SplitEntries::Internal(
2632 v[lo..hi].to_vec(),
2633 c[lo..hi].to_vec(),
2634 l[lo..hi].to_vec(),
2635 ),
2636 SplitEntries::Bottom(v, l, k) => SplitEntries::Bottom(
2637 v[lo..hi].to_vec(),
2638 l[lo..hi].to_vec(),
2639 k[lo..hi].to_vec(),
2640 ),
2641 }
2642 }
2643}
2644
2645/// Tri-state outcome from one attempt at
2646/// `Tree::get_adjacent_bin_attempt`.
2647///
2648/// Distinguishes "the tree genuinely has no BIN in the requested
2649/// direction" (→ propagate as end-of-iteration) from "the path we
2650/// captured was invalidated by a concurrent split" (→ caller
2651/// retries from root). This split is necessary because the cursor
2652/// translates a `None` from `get_adjacent_bin` into
2653/// `OperationStatus::NotFound`, which is indistinguishable from a
2654/// real end-of-tree.
2655#[derive(Debug)]
2656enum AdjacentBinOutcome {
2657 /// A BIN was found in the requested direction. T-3: each slot carries its
2658 /// `Lsn` alongside the `BinEntry` (the LSN lives in the node's packed
2659 /// `LsnRep`, not in `BinEntry`, so the scan snapshot pairs them).
2660 Found(Vec<(BinEntry, Lsn, Vec<u8>)>),
2661 /// The tree genuinely has no BIN in the requested direction.
2662 NoAdjacent,
2663 /// A concurrent split invalidated our captured path; the
2664 /// caller should retry from root.
2665 SplitRaceRetry,
2666}
2667
2668/// Split hint for the `splitSpecial` heuristic.
2669///
2670/// JE `Tree.forceSplit` tracks `allLeftSideDescent` / `allRightSideDescent`
2671/// (true if **every** routing decision during the top-down descent followed
2672/// the leftmost / rightmost child). At split time, when one of those flags
2673/// is set, `IN.splitSpecial` forces the split index to 1 (left side) or
2674/// `nEntries - 1` (right side) instead of `nEntries / 2`.
2675///
2676/// Effect: for sequential-append workloads the left BIN stays near-full
2677/// after every split (only one entry migrates to the new sibling), cutting
2678/// the split count roughly in half and reducing write amplification.
2679///
2680/// Ref: `IN.java splitSpecial` ~line 4129, `Tree.java forceSplit` ~line 1907.
2681#[derive(Clone, Copy, Debug, PartialEq, Eq)]
2682enum SplitHint {
2683 /// Normal midpoint split (`n_entries / 2`).
2684 Normal,
2685 /// Key was at position 0 on every level of descent.
2686 /// → `split_index = 1` so left node keeps all but the first entry.
2687 AllLeft,
2688 /// Key was at the rightmost position on every level of descent.
2689 /// → `split_index = n_entries - 1` so left node keeps almost everything.
2690 AllRight,
2691}
2692
2693impl Tree {
2694 /// Creates a new empty tree.
2695 ///
2696 /// Constructor.
2697 pub fn new(database_id: u64, max_entries_per_node: usize) -> Self {
2698 Tree {
2699 database_id,
2700 max_entries_per_node,
2701 root: RwLock::new(None),
2702 root_latch: SharedLatch::new(LatchContext::new("TreeRoot"), false),
2703 root_log_lsn: RwLock::new(noxu_util::NULL_LSN),
2704 root_splits: AtomicU64::new(0),
2705 relatches_required: AtomicU64::new(0),
2706 key_comparator: None,
2707 memory_counter: None,
2708 in_list_listener: None,
2709 log_manager: None,
2710 redo_capacity_hint: 0,
2711 key_prefixing: false, // JE default: KEY_PREFIXING_DEFAULT = false
2712 compact_max_key_length: INKeyRep_DEFAULT_MAX_KEY_LENGTH, // T-5
2713 }
2714 }
2715
2716 /// Installs a shared memory counter for evictor / MemoryBudget feedback.
2717 ///
2718 /// → `env.getMemoryBudget().updateTreeMemoryUsage(delta)`
2719 ///. The counter is updated on every BIN entry insert/delete.
2720 pub fn set_memory_counter(&mut self, counter: Arc<AtomicI64>) {
2721 self.memory_counter = Some(counter);
2722 }
2723
2724 /// Installs the [`InListListener`] (the evictor) so node add/access/remove
2725 /// feed the LRU lists. JE: `INList` registration that feeds
2726 /// `Evictor.addBack`/`moveBack`/`remove`.
2727 pub fn set_in_list_listener(&mut self, listener: Arc<dyn InListListener>) {
2728 self.in_list_listener = Some(listener);
2729 }
2730
2731 /// Installs the [`noxu_log::LogManager`] so an evicted root IN can be
2732 /// re-materialized from its persisted LSN on the next access (EV-14).
2733 ///
2734 /// JE: the tree reaches the log through `database.getEnv().getLogManager()`
2735 /// for `ChildReference.fetchTarget`. Noxu installs it directly.
2736 pub fn set_log_manager(&mut self, lm: Arc<noxu_log::LogManager>) {
2737 self.log_manager = Some(lm);
2738 }
2739
2740 /// Drops this tree's `Arc<LogManager>` reference (EV-14 teardown).
2741 ///
2742 /// The env's `Drop` calls this on every tree it owns so the
2743 /// `Tree -> Arc<LogManager> -> Arc<FileManager>` chain cannot keep the
2744 /// FileManager (and its on-disk exclusive lock) alive past environment
2745 /// close. After this the tree can no longer re-fetch an evicted root
2746 /// from the log — which is correct, because the environment is shutting
2747 /// down and the tree is about to be dropped.
2748 pub fn clear_log_manager(&mut self) {
2749 self.log_manager = None;
2750 }
2751
2752 /// T-5: set the compact-key threshold (`TREE_COMPACT_MAX_KEY_LENGTH` /
2753 /// `IN.getCompactMaxKeyLength`). New BINs created by this tree inherit it;
2754 /// `<= 0` disables the compact key rep. Default 16.
2755 pub fn set_compact_max_key_length(&mut self, len: i32) {
2756 self.compact_max_key_length = len;
2757 }
2758
2759 /// Notify the listener that a node became resident (JE `Evictor.addBack`).
2760 #[inline]
2761 fn note_added(&self, node_id: u64) {
2762 if let Some(l) = &self.in_list_listener {
2763 l.note_ins_added(node_id);
2764 }
2765 }
2766
2767 /// Notify the listener that a resident node was accessed
2768 /// (JE `Evictor.moveBack` — LRU touch).
2769 #[inline]
2770 fn note_accessed(&self, node_id: u64) {
2771 if let Some(l) = &self.in_list_listener {
2772 l.note_ins_accessed(node_id);
2773 }
2774 }
2775
2776 /// Notify the listener that a node was removed (JE `Evictor.remove`).
2777 #[inline]
2778 fn note_removed(&self, node_id: u64) {
2779 if let Some(l) = &self.in_list_listener {
2780 l.note_ins_removed(node_id);
2781 }
2782 }
2783
2784 /// Creates a new empty tree with a custom key comparator.
2785 ///
2786 /// Used for sorted-duplicate databases where keys are two-part
2787 /// composite keys that require a custom ordering function.
2788 ///
2789 /// Constructor with `btreeComparator` parameter.
2790 pub fn new_with_comparator(
2791 database_id: u64,
2792 max_entries_per_node: usize,
2793 comparator: KeyComparatorFn,
2794 ) -> Self {
2795 Tree {
2796 database_id,
2797 max_entries_per_node,
2798 root: RwLock::new(None),
2799 root_latch: SharedLatch::new(LatchContext::new("TreeRoot"), false),
2800 root_log_lsn: RwLock::new(noxu_util::NULL_LSN),
2801 root_splits: AtomicU64::new(0),
2802 relatches_required: AtomicU64::new(0),
2803 key_comparator: Some(comparator),
2804 memory_counter: None,
2805 in_list_listener: None,
2806 log_manager: None,
2807 redo_capacity_hint: 0,
2808 key_prefixing: false,
2809 compact_max_key_length: INKeyRep_DEFAULT_MAX_KEY_LENGTH, // T-5
2810 }
2811 }
2812
2813 /// Sets the key-prefixing flag.
2814 ///
2815 /// When `true`, BIN key-prefix compression is enabled: shared leading
2816 /// bytes are factored out of each slot's key. When `false` (the
2817 /// default), keys are stored verbatim — matching JE
2818 /// `DatabaseConfig.setKeyPrefixing(false)` / `IN.computeKeyPrefix`
2819 /// returning `null`.
2820 ///
2821 /// Ref: `IN.java computeKeyPrefix` ~line 2456.
2822 pub fn set_key_prefixing(&mut self, enabled: bool) {
2823 self.key_prefixing = enabled;
2824 }
2825
2826 /// Sets the key comparator, replacing any existing one.
2827 pub fn set_comparator(&mut self, comparator: KeyComparatorFn) {
2828 self.key_comparator = Some(comparator);
2829 }
2830
2831 /// Store a capacity hint used by `redo_insert` when it creates the first
2832 /// BIN for this tree (the first-key path).
2833 ///
2834 /// The first BIN's `entries` Vec is pre-allocated with
2835 /// `capacity.min(max_entries_per_node)` slots, eliminating the
2836 /// Vec-resize doubling cycle (1 → 2 → 4 → … → cap) that would
2837 /// otherwise occur during the redo loop.
2838 ///
2839 /// Call once before the redo loop. Has no effect on `insert` (the
2840 /// normal, non-recovery path).
2841 ///
2842 /// Wave 11-K optimisation (Fix 3).
2843 pub fn hint_redo_capacity(&mut self, capacity: usize) {
2844 self.redo_capacity_hint = capacity;
2845 }
2846
2847 /// Returns the current redo capacity hint (0 = no hint set).
2848 pub fn get_redo_capacity_hint(&self) -> usize {
2849 self.redo_capacity_hint
2850 }
2851
2852 /// Takes the key comparator out of this tree (leaving None).
2853 pub fn take_comparator(&mut self) -> Option<KeyComparatorFn> {
2854 self.key_comparator.take()
2855 }
2856
2857 /// Returns a reference to the key comparator, if configured.
2858 ///
2859 /// Used by `CursorImpl::find_bin_for_key` (R4 fix) so the cursor's own
2860 /// IN-level descent uses the same comparator-aware floor slot as the
2861 /// tree's own search paths. Mirrors JE `DatabaseImpl.getKeyComparator()`.
2862 pub fn get_comparator(&self) -> Option<&KeyComparatorFn> {
2863 self.key_comparator.as_ref()
2864 }
2865
2866 /// Returns the key comparator if set, or performs lexicographic comparison.
2867 #[inline]
2868 fn key_cmp(&self, a: &[u8], b: &[u8]) -> std::cmp::Ordering {
2869 match &self.key_comparator {
2870 Some(cmp) => cmp(a, b),
2871 None => a.cmp(b),
2872 }
2873 }
2874
2875 /// Floor child slot index for descending an internal node: the largest
2876 /// slot whose key is ≤ `key`. Slot 0 carries a virtual −∞ key (always
2877 /// qualifies); `entries[1..]` are sorted ascending, so this binary-searches
2878 /// the partition point instead of an O(n) linear walk (St-H4). Uses
2879 /// `key_cmp` so a configured custom comparator is honoured on every descent
2880 /// path. Returns 0 for an empty/single-slot node.
2881 fn upper_in_floor_index(&self, entries: &[InEntry], key: &[u8]) -> usize {
2882 if entries.len() <= 1 {
2883 return 0;
2884 }
2885 entries[1..].partition_point(|e| {
2886 self.key_cmp(e.key.as_slice(), key) != std::cmp::Ordering::Greater
2887 })
2888 }
2889
2890 /// Returns true if the tree has no root (is empty).
2891 pub fn is_empty(&self) -> bool {
2892 self.root.read().is_none()
2893 }
2894
2895 /// Sets the root of the tree.
2896 ///
2897 /// Must hold root_latch exclusively before calling.
2898 pub fn set_root(&self, node: TreeNode) {
2899 *self.root.write() = Some(Arc::new(RwLock::new(node)));
2900 }
2901
2902 /// Returns the root Arc, if any.
2903 ///
2904 /// Returns a cloned `Arc` rather than a reference so the caller does not
2905 /// hold the inner `RwLock` guard.
2906 ///
2907 /// EV-14: when the in-memory root has been evicted (`evict_root`) but a
2908 /// persisted version exists (`root_log_lsn` set), this re-materializes it
2909 /// from the log before returning — the faithful equivalent of JE
2910 /// `Tree.getRootIN` always calling `root.fetchTarget(...)`. Returns
2911 /// `None` only for a genuinely empty tree (no resident root and no
2912 /// persisted root LSN).
2913 pub fn get_root(&self) -> Option<Arc<RwLock<TreeNode>>> {
2914 if let Some(r) = self.root.read().clone() {
2915 return Some(r);
2916 }
2917 // Root not resident: re-fetch it from `root_log_lsn` if one exists
2918 // (a no-op returning None when the tree was never populated).
2919 self.fetch_root_from_log()
2920 }
2921
2922 /// Returns the database ID.
2923 pub fn get_database_id(&self) -> u64 {
2924 self.database_id
2925 }
2926
2927 /// Count the total number of live (non-deleted) entries across all BINs.
2928 ///
2929 /// Used by `DatabaseImpl::set_recovered_tree()` to initialise the
2930 /// per-database `entry_count` AtomicU64 after recovery replays the log.
2931 pub fn count_entries(&self) -> u64 {
2932 let mut total = 0u64;
2933 if let Some(root) = self.get_root() {
2934 Self::count_entries_recursive(&root, &mut total);
2935 }
2936 total
2937 }
2938
2939 /// DBI-14: collect every live `(full_key, data, lsn)` triple in physical
2940 /// (left-to-right) order. Used by `resort_under_comparator` to rebuild a
2941 /// tree whose slots were laid out in byte order (e.g. by recovery redo,
2942 /// which has no access to the application comparator) under the real
2943 /// configured comparator.
2944 fn collect_all_entries(&self) -> Vec<(Vec<u8>, Vec<u8>, Lsn)> {
2945 let mut out = Vec::new();
2946 if let Some(root) = self.get_root() {
2947 Self::collect_all_entries_recursive(&root, &mut out);
2948 }
2949 out
2950 }
2951
2952 fn collect_all_entries_recursive(
2953 node_arc: &Arc<RwLock<TreeNode>>,
2954 out: &mut Vec<(Vec<u8>, Vec<u8>, Lsn)>,
2955 ) {
2956 let guard = node_arc.read();
2957 match &*guard {
2958 TreeNode::Bottom(b) => {
2959 for i in 0..b.entries.len() {
2960 if b.entries[i].known_deleted {
2961 continue;
2962 }
2963 if let Some(fk) = b.get_full_key(i) {
2964 let data =
2965 b.entries[i].data.clone().unwrap_or_default();
2966 out.push((fk, data, b.get_lsn(i)));
2967 }
2968 }
2969 }
2970 TreeNode::Internal(n) => {
2971 let children: Vec<Arc<RwLock<TreeNode>>> =
2972 n.resident_children();
2973 drop(guard);
2974 for child in &children {
2975 Self::collect_all_entries_recursive(child, out);
2976 }
2977 }
2978 }
2979 }
2980
2981 /// DBI-14: rebuild this tree so that its on-disk byte-ordered slot layout
2982 /// is re-sorted under the currently-configured key comparator.
2983 ///
2984 /// Recovery redo (`redo_insert`) has no access to the application's
2985 /// comparator function — only the persisted identity — so it lays keys
2986 /// out in unsigned-byte order. After `set_recovered_tree` attaches the
2987 /// real comparator, the slots must be re-sorted, or comparator-driven
2988 /// searches would binary-search a tree ordered by the wrong relation.
2989 ///
2990 /// No-op when no comparator is configured (byte order already matches the
2991 /// recovered layout) or when the tree is empty. Mirrors the effect of
2992 /// JE reconstructing the comparator at open and the tree always having
2993 /// been built under it.
2994 pub fn resort_under_comparator(&self) {
2995 if self.key_comparator.is_none() {
2996 return;
2997 }
2998 let entries = self.collect_all_entries();
2999 if entries.is_empty() {
3000 return;
3001 }
3002 // Drop the current root; re-insert every entry through the normal
3003 // comparator-aware insert path so the new layout obeys the comparator.
3004 *self.root.write() = None;
3005 *self.root_log_lsn.write() = noxu_util::NULL_LSN;
3006 for (key, data, lsn) in entries {
3007 // Best-effort: a failed re-insert would be a tree-structure bug;
3008 // surface it loudly in debug builds.
3009 let r = self.insert(key, data, lsn);
3010 debug_assert!(
3011 r.is_ok(),
3012 "resort_under_comparator: re-insert failed: {r:?}"
3013 );
3014 }
3015 }
3016
3017 fn count_entries_recursive(
3018 node_arc: &Arc<RwLock<TreeNode>>,
3019 total: &mut u64,
3020 ) {
3021 let guard = node_arc.read();
3022 match &*guard {
3023 TreeNode::Bottom(b) => {
3024 // Count only live (non-known_deleted) entries.
3025 *total += b.entries.iter().filter(|e| !e.known_deleted).count()
3026 as u64;
3027 }
3028 TreeNode::Internal(n) => {
3029 let children: Vec<Arc<RwLock<TreeNode>>> =
3030 n.resident_children();
3031 drop(guard);
3032 for child in children {
3033 Self::count_entries_recursive(&child, total);
3034 }
3035 }
3036 }
3037 }
3038
3039 /// Sum the real in-memory heap footprint of every resident node in the
3040 /// tree (DBI-23 oracle / reconciliation), in bytes.
3041 ///
3042 /// Walks all resident IN/BIN nodes and adds each node's
3043 /// `budgeted_memory_size` (JE `IN.getBudgetedMemorySize`). This is the
3044 /// authoritative "real heap" figure the incrementally-maintained
3045 /// `memory_counter` is meant to approximate; an engine can call it to
3046 /// reconcile counter drift, and the DBI-23 test uses it as the oracle the
3047 /// live counter must stay within tolerance of.
3048 pub fn total_budgeted_memory(&self) -> u64 {
3049 let mut total = 0u64;
3050 if let Some(root) = self.get_root() {
3051 Self::total_budgeted_memory_recursive(&root, &mut total);
3052 }
3053 total
3054 }
3055
3056 fn total_budgeted_memory_recursive(
3057 node_arc: &Arc<RwLock<TreeNode>>,
3058 total: &mut u64,
3059 ) {
3060 let guard = node_arc.read();
3061 *total += guard.budgeted_memory_size();
3062 if let TreeNode::Internal(n) = &*guard {
3063 let children: Vec<Arc<RwLock<TreeNode>>> = n.resident_children();
3064 drop(guard);
3065 for child in children {
3066 Self::total_budgeted_memory_recursive(&child, total);
3067 }
3068 }
3069 }
3070
3071 /// Search for a BIN that should contain the given key.
3072 ///
3073 /// This is the core tree traversal operation. It walks from root to BIN
3074 /// using latch-coupling (acquire child latch, then release parent latch).
3075 ///
3076 /// . Descends the tree until a BIN is
3077 /// reached, following the child pointer at the slot whose key is the
3078 /// largest key <= the search key (the "LTE" rule). Slot 0 in every upper
3079 /// IN carries a virtual key (-infinity) so any search key routes through
3080 /// it when all real keys are larger.
3081 ///
3082 /// Returns a SearchResult indicating where the key is or should be.
3083 /// Returns None if tree is empty.
3084 pub fn search(&self, key: &[u8]) -> Option<SearchResult> {
3085 let root = self.get_root()?;
3086
3087 // Hand-over-hand latch coupling for the descent. At each level we
3088 // hold a `parking_lot::ArcRwLockReadGuard` on the current node;
3089 // before dropping it, we acquire the child's read guard via
3090 // `Arc::read_arc`. This keeps a continuous chain of read locks
3091 // along the descent path so that no concurrent `split_child(parent,
3092 // …)` can run on a node we are about to enter — `split_child` takes
3093 // `parent.write()` to install the new sibling, and that write
3094 // blocks while we hold `parent.read()`. Without this, the prior
3095 // pattern (capture child Arc, drop parent guard, then take child
3096 // read lock) left a window in which a split could relocate the
3097 // child entries: a search for a key that should have ended up in
3098 // the new sibling would instead reach the (now left-half) child
3099 // and return a false `NotFound`.
3100 //
3101 // `read_arc()` returns `ArcRwLockReadGuard<RawRwLock, TreeNode>`
3102 // — a guard that owns its own Arc reference, so it has no
3103 // borrow lifetime and can be held across loop iterations and
3104 // assignment.
3105 let mut guard: parking_lot::ArcRwLockReadGuard<
3106 parking_lot::RawRwLock,
3107 TreeNode,
3108 > = root.read_arc();
3109
3110 loop {
3111 if guard.is_bin() {
3112 // JE: IN.fetchTarget / CursorImpl access moves the reached
3113 // BIN toward the hot end of the evictor's LRU list
3114 // (Evictor.moveBack). A freshly split BIN that has not yet
3115 // been registered is added here (moveBack is add-if-absent).
3116 if let TreeNode::Bottom(bin) = &*guard {
3117 self.note_accessed(bin.node_id);
3118 }
3119 // Reached a BIN: final key lookup within the same guard.
3120 // Use indicate_if_duplicate=true so an exact match sets
3121 // EXACT_MATCH in the return value. Guard against -1 (not
3122 // found): -1i32 has all bits set, so the naive
3123 // `index & EXACT_MATCH != 0` check would incorrectly report
3124 // an exact match for a missing key.
3125 let (found, raw_idx) = match &*guard {
3126 TreeNode::Bottom(bin) => match &self.key_comparator {
3127 Some(cmp) => {
3128 let (idx, exact) =
3129 bin.find_entry_cmp(key, cmp.as_ref());
3130 (exact, idx as i32)
3131 }
3132 None => {
3133 let index = guard.find_entry(key, true, true);
3134 let exact =
3135 index >= 0 && (index & EXACT_MATCH != 0);
3136 (exact, index & 0xFFFF)
3137 }
3138 },
3139 _ => {
3140 let index = guard.find_entry(key, true, true);
3141 let exact = index >= 0 && (index & EXACT_MATCH != 0);
3142 (exact, index & 0xFFFF)
3143 }
3144 };
3145 // CursorImpl.isProbablyExpired(): if an exact match
3146 // was found, check whether the entry's TTL has already elapsed.
3147 // If it has, treat the slot as not found so callers skip it.
3148 //
3149 // TREE-F1: also treat a known_deleted slot as ABSENT on an
3150 // exact lookup, mirroring the tail of IN.findEntry
3151 // (IN.java:3197): `if (ret >= 0 && exact &&
3152 // isEntryKnownDeleted(ret & 0xffff)) return -1;`. KD slots
3153 // legitimately exist in live BINs during BIN-delta
3154 // reconstitution until the compressor reclaims them.
3155 let found = if found {
3156 if let TreeNode::Bottom(bin) = &*guard {
3157 let idx = (raw_idx & 0x7FFF) as usize;
3158 bin.slot_is_live(idx)
3159 } else {
3160 found
3161 }
3162 } else {
3163 found
3164 };
3165 return Some(SearchResult::with_values(found, raw_idx, false));
3166 }
3167
3168 // Upper IN: find the child slot with the largest key <= search
3169 // key, and capture the child Arc WHILE HOLDING the guard.
3170 // Slot 0 has a virtual key that compares as -infinity.
3171 let parent_arc =
3172 parking_lot::ArcRwLockReadGuard::rwlock(&guard).clone();
3173 let next_arc = match &*guard {
3174 TreeNode::Internal(n) => {
3175 if n.entries.is_empty() {
3176 return None;
3177 }
3178 // Walk forward as long as entry.key <= key, starting
3179 // from slot 0 (which always qualifies because its key
3180 // is the virtual -infinity key).
3181 let idx = self.upper_in_floor_index(&n.entries, key);
3182 match n.get_child(idx) {
3183 // Resident child: keep the hand-over-hand fast path.
3184 Some(c) => {
3185 let next_guard = c.read_arc();
3186 drop(guard);
3187 guard = next_guard;
3188 continue;
3189 }
3190 // EV-14/EV-13: child evicted — re-fetch it from its
3191 // slot LSN (JE ChildReference.fetchTarget). Must
3192 // drop the parent read guard to upgrade to a write
3193 // latch inside child_at_or_fetch.
3194 None => idx,
3195 }
3196 }
3197 TreeNode::Bottom(_) => {
3198 unreachable!("is_bin() returned false above")
3199 }
3200 };
3201 drop(guard);
3202 let child = self.child_at_or_fetch(&parent_arc, next_arc)?;
3203 guard = child.read_arc();
3204 }
3205 }
3206
3207 /// Combined search-and-fetch: descend once to the BIN and return the
3208 /// slot's data together with a reference to the BIN arc.
3209 ///
3210 /// Replaces the previous three-descent sequence on the `Database::get`
3211 /// hot path:
3212 /// 1. `Tree::search` — existence check only.
3213 /// 2. `CursorImpl::get_data_from_tree` — re-descended to fetch data.
3214 /// 3. `CursorImpl::find_bin_for_key` — re-descended for BIN pinning.
3215 ///
3216 /// One descent now does all three jobs. At the BIN level it uses the
3217 /// existing binary-search helper `find_entry_compressed` instead of the
3218 /// O(n) `iter().find()` used by `get_data_from_tree`.
3219 ///
3220 /// Returns `None` only when the tree is empty. Otherwise returns
3221 /// `Some(SlotFetch)` — callers must inspect `SlotFetch::found` to
3222 /// determine whether the key was present. The BIN read-guard is released
3223 /// before this method returns so callers may safely call `lock_ln`
3224 /// (which may block) without holding any tree latch.
3225 ///
3226 /// Wave-11-I — see the 2026 review.
3227 pub fn search_with_data(&self, key: &[u8]) -> Option<SlotFetch> {
3228 let root = self.get_root()?;
3229 let mut guard: parking_lot::ArcRwLockReadGuard<
3230 parking_lot::RawRwLock,
3231 TreeNode,
3232 > = root.read_arc();
3233
3234 loop {
3235 if guard.is_bin() {
3236 // Capture the BIN Arc before inspecting entries.
3237 let bin_arc =
3238 parking_lot::ArcRwLockReadGuard::rwlock(&guard).clone();
3239
3240 let (found, data, lsn, slot_index) = match &*guard {
3241 TreeNode::Bottom(bin) => {
3242 let (idx, exact) = match &self.key_comparator {
3243 Some(cmp) => bin.find_entry_cmp(key, cmp.as_ref()),
3244 None => bin.find_entry_compressed(key),
3245 };
3246 if exact {
3247 // TREE-F1: a slot is reported as found only when
3248 // live (not known_deleted, not TTL-expired) — the
3249 // same predicate used by Tree::search and the
3250 // cursor scan. Mirrors IN.findEntry (IN.java:3197)
3251 // and CursorImpl.isProbablyExpired.
3252 if bin.slot_is_live(idx) {
3253 let lsn = bin.get_lsn(idx); // T-3
3254 let e = &bin.entries[idx];
3255 (true, e.data.clone(), lsn.as_u64(), idx)
3256 } else {
3257 (false, None, 0u64, 0)
3258 }
3259 } else {
3260 (false, None, 0u64, 0)
3261 }
3262 }
3263 _ => (false, None, 0u64, 0),
3264 };
3265 // Release the BIN read guard before returning so the caller
3266 // can call lock_ln (which may block) without holding a latch.
3267 drop(guard);
3268 return Some(SlotFetch {
3269 found,
3270 data,
3271 lsn,
3272 slot_index,
3273 bin_arc,
3274 });
3275 }
3276
3277 // Upper IN: same hand-over-hand descent as `Tree::search`.
3278 let parent_arc =
3279 parking_lot::ArcRwLockReadGuard::rwlock(&guard).clone();
3280 let next_idx = match &*guard {
3281 TreeNode::Internal(n) => {
3282 if n.entries.is_empty() {
3283 return None;
3284 }
3285 // Slot 0 = virtual −∞; walk forward while entry.key ≤ key.
3286 let idx = self.upper_in_floor_index(&n.entries, key);
3287 match n.get_child(idx) {
3288 Some(c) => {
3289 let next_guard = c.read_arc();
3290 drop(guard);
3291 guard = next_guard;
3292 continue;
3293 }
3294 // EV-14/EV-13: re-fetch an evicted child from its LSN.
3295 None => idx,
3296 }
3297 }
3298 TreeNode::Bottom(_) => {
3299 unreachable!("is_bin() returned false above")
3300 }
3301 };
3302 drop(guard);
3303 let child = self.child_at_or_fetch(&parent_arc, next_idx)?;
3304 guard = child.read_arc();
3305 }
3306 }
3307
3308 /// Sets the expiration time (in absolute hours since Unix epoch) for an
3309 /// existing key's BIN slot.
3310 ///
3311 /// Returns `true` if the key was found and updated, `false` otherwise.
3312 ///
3313 /// Used by `Database::put_with_options()` to apply per-record TTL.
3314 /// `IN.entryExpiration` / `BIN.expirationInHours` path.
3315 pub fn update_key_expiration(
3316 &self,
3317 key: &[u8],
3318 expiration_hours: u32,
3319 ) -> bool {
3320 let root = match self.get_root() {
3321 Some(r) => r,
3322 None => return false,
3323 };
3324 // Hand-over-hand latch coupling for the descent. At the BIN we
3325 // need a write lock; we drop our read lock first and take the
3326 // write lock under the protection of the *outer* parent's read
3327 // lock (held by the previous loop iteration's guard). For the
3328 // first iteration there is no outer parent, but no `split_child`
3329 // can run on the root itself in that single-level case because
3330 // root splits go through `split_root_if_needed` which holds
3331 // `self.root.write()`. So the worst case is that the root is
3332 // promoted from a single BIN to a level-2 IN between our read
3333 // detect and our write — handled by the `is_bin` re-check
3334 // inside the write lock.
3335 //
3336 // We retry the descent up to a small bound to absorb the rare
3337 // case where a concurrent split moved this key into the new
3338 // sibling between the read-chain release and the write-lock
3339 // acquisition. Without the retry, the sole caller
3340 // (`Database::put_with_options`) would silently lose the TTL
3341 // for the affected key. Three attempts is generous: each
3342 // retry only races a single split and splits are infrequent.
3343 for _ in 0..3 {
3344 let mut guard: parking_lot::ArcRwLockReadGuard<
3345 parking_lot::RawRwLock,
3346 TreeNode,
3347 > = root.read_arc();
3348 let bin_arc;
3349 loop {
3350 if guard.is_bin() {
3351 bin_arc =
3352 parking_lot::ArcRwLockReadGuard::rwlock(&guard).clone();
3353 drop(guard);
3354 break;
3355 }
3356 let next_arc = match &*guard {
3357 TreeNode::Internal(n) => {
3358 if n.entries.is_empty() {
3359 return false;
3360 }
3361 let idx = self.upper_in_floor_index(&n.entries, key);
3362 match n.get_child(idx) {
3363 Some(c) => c,
3364 None => return false,
3365 }
3366 }
3367 TreeNode::Bottom(_) => unreachable!(),
3368 };
3369 let next_guard = next_arc.read_arc();
3370 drop(guard);
3371 guard = next_guard;
3372 }
3373
3374 // Now take the write lock on the BIN we descended to.
3375 let mut wguard = bin_arc.write();
3376 if let TreeNode::Bottom(bin) = &mut *wguard {
3377 let slot = if let Some(cmp) = &self.key_comparator {
3378 let (idx, exact) = bin.find_entry_cmp(key, cmp.as_ref());
3379 if exact { Some(idx) } else { None }
3380 } else {
3381 let (idx, exact) = bin.find_entry_compressed(key);
3382 if exact { Some(idx) } else { None }
3383 };
3384 if let Some(slot_idx) = slot
3385 && let Some(entry) = bin.entries.get_mut(slot_idx)
3386 {
3387 entry.expiration_time = expiration_hours;
3388 bin.expiration_in_hours = true;
3389 bin.dirty = true;
3390 return true;
3391 }
3392 }
3393 // Key not in this BIN — either it was never present or a
3394 // concurrent split moved it. Retry the descent; at most a
3395 // few iterations are needed to follow the key into its new
3396 // BIN.
3397 }
3398 false
3399 }
3400
3401 /// Returns the key and data of the first BIN entry at or after `key`.
3402 ///
3403 /// Descends with the tree's key comparator (same path as `search()`), then
3404 /// within the BIN finds the first slot whose stored key >= `key` using the
3405 /// comparator. Returns `None` if every entry in the tree is < `key`.
3406 ///
3407 /// Used by sorted-duplicate cursor `search(Set)` to position at the first
3408 /// (key, data) pair whose two-part key >= `lower_bound(primary_key)`.
3409 ///
3410 /// → BIN scan path.
3411 pub fn first_entry_at_or_after(
3412 &self,
3413 key: &[u8],
3414 ) -> Option<(Vec<u8>, Vec<u8>, u64)> {
3415 // Hand-over-hand latch coupling — see Tree::search for the
3416 // detailed rationale on why this closes a reader-vs-splitter
3417 // race window.
3418 let mut guard: parking_lot::ArcRwLockReadGuard<
3419 parking_lot::RawRwLock,
3420 TreeNode,
3421 > = self.get_root()?.read_arc();
3422
3423 loop {
3424 if guard.is_bin() {
3425 let result = match &*guard {
3426 TreeNode::Bottom(bin) => {
3427 let (mut idx, _exact) = match &self.key_comparator {
3428 Some(cmp) => bin.find_entry_cmp(key, cmp.as_ref()),
3429 None => bin.find_entry_compressed(key),
3430 };
3431 // TREE-F1: skip non-live slots (known_deleted /
3432 // TTL-expired) at/after the floor index, mirroring the
3433 // cursor getNext skip (CursorImpl.java:2062-2064).
3434 while idx < bin.entries.len() && !bin.slot_is_live(idx)
3435 {
3436 idx += 1;
3437 }
3438 if idx < bin.entries.len() {
3439 let full_key =
3440 bin.get_full_key(idx).unwrap_or_default();
3441 let data = bin.entries[idx]
3442 .data
3443 .clone()
3444 .unwrap_or_default();
3445 let lsn = bin.get_lsn(idx).as_u64(); // T-3
3446 Some((full_key, data, lsn))
3447 } else {
3448 None
3449 }
3450 }
3451 _ => None,
3452 };
3453 return result;
3454 }
3455
3456 // Upper IN: same descent as search().
3457 let parent_arc =
3458 parking_lot::ArcRwLockReadGuard::rwlock(&guard).clone();
3459 let next_idx = match &*guard {
3460 TreeNode::Internal(n) => {
3461 if n.entries.is_empty() {
3462 return None;
3463 }
3464 let idx = self.upper_in_floor_index(&n.entries, key);
3465 match n.get_child(idx) {
3466 Some(c) => {
3467 let next_guard = c.read_arc();
3468 drop(guard);
3469 guard = next_guard;
3470 continue;
3471 }
3472 None => idx, // EV-14/EV-13: re-fetch below.
3473 }
3474 }
3475 TreeNode::Bottom(_) => unreachable!(),
3476 };
3477 drop(guard);
3478 let child = self.child_at_or_fetch(&parent_arc, next_idx)?;
3479 guard = child.read_arc();
3480 }
3481 }
3482
3483 /// Like [`Tree::first_entry_at_or_after`] but also returns the BIN node
3484 /// (so callers may pin it) and the entry's slot index inside that
3485 /// BIN.
3486 ///
3487 /// Wave 11-N (Bug 2): `CursorImpl::search_dup` previously stored
3488 /// `current_index = 0` after a sorted-dup `Search`, which broke the
3489 /// fast-path of `retrieve_next` (and the slow path's
3490 /// `next_index = current_index + 1` arithmetic) for any primary
3491 /// that was not the first slot of its BIN. This helper hands back
3492 /// the real index so the cursor can be positioned correctly.
3493 ///
3494 /// CC-2 fix: uses the same `read_arc()` hand-over-hand latch coupling
3495 /// as every other descent method (`search`, `first_entry_at_or_after`,
3496 /// `get_first_node`, `get_adjacent_bin_attempt`). The original
3497 /// implementation did `arc.read().is_bin()` (lock acquired and released)
3498 /// then a SECOND `arc.read()` on the next line — a gap in which a
3499 /// concurrent split can promote the node (BIN→upper IN) or move the
3500 /// sought key to a new sibling, yielding a false "not found" for an
3501 /// existing key. Mirrors JE `Tree.searchSubTree` / `Tree.search`
3502 /// which hold the latch across the `is_bin()` test and the subsequent
3503 /// entry lookup.
3504 pub fn first_entry_at_or_after_with_index(
3505 &self,
3506 key: &[u8],
3507 ) -> Option<(
3508 Vec<u8>,
3509 Vec<u8>,
3510 usize,
3511 u64,
3512 std::sync::Arc<crate::NodeRwLock<TreeNode>>,
3513 )> {
3514 // Hand-over-hand latch coupling — identical strategy to
3515 // first_entry_at_or_after; the guard is held continuously across
3516 // is_bin() and the subsequent entry lookup so no split can
3517 // restructure the path between the two observations.
3518 let mut guard: parking_lot::ArcRwLockReadGuard<
3519 parking_lot::RawRwLock,
3520 TreeNode,
3521 > = self.get_root()?.read_arc();
3522 loop {
3523 if guard.is_bin() {
3524 if let TreeNode::Bottom(bin) = &*guard {
3525 let (idx, _exact) = match &self.key_comparator {
3526 Some(cmp) => bin.find_entry_cmp(key, cmp.as_ref()),
3527 None => bin.find_entry_compressed(key),
3528 };
3529 // TREE-F1: skip non-live slots (known_deleted /
3530 // TTL-expired) at/after the floor index
3531 // (CursorImpl.java:2062-2064).
3532 let mut idx = idx;
3533 while idx < bin.entries.len() && !bin.slot_is_live(idx) {
3534 idx += 1;
3535 }
3536 if idx < bin.entries.len() {
3537 let full_key =
3538 bin.get_full_key(idx).unwrap_or_default();
3539 let data =
3540 bin.entries[idx].data.clone().unwrap_or_default();
3541 let lsn = bin.get_lsn(idx).as_u64(); // T-3
3542 // Obtain the Arc for the BIN node the guard came from.
3543 // `ArcRwLockReadGuard::rwlock()` returns the backing Arc.
3544 let bin_arc =
3545 parking_lot::ArcRwLockReadGuard::rwlock(&guard)
3546 .clone();
3547 return Some((full_key, data, idx, lsn, bin_arc));
3548 } else {
3549 return None;
3550 }
3551 }
3552 return None;
3553 }
3554
3555 // Upper IN: descend as in first_entry_at_or_after / search.
3556 let parent_arc =
3557 parking_lot::ArcRwLockReadGuard::rwlock(&guard).clone();
3558 let next_idx = match &*guard {
3559 TreeNode::Internal(n) => {
3560 if n.entries.is_empty() {
3561 return None;
3562 }
3563 let idx = self.upper_in_floor_index(&n.entries, key);
3564 match n.get_child(idx) {
3565 Some(c) => {
3566 let next_guard = c.read_arc();
3567 drop(guard);
3568 guard = next_guard;
3569 continue;
3570 }
3571 None => idx, // EV-14/EV-13: re-fetch below.
3572 }
3573 }
3574 TreeNode::Bottom(_) => unreachable!(),
3575 };
3576 drop(guard);
3577 let child = self.child_at_or_fetch(&parent_arc, next_idx)?;
3578 guard = child.read_arc();
3579 }
3580 }
3581
3582 /// Insert a key/data pair into the tree.
3583 ///
3584 /// . Handles the root-is-null case by
3585 /// creating a two-level tree (upper IN + BIN) per initialisation path,
3586 /// then delegates to `insert_recursive` which performs preemptive splitting
3587 /// as it descends.
3588 ///
3589 /// Returns Ok(true) if this was a new insert, Ok(false) if it was an update.
3590 pub fn insert(
3591 &self,
3592 key: Vec<u8>,
3593 data: Vec<u8>,
3594 lsn: Lsn,
3595 ) -> Result<bool, TreeError> {
3596 // Save sizes before potentially moving key/data — needed for memory tracking.
3597 let key_len = key.len();
3598 let data_len = data.len();
3599
3600 // First-key path. We MUST hold the write lock while testing
3601 // root.is_none() and replacing the root, otherwise N threads can all
3602 // observe an empty tree, each build a fresh single-entry root, and
3603 // the last writer's `*self.root.write() = Some(...)` silently
3604 // discards the others' inserts. (Reproducer:
3605 // xa_protocol_test::test_concurrent_independent_xids — 8 threads
3606 // each inserting their own key into an empty tree lost ~30% of
3607 // inserts before this lock change.)
3608 {
3609 let mut root_guard = self.root.write();
3610 if root_guard.is_none() {
3611 let bin_node_id = generate_node_id();
3612 let root_node_id = generate_node_id();
3613 let bin = Arc::new(RwLock::new(TreeNode::Bottom(BinStub {
3614 node_id: bin_node_id,
3615 level: BIN_LEVEL,
3616 entries: vec![BinEntry {
3617 data: Some(data),
3618 known_deleted: false,
3619 dirty: false,
3620 expiration_time: 0,
3621 }],
3622 key_prefix: Vec::new(), // single entry — no common prefix yet
3623 dirty: true,
3624 is_delta: false,
3625 last_full_lsn: NULL_LSN,
3626 last_delta_lsn: NULL_LSN,
3627 generation: 0,
3628 parent: None, // set below after root_in is created
3629 // St-H6: use true to match the engine-wide invariant that
3630 // every BIN which may hold TTL entries uses hours granularity
3631 // (JE BIN.java default; matches tree.rs:980 and read_from_log).
3632 expiration_in_hours: true,
3633 cursor_count: 0,
3634 prohibit_next_delta: false,
3635 lsn_rep: LsnRep::from_lsns(&[lsn]),
3636 keys: KeyRep::from_keys(vec![key]), // T-2
3637 compact_max_key_length: self.compact_max_key_length,
3638 })));
3639
3640 // Upper IN at level 2; slot 0 uses an empty key (virtual root key).
3641 let root_arc =
3642 Arc::new(RwLock::new(TreeNode::Internal(InNodeStub {
3643 node_id: root_node_id,
3644 level: MAIN_LEVEL | 2,
3645 entries: vec![InEntry {
3646 key: vec![], // virtual key for slot 0 in upper IN
3647 }],
3648 // T-4: the single resident child at slot 0.
3649 targets: TargetRep::Sparse(vec![(0, bin.clone())]),
3650 dirty: true,
3651 generation: 0,
3652 parent: None,
3653 lsn_rep: LsnRep::from_lsns(&[lsn]),
3654 })));
3655
3656 // Wire the BIN's parent pointer back to the root IN.
3657 {
3658 let mut g = bin.write();
3659 g.set_parent(Some(Arc::downgrade(&root_arc)));
3660 }
3661
3662 *root_guard = Some(root_arc);
3663
3664 // JE: IN.fetchTarget / initial tree build registers the new
3665 // resident nodes with the evictor (Evictor.addBack).
3666 self.note_added(root_node_id);
3667 self.note_added(bin_node_id);
3668
3669 // Count the first entry.
3670 if let Some(counter) = &self.memory_counter {
3671 let delta =
3672 (key_len + data_len + BIN_ENTRY_OVERHEAD) as i64;
3673 counter.fetch_add(delta, Ordering::Relaxed);
3674 }
3675 return Ok(true);
3676 }
3677 // Another thread initialized the root while we were waiting for
3678 // the write lock; fall through and insert into the existing tree.
3679 }
3680
3681 // Check whether the root itself needs to be split before descending.
3682 // Tree.searchSplitsAllowed(): if rootIN.needsSplitting()
3683 // call splitRoot first.
3684 self.split_root_if_needed(lsn)?;
3685
3686 // Recursively insert, splitting children proactively as we descend
3687 // (forceSplit / searchSplitsAllowed pattern).
3688 let root_arc = self.get_root().unwrap();
3689 let result = Self::insert_recursive(
3690 &root_arc,
3691 key,
3692 data,
3693 lsn,
3694 self.max_entries_per_node,
3695 self.key_comparator.as_ref(),
3696 self.key_prefixing,
3697 self.in_list_listener.as_ref(),
3698 )?;
3699
3700 // Update the memory counter for new inserts.
3701 // IN.updateMemorySize(delta) → MemoryBudget.updateTreeMemoryUsage(delta).
3702 // LN_OVERHEAD = 48 bytes (approximate fixed overhead per entry).
3703 if result && let Some(counter) = &self.memory_counter {
3704 let delta = (key_len + data_len + BIN_ENTRY_OVERHEAD) as i64;
3705 counter.fetch_add(delta, Ordering::Relaxed);
3706 }
3707
3708 Ok(result)
3709 }
3710
3711 /// Recovery-redo variant of [`Tree::insert`] that accepts `&[u8]` slices.
3712 ///
3713 /// Eliminates the two intermediate `Vec<u8>` allocations that the normal
3714 /// insert path requires at the `redo_ln` call site (one for the key, one
3715 /// for the data). The compressed key suffix and the data bytes are each
3716 /// materialised into their `BinEntry` slots exactly once.
3717 ///
3718 /// Semantics are identical to `insert`:
3719 /// - Updates the existing slot when the key is already present.
3720 /// - Inserts a new sorted entry when the key is absent.
3721 /// - Triggers the same root-split and proactive-split logic.
3722 ///
3723 /// `data` should be the raw value bytes, or an empty slice for a
3724 /// deletion (which should not normally arrive here during redo, but is
3725 /// handled gracefully).
3726 ///
3727 /// Wave 11-K optimisation (Fix 1).
3728 pub fn redo_insert(
3729 &self,
3730 key: &[u8],
3731 data: &[u8],
3732 lsn: Lsn,
3733 ) -> Result<bool, TreeError> {
3734 let key_len = key.len();
3735 let data_len = data.len();
3736 let data_opt: Option<&[u8]> =
3737 if data.is_empty() { None } else { Some(data) };
3738
3739 // First-key path: initialise a two-level tree from scratch.
3740 {
3741 let mut root_guard = self.root.write();
3742 if root_guard.is_none() {
3743 // Pre-allocate the BIN's entries Vec using the redo capacity
3744 // hint (Fix 3). Without the hint the first BIN starts at
3745 // capacity 1 and doubles on each insert; with the hint it
3746 // starts at min(hint, max_entries) entries, eliminating
3747 // ~log2(max_entries) Vec-resize doublings.
3748 let initial_cap = if self.redo_capacity_hint > 0 {
3749 self.redo_capacity_hint.min(self.max_entries_per_node)
3750 } else {
3751 1
3752 };
3753 let mut initial_entries = Vec::with_capacity(initial_cap);
3754 initial_entries.push(BinEntry {
3755 data: data_opt.map(|d| d.to_vec()),
3756 known_deleted: false,
3757 dirty: false,
3758 expiration_time: 0,
3759 });
3760 let bin = Arc::new(RwLock::new(TreeNode::Bottom(BinStub {
3761 node_id: generate_node_id(),
3762 level: BIN_LEVEL,
3763 entries: initial_entries,
3764 key_prefix: Vec::new(),
3765 dirty: true,
3766 is_delta: false,
3767 last_full_lsn: NULL_LSN,
3768 last_delta_lsn: NULL_LSN,
3769 generation: 0,
3770 parent: None,
3771 // St-H6: use true to match the engine-wide hours-only
3772 // invariant (JE BIN.java default; matches tree.rs:980).
3773 expiration_in_hours: true,
3774 cursor_count: 0,
3775 prohibit_next_delta: false,
3776 lsn_rep: LsnRep::from_lsns(&[lsn]),
3777 keys: KeyRep::from_keys(vec![key.to_vec()]), // T-2
3778 compact_max_key_length: self.compact_max_key_length,
3779 })));
3780
3781 let root_arc =
3782 Arc::new(RwLock::new(TreeNode::Internal(InNodeStub {
3783 node_id: generate_node_id(),
3784 level: MAIN_LEVEL | 2,
3785 entries: vec![InEntry { key: vec![] }],
3786 // T-4: the single resident child at slot 0.
3787 targets: TargetRep::Sparse(vec![(0, bin.clone())]),
3788 dirty: true,
3789 generation: 0,
3790 parent: None,
3791 lsn_rep: LsnRep::from_lsns(&[lsn]),
3792 })));
3793
3794 {
3795 let mut g = bin.write();
3796 g.set_parent(Some(Arc::downgrade(&root_arc)));
3797 }
3798
3799 *root_guard = Some(root_arc);
3800
3801 if let Some(counter) = &self.memory_counter {
3802 let delta =
3803 (key_len + data_len + BIN_ENTRY_OVERHEAD) as i64;
3804 counter.fetch_add(delta, Ordering::Relaxed);
3805 }
3806 return Ok(true);
3807 }
3808 }
3809
3810 self.split_root_if_needed(lsn)?;
3811
3812 let root_arc = self.get_root().unwrap();
3813 let result = Self::redo_insert_recursive(
3814 &root_arc,
3815 key,
3816 data_opt,
3817 lsn,
3818 self.max_entries_per_node,
3819 self.key_comparator.as_ref(),
3820 self.key_prefixing,
3821 )?;
3822
3823 if result && let Some(counter) = &self.memory_counter {
3824 let delta = (key_len + data_len + BIN_ENTRY_OVERHEAD) as i64;
3825 counter.fetch_add(delta, Ordering::Relaxed);
3826 }
3827
3828 Ok(result)
3829 }
3830
3831 /// Splits the root node if it is full (needsSplitting).
3832 ///
3833 ///
3834 /// ```text
3835 /// 1. Save oldRoot (the current root IN or BIN).
3836 /// 2. Create newRoot at oldRoot.level + 1.
3837 /// 3. Insert oldRoot into newRoot at slot 0 with a virtual (empty) key.
3838 /// 4. Call split_node on oldRoot, passing newRoot as parent.
3839 /// 5. Replace tree root with newRoot.
3840 /// ```
3841 fn split_root_if_needed(&self, lsn: Lsn) -> Result<(), TreeError> {
3842 // Hold `self.root.write()` across the needs_split check and the
3843 // root promotion, mirroring the first-key path fix and matching
3844 // the broader insert/split serialisation discipline.
3845 //
3846 // With the previous read-then-write pattern, two concurrent
3847 // splitters could each observe needs_split == true, then take()
3848 // and install in turn, with the second wrapping the first's
3849 // already-promoted root in its own new IN. Each level wraps the
3850 // previous, producing a chain of one-child internal nodes. No
3851 // data is lost (every entry is still reachable) but the tree
3852 // becomes unnecessarily deep, and the imbalance can compound
3853 // under heavy concurrent insertion.
3854 let mut root_guard = self.root.write();
3855 let needs_split = match root_guard.as_ref() {
3856 Some(arc) => {
3857 let g = arc.read();
3858 g.get_n_entries() >= self.max_entries_per_node
3859 }
3860 None => false,
3861 };
3862 if !needs_split {
3863 return Ok(());
3864 }
3865
3866 // Create a fresh new root one level above the current root.
3867 let old_root_arc = root_guard.take().expect("checked Some above");
3868 let old_root_level = {
3869 let g = old_root_arc.read();
3870 g.level()
3871 };
3872
3873 // newRoot = new IN(level = oldRoot.level + 1) with slot 0 = oldRoot.
3874 // The key at slot 0 is the virtual key (empty slice) following the
3875 // convention that entry-zero in an upper IN compares as -infinity.
3876 let new_root_arc =
3877 Arc::new(RwLock::new(TreeNode::Internal(InNodeStub {
3878 node_id: generate_node_id(),
3879 level: old_root_level + 1,
3880 entries: vec![InEntry { key: vec![] }],
3881 // T-4: slot 0's resident child is the old root.
3882 targets: TargetRep::Sparse(vec![(0, old_root_arc.clone())]),
3883 dirty: true,
3884 generation: 0,
3885 parent: None,
3886 lsn_rep: LsnRep::from_lsns(&[lsn]),
3887 })));
3888
3889 // Update the old root's parent pointer to the new root.
3890 {
3891 let mut g = old_root_arc.write();
3892 g.set_parent(Some(Arc::downgrade(&new_root_arc)));
3893 }
3894
3895 // Install the new root before calling split_child so split_child
3896 // (which itself takes parent.write()) can run unencumbered.
3897 *root_guard = Some(new_root_arc.clone());
3898 drop(root_guard);
3899
3900 // Now split the old root (which is now child at slot 0 in new_root).
3901 Self::split_child(
3902 &new_root_arc,
3903 0, // child is at slot 0
3904 self.max_entries_per_node,
3905 lsn,
3906 SplitHint::Normal,
3907 &[], // no insertion key at root-init time
3908 self.key_comparator.as_ref(),
3909 self.key_prefixing,
3910 self.in_list_listener.as_ref(),
3911 )?;
3912
3913 // EVICTOR-RECLAIM-1: register the freshly-promoted root IN with the
3914 // evictor's LRU (JE Tree.splitRoot adds the new root to the INList).
3915 // split_child above already registers the new sibling.
3916 let new_root_id = match &*new_root_arc.read() {
3917 TreeNode::Internal(n) => n.node_id,
3918 TreeNode::Bottom(b) => b.node_id,
3919 };
3920 self.note_added(new_root_id);
3921
3922 self.root_splits.fetch_add(1, Ordering::Relaxed);
3923 Ok(())
3924 }
3925
3926 /// Splits the child at `child_index` in `parent`.
3927 ///
3928 /// . This implementation always keeps the **left** half in the
3929 /// existing child node (`child_arc`) and puts the right half in the new
3930 /// sibling, regardless of where the `identifierKey` falls. JE's
3931 /// `IN.splitInternal` (`idKeyIndex` logic ~line 4172) can place either
3932 /// half in the existing node; Noxu's preemptive-split discipline ensures
3933 /// the parent always has a free slot at split time (the split is done on
3934 /// the way *down*, before the parent fills up), so the safe simplification
3935 /// of always using the left half is correct here — no routing information
3936 /// is lost. This comment replaces the previous incorrect claim that
3937 /// `idKeyIndex` drove the choice.
3938 ///
3939 /// Note: does not emit a split log entry; split nodes are marked dirty
3940 /// and flushed at the next checkpoint (flush_dirty_bins/upper_ins).
3941 ///
3942 /// ```text
3943 /// 1. splitIndex = child.nEntries / 2 (or 1 / n-1 for splitSpecial)
3944 /// 2. Create newSibling at the same level.
3945 /// 3. Move entries [splitIndex..nEntries) to newSibling.
3946 /// 4. Update parent slot childIndex -> child (left half),
3947 /// insert newSibling with newIdKey after childIndex.
3948 /// ```
3949 fn split_child(
3950 parent: &Arc<RwLock<TreeNode>>,
3951 child_index: usize,
3952 max_entries: usize,
3953 lsn: Lsn,
3954 hint: SplitHint,
3955 insert_key: &[u8],
3956 key_comparator: Option<&KeyComparatorFn>,
3957 key_prefixing: bool,
3958 listener: Option<&Arc<dyn InListListener>>,
3959 ) -> Result<(), TreeError> {
3960 // The split is performed under `parent.write()` for the entire
3961 // duration. This is a deliberate choice for correctness:
3962 //
3963 // - Without it, between dropping `child.write()` (after installing
3964 // the left half) and acquiring `parent.write()` (to install the
3965 // sibling), a concurrent descender can pick `child_arc` from the
3966 // parent (still pointing at it), descend, take `child.write()`
3967 // and insert a key. Whether the descender's key belongs in the
3968 // left half (now in `child`) or the right half (which will be
3969 // in the new sibling) is determined by the parent's split key —
3970 // but the parent doesn't know about the split key yet, so the
3971 // descender's routing decision is based on stale data. If the
3972 // descender's key falls in the right half, it lands in `child`
3973 // (left half) where a future search will not find it: the
3974 // future search descends from the root, the parent now has the
3975 // sibling installed, the search routes the key to the sibling,
3976 // the sibling does not contain the key — silently lost.
3977 //
3978 // - Holding `parent.write()` throughout serialises split_child
3979 // against every descender that wants `parent.read()`. A
3980 // descender already holding `parent.read()` (latch coupling
3981 // from above) keeps split_child waiting at this lock until it
3982 // has finished its own work. Combined, the split + sibling
3983 // install is atomic with respect to descents.
3984 //
3985 // - Splits are infrequent compared to inserts (~ once per
3986 // max_entries new keys) so the extra serialisation here does
3987 // not dominate.
3988 //
3989 // Reproducer that exercises this race:
3990 // crates/noxu-db/tests/concurrent_commits_stress.rs.
3991 let mut parent_write_guard = parent.write();
3992
3993 // Extract the child Arc from the parent slot.
3994 let child_arc = match &*parent_write_guard {
3995 TreeNode::Internal(p) => {
3996 p.get_child(child_index).ok_or(TreeError::SplitRequired)?
3997 }
3998 TreeNode::Bottom(_) => return Err(TreeError::SplitRequired),
3999 };
4000
4001 // Gather all entries from the child plus split metadata, AND
4002 // perform the in-place left-half install, all under a single
4003 // write lock on the child. See the earlier comment on the race
4004 // this avoids inside split_child.
4005 let mut child_guard = child_arc.write();
4006
4007 // Re-validate that the child still needs splitting, now that we hold
4008 // its write lock. This closes a check-then-act race: the caller
4009 // (`insert_recursive_inner`) tested `child.get_n_entries() >=
4010 // max_entries` under a PARENT READ lock, then dropped that read lock
4011 // (required — the split needs `parent.write()`) before calling
4012 // `split_child`. Read locks do not exclude each other, so two
4013 // descenders can both pass the fullness check on the same child, both
4014 // drop the parent read lock, and both call `split_child`. They
4015 // serialise here on `parent.write()`: the first splits the child
4016 // (leaving it with only its left half), and by the time the second
4017 // acquires this child write lock the child is no longer full — or is
4018 // empty, if a concurrent INCompressor merge cleared it
4019 // (`compress_node`'s `lb.entries.clear()`). Without this re-check the
4020 // second caller would build a `SplitEntries` from that stale child and
4021 // panic in `SplitEntries::get_key(split_index)` on an empty entries
4022 // vec (tree.rs SplitEntries::get_key `v[index]`, observed as
4023 // "index out of bounds: len is 0" under the 96-thread saturation
4024 // benchmark; see .agent/archived-audits/bench/
4025 // bug-bin-split-concurrency.md).
4026 //
4027 // JE performs the identical re-validation: `IN.split` re-checks
4028 // `needsSplitting()` *after* latching the node it will split, so the
4029 // fullness test and the split are atomic w.r.t. the node latch (see
4030 // `IN.split` / `IN.needsSplitting` in IN.java; `Tree.forceSplit`
4031 // latch-couples down and `IN.split` re-tests before mutating). Here
4032 // the child write guard plays the role of that node latch.
4033 //
4034 // A no-op split returns `Ok(())` — the SAME success variant a real
4035 // split returns — because the caller re-descends unconditionally
4036 // after `split_child` (`return Self::insert_recursive_inner(...)`),
4037 // where it re-reads the (now-current) topology and re-checks
4038 // `child_full`. So a benign "already split" outcome simply leads to a
4039 // correct re-descent and the insert proceeds. This does NOT widen any
4040 // lock or hold `parent.write()` across the caller's read-check, so it
4041 // does not re-introduce the descent over-serialisation fixed in 7.2.1.
4042 if child_guard.get_n_entries() < max_entries {
4043 return Ok(());
4044 }
4045
4046 let child_level = child_guard.level();
4047 // St-H6: capture the splitting BIN's expiration_in_hours flag BEFORE
4048 // drop(child_guard) so the right-half sibling inherits it.
4049 // JE: BIN.java::setExpiration calls setExpirationInHours(hours) to
4050 // propagate the flag on split/clone; the Rust split was hardcoding
4051 // false instead of inheriting — this caused hours-granularity TTL
4052 // entries in the right sibling to be read with in_hours=false, making
4053 // the hours-since-epoch value compare as seconds-since-epoch (far in
4054 // the past) and every right-sibling TTL record appear expired.
4055 let bin_expiration_in_hours: bool = match &*child_guard {
4056 TreeNode::Bottom(b) => b.expiration_in_hours,
4057 // Internal nodes do not carry per-entry TTL; default to true
4058 // (the engine-wide invariant for any BIN that may hold TTL data).
4059 TreeNode::Internal(_) => true,
4060 };
4061 // T-2/T-5: the compact-key threshold the new sibling BIN inherits.
4062 // (Only consumed when the child is a BIN; an upper-IN split produces
4063 // upper-IN siblings, which have no compact key rep.)
4064 let bin_compact_max_key_length: i32 = match &*child_guard {
4065 TreeNode::Bottom(b) => b.compact_max_key_length,
4066 TreeNode::Internal(_) => INKeyRep_DEFAULT_MAX_KEY_LENGTH,
4067 };
4068 let (all_entries, bin_old_prefix) = match &*child_guard {
4069 TreeNode::Internal(n) => {
4070 // T-4: capture the parallel resident-child array alongside the
4071 // entries so children travel with their slots through the
4072 // split (JE `IN.split` copies `entryTargets`).
4073 let children: Vec<Option<ChildArc>> =
4074 (0..n.entries.len()).map(|i| n.get_child(i)).collect();
4075 // T-3: capture the parallel per-slot LSNs so they travel with
4076 // their slots (JE `IN.split` copies `entryLsnByteArray`).
4077 let lsns: Vec<Lsn> =
4078 (0..n.entries.len()).map(|i| n.get_lsn(i)).collect();
4079 (
4080 SplitEntries::Internal(n.entries.clone(), children, lsns),
4081 Vec::new(),
4082 )
4083 }
4084 TreeNode::Bottom(b) => {
4085 // Decompress to full keys.
4086 let full: Vec<BinEntry> = (0..b.entries.len())
4087 .map(|i| BinEntry {
4088 data: b.entries[i].data.clone(),
4089 known_deleted: b.entries[i].known_deleted,
4090 dirty: b.entries[i].dirty,
4091 expiration_time: b.entries[i].expiration_time,
4092 })
4093 .collect();
4094 let lsns: Vec<Lsn> =
4095 (0..b.entries.len()).map(|i| b.get_lsn(i)).collect();
4096 // T-2: carry FULL keys through the split; the new BINs
4097 // recompute their own prefix from them.
4098 let full_keys: Vec<Vec<u8>> = (0..b.entries.len())
4099 .map(|i| b.get_full_key(i).unwrap_or_default())
4100 .collect();
4101 (
4102 SplitEntries::Bottom(full, lsns, full_keys),
4103 b.key_prefix.clone(),
4104 )
4105 }
4106 };
4107
4108 // Determine split point — JE `IN.splitSpecial` / `IN.splitInternal`.
4109 //
4110 // Normal midpoint: `n_entries / 2`.
4111 // AllLeft: insertion key is at position 0 on every descend level.
4112 // → split_index = 1 (left half keeps n-1 entries; new right sibling
4113 // gets only the former-first slot, then the insertion fills it).
4114 // This matches JE: `if (leftSide && index == 0) splitInternal(…, 1)`.
4115 // AllRight: insertion key is at the last position on every level.
4116 // → split_index = n_entries - 1 (left half keeps all but one entry).
4117 // JE: `else if (!leftSide && index == nEntries-1) splitInternal(…, nEntries-1)`.
4118 //
4119 // Ref: `IN.java` splitSpecial ~line 4129, splitInternal ~line 4159.
4120 let n_entries = all_entries.len();
4121 let split_index = if n_entries >= 2 {
4122 // Find where insert_key falls in the child.
4123 let insert_idx = {
4124 let mut idx = 0usize;
4125 for i in 1..n_entries {
4126 let ord = match key_comparator {
4127 Some(cmp) => cmp(all_entries.get_key(i), insert_key),
4128 None => all_entries.get_key(i).cmp(insert_key),
4129 };
4130 if ord != std::cmp::Ordering::Greater {
4131 idx = i;
4132 } else {
4133 break;
4134 }
4135 }
4136 idx
4137 };
4138 match hint {
4139 SplitHint::AllLeft if insert_idx == 0 => 1,
4140 SplitHint::AllRight if insert_idx == n_entries - 1 => {
4141 n_entries - 1
4142 }
4143 _ => n_entries / 2,
4144 }
4145 } else {
4146 n_entries / 2
4147 };
4148
4149 // newIdKey — the full key of the first entry of the right half.
4150 // For BIN: entries are already full keys after decompression above.
4151 // For IN: entries carry full keys directly.
4152 let new_id_key = all_entries.get_key(split_index).to_vec();
4153 // Suppress unused-variable warning when no BIN is involved.
4154 let _ = &bin_old_prefix;
4155
4156 // Divide into left and right halves.
4157 let left_entries = all_entries.slice(0, split_index);
4158 let right_entries = all_entries.slice(split_index, n_entries);
4159
4160 // Install the left half into `child_arc` (still under the same
4161 // write lock) and mark the node dirty.
4162 match (&mut *child_guard, &left_entries) {
4163 (TreeNode::Internal(n), SplitEntries::Internal(le, lc, ll)) => {
4164 n.entries = le.clone();
4165 // T-4: reinstall the (now-shorter) left child array.
4166 n.targets = TargetRep::None;
4167 for (i, c) in lc.iter().enumerate() {
4168 if let Some(child) = c {
4169 n.set_child(i, Some(child.clone()));
4170 }
4171 }
4172 // T-3: reinstall the (now-shorter) left LSN array.
4173 n.lsn_rep = LsnRep::from_lsns(ll);
4174 }
4175 (TreeNode::Bottom(b), SplitEntries::Bottom(le, ll, lk)) => {
4176 // Reset prefix; keys arrive as FULL keys (no prefix yet).
4177 b.key_prefix = Vec::new();
4178 // Pre-allocate at max_entries capacity so the left half
4179 // does not need to reallocate on the next insert (Fix 3).
4180 let mut left = Vec::with_capacity(max_entries);
4181 left.extend_from_slice(le);
4182 b.entries = left;
4183 // T-3: reinstall the left LSN array.
4184 b.lsn_rep = LsnRep::from_lsns(ll);
4185 // T-2: reinstall the left key rep from the full keys (Default;
4186 // recompute_key_prefix below compresses + compacts).
4187 b.keys = KeyRep::from_keys(lk.clone());
4188 // Recompute prefix on each half after split (only when
4189 // key_prefixing is enabled for this database).
4190 // JE: IN.computeKeyPrefix returns null when
4191 // databaseImpl.getKeyPrefixing() is false.
4192 // Ref: IN.java computeKeyPrefix ~line 2456.
4193 if key_prefixing && b.entries.len() >= 2 {
4194 b.recompute_key_prefix();
4195 } else {
4196 b.keys.compact(b.compact_max_key_length); // T-2
4197 }
4198 }
4199 _ => return Err(TreeError::SplitRequired),
4200 }
4201 child_guard.set_dirty(true);
4202 drop(child_guard);
4203
4204 // Create the new right-half sibling.
4205 // Parent pointer will be wired in when it is inserted into the parent.
4206 let new_sibling = match right_entries {
4207 SplitEntries::Internal(re, rc, rl) => {
4208 let mut rin = InNodeStub {
4209 node_id: generate_node_id(),
4210 level: child_level,
4211 entries: re,
4212 targets: TargetRep::None,
4213 dirty: true,
4214 generation: 0,
4215 parent: None, // set below
4216 // T-3: the right half's per-slot LSNs.
4217 lsn_rep: LsnRep::from_lsns(&rl),
4218 };
4219 // T-4: install the right half's resident children.
4220 for (i, c) in rc.into_iter().enumerate() {
4221 if c.is_some() {
4222 rin.set_child(i, c);
4223 }
4224 }
4225 Arc::new(RwLock::new(TreeNode::Internal(rin)))
4226 }
4227 SplitEntries::Bottom(re, rl, rk) => {
4228 // Entries arrive as FULL keys; build BinStub with no prefix
4229 // then recompute key prefix for the new sibling.
4230 // Pre-allocate at max_entries capacity so the right half
4231 // does not need to reallocate on the next insert (Fix 3).
4232 let mut right = Vec::with_capacity(max_entries);
4233 right.extend(re);
4234 let mut sibling_bin = BinStub {
4235 node_id: generate_node_id(),
4236 level: child_level,
4237 entries: right,
4238 key_prefix: Vec::new(),
4239 dirty: true,
4240 is_delta: false,
4241 last_full_lsn: NULL_LSN,
4242 last_delta_lsn: NULL_LSN,
4243 generation: 0,
4244 parent: None, // set below
4245 // St-H6 fix: inherit the splitting BIN's flag so that
4246 // is_expired() uses the correct granularity for entries
4247 // that were already in the BIN before the split.
4248 // JE reference: BIN.java::split() propagates
4249 // expirationInHours via setExpirationInHours(hours).
4250 expiration_in_hours: bin_expiration_in_hours,
4251 cursor_count: 0,
4252 prohibit_next_delta: false,
4253 // T-3: the right half's per-slot LSNs.
4254 lsn_rep: LsnRep::from_lsns(&rl),
4255 // T-2: full keys (Default); recompute/compact below.
4256 keys: KeyRep::from_keys(rk),
4257 compact_max_key_length: bin_compact_max_key_length,
4258 };
4259 // St-H6 debug guard: the sibling must carry the same flag as
4260 // the splitting BIN so that in_hours-resolution entries are
4261 // never silently expired by a mismatched false flag.
4262 debug_assert_eq!(
4263 sibling_bin.expiration_in_hours, bin_expiration_in_hours,
4264 "St-H6 invariant: sibling BIN expiration_in_hours must \
4265 match the splitting BIN (got {}, expected {})",
4266 sibling_bin.expiration_in_hours, bin_expiration_in_hours
4267 );
4268
4269 if key_prefixing && sibling_bin.entries.len() >= 2 {
4270 sibling_bin.recompute_key_prefix();
4271 } else {
4272 sibling_bin.keys.compact(bin_compact_max_key_length); // T-2
4273 }
4274 Arc::new(RwLock::new(TreeNode::Bottom(sibling_bin)))
4275 }
4276 };
4277
4278 // Note: the child (left half) was marked dirty earlier under the
4279 // same write lock that installed left_entries; no need to re-take
4280 // the write lock here.
4281
4282 // Insert the new sibling into the parent after child_index.
4283 // We already hold `parent.write()` (taken at the top of the
4284 // function); operate on it directly rather than re-acquiring.
4285 match &mut *parent_write_guard {
4286 TreeNode::Internal(p) => {
4287 let insert_pos = child_index + 1;
4288 // T-4: insert the parent slot and set its cached child via the
4289 // node-level INTargetRep (shifting existing children).
4290 p.insert_entry(
4291 insert_pos,
4292 new_id_key,
4293 lsn,
4294 Some(new_sibling.clone()),
4295 );
4296 // Parent is dirty because it gained a new entry.
4297 p.dirty = true;
4298 }
4299 TreeNode::Bottom(_) => return Err(TreeError::SplitRequired),
4300 }
4301
4302 // Wire the new sibling's parent pointer to the parent node
4303 // before releasing parent_write_guard, so a future descent that
4304 // takes parent.read() and finds the sibling immediately sees a
4305 // fully-wired parent pointer.
4306 {
4307 let mut g = new_sibling.write();
4308 g.set_parent(Some(Arc::downgrade(parent)));
4309 }
4310 // T-4: when an upper IN split, the children that moved into the new
4311 // sibling must have their parent back-pointers re-wired to the
4312 // sibling (JE re-parents moved targets in IN.split).
4313 {
4314 let sg = new_sibling.read();
4315 if let TreeNode::Internal(sn) = &*sg {
4316 let moved = sn.resident_children();
4317 drop(sg);
4318 for child in moved {
4319 let mut cg = child.write();
4320 cg.set_parent(Some(Arc::downgrade(&new_sibling)));
4321 }
4322 }
4323 }
4324 drop(parent_write_guard);
4325
4326 // EVICTOR-RECLAIM-1: register the freshly-split sibling with the
4327 // evictor's LRU (JE IN.splitInternal calls inList.add(newSibling)).
4328 // Without this, split-created BINs/INs are invisible to the evictor:
4329 // the policy lists never receive them, every evict_batch phase quota
4330 // is 0, and eviction reclaims nothing under pressure even though the
4331 // nodes are fully resident. Only the very first root+BIN (the
4332 // first-key path) and re-fetched nodes were ever registered.
4333 if let Some(l) = listener {
4334 let sibling_id = match &*new_sibling.read() {
4335 TreeNode::Internal(n) => n.node_id,
4336 TreeNode::Bottom(b) => b.node_id,
4337 };
4338 l.note_ins_added(sibling_id);
4339 }
4340
4341 Ok(())
4342 }
4343
4344 /// Recursive insert with preemptive splitting.
4345 ///
4346 /// Top-down traversal in `Tree.forceSplit` +
4347 /// `Tree.searchSplitsAllowed`:
4348 ///
4349 /// 1. At an upper IN: find which child slot covers `key`, split the child
4350 /// proactively if it is full (so we always have room to insert the split
4351 /// key into the parent), then recurse into the appropriate child.
4352 /// 2. At a BIN: insert the key/data directly.
4353 ///
4354 /// This implements the "preemptive splitting" strategy from the: we split
4355 /// children on the way down so we never need to walk back up.
4356 fn insert_recursive(
4357 node_arc: &Arc<RwLock<TreeNode>>,
4358 key: Vec<u8>,
4359 data: Vec<u8>,
4360 lsn: Lsn,
4361 max_entries: usize,
4362 key_comparator: Option<&KeyComparatorFn>,
4363 key_prefixing: bool,
4364 listener: Option<&Arc<dyn InListListener>>,
4365 ) -> Result<bool, TreeError> {
4366 Self::insert_recursive_inner(
4367 node_arc,
4368 key,
4369 data,
4370 lsn,
4371 max_entries,
4372 key_comparator,
4373 key_prefixing,
4374 true, // all_left_so_far
4375 true, // all_right_so_far
4376 listener,
4377 )
4378 }
4379
4380 /// Inner recursive helper that threads `allLeftSideDescent` /
4381 /// `allRightSideDescent` from `Tree.forceSplit` (JE ~line 1912).
4382 ///
4383 /// Both flags start `true` at the root and are cleared as soon as the
4384 /// descent takes a non-leftmost / non-rightmost child slot. At split
4385 /// time they are forwarded to `split_child` which uses them to pick the
4386 /// `splitSpecial` split index (JE `IN.splitSpecial` ~line 4129).
4387 #[allow(clippy::too_many_arguments)]
4388 fn insert_recursive_inner(
4389 node_arc: &Arc<RwLock<TreeNode>>,
4390 key: Vec<u8>,
4391 data: Vec<u8>,
4392 lsn: Lsn,
4393 max_entries: usize,
4394 key_comparator: Option<&KeyComparatorFn>,
4395 key_prefixing: bool,
4396 all_left_so_far: bool,
4397 all_right_so_far: bool,
4398 listener: Option<&Arc<dyn InListListener>>,
4399 ) -> Result<bool, TreeError> {
4400 // Determine if this is a BIN (leaf level).
4401 //
4402 // We hold a read lock on `node_arc` (the parent of any descent we
4403 // do below) for the duration of this call, releasing it just
4404 // before returning. That achieves *latch coupling*: a concurrent
4405 // `split_child(parent, …)` that wants to reorganise our subtree
4406 // ultimately needs `parent.write()` to install the new sibling,
4407 // and that write blocks until our read lock is dropped. Without
4408 // this, the descender-vs-splitter race goes:
4409 //
4410 // T_X: at root, picks child_arc (BIN), drops root read lock.
4411 // T_Y: at root, runs split_child(root, …): takes child_arc.write(),
4412 // installs left half [E1..E5], creates sibling [E6..E10],
4413 // takes root.write() and inserts the sibling.
4414 // T_X: now takes child_arc.write() and inserts a key whose
4415 // sort order falls in the right half. The key lands in
4416 // child_arc (left half) but a future search descending
4417 // from the root routes that key to the new sibling and
4418 // does not find it — silently lost.
4419 //
4420 // Reproducer: noxu-db/tests/concurrent_commits_stress.rs
4421 // (32 threads × 100 keys, ~1–6 lost writes per run before this fix;
4422 // occasionally hundreds when an entire BIN is orphaned).
4423 let parent_guard = node_arc.read();
4424 let is_bin = parent_guard.is_bin();
4425
4426 if is_bin {
4427 // BIN: drop the read lock and take the write lock; this is
4428 // safe because the *outer* call frame still holds a read
4429 // lock on this BIN's parent (or this is the root, in which
4430 // case the first-key path has already initialised it). A
4431 // concurrent split_child(parent, …) cannot run while the
4432 // outer parent.read() is held, so the BIN cannot be
4433 // restructured between dropping our read lock and acquiring
4434 // our write lock.
4435 drop(parent_guard);
4436 let mut guard = node_arc.write();
4437 match &mut *guard {
4438 TreeNode::Bottom(bin) => {
4439 let is_new = if let Some(cmp) = key_comparator {
4440 // Comparator-based insert: no prefix compression.
4441 let (_idx, new) =
4442 bin.insert_cmp(key, lsn, Some(data), cmp.as_ref());
4443 new
4444 } else if key_prefixing {
4445 // insert_with_prefix handles prefix recomputation when
4446 // the new key shrinks the existing prefix, and also
4447 // initialises the prefix when 2 entries are present for
4448 // the first time.
4449 let (_idx, new) =
4450 bin.insert_with_prefix(key, lsn, Some(data));
4451 new
4452 } else {
4453 // key_prefixing disabled: store full key, no prefix.
4454 // JE: IN.computeKeyPrefix returns null when
4455 // databaseImpl.getKeyPrefixing() is false.
4456 // Ref: IN.java computeKeyPrefix ~line 2456.
4457 let (_idx, new) = bin.insert_raw(key, lsn, Some(data));
4458 new
4459 };
4460 // Mark dirty after any modification.
4461 bin.dirty = true;
4462 Ok(is_new)
4463 }
4464 TreeNode::Internal(_) => Err(TreeError::SplitRequired),
4465 }
4466 } else {
4467 // Upper IN: find the child slot that covers key.
4468 // Index = parent.findEntry(key, false, false)
4469 // Entry zero in an upper IN has a virtual key (-infinity), so
4470 // any real key is routed to at least slot 0.
4471 let (child_index, n_entries_at_level, child_arc) =
4472 match &*parent_guard {
4473 TreeNode::Internal(n) => {
4474 // Binary search for the largest key <= search key.
4475 // Slot 0 always matches (virtual key = -infinity).
4476 let mut idx = 0usize;
4477 for (i, entry) in n.entries.iter().enumerate() {
4478 if i == 0 {
4479 idx = 0;
4480 } else {
4481 let ord = match key_comparator {
4482 Some(cmp) => cmp(
4483 entry.key.as_slice(),
4484 key.as_slice(),
4485 ),
4486 None => {
4487 entry.key.as_slice().cmp(key.as_slice())
4488 }
4489 };
4490 if ord != std::cmp::Ordering::Greater {
4491 idx = i;
4492 } else {
4493 break;
4494 }
4495 }
4496 }
4497 let child =
4498 n.get_child(idx).ok_or(TreeError::SplitRequired)?;
4499 (idx, n.entries.len(), child)
4500 }
4501 TreeNode::Bottom(_) => {
4502 return Err(TreeError::SplitRequired);
4503 }
4504 };
4505
4506 // Update the descent-side flags (JE `Tree.forceSplit` ~1959).
4507 // `allLeftSideDescent` ← still true only if we chose slot 0.
4508 // `allRightSideDescent` ← still true only if we chose the last slot.
4509 let all_left = all_left_so_far && child_index == 0;
4510 let all_right = all_right_so_far
4511 && child_index == n_entries_at_level.saturating_sub(1);
4512
4513 // Proactively split the child if it is full.
4514 // If (child.needsSplitting()) child.split(parent, ...)
4515 let child_full = {
4516 let g = child_arc.read();
4517 g.get_n_entries() >= max_entries
4518 };
4519
4520 if child_full {
4521 // Build the splitSpecial hint from the accumulated flags.
4522 // JE `Tree.forceSplit` ~line 2010:
4523 // if (allLeftSideDescent || allRightSideDescent)
4524 // child.splitSpecial(parent, index, grandParent,
4525 // maxTreeEntriesPerNode, key, allLeftSideDescent)
4526 let hint = match (all_left, all_right) {
4527 (true, _) => SplitHint::AllLeft,
4528 (_, true) => SplitHint::AllRight,
4529 _ => SplitHint::Normal,
4530 };
4531 // split_child(parent, …) needs parent.write(); we must
4532 // drop our parent read lock before calling it.
4533 drop(parent_guard);
4534 Self::split_child(
4535 node_arc,
4536 child_index,
4537 max_entries,
4538 lsn,
4539 hint,
4540 &key,
4541 key_comparator,
4542 key_prefixing,
4543 listener,
4544 )?;
4545
4546 // After the split, re-find which child now covers key.
4547 // Re-enter at the top of the inner function; carry the
4548 // flags (the new topology doesn't invalidate them — we
4549 // still know the overall descent direction).
4550 return Self::insert_recursive_inner(
4551 node_arc,
4552 key,
4553 data,
4554 lsn,
4555 max_entries,
4556 key_comparator,
4557 key_prefixing,
4558 all_left_so_far,
4559 all_right_so_far,
4560 listener,
4561 );
4562 }
4563
4564 // Descend into the child while still holding parent_guard.
4565 // The recursive call will hold child.read() before this
4566 // returns, then drop it; combined with our parent_guard,
4567 // the latch coupling chain is preserved on the way down and
4568 // unwound on the way back up.
4569 let r = Self::insert_recursive_inner(
4570 &child_arc,
4571 key,
4572 data,
4573 lsn,
4574 max_entries,
4575 key_comparator,
4576 key_prefixing,
4577 all_left,
4578 all_right,
4579 listener,
4580 );
4581 drop(parent_guard);
4582 r
4583 }
4584 }
4585
4586 /// Slice-based variant of [`Tree::insert_recursive`] for the recovery redo path.
4587 ///
4588 /// Accepts `key: &[u8]` and `data: Option<&[u8]>` instead of owned
4589 /// `Vec<u8>` values. At the BIN leaf, calls
4590 /// [`BinStub::insert_with_prefix_slice`] which copies bytes into the
4591 /// `BinEntry` exactly once.
4592 ///
4593 /// For the comparator path (custom key comparator), falls back to
4594 /// `insert_cmp` with a one-time `to_vec()` conversion — that path is
4595 /// rare in practice (sorted-dup databases only) and is not on the
4596 /// W11 hot path.
4597 ///
4598 /// Wave 11-K optimisation (Fix 1).
4599 fn redo_insert_recursive(
4600 node_arc: &Arc<RwLock<TreeNode>>,
4601 key: &[u8],
4602 data: Option<&[u8]>,
4603 lsn: Lsn,
4604 max_entries: usize,
4605 key_comparator: Option<&KeyComparatorFn>,
4606 key_prefixing: bool,
4607 ) -> Result<bool, TreeError> {
4608 Self::redo_insert_recursive_inner(
4609 node_arc,
4610 key,
4611 data,
4612 lsn,
4613 max_entries,
4614 key_comparator,
4615 key_prefixing,
4616 true,
4617 true,
4618 )
4619 }
4620
4621 #[allow(clippy::too_many_arguments)]
4622 fn redo_insert_recursive_inner(
4623 node_arc: &Arc<RwLock<TreeNode>>,
4624 key: &[u8],
4625 data: Option<&[u8]>,
4626 lsn: Lsn,
4627 max_entries: usize,
4628 key_comparator: Option<&KeyComparatorFn>,
4629 key_prefixing: bool,
4630 all_left_so_far: bool,
4631 all_right_so_far: bool,
4632 ) -> Result<bool, TreeError> {
4633 let parent_guard = node_arc.read();
4634 let is_bin = parent_guard.is_bin();
4635
4636 if is_bin {
4637 drop(parent_guard);
4638 let mut guard = node_arc.write();
4639 match &mut *guard {
4640 TreeNode::Bottom(bin) => {
4641 // REC-F2: JE redo currency check
4642 // (RecoveryManager.redo() line ~2512/2544). A logged LN
4643 // is applied only when logrecLsn > treeLsn. If the slot
4644 // already holds an equal-or-newer LSN, skip the overwrite
4645 // so an out-of-order (older-LSN) redo cannot revert
4646 // committed data or reset the slot LSN backward. This
4647 // makes redo genuinely idempotent regardless of
4648 // redo/undo phase order. Deletes never reach this path
4649 // (redo_ln routes Delete through tree.delete), so the JE
4650 // "lsnCmp == 0 && isDeletion -> set KD" sub-case does not
4651 // apply here.
4652 let cmp_ref = key_comparator.map(|c| {
4653 c.as_ref()
4654 as &dyn Fn(&[u8], &[u8]) -> std::cmp::Ordering
4655 });
4656 if let Some(slot_lsn) =
4657 bin.redo_slot_lsn(key, cmp_ref, key_prefixing)
4658 && lsn <= slot_lsn
4659 {
4660 // Tree already holds an equal-or-newer version.
4661 return Ok(false);
4662 }
4663 let is_new = if let Some(cmp) = key_comparator {
4664 // Comparator path: fall back to owned-Vec variant.
4665 let (_idx, new) = bin.insert_cmp(
4666 key.to_vec(),
4667 lsn,
4668 data.map(|d| d.to_vec()),
4669 cmp.as_ref(),
4670 );
4671 new
4672 } else if key_prefixing {
4673 let (_idx, new) =
4674 bin.insert_with_prefix_slice(key, lsn, data);
4675 new
4676 } else {
4677 // key_prefixing disabled: store full key verbatim.
4678 // Ref: IN.java computeKeyPrefix ~line 2456.
4679 let (_idx, new) = bin.insert_raw(
4680 key.to_vec(),
4681 lsn,
4682 data.map(|d| d.to_vec()),
4683 );
4684 new
4685 };
4686 bin.dirty = true;
4687 Ok(is_new)
4688 }
4689 TreeNode::Internal(_) => Err(TreeError::SplitRequired),
4690 }
4691 } else {
4692 let (child_index, n_entries_at_level, child_arc) =
4693 match &*parent_guard {
4694 TreeNode::Internal(n) => {
4695 let mut idx = 0usize;
4696 for (i, entry) in n.entries.iter().enumerate() {
4697 if i == 0 {
4698 idx = 0;
4699 } else {
4700 let ord = match key_comparator {
4701 Some(cmp) => cmp(entry.key.as_slice(), key),
4702 None => entry.key.as_slice().cmp(key),
4703 };
4704 if ord != std::cmp::Ordering::Greater {
4705 idx = i;
4706 } else {
4707 break;
4708 }
4709 }
4710 }
4711 let child =
4712 n.get_child(idx).ok_or(TreeError::SplitRequired)?;
4713 (idx, n.entries.len(), child)
4714 }
4715 TreeNode::Bottom(_) => {
4716 return Err(TreeError::SplitRequired);
4717 }
4718 };
4719
4720 let all_left = all_left_so_far && child_index == 0;
4721 let all_right = all_right_so_far
4722 && child_index == n_entries_at_level.saturating_sub(1);
4723
4724 let child_full = {
4725 let g = child_arc.read();
4726 g.get_n_entries() >= max_entries
4727 };
4728
4729 if child_full {
4730 let hint = match (all_left, all_right) {
4731 (true, _) => SplitHint::AllLeft,
4732 (_, true) => SplitHint::AllRight,
4733 _ => SplitHint::Normal,
4734 };
4735 drop(parent_guard);
4736 Self::split_child(
4737 node_arc,
4738 child_index,
4739 max_entries,
4740 lsn,
4741 hint,
4742 key,
4743 key_comparator,
4744 key_prefixing,
4745 // Recovery redo path: the listener is not active during
4746 // log replay (the evictor is wired AFTER recovery, and
4747 // the INList is rebuilt separately). EVICTOR-RECLAIM-1
4748 // registration happens on the live insert path.
4749 None,
4750 )?;
4751 return Self::redo_insert_recursive_inner(
4752 node_arc,
4753 key,
4754 data,
4755 lsn,
4756 max_entries,
4757 key_comparator,
4758 key_prefixing,
4759 all_left_so_far,
4760 all_right_so_far,
4761 );
4762 }
4763
4764 let r = Self::redo_insert_recursive_inner(
4765 &child_arc,
4766 key,
4767 data,
4768 lsn,
4769 max_entries,
4770 key_comparator,
4771 key_prefixing,
4772 all_left,
4773 all_right,
4774 );
4775 drop(parent_guard);
4776 r
4777 }
4778 }
4779
4780 /// Pre-warm the tree's internal `Vec<BinEntry>` capacity before a redo
4781 /// pass that will insert approximately `n` records.
4782 ///
4783 /// If the tree is empty, this is a no-op (there is no BIN yet to reserve
4784 /// capacity on). If the tree already has a root BIN (from a previous
4785 /// checkpoint), reserves `n.min(max_entries_per_node)` additional slots
4786 /// in that BIN's entries vector, eliminating the resize-double cycle
4787 /// during the redo loop.
4788 ///
4789 /// Wave 11-K optimisation (Fix 3).
4790 pub fn reserve_redo_capacity(&self, n: usize) {
4791 if n == 0 {
4792 return;
4793 }
4794 let root = match self.get_root() {
4795 Some(r) => r,
4796 None => return,
4797 };
4798 // Descend to the leftmost BIN and reserve there.
4799 let mut arc = root;
4800 loop {
4801 let guard = arc.read();
4802 match &*guard {
4803 TreeNode::Bottom(bin_guard) => {
4804 let additional = n
4805 .min(self.max_entries_per_node)
4806 .saturating_sub(bin_guard.entries.len());
4807 drop(guard);
4808 let mut wguard = arc.write();
4809 if let TreeNode::Bottom(bin) = &mut *wguard {
4810 bin.entries.reserve(additional);
4811 }
4812 return;
4813 }
4814 TreeNode::Internal(inner) => {
4815 let child = inner.get_child(0);
4816 drop(guard);
4817 match child {
4818 Some(c) => arc = c,
4819 None => return,
4820 }
4821 }
4822 }
4823 }
4824 }
4825
4826 /// Get the first (leftmost) BIN in the tree.
4827 ///
4828 /// Descends to the leftmost BIN by
4829 /// always following the first child slot at each upper IN level.
4830 pub fn get_first_node(&self) -> Option<SearchResult> {
4831 let mut guard: parking_lot::ArcRwLockReadGuard<
4832 parking_lot::RawRwLock,
4833 TreeNode,
4834 > = self.get_root()?.read_arc();
4835
4836 loop {
4837 if guard.is_bin() {
4838 let n = guard.get_n_entries();
4839 if n == 0 {
4840 return None;
4841 }
4842 // TREE-F1: return the first LIVE slot, skipping known_deleted
4843 // slots (CursorImpl.java:2062-2064). If the leftmost BIN is
4844 // entirely KD during the reconstitution window the cursor's
4845 // get_first falls through to its cross-BIN advance.
4846 if let TreeNode::Bottom(b) = &*guard {
4847 match (0..b.entries.len()).find(|&i| b.slot_is_live(i)) {
4848 Some(i) => {
4849 return Some(SearchResult::with_values(
4850 true, i as i32, false,
4851 ));
4852 }
4853 None => return None,
4854 }
4855 }
4856 return Some(SearchResult::with_values(true, 0, false));
4857 }
4858
4859 // Capture the leftmost child Arc while holding `guard`, then
4860 // hand-over-hand: take the child read lock before releasing
4861 // the parent's. Same race fix as `Tree::search`.
4862 let next_arc = match &*guard {
4863 TreeNode::Internal(n_node) => n_node.get_child(0)?,
4864 _ => return None,
4865 };
4866 let next_guard = next_arc.read_arc();
4867 drop(guard);
4868 guard = next_guard;
4869 }
4870 }
4871
4872 /// Get the last (rightmost) BIN in the tree.
4873 ///
4874 /// Descends to the rightmost BIN by
4875 /// always following the last child slot at each upper IN level.
4876 pub fn get_last_node(&self) -> Option<SearchResult> {
4877 let mut guard: parking_lot::ArcRwLockReadGuard<
4878 parking_lot::RawRwLock,
4879 TreeNode,
4880 > = self.get_root()?.read_arc();
4881
4882 loop {
4883 if guard.is_bin() {
4884 let n = guard.get_n_entries();
4885 if n == 0 {
4886 return None;
4887 }
4888 // TREE-F1: return the last LIVE slot, skipping known_deleted
4889 // slots (CursorImpl.java:2062-2064).
4890 if let TreeNode::Bottom(b) = &*guard {
4891 match (0..b.entries.len())
4892 .rev()
4893 .find(|&i| b.slot_is_live(i))
4894 {
4895 Some(i) => {
4896 return Some(SearchResult::with_values(
4897 true, i as i32, false,
4898 ));
4899 }
4900 None => return None,
4901 }
4902 }
4903 return Some(SearchResult::with_values(
4904 true,
4905 (n - 1) as i32,
4906 false,
4907 ));
4908 }
4909
4910 // Capture the rightmost child Arc while holding `guard`, then
4911 // hand-over-hand: take the child read lock before releasing
4912 // the parent's. Same race fix as `Tree::search`.
4913 let next_arc = match &*guard {
4914 TreeNode::Internal(n_node) => {
4915 n_node.get_child(n_node.entries.len().saturating_sub(1))?
4916 }
4917 _ => return None,
4918 };
4919 let next_guard = next_arc.read_arc();
4920 drop(guard);
4921 guard = next_guard;
4922 }
4923 }
4924
4925 /// Returns the number of root splits that have occurred.
4926 pub fn get_root_splits(&self) -> u64 {
4927 self.root_splits.load(Ordering::Relaxed)
4928 }
4929
4930 /// Returns the number of relatches required.
4931 pub fn get_relatches_required(&self) -> u64 {
4932 self.relatches_required.load(Ordering::Relaxed)
4933 }
4934
4935 /// Delete a key from the tree.
4936 ///
4937 /// Traverses the tree to find the BIN that should contain the key, then
4938 /// removes the entry. Returns true if the key was found and removed.
4939 ///
4940 /// Delete path in `Tree` from the.
4941 ///
4942 /// In-memory removal only — WAL logging for deletes is handled by the
4943 /// cursor layer (`cursor_impl.rs::log_ln_write`) before this is called,
4944 /// matching separation between LN logging and tree mutation.
4945 pub fn delete(&self, key: &[u8]) -> bool {
4946 let root = match self.get_root() {
4947 Some(r) => r,
4948 None => return false,
4949 };
4950
4951 // F8 consistency: insert accounts key + data + BIN_ENTRY_OVERHEAD; delete must
4952 // subtract the SAME (data_len was previously omitted, leaking
4953 // data_len from the cache counter on every delete and biasing the
4954 // evictor's over-budget view). Peek the data length before deleting.
4955 let data_len = if self.memory_counter.is_some() {
4956 self.search_with_data(key)
4957 .filter(|sf| sf.found)
4958 .and_then(|sf| sf.data.as_ref().map(|d| d.len()))
4959 .unwrap_or(0)
4960 } else {
4961 0
4962 };
4963
4964 let deleted =
4965 Self::delete_recursive(&root, key, self.key_comparator.as_ref());
4966
4967 // Update the memory counter when an entry is removed.
4968 // IN.updateMemorySize(-delta) → MemoryBudget.updateTreeMemoryUsage(-delta).
4969 if deleted && let Some(counter) = &self.memory_counter {
4970 let delta = (key.len() + data_len + BIN_ENTRY_OVERHEAD) as i64;
4971 counter.fetch_sub(delta, Ordering::Relaxed);
4972 }
4973
4974 deleted
4975 }
4976
4977 /// Recursive helper for `delete`: descend to the BIN that holds `key`
4978 /// and remove it.
4979 fn delete_recursive(
4980 node_arc: &Arc<RwLock<TreeNode>>,
4981 key: &[u8],
4982 key_comparator: Option<&KeyComparatorFn>,
4983 ) -> bool {
4984 // Latch coupling, mirroring `insert_recursive`. Without this,
4985 // delete has the same "BIN split out from under us" race: thread
4986 // A finds child_arc as the target BIN under parent.read(), drops
4987 // the lock, and another thread runs split_child(parent, …) that
4988 // moves the target key into the new sibling. A then takes
4989 // child_arc.write(), looks for the key in the (now left-half)
4990 // BIN, doesn't find it, and returns `false`. The caller treats
4991 // the `false` as "key was not present", but the key is actually
4992 // still in the tree (in the sibling). Subsequent operations
4993 // observe a stale record that should have been deleted —
4994 // semantically a lost delete.
4995 let parent_guard = node_arc.read();
4996 let is_bin = parent_guard.is_bin();
4997 let child_arc = if !is_bin {
4998 match &*parent_guard {
4999 TreeNode::Internal(n) => {
5000 // Find child slot with largest key <= search key
5001 let mut idx = 0usize;
5002 for (i, entry) in n.entries.iter().enumerate() {
5003 if i == 0 {
5004 idx = 0;
5005 } else {
5006 let ord = match key_comparator {
5007 Some(cmp) => cmp(entry.key.as_slice(), key),
5008 None => entry.key.as_slice().cmp(key),
5009 };
5010 if ord != std::cmp::Ordering::Greater {
5011 idx = i;
5012 } else {
5013 break;
5014 }
5015 }
5016 }
5017 n.get_child(idx)
5018 }
5019 _ => None,
5020 }
5021 } else {
5022 None
5023 };
5024
5025 if is_bin {
5026 // Drop the read lock before taking the write lock; the outer
5027 // call frame still holds the parent read lock so a concurrent
5028 // split_child cannot run on this BIN's parent until we unwind.
5029 drop(parent_guard);
5030 let mut g = node_arc.write();
5031 match &mut *g {
5032 TreeNode::Bottom(bin) => {
5033 if let Some(cmp) = key_comparator {
5034 bin.delete_cmp(key, cmp.as_ref())
5035 } else {
5036 // Entries store compressed (suffix) keys when key_prefix
5037 // is non-empty. Compress the search key before comparing.
5038 //
5039 // The caller is not required to ensure that `key`
5040 // shares this BIN's learned `key_prefix` — a stray
5041 // delete of a key that was never present (or that
5042 // sits under a different prefix) is legal and must
5043 // simply return `false`. Calling `compress_key`
5044 // unconditionally would `debug_assert!`-panic on
5045 // such inputs, so guard it the same way the cursor
5046 // path does.
5047 if !bin.key_prefix.is_empty()
5048 && !key.starts_with(bin.key_prefix.as_slice())
5049 {
5050 return false;
5051 }
5052 let suffix = bin.compress_key(key);
5053 match bin.key_binary_search(suffix.as_slice()) {
5054 Ok(idx) => {
5055 bin.entries.remove(idx);
5056 bin.keys.remove(idx); // T-2
5057 bin.lsn_rep.remove_shift(idx); // T-3
5058 // Mark dirty after any modification.
5059 bin.dirty = true;
5060 true
5061 }
5062 Err(_) => false,
5063 }
5064 }
5065 }
5066 _ => false,
5067 }
5068 } else {
5069 // Descend with parent_guard still held; the recursion will
5070 // hold its own read lock and drop ours after it returns.
5071 let r = match child_arc {
5072 Some(child) => {
5073 Self::delete_recursive(&child, key, key_comparator)
5074 }
5075 None => false,
5076 };
5077 drop(parent_guard);
5078 r
5079 }
5080 }
5081
5082 // ========================================================================
5083 // B-tree Merge / Compress
5084 // ========================================================================
5085
5086 /// Merge under-full sibling BIN pairs and remove empty subtrees.
5087 ///
5088 /// `INCompressor` / `Tree.compressInternal()` logic.
5089 ///
5090 /// merges two adjacent siblings when their combined entry count is
5091 /// ≤ `max_entries_per_node` (the merge threshold equal to the node
5092 /// capacity). The left sibling's entries are prepended into the right
5093 /// sibling; the parent key slot pointing at the left sibling is then
5094 /// removed from the parent IN with `deleteEntry`. If the parent IN
5095 /// becomes empty after the removal the process repeats recursively up
5096 /// the tree.
5097 ///
5098 /// This implementation performs a single post-order walk so that each
5099 /// level is compressed after all its children have been compressed.
5100 pub fn compress(&self) {
5101 let root = match self.get_root() {
5102 Some(r) => r,
5103 None => return,
5104 };
5105 Self::compress_node(&root, self.max_entries_per_node);
5106 }
5107
5108 /// Recursive post-order compress helper.
5109 ///
5110 /// Visits children first (post-order), then scans adjacent child
5111 /// pairs in the current IN and merges them when the merge condition
5112 /// holds: `left.n_entries + right.n_entries <= max_entries`.
5113 ///
5114 /// After merging, the parent entry for the left sibling is deleted.
5115 /// The loop restarts after each merge so that newly under-full pairs
5116 /// created by previous merges are also considered.
5117 fn compress_node(node_arc: &Arc<RwLock<TreeNode>>, max_entries: usize) {
5118 // Collect child arcs to recurse without holding the node lock.
5119 let children: Vec<Arc<RwLock<TreeNode>>> = {
5120 let g = node_arc.read();
5121 match &*g {
5122 TreeNode::Internal(n) => n.resident_children(),
5123 // BINs are leaves; nothing to compress at this level.
5124 TreeNode::Bottom(_) => return,
5125 }
5126 };
5127
5128 // Post-order: recurse into every child before working on this level.
5129 for child in &children {
5130 Self::compress_node(child, max_entries);
5131 }
5132
5133 // Compress the current IN level: merge adjacent under-full children.
5134 // Repeat until a full pass produces no merges.
5135 loop {
5136 let n_entries = {
5137 let g = node_arc.read();
5138 g.get_n_entries()
5139 };
5140
5141 let mut merged_any = false;
5142
5143 // `i` is the index of the *left* candidate; right is at `i+1`.
5144 let mut i = 0usize;
5145 while i + 1 < n_entries {
5146 // Fetch left and right child arcs.
5147 let (left_arc, right_arc) = {
5148 let g = node_arc.read();
5149 match &*g {
5150 TreeNode::Internal(p) => {
5151 let l = p.get_child(i);
5152 let r = p.get_child(i + 1);
5153 match (l, r) {
5154 (Some(l), Some(r)) => (l, r),
5155 _ => {
5156 i += 1;
5157 continue;
5158 }
5159 }
5160 }
5161 TreeNode::Bottom(_) => return,
5162 }
5163 };
5164
5165 let left_n = { left_arc.read().get_n_entries() };
5166 let right_n = { right_arc.read().get_n_entries() };
5167
5168 // merge condition: combined count fits within one node.
5169 if left_n + right_n > max_entries {
5170 i += 1;
5171 continue;
5172 }
5173
5174 // Determine node kind from left child.
5175 let left_is_bin = { left_arc.read().is_bin() };
5176
5177 if left_is_bin {
5178 // BIN merge: decompress left entries to full keys, then
5179 // prepend into right BIN (also decompressed), and finally
5180 // recompute the merged BIN's prefix.
5181 // merge left into right, then
5182 // recalcKeyPrefix on the merged node.
5183 let left_full_entries: Vec<BinEntry> = {
5184 {
5185 let g = left_arc.read();
5186 match &*g {
5187 TreeNode::Bottom(b) => (0..b.entries.len())
5188 .map(|j| BinEntry {
5189 data: b.entries[j].data.clone(),
5190 known_deleted: b.entries[j]
5191 .known_deleted,
5192 dirty: b.entries[j].dirty,
5193 expiration_time: b.entries[j]
5194 .expiration_time,
5195 })
5196 .collect(),
5197 _ => {
5198 i += 1;
5199 continue;
5200 }
5201 }
5202 }
5203 };
5204 // T-3 / T-2: capture left's per-slot LSNs and FULL keys.
5205 let (left_full_lsns, left_full_keys): (
5206 Vec<Lsn>,
5207 Vec<Vec<u8>>,
5208 ) = {
5209 let g = left_arc.read();
5210 match &*g {
5211 TreeNode::Bottom(b) => (
5212 (0..b.entries.len())
5213 .map(|j| b.get_lsn(j))
5214 .collect(),
5215 (0..b.entries.len())
5216 .map(|j| {
5217 b.get_full_key(j).unwrap_or_default()
5218 })
5219 .collect(),
5220 ),
5221 _ => (Vec::new(), Vec::new()),
5222 }
5223 };
5224 {
5225 {
5226 let mut g = right_arc.write();
5227 match &mut *g {
5228 TreeNode::Bottom(rb) => {
5229 // Decompress right entries to full keys.
5230 let right_full: Vec<BinEntry> = (0..rb
5231 .entries
5232 .len())
5233 .map(|j| BinEntry {
5234 data: rb.entries[j].data.clone(),
5235 known_deleted: rb.entries[j]
5236 .known_deleted,
5237 dirty: rb.entries[j].dirty,
5238 expiration_time: rb.entries[j]
5239 .expiration_time,
5240 })
5241 .collect();
5242 // T-3 / T-2: right's per-slot LSNs + keys.
5243 let right_full_lsns: Vec<Lsn> =
5244 (0..rb.entries.len())
5245 .map(|j| rb.get_lsn(j))
5246 .collect();
5247 let right_full_keys: Vec<Vec<u8>> =
5248 (0..rb.entries.len())
5249 .map(|j| {
5250 rb.get_full_key(j)
5251 .unwrap_or_default()
5252 })
5253 .collect();
5254 // Left entries are all smaller; prepend.
5255 let mut combined = left_full_entries;
5256 combined.extend(right_full);
5257 let mut combined_lsns = left_full_lsns;
5258 combined_lsns.extend(right_full_lsns);
5259 let mut combined_keys = left_full_keys;
5260 combined_keys.extend(right_full_keys);
5261 // Reset prefix and assign full keys.
5262 rb.key_prefix = Vec::new();
5263 rb.entries = combined;
5264 // T-3: rebuild the merged LSN array.
5265 rb.lsn_rep =
5266 LsnRep::from_lsns(&combined_lsns);
5267 // T-2: rebuild the merged key rep (Default;
5268 // recompute below compresses + compacts).
5269 rb.keys = KeyRep::from_keys(combined_keys);
5270 // Recompute prefix on merged BIN.
5271 if rb.entries.len() >= 2 {
5272 rb.recompute_key_prefix();
5273 } else {
5274 rb.keys
5275 .compact(rb.compact_max_key_length);
5276 }
5277 rb.dirty = true;
5278 }
5279 _ => {
5280 i += 1;
5281 continue;
5282 }
5283 }
5284 }
5285 }
5286 // Clear the now-merged left BIN.
5287 {
5288 let mut g = left_arc.write();
5289 if let TreeNode::Bottom(lb) = &mut *g {
5290 lb.entries.clear();
5291 lb.lsn_rep = LsnRep::Empty; // T-3
5292 lb.keys = KeyRep::new(); // T-2
5293 lb.key_prefix = Vec::new();
5294 lb.dirty = true;
5295 }
5296 }
5297 } else {
5298 // Upper-IN merge: prepend left's InEntries into right.
5299 // T-4: capture left's resident children alongside its
5300 // entries so they travel into the merged right IN.
5301 let (left_in_entries, left_children): (
5302 Vec<InEntry>,
5303 Vec<Option<ChildArc>>,
5304 ) = {
5305 let g = left_arc.read();
5306 match &*g {
5307 TreeNode::Internal(n) => {
5308 let children = (0..n.entries.len())
5309 .map(|j| n.get_child(j))
5310 .collect();
5311 (n.entries.clone(), children)
5312 }
5313 _ => {
5314 i += 1;
5315 continue;
5316 }
5317 }
5318 };
5319 // T-3: capture left's per-slot LSNs.
5320 let left_in_lsns: Vec<Lsn> = {
5321 let g = left_arc.read();
5322 match &*g {
5323 TreeNode::Internal(n) => (0..n.entries.len())
5324 .map(|j| n.get_lsn(j))
5325 .collect(),
5326 _ => Vec::new(),
5327 }
5328 };
5329 let n_left = left_in_entries.len();
5330 {
5331 {
5332 let mut g = right_arc.write();
5333 match &mut *g {
5334 TreeNode::Internal(rn) => {
5335 // Snapshot right's existing children, then
5336 // rebuild the merged entry + target arrays
5337 // (left half first, then right half).
5338 let right_children: Vec<Option<ChildArc>> =
5339 (0..rn.entries.len())
5340 .map(|j| rn.get_child(j))
5341 .collect();
5342 // T-3: snapshot right's LSNs too.
5343 let right_in_lsns: Vec<Lsn> =
5344 (0..rn.entries.len())
5345 .map(|j| rn.get_lsn(j))
5346 .collect();
5347 let mut combined = left_in_entries.clone();
5348 combined.append(&mut rn.entries);
5349 rn.entries = combined;
5350 // T-3: rebuild the merged LSN array.
5351 let mut combined_lsns =
5352 left_in_lsns.clone();
5353 combined_lsns.extend(right_in_lsns);
5354 rn.lsn_rep =
5355 LsnRep::from_lsns(&combined_lsns);
5356 rn.targets = TargetRep::None;
5357 for (j, c) in
5358 left_children.iter().enumerate()
5359 {
5360 if let Some(child) = c {
5361 rn.set_child(
5362 j,
5363 Some(child.clone()),
5364 );
5365 }
5366 }
5367 for (j, c) in
5368 right_children.into_iter().enumerate()
5369 {
5370 if c.is_some() {
5371 rn.set_child(n_left + j, c);
5372 }
5373 }
5374 rn.dirty = true;
5375 }
5376 _ => {
5377 i += 1;
5378 continue;
5379 }
5380 }
5381 }
5382 }
5383 // Update parent pointers for moved children.
5384 for child in left_children.into_iter().flatten() {
5385 let mut cg = child.write();
5386 cg.set_parent(Some(Arc::downgrade(&right_arc)));
5387 }
5388 // Clear the now-merged left IN.
5389 {
5390 let mut g = left_arc.write();
5391 if let TreeNode::Internal(ln) = &mut *g {
5392 ln.entries.clear();
5393 ln.lsn_rep = LsnRep::Empty; // T-3
5394 ln.targets = TargetRep::None;
5395 ln.dirty = true;
5396 }
5397 }
5398 }
5399
5400 // Remove the right sibling's parent slot and update
5401 // the left slot to point at the merged right child.
5402 //
5403 // We keep the LEFT slot's key (which is the correct minimum for
5404 // the merged BIN's range) and remove the RIGHT slot (i+1).
5405 // This avoids having to update the parent key when i == 0.
5406 {
5407 {
5408 let mut g = node_arc.write();
5409 match &mut *g {
5410 TreeNode::Internal(p) => {
5411 // Update left slot (i) to point at right_arc
5412 // (which now contains the merged entries).
5413 if i < p.entries.len() {
5414 p.set_child(i, Some(right_arc.clone()));
5415 }
5416 // Remove right slot (i+1) — it is now redundant.
5417 // T-4: remove_entry shifts the child array too.
5418 if i + 1 < p.entries.len() {
5419 p.remove_entry(i + 1);
5420 }
5421 p.dirty = true;
5422 }
5423 TreeNode::Bottom(_) => return,
5424 }
5425 }
5426 }
5427
5428 merged_any = true;
5429 // Advance i to check the merged BIN against its new right
5430 // sibling (the old slot i+2 is now at i+1).
5431 i += 1;
5432 let updated_n = { node_arc.read().get_n_entries() };
5433 if i + 1 >= updated_n {
5434 break;
5435 }
5436 }
5437
5438 if !merged_any {
5439 break;
5440 }
5441 }
5442 }
5443
5444 // ========================================================================
5445 // BIN slot compression
5446 // ========================================================================
5447
5448 /// Compress deleted slots from a BIN node, then prune it from its parent
5449 /// IN when it becomes empty.
5450 ///
5451 /// (the in-place slot-removal
5452 /// path, NOT the sibling-merge path handled by `compress()`).
5453 ///
5454 /// # Algorithm
5455 ///
5456 /// 1. If the BIN is a delta, skip — deltas cannot be compressed.
5457 /// 2. Remove all slots where `entry.known_deleted` is true. This mirrors
5458 /// `bin.compress(!bin.shouldLogDelta(), localTracker)`.
5459 /// 3. If the BIN is now empty, remove it from its parent IN. This mirrors
5460 /// `pruneBIN(db, binRef, idKey)` → `tree.delete(idKey)`.
5461 ///
5462 /// # Arguments
5463 ///
5464 /// * `bin_arc` — the BIN to compress (must be a `TreeNode::Bottom`).
5465 ///
5466 /// # Returns
5467 ///
5468 /// `true` if compression made progress (slots were removed or the BIN was
5469 /// pruned), `false` if the BIN was skipped (delta, no cursors issue, etc.).
5470 pub fn compress_bin(&self, bin_arc: &Arc<RwLock<TreeNode>>) -> bool {
5471 self.compress_bin_with_lock_check(bin_arc, None)
5472 }
5473
5474 /// Like [`compress_bin`](Self::compress_bin), but consults a caller-supplied
5475 /// `is_locked` predicate before physically removing each `known_deleted`
5476 /// slot. If `is_locked(slot_lsn)` returns `true`, the slot is SKIPPED
5477 /// (left for a later compression pass after the locking txn resolves).
5478 ///
5479 /// This is the faithful port of JE `BIN.compress` (BIN.java:1141-1172):
5480 ///
5481 /// > We have to be able to lock the LN before we can compress the entry.
5482 /// > If we can't, then skip over it. ... it is more efficient to call
5483 /// > `isLockUncontended` than to actually lock the LN, since we would
5484 /// > release the lock immediately.
5485 ///
5486 /// ```text
5487 /// if (lsn != DbLsn.NULL_LSN &&
5488 /// !lockManager.isLockUncontended(lsn)) {
5489 /// anyLocked = true;
5490 /// continue;
5491 /// }
5492 /// ```
5493 ///
5494 /// JE's `isLockUncontended(lsn)` (LockManager.java:692) returns
5495 /// `nWaiters() == 0 && nOwners() == 0`. Our `is_locked(lsn)` is its
5496 /// inverse: the dbi layer supplies a closure over the `LockManager` that
5497 /// returns `true` iff the slot's LSN has any owner or waiter
5498 /// (`LockManager::get_lock_info(lsn) != (0, 0)`). A `NULL_LSN` slot is
5499 /// always discardable without locking (JE: "Can discard a NULL_LSN entry
5500 /// without locking"), so we never invoke the predicate for it.
5501 ///
5502 /// # Layering (noxu-tree -/-> noxu-txn)
5503 ///
5504 /// The predicate is a `&dyn Fn(u64) -> bool`, NOT a `LockManager`
5505 /// reference, so noxu-tree never depends on noxu-txn. The lock knowledge
5506 /// lives entirely in the dbi-supplied closure.
5507 ///
5508 /// # Lock ordering (no deadlock)
5509 ///
5510 /// `is_locked` is invoked while this method holds the **BIN write latch**.
5511 /// The dbi closure calls `LockManager::get_lock_info`, which takes a
5512 /// lock-table *shard* mutex for a single, non-blocking critical section
5513 /// and releases it before returning — it never waits and never re-enters
5514 /// the tree. The LockManager has no edge back into a BIN latch (lock
5515 /// acquisition descends the tree from the dbi/cursor layer, never the
5516 /// reverse). The only ordering is therefore BIN-latch -> shard-mutex,
5517 /// which is acyclic; no lock cycle exists, so the predicate cannot
5518 /// deadlock against the latch.
5519 ///
5520 /// When `is_locked` is `None` (recovery, BIN-delta replay, unit tests with
5521 /// no lock manager) behavior is identical to the historical
5522 /// `compress_bin`: every `known_deleted` slot is removed.
5523 pub fn compress_bin_with_lock_check(
5524 &self,
5525 bin_arc: &Arc<RwLock<TreeNode>>,
5526 is_locked: Option<&dyn Fn(u64) -> bool>,
5527 ) -> bool {
5528 // ---- Step 1: collect metadata without holding the write lock ----
5529 let (is_delta, n_entries, id_key) = {
5530 {
5531 let g = bin_arc.read();
5532 match &*g {
5533 TreeNode::Bottom(b) => {
5534 // Identifier key = first full key in the BIN
5535 // (the: bin.getIdentifierKey()).
5536 let id_key = b.get_full_key(0);
5537 (b.is_delta, b.entries.len(), id_key)
5538 }
5539 _ => return false, // not a BIN
5540 }
5541 }
5542 };
5543
5544 // If (bin.isBINDelta()) return; — deltas cannot be compressed.
5545 if is_delta {
5546 return false;
5547 }
5548
5549 // ---- Step 2: remove known-deleted slots) ----
5550 // We compress dirty slots too (compress_dirty_slots = true) because
5551 // we are not writing a BIN-delta here.
5552 let removed_any = {
5553 {
5554 let mut g = bin_arc.write();
5555 match &mut *g {
5556 TreeNode::Bottom(b) => {
5557 let before = b.entries.len();
5558 // BIN.compress(): walk backwards to remove
5559 // deleted slots without index confusion.
5560 //
5561 // IC-3 — JE `BIN.compress` (BIN.java:1141-1172) does
5562 // NOT compress a slot it cannot lock: "We have to be
5563 // able to lock the LN before we can compress the
5564 // entry. If we can't, then skip over it." JE calls
5565 // `lockManager.isLockUncontended(lsn)` and, on a
5566 // contended slot, does `anyLocked = true; continue;`.
5567 // We mirror that here via the optional `is_locked`
5568 // predicate (supplied by the dbi layer, closing over
5569 // the LockManager — see
5570 // `compress_bin_with_lock_check`). This removes the
5571 // previously fragile implicit invariant ("no code path
5572 // ever tombstones a slot before its txn commits"):
5573 // even if a future write path leaves an uncommitted,
5574 // write-locked `known_deleted` tombstone in a BinStub,
5575 // the predicate keeps the compressor from physically
5576 // removing a slot a live txn still references.
5577 //
5578 // When `is_locked` is `None` (recovery / BIN-delta
5579 // replay / lock-manager-less tests) behavior is
5580 // unchanged: every `known_deleted` slot is removed,
5581 // matching the historical safe-by-invariant path.
5582 let mut j = b.entries.len();
5583 while j > 0 {
5584 j -= 1;
5585 if b.entries[j].known_deleted {
5586 // IC-3 lock check (JE BIN.compress). A
5587 // NULL_LSN slot is always discardable without
5588 // locking (JE: "Can discard a NULL_LSN entry
5589 // without locking"), so we only consult the
5590 // predicate for a non-null LSN.
5591 if let Some(is_locked) = is_locked {
5592 let slot_lsn = b.get_lsn(j);
5593 if !slot_lsn.is_null()
5594 && is_locked(slot_lsn.as_u64())
5595 {
5596 // Slot still write-locked by an
5597 // in-flight txn — leave it for a later
5598 // pass (JE: anyLocked = true; continue).
5599 continue;
5600 }
5601 }
5602 // JE `IN.deleteEntry` (IN.java:3466): removing a
5603 // DIRTY slot must prohibit the next delta — a
5604 // delta only carries dirty slots, so the removal
5605 // would otherwise be silently lost. Force a
5606 // full BIN on the next log.
5607 if b.entries[j].dirty {
5608 b.prohibit_next_delta = true;
5609 }
5610 b.entries.remove(j);
5611 b.keys.remove(j); // T-2
5612 b.lsn_rep.remove_shift(j); // T-3
5613 b.dirty = true;
5614 }
5615 }
5616 // Recompute prefix after slot removal, since the
5617 // remaining keys may share a longer common prefix.
5618 // After compress(), call recalcKeyPrefix().
5619 if b.entries.len() >= 2 {
5620 b.recompute_key_prefix();
5621 } else if b.entries.len() < 2 {
5622 b.key_prefix = Vec::new();
5623 }
5624 b.entries.len() < before
5625 }
5626 _ => false,
5627 }
5628 }
5629 };
5630
5631 // ---- Step 3: prune empty BIN from parent ----
5632 // If (empty) pruneBIN(db, binRef, idKey) → tree.delete(idKey).
5633 // We only prune when the BIN is actually empty after compression.
5634 let now_empty = { bin_arc.read().get_n_entries() == 0 };
5635
5636 if now_empty {
5637 // pruneBIN re-descends to the SPECIFIC empty BIN and removes its
5638 // parent-IN slot ONLY IF the BIN is still empty (and has no
5639 // cursors and is not a delta) UNDER THE PARENT LATCH.
5640 //
5641 // We must NOT use `self.delete(&id_key)` here (IC-1): that
5642 // re-descends by key and removes whatever live entry now matches
5643 // `id_key`. Between reading `now_empty` (a fresh read lock taken
5644 // after the compression write lock was dropped) and acting on it,
5645 // a concurrent insert can repopulate this BIN; `self.delete` would
5646 // then drop a LIVE entry — tree corruption / lost write.
5647 //
5648 // JE `INCompressor.pruneBIN` (INCompressor.java ~line 502-510)
5649 // calls `tree.delete(idKey)`, and JE `Tree.delete` /
5650 // `searchDeletableSubTree` (Tree.java ~line 755-800) re-validates
5651 // `bin.getNEntries() != 0` → NODE_NOT_EMPTY (abort) and
5652 // `bin.nCursors() > 0` → CURSORS_EXIST (abort) while holding the
5653 // parent (branch) latch. `prune_empty_bin` reproduces exactly
5654 // that re-validation. See `prune_empty_bin` below.
5655 //
5656 // Note: we only attempt the prune if n_entries was > 0 before
5657 // compression (an already-empty BIN we never populated is left
5658 // alone, matching the pre-existing guard).
5659 if let Some(key) = id_key
5660 && n_entries > 0
5661 {
5662 self.prune_empty_bin(&key);
5663 }
5664 return true;
5665 }
5666
5667 removed_any
5668 }
5669
5670 /// Re-descend to the leaf BIN that should contain `id_key` and remove its
5671 /// parent-IN child slot ONLY IF the BIN is still safe to prune.
5672 ///
5673 /// This is the faithful port of JE `Tree.delete(idKey)` /
5674 /// `Tree.searchDeletableSubTree` (Tree.java ~line 755-800) as invoked by
5675 /// `INCompressor.pruneBIN` (INCompressor.java ~line 502-510). JE takes the
5676 /// branch-parent latch, re-descends to the specific empty BIN, and aborts
5677 /// the prune (removing NOTHING) if any of the following changed since the
5678 /// compressor observed the BIN as empty:
5679 ///
5680 /// * `bin.getNEntries() != 0` → `NodeNotEmptyException` (a concurrent
5681 /// insert repopulated the BIN — IC-1: we must NOT delete a live entry).
5682 /// * `bin.isBINDelta()` → `unexpectedState` (deltas are never empty).
5683 /// * `bin.nCursors() > 0` → `CursorsExistException` (a cursor is parked
5684 /// on the empty BIN; requeue rather than orphan the cursor).
5685 ///
5686 /// The re-check and the slot removal both happen while holding the
5687 /// **parent IN write latch**. Holding the parent write latch blocks every
5688 /// descender (insert / delete take `parent.read()` hand-over-hand), so a
5689 /// concurrent insert cannot reach the BIN between our re-check and the
5690 /// slot removal — the TOCTOU window IC-1 describes is closed.
5691 ///
5692 /// Returns `true` iff a parent-IN slot was removed, `false` otherwise
5693 /// (BIN repopulated, has a cursor, is a delta, vanished, or is the root —
5694 /// in every `false` case NOTHING is removed).
5695 pub fn prune_empty_bin(&self, id_key: &[u8]) -> bool {
5696 let root = match self.get_root() {
5697 Some(r) => r,
5698 None => return false,
5699 };
5700
5701 // If the root itself is the BIN (single-BIN tree) there is no parent
5702 // IN to remove a slot from. JE's searchDeletableSubTree returns null
5703 // ("the entire tree is empty") and keeps the root BIN; we do the same.
5704 if root.read().is_bin() {
5705 return false;
5706 }
5707
5708 // Descend by id_key tracking the IN that is the *parent of the leaf
5709 // BIN* and the child index within it. Hand-over-hand read coupling
5710 // keeps the descent consistent with concurrent splits, exactly like
5711 // `get_parent_bin_for_child_ln`.
5712 let (parent_arc, child_index) = {
5713 let mut parent_arc: Arc<RwLock<TreeNode>> = root.clone();
5714 let mut guard: parking_lot::ArcRwLockReadGuard<
5715 parking_lot::RawRwLock,
5716 TreeNode,
5717 > = root.read_arc();
5718 loop {
5719 let (next_arc, idx) = match &*guard {
5720 TreeNode::Internal(n) => {
5721 if n.entries.is_empty() {
5722 return false;
5723 }
5724 let idx = self.upper_in_floor_index(&n.entries, id_key);
5725 match n.get_child(idx) {
5726 Some(c) => (c, idx),
5727 None => return false,
5728 }
5729 }
5730 TreeNode::Bottom(_) => {
5731 unreachable!("is_bin checked before / below")
5732 }
5733 };
5734 // Is the next node the leaf BIN? If so, `guard`'s node is the
5735 // parent IN we want and `idx` is the child slot.
5736 if next_arc.read().is_bin() {
5737 drop(guard);
5738 break (parent_arc, idx);
5739 }
5740 let next_guard = next_arc.read_arc();
5741 drop(guard);
5742 parent_arc = next_arc;
5743 guard = next_guard;
5744 }
5745 };
5746
5747 // ---- Re-validate and remove the slot UNDER THE PARENT WRITE LATCH ----
5748 // Holding parent.write() excludes all descenders (they need
5749 // parent.read()), so the BIN cannot be repopulated between the
5750 // re-check and the slot removal.
5751 let mut parent_guard = parent_arc.write();
5752 let pruned_bin_id;
5753 let removed_key_len = match &mut *parent_guard {
5754 TreeNode::Internal(p) => {
5755 let child = match p.get_child(child_index) {
5756 Some(c) => c,
5757 None => return false, // slot already vacated / invalid
5758 };
5759 // Re-validate the child BIN under the parent latch.
5760 {
5761 let cg = child.read();
5762 match &*cg {
5763 TreeNode::Bottom(b) => {
5764 // JE: bin.getNEntries() != 0 → NODE_NOT_EMPTY (abort).
5765 if !b.entries.is_empty() {
5766 return false;
5767 }
5768 // JE: bin.isBINDelta() → unexpectedState (abort).
5769 if b.is_delta {
5770 return false;
5771 }
5772 // JE: bin.nCursors() > 0 → CURSORS_EXIST (abort).
5773 if b.cursor_count > 0 {
5774 return false;
5775 }
5776 pruned_bin_id = b.node_id;
5777 }
5778 // A concurrent split could in principle have replaced
5779 // the child with an IN; never prune in that case.
5780 TreeNode::Internal(_) => return false,
5781 }
5782 }
5783 // Safe to prune: remove the BIN's slot from the parent IN.
5784 // Mirrors the parent-slot removal `Tree.delete` performs for
5785 // an empty BIN (Tree.java deleteEntry under the branch latch).
5786 // T-4: remove_entry shifts the node-level child array too.
5787 let removed = p.remove_entry(child_index);
5788 p.dirty = true;
5789 removed.key.len()
5790 }
5791 TreeNode::Bottom(_) => return false,
5792 };
5793 drop(parent_guard);
5794
5795 // JE: removing the BIN slot detaches the BIN from the tree; the
5796 // evictor must drop it from its LRU lists (Evictor.remove).
5797 self.note_removed(pruned_bin_id);
5798
5799 // Preserve the memory-counter bookkeeping that `self.delete` performed
5800 // (IN.updateMemorySize(-delta) → MemoryBudget.updateTreeMemoryUsage).
5801 // The pruned slot's key plus the fixed per-entry overhead matches the
5802 // `delete` accounting (key.len() + BIN_ENTRY_OVERHEAD).
5803 if let Some(counter) = &self.memory_counter {
5804 let delta = (removed_key_len + BIN_ENTRY_OVERHEAD) as i64;
5805 counter.fetch_sub(delta, Ordering::Relaxed);
5806 }
5807
5808 true
5809 }
5810
5811 /// Detach the resident child node `node_id` from its parent IN, dropping
5812 /// the strong `Arc` so the node is actually freed from memory, and return
5813 /// the heap bytes reclaimed (0 if not found / not detachable).
5814 ///
5815 /// This is the faithful port of JE `IN.detachNode(idx, updateLsn, newLsn)`
5816 /// (IN.java ~4019) as called from `Evictor.evict` (Evictor.java ~3035):
5817 /// `evict` measures `target.getBudgetedMemorySize()` and then
5818 /// `parent.detachNode(index, ...)` does `setTarget(idx, null)` to drop the
5819 /// child reference and `getInMemoryINs().remove(child)` to drop it from
5820 /// the INList.
5821 ///
5822 /// EV-13: before this method existed, the evictor credited
5823 /// `node_size_fn(node_id)` bytes back to the budget and removed the node
5824 /// from the LRU lists, but the parent's `InEntry.child` still held a
5825 /// strong `Arc` — so the node was never dropped from the heap. The budget
5826 /// over-credited (claimed bytes freed that were not), `cache_usage`
5827 /// drifted below reality, and the evictor under-fired. Detaching here
5828 /// drops the `Arc` for real and credits exactly the measured size.
5829 ///
5830 /// The detach happens **under the parent IN write latch** (JE detaches
5831 /// under the parent's latch), so no concurrent descender can re-cache the
5832 /// child between measurement and detach. The slot (key + LSN) is kept —
5833 /// only the in-memory `child` target is cleared — matching JE's
5834 /// `setTarget(idx, null)` which leaves the `ChildReference` LSN intact so
5835 /// the node can be re-fetched from the log later.
5836 ///
5837 /// Returns `0` if the node is not a resident child of any IN (e.g. it is
5838 /// the root, already detached, or was pinned and could not be latched).
5839 pub fn detach_node_by_id(&self, node_id: u64) -> u64 {
5840 let root = match self.get_root() {
5841 Some(r) => r,
5842 None => return 0,
5843 };
5844
5845 // The root has no parent IN to detach from (JE evicts the root via a
5846 // separate evictRoot path; we keep the root resident here).
5847 let root_id = {
5848 let g = root.read();
5849 match &*g {
5850 TreeNode::Internal(n) => n.node_id,
5851 TreeNode::Bottom(b) => b.node_id,
5852 }
5853 };
5854 if root_id == node_id {
5855 return 0;
5856 }
5857
5858 // Locate the parent IN and the child slot index.
5859 let (parent_arc, child_index) =
5860 match Self::find_parent_of_node_id(&root, node_id) {
5861 Some(p) => p,
5862 None => return 0,
5863 };
5864
5865 // ---- Measure + detach UNDER THE PARENT WRITE LATCH ----
5866 // Holding parent.write() excludes all descenders (they take
5867 // parent.read() hand-over-hand), so the child cannot be re-cached or
5868 // re-pinned between the measurement and the detach. Mirrors JE
5869 // detachNode running under the parent latch held by Evictor.evict.
5870 let mut parent_guard = parent_arc.write();
5871 let TreeNode::Internal(p) = &mut *parent_guard else {
5872 return 0; // parent is not an IN (concurrent restructure)
5873 };
5874 if child_index >= p.entries.len() {
5875 return 0;
5876 }
5877 // T-4: detach the cached child via the node-level INTargetRep, leaving
5878 // the slot's key/LSN intact for re-fetch (JE IN.setTarget(idx, null)).
5879 let child = match p.take_child(child_index) {
5880 Some(c) => c, // child Arc removed from the slot
5881 None => return 0, // already detached
5882 };
5883
5884 // Measure the child's real heap footprint while we still hold it.
5885 // JE: long evictedBytes = target.getBudgetedMemorySize().
5886 let freed = child.read().budgeted_memory_size();
5887
5888 // EV-14 re-fetch correctness: the parent slot LSN must point at the
5889 // child's CURRENT on-disk version so `child_at_or_fetch` re-reads the
5890 // right bytes (JE `IN.updateEntry(idx, newLsn)` is called whenever a
5891 // child is logged; the parent slot LSN tracks the child's LSN). The
5892 // evictor only fully evicts/detaches a CLEAN BIN (it logs+clears dirty
5893 // BINs via flush_dirty_node_to_log first, which sets `last_full_lsn`),
5894 // so the child's authoritative LSN is its `last_full_lsn`. Stamp it
5895 // into the parent slot before dropping the child; if it is null (the
5896 // child was never logged) leave the existing slot LSN intact rather
5897 // than writing a null — a never-logged clean child cannot occur on
5898 // the evict path, but be conservative.
5899 let child_full_lsn = match &*child.read() {
5900 TreeNode::Bottom(b) => b.last_full_lsn,
5901 TreeNode::Internal(_) => NULL_LSN,
5902 };
5903 if child_full_lsn != NULL_LSN {
5904 p.set_lsn(child_index, child_full_lsn);
5905 }
5906
5907 // Mark the parent dirty: the slot's in-memory target changed (JE
5908 // detachNode sets dirty when updateLsn; we conservatively mark dirty
5909 // so the parent is re-logged with the now-non-resident slot).
5910 p.dirty = true;
5911
5912 // Drop the strong Arc explicitly so the node is freed now (the slot's
5913 // `child` is already None). If any other resident path still held a
5914 // strong reference this would not free — but the tree is the sole
5915 // strong owner of a cached child, so this drops the last strong ref.
5916 drop(parent_guard);
5917 drop(child);
5918
5919 // JE: getInMemoryINs().remove(child) — drop it from the evictor LRU.
5920 self.note_removed(node_id);
5921
5922 // NOTE: the live tree-memory counter (`memory_counter`) is the SAME
5923 // `Arc<AtomicI64>` the evictor's Arbiter uses as `cache_usage`. The
5924 // evictor decrements it once via `Arbiter::release_memory(bytes)` for
5925 // the full eviction batch, so detach must NOT decrement here too —
5926 // that would double-credit and drive `cache_usage` below reality
5927 // (the very drift EV-13 fixes, in the other direction). We only
5928 // measure-and-free; the caller does the single counter update.
5929 freed
5930 }
5931
5932 /// Evict the root IN of this tree (EV-14).
5933 ///
5934 /// Faithful port of JE `Evictor.evictRoot` (Evictor.java:3050-3110) plus
5935 /// the `RootEvictor.doWork` + `Tree.withRootLatchedExclusive` framing
5936 /// (Evictor.java:2529-2576, Tree.java:508-517). Unlike a normal IN, the
5937 /// root has no parent slot to detach from; instead the *tree's* root
5938 /// reference is the equivalent of the `RootChildReference`, so eviction:
5939 ///
5940 /// 1. Latches the root reference exclusively (`rootLatch.acquireExclusive`
5941 /// via `withRootLatchedExclusive`).
5942 /// 2. Re-checks that the root is still resident and still evictable
5943 /// (no resident children, no pinned BIN — JE `RootEvictor.doWork`
5944 /// re-latches and re-checks `rootIN == target && rootIN.isRoot()`).
5945 /// 3. If the root is dirty, LOGS it first so the on-disk version is
5946 /// current and updates `root_log_lsn` to the new LSN (JE
5947 /// `evictRoot`: `long newLsn = target.log(...); rootRef.setLsn(newLsn)`).
5948 /// 4. Clears the in-memory root (`rootRef.clearTarget()` — JE leaves the
5949 /// `ChildReference` LSN intact; here `root_log_lsn` is that LSN) and
5950 /// `note_removed`s it from the evictor LRU (JE `inList.remove(target)`).
5951 ///
5952 /// On the next access `fetch_root_from_log` re-materializes the root from
5953 /// `root_log_lsn` (JE `Tree.getRootINRootAlreadyLatched` →
5954 /// `root.fetchTarget`).
5955 ///
5956 /// # Conditions (eviction is REFUSED, returning `None`, when)
5957 ///
5958 /// * there is no log manager wired (the root could never be re-fetched),
5959 /// * the tree has no resident root (already evicted),
5960 /// * the root has any resident child (JE only evicts a childless root —
5961 /// the `hasCachedChildren` skip in `processTarget`; a root with cached
5962 /// children would orphan them, the EV-6 invariant),
5963 /// * the root is a BIN pinned by a cursor (`cursor_count > 0`),
5964 /// * the root is dirty but we have no clean persisted version AND logging
5965 /// it fails, or
5966 /// * the root is clean but `root_log_lsn` is null (never logged — cannot
5967 /// be re-fetched; happens only for a brand-new unlogged tree).
5968 ///
5969 /// Returns `Some((freed_bytes, was_dirty))` on success, where `freed_bytes`
5970 /// is the root's measured heap footprint (JE
5971 /// `target.getBudgetedMemorySize()`) and `was_dirty` reports whether the
5972 /// root had to be logged (JE `rootEvictor.flushed`, which drives
5973 /// `nDirtyNodesEvicted` and `modifyDbRoot`).
5974 pub fn evict_root(&self, db_id: u64) -> Option<(u64, bool)> {
5975 // A root with no re-fetch path must never be made non-resident.
5976 self.log_manager.as_ref()?;
5977
5978 // JE `Tree.withRootLatchedExclusive(rootEvictor)`: hold the root latch
5979 // exclusively across the whole evict so no descender or splitter can
5980 // observe/install a half-evicted root. Acquiring `self.root.write()`
5981 // is the Noxu equivalent (it is the lock guarding the root pointer).
5982 let mut root_slot = self.root.write();
5983 let root_arc = root_slot.as_ref()?.clone();
5984
5985 // JE `RootEvictor.doWork`: re-latch the target and re-check the
5986 // conditions. We hold the node guard for the duration.
5987 let node_guard = root_arc.write();
5988
5989 // EV-6 / JE `processTarget` hasCachedChildren skip: a root with any
5990 // resident child must NOT be evicted (it would orphan the child).
5991 // EV-14 only evicts an *idle* root whose children are already
5992 // non-resident (or which is itself a leaf BIN).
5993 let (node_id, was_dirty, freed) = match &*node_guard {
5994 TreeNode::Internal(n) => {
5995 if !n.resident_children().is_empty() {
5996 return None; // has cached children — keep resident
5997 }
5998 (n.node_id, n.dirty, node_guard.budgeted_memory_size())
5999 }
6000 TreeNode::Bottom(b) => {
6001 if b.cursor_count > 0 {
6002 return None; // pinned by a cursor — keep resident
6003 }
6004 (
6005 b.node_id,
6006 b.dirty || b.dirty_count() > 0,
6007 node_guard.budgeted_memory_size(),
6008 )
6009 }
6010 };
6011
6012 // If dirty, log the root first so the on-disk version is current,
6013 // then record the new LSN as the root's re-fetch point (JE
6014 // `evictRoot`: target.log(...) + rootRef.setLsn(newLsn)).
6015 if was_dirty {
6016 let lm = self.log_manager.as_ref()?; // checked above; re-borrow
6017 let node_bytes = node_guard.write_to_bytes();
6018 let is_bin = node_guard.is_bin();
6019 let entry = noxu_log::entry::in_log_entry::InLogEntry::new(
6020 db_id, NULL_LSN, // prev_full_lsn
6021 NULL_LSN, // prev_delta_lsn
6022 node_bytes,
6023 );
6024 let mut buf = bytes::BytesMut::with_capacity(entry.log_size());
6025 entry.write_to_log(&mut buf);
6026 let entry_type = if is_bin {
6027 noxu_log::LogEntryType::BIN
6028 } else {
6029 noxu_log::LogEntryType::IN
6030 };
6031 // flush_required = true so the root's bytes are durable before we
6032 // drop the in-memory copy (JE logs synchronously in evictRoot).
6033 let new_lsn = match lm.log(
6034 entry_type,
6035 &buf,
6036 noxu_log::Provisional::No,
6037 true, // flush_required
6038 false, // fsync at next checkpoint
6039 ) {
6040 Ok(l) => l,
6041 Err(_) => return None, // could not log — keep the root resident
6042 };
6043 *self.root_log_lsn.write() = new_lsn;
6044 } else {
6045 // Clean root: it must already be re-fetchable. If it was never
6046 // logged (root_log_lsn null) we cannot evict it safely.
6047 if *self.root_log_lsn.read() == NULL_LSN {
6048 return None;
6049 }
6050 }
6051
6052 // JE `rootRef.clearTarget()` + `inList.remove(target)`: drop the
6053 // in-memory root and remove it from the evictor LRU. The root_log_lsn
6054 // is the surviving `ChildReference` LSN used to re-fetch it.
6055 drop(node_guard);
6056 *root_slot = None;
6057 drop(root_slot);
6058 self.note_removed(node_id);
6059
6060 Some((freed, was_dirty))
6061 }
6062
6063 /// Re-materialize an evicted root IN from its persisted `root_log_lsn`
6064 /// (EV-14, piece B).
6065 /// Faithful to JE `Tree.getRootINRootAlreadyLatched` (Tree.java:477-516)
6066 /// which calls `root.fetchTarget(database, null)` when the in-memory
6067 /// target is null. Idempotent and cheap when the root is already
6068 /// resident: returns the resident root without touching the log.
6069 ///
6070 /// Returns `None` only when the tree is genuinely empty (no resident root
6071 /// AND `root_log_lsn` is null) or when the re-fetch fails (no log manager,
6072 /// log read error, deserialize failure) — callers then see an empty tree,
6073 /// never wrong data.
6074 pub fn fetch_root_from_log(&self) -> Option<Arc<RwLock<TreeNode>>> {
6075 // Fast path: root already resident.
6076 if let Some(r) = self.root.read().clone() {
6077 return Some(r);
6078 }
6079 // Take the write lock and re-check (another thread may have re-fetched
6080 // it while we waited — JE upgrades the root latch the same way).
6081 let mut root_slot = self.root.write();
6082 if let Some(r) = root_slot.as_ref() {
6083 return Some(r.clone());
6084 }
6085 let log_lsn = *self.root_log_lsn.read();
6086 let node = self.fetch_node_from_log(log_lsn)?;
6087 let node_id = node.node_id();
6088 let arc = Arc::new(RwLock::new(node));
6089 *root_slot = Some(arc.clone());
6090 drop(root_slot);
6091 // JE: a fetched IN is added back to the INList (Evictor LRU).
6092 self.note_added(node_id);
6093 Some(arc)
6094 }
6095
6096 /// Return the resident child Arc for slot `idx` of `parent_arc`, fetching
6097 /// it from its slot LSN and installing it if it is not resident (EV-14 /
6098 /// EV-13 re-fetch on descent).
6099 ///
6100 /// Faithful to JE `ChildReference.fetchTarget` (and `IN.fetchTarget`):
6101 /// when a slot's in-memory target is null but its LSN is valid, the node
6102 /// is read back from the log and cached in the slot. Installing the
6103 /// fetched child requires the parent EX-latch, so this takes the parent
6104 /// write lock; the fast path (child already resident) takes only a read
6105 /// lock.
6106 ///
6107 /// Returns `None` only when the slot index is out of range, the slot has
6108 /// no valid LSN, or the log read/deserialize fails — callers then treat
6109 /// the descent as terminating in an empty subtree, never wrong data.
6110 fn child_at_or_fetch(
6111 &self,
6112 parent_arc: &Arc<RwLock<TreeNode>>,
6113 idx: usize,
6114 ) -> Option<ChildArc> {
6115 // Fast path: child already cached (read lock only).
6116 {
6117 let g = parent_arc.read();
6118 if let TreeNode::Internal(n) = &*g {
6119 if let Some(c) = n.get_child(idx) {
6120 return Some(c);
6121 }
6122 } else {
6123 return None; // BINs have no IN children
6124 }
6125 }
6126 // Slow path: fetch the child from its slot LSN under the parent
6127 // EX-latch (JE installs the fetched target under the IN latch).
6128 let mut g = parent_arc.write();
6129 let TreeNode::Internal(n) = &mut *g else {
6130 return None;
6131 };
6132 // Re-check: another thread may have fetched it while we upgraded.
6133 if let Some(c) = n.get_child(idx) {
6134 return Some(c);
6135 }
6136 if idx >= n.entries.len() {
6137 return None;
6138 }
6139 let child_lsn = n.get_lsn(idx);
6140 let node = self.fetch_node_from_log(child_lsn)?;
6141 let node_id = node.node_id();
6142 let arc: ChildArc = Arc::new(RwLock::new(node));
6143 n.set_child(idx, Some(arc.clone()));
6144 drop(g);
6145 // JE: a fetched IN is added back to the INList (Evictor LRU).
6146 self.note_added(node_id);
6147 Some(arc)
6148 }
6149
6150 /// Check whether a BIN node is a candidate for slot compression and,
6151 /// if so, trigger `compress_bin`.
6152 ///
6153 /// from (the opportunistic / lazy compression path).
6154 ///
6155 /// # Algorithm
6156 ///
6157 /// 1. Skip the BIN if it is a delta or has no defunct (known-deleted) slots.
6158 /// 2. If compression succeeds and the BIN becomes empty, it is pruned.
6159 ///
6160 /// # Returns
6161 ///
6162 /// `true` if compression was triggered (regardless of whether any slots
6163 /// were actually removed), `false` if the BIN does not need compression.
6164 pub fn maybe_compress_bin_and_parent(
6165 &self,
6166 bin_arc: &Arc<RwLock<TreeNode>>,
6167 ) -> bool {
6168 // Check whether the BIN has any deleted slots worth compressing.
6169 // lazyCompress: skip deltas and BINs with no defunct slots.
6170 let should_compress = {
6171 {
6172 let g = bin_arc.read();
6173 match &*g {
6174 TreeNode::Bottom(b) => {
6175 // Skip deltas (the: !in.isBIN() || in.isBINDelta()).
6176 if b.is_delta {
6177 false
6178 } else {
6179 // Check for any known-deleted slot
6180 // (the: for (int i=0; i < bin.getNEntries(); i++) {
6181 // if (bin.isDefunct(i)) { ... break; }
6182 // }).
6183 b.entries.iter().any(|e| e.known_deleted)
6184 }
6185 }
6186 _ => false,
6187 }
6188 }
6189 };
6190
6191 if !should_compress {
6192 return false;
6193 }
6194
6195 self.compress_bin(bin_arc)
6196 }
6197
6198 // ========================================================================
6199 // Latch-coupling validation
6200 // ========================================================================
6201
6202 /// Validate that `parent.entries[child_index].child` still points at
6203 /// `child_arc` after acquiring the child's latch.
6204 ///
6205 /// Re-latch validation step inside the
6206 /// `Tree.searchSplitsAllowed`: after a concurrent split the parent
6207 /// slot that previously held the child may have changed. Callers that
6208 /// plan to mutate the child must verify the parent-child link is still
6209 /// intact before proceeding.
6210 ///
6211 /// Returns `true` if the parent-child link is intact.
6212 pub fn validate_parent_child(
6213 parent: &Arc<RwLock<TreeNode>>,
6214 child_index: usize,
6215 child_arc: &Arc<RwLock<TreeNode>>,
6216 ) -> bool {
6217 let g = parent.read();
6218 match &*g {
6219 TreeNode::Internal(p) => match p.child_ref(child_index) {
6220 Some(stored) => Arc::ptr_eq(stored, child_arc),
6221 None => false,
6222 },
6223 TreeNode::Bottom(_) => false,
6224 }
6225 }
6226
6227 /// Search for the BIN that should contain `key`, with latch-coupling
6228 /// validation at every level of descent.
6229 ///
6230 /// .
6231 ///
6232 /// The difference from `search()` is that after obtaining the child
6233 /// arc we call `validate_parent_child` to confirm the parent still
6234 /// holds the expected Arc. If the link has been broken (e.g. by a
6235 /// concurrent split that relocated the child) the traversal restarts
6236 /// from the root.
6237 ///
6238 /// Returns a `SearchResult` if the key is (or should be) in the tree,
6239 /// `None` if the tree is empty.
6240 ///
6241 /// Same as [`Tree::search`] but exposes the hand-over-hand latch
6242 /// coupling explicitly. Kept as a public, equivalent API for
6243 /// callers (today only tests) that want to verify the
6244 /// latch-coupling behaviour against `search()` itself.
6245 ///
6246 /// Both `search()` and this method use the same `read_arc()`
6247 /// hand-over-hand: take the child read guard *before* dropping
6248 /// the parent guard, so a concurrent `split_child(parent, ..)`
6249 /// (which takes `parent.write()`) cannot run between when we
6250 /// captured the child Arc and when we entered the child. There
6251 /// is no validate-and-restart loop because the coupling makes
6252 /// the race unreachable.
6253 pub fn search_with_coupling(&self, key: &[u8]) -> Option<SearchResult> {
6254 let root = self.get_root()?;
6255 let mut guard: parking_lot::ArcRwLockReadGuard<
6256 parking_lot::RawRwLock,
6257 TreeNode,
6258 > = root.read_arc();
6259
6260 loop {
6261 if guard.is_bin() {
6262 let index = guard.find_entry(key, true, true);
6263 let found = index >= 0 && (index & EXACT_MATCH != 0);
6264 return Some(SearchResult::with_values(
6265 found,
6266 index & 0xFFFF,
6267 false,
6268 ));
6269 }
6270
6271 let parent_arc =
6272 parking_lot::ArcRwLockReadGuard::rwlock(&guard).clone();
6273 let next_idx = match &*guard {
6274 TreeNode::Internal(n) => {
6275 if n.entries.is_empty() {
6276 return None;
6277 }
6278 let idx = self.upper_in_floor_index(&n.entries, key);
6279 match n.get_child(idx) {
6280 Some(c) => {
6281 let next_guard = c.read_arc();
6282 drop(guard);
6283 guard = next_guard;
6284 continue;
6285 }
6286 None => idx, // EV-14/EV-13: re-fetch below.
6287 }
6288 }
6289 TreeNode::Bottom(_) => {
6290 unreachable!("is_bin() returned false above")
6291 }
6292 };
6293 // Hand-over-hand: take the child read guard before
6294 // releasing the parent guard. Closes the
6295 // descender-vs-splitter window: a concurrent
6296 // split_child(parent, ..) takes parent.write(), which
6297 // blocks while we still hold parent.read().
6298 drop(guard);
6299 let child = self.child_at_or_fetch(&parent_arc, next_idx)?;
6300 guard = child.read_arc();
6301 }
6302 }
6303
6304 // ========================================================================
6305 // BIN-Delta reconstitution
6306 // ========================================================================
6307
6308 /// Increments the cursor-pin count on a BIN node.
6309 ///
6310 /// Called by `CursorImpl` when it positions on (or enters) a BIN.
6311 /// The evictor will not select a BIN with `cursor_count > 0` for eviction
6312 /// (`RealNodeInfo.pin_count`), matching `BIN.incrementCursorCount()`.
6313 pub fn pin_bin(bin_arc: &Arc<RwLock<TreeNode>>) {
6314 let mut guard = bin_arc.write();
6315 if let TreeNode::Bottom(ref mut stub) = *guard {
6316 stub.cursor_count += 1;
6317 }
6318 }
6319
6320 /// Decrements the cursor-pin count on a BIN node.
6321 ///
6322 /// Called by `CursorImpl` when it moves away from or closes on a BIN.
6323 /// Uses `saturating_sub` to guard against an accidental double-unpin.
6324 /// Matching `BIN.decrementCursorCount()`.
6325 pub fn unpin_bin(bin_arc: &Arc<RwLock<TreeNode>>) {
6326 let mut guard = bin_arc.write();
6327 if let TreeNode::Bottom(ref mut stub) = *guard {
6328 stub.cursor_count = stub.cursor_count.saturating_sub(1);
6329 }
6330 }
6331
6332 /// Returns `true` if the given `BinStub` is a BIN-delta (not a full BIN).
6333 ///
6334 /// `IN.isBINDelta()`.
6335 pub fn bin_is_delta(bin: &BinStub) -> bool {
6336 bin.is_delta
6337 }
6338
6339 /// Merge delta entries into a full BIN's entry list.
6340 ///
6341 /// - For each delta entry: if a matching key already exists in `bin`,
6342 /// replace it (delta is authoritative).
6343 /// - Otherwise insert the delta entry in sorted position.
6344 ///
6345 /// Delta entries carry **full** keys (prefix already prepended by the
6346 /// caller). After applying all delta entries the BIN's prefix is
6347 /// recomputed so the final state is consistent.
6348 ///
6349 /// All delta entries are considered to be the most-recently-dirtied
6350 /// state, exactly as in where delta slots supersede full-BIN slots.
6351 pub fn apply_delta_to_bin(
6352 bin: &mut BinStub,
6353 delta_entries: Vec<(Vec<u8>, Lsn, Option<Vec<u8>>)>,
6354 ) {
6355 for (full_key, lsn, data) in delta_entries {
6356 // `full_key` is a full (uncompressed) key here.
6357 bin.insert_with_prefix(full_key, lsn, data);
6358 }
6359 bin.dirty = true;
6360 }
6361
6362 /// Reconstitute a BIN-delta into a full BIN.
6363 ///
6364 /// from the:
6365 ///
6366 /// 1. Extract the delta entries from `self` (this BIN-delta), decompressing
6367 /// them to full keys.
6368 /// 2. Apply them onto `base` (the previously logged full BIN) via
6369 /// `apply_delta_to_bin`.
6370 /// 3. Copy `base`'s merged entries and prefix back into `self`.
6371 /// 4. Clear the `is_delta` flag so subsequent code treats `self` as
6372 /// a full BIN.
6373 ///
6374 /// After this call `self` is a full BIN; `base` should be discarded.
6375 pub fn mutate_to_full_bin(delta: &mut BinStub, mut base: BinStub) {
6376 // Decompress delta entries to full keys before applying.
6377 let delta_full_entries: Vec<(Vec<u8>, Lsn, Option<Vec<u8>>)> = (0
6378 ..delta.entries.len())
6379 .map(|i| {
6380 (
6381 delta.get_full_key(i).unwrap_or_default(),
6382 delta.get_lsn(i),
6383 delta.entries[i].data.clone(),
6384 )
6385 })
6386 .collect();
6387 // reconstituteBIN + resetContent + setBINDelta(false).
6388 Self::apply_delta_to_bin(&mut base, delta_full_entries);
6389 delta.entries = base.entries;
6390 delta.lsn_rep = base.lsn_rep; // T-3
6391 delta.keys = base.keys; // T-2
6392 delta.key_prefix = base.key_prefix;
6393 delta.is_delta = false;
6394 delta.dirty = true;
6395 }
6396
6397 /// Read an IN/BIN log entry at `log_lsn` and deserialise it into a
6398 /// `TreeNode`, ready to be installed as a (re-fetched) resident node.
6399 ///
6400 /// JE `LogManager.getLogEntry(lsn)` + `IN.readFromLog` as used by
6401 /// `ChildReference.fetchTarget` (the path that re-materializes a
6402 /// non-resident node from its persisted LSN on descent) and by
6403 /// `Tree.getRootINRootAlreadyLatched` for the root. The freshly-fetched
6404 /// node has no resident children (`TargetRep::None`); its own children, if
6405 /// any, are re-fetched on demand the same way when the descent reaches
6406 /// them.
6407 ///
6408 /// Returns `None` if the LSN is null, the log read fails, the entry is not
6409 /// an IN/BIN, or deserialisation fails (the caller treats this as "node
6410 /// unavailable" rather than panicking, matching the graceful-degradation
6411 /// policy of `mutate_to_full_bin_from_log`).
6412 fn fetch_node_from_log(&self, log_lsn: Lsn) -> Option<TreeNode> {
6413 if log_lsn == NULL_LSN {
6414 return None;
6415 }
6416 let lm = self.log_manager.as_ref()?;
6417 let (entry_type, payload) = lm.read_entry(log_lsn).ok()?;
6418 // The on-disk payload is an `InLogEntry` body (db_id | prev_full_lsn
6419 // | prev_delta_lsn | len | node_data). The recovery scanner strips
6420 // this header before calling `recover_in_redo`; re-fetch must do the
6421 // same so `deserialize_*` sees the bare node bytes. JE
6422 // `INLogEntry.readEntry` parses the same wrapper.
6423 let in_entry =
6424 noxu_log::entry::in_log_entry::InLogEntry::read_from_log(&payload)
6425 .ok()?;
6426 let node_data = &in_entry.node_data;
6427 use noxu_log::LogEntryType;
6428 match entry_type {
6429 LogEntryType::BIN => {
6430 Self::deserialize_bin(node_data).map(TreeNode::Bottom)
6431 }
6432 LogEntryType::IN => {
6433 Self::deserialize_upper_in(node_data).map(TreeNode::Internal)
6434 }
6435 // BIN-deltas are never logged as the *root* version and are
6436 // reconstituted by the BIN-delta path, not here.
6437 _ => {
6438 log::warn!(
6439 "fetch_node_from_log: expected IN/BIN entry at LSN {:?}, \
6440 got {:?}",
6441 log_lsn,
6442 entry_type
6443 );
6444 None
6445 }
6446 }
6447 }
6448
6449 /// Reconstitute a BIN-delta into a full BIN by reading the base from log.
6450 ///
6451 /// — the
6452 /// single-argument overload that calls `fetchFullBIN(databaseImpl)` to
6453 /// read the last full BIN from the log manager automatically.
6454 ///
6455 /// Algorithm:
6456 /// 1. If `delta.last_full_lsn == NULL_LSN`, the BIN was never written as a
6457 /// full entry; there is no base to merge so the delta IS the full BIN.
6458 /// Clear `is_delta` and return.
6459 /// 2. Read the full-BIN log entry at `delta.last_full_lsn` using
6460 /// `log_manager.read_entry(lsn)`.
6461 /// 3. Deserialize the payload with `BinStub::deserialize_full()`.
6462 /// 4. Delegate to `Self::mutate_to_full_bin(delta, base)` to merge and
6463 /// replace `delta`'s contents.
6464 ///
6465 /// On any read / parse failure the function falls back to clearing the
6466 /// `is_delta` flag without merging, so the caller always gets a non-delta
6467 /// BIN (possibly missing some old slots). This mirrors the
6468 /// `EnvironmentFailureException` path but gracefully degrades instead of
6469 /// panicking.
6470 ///
6471 /// `BIN.fetchFullBIN(dbImpl)` + `BIN.mutateToFullBIN(boolean)`.
6472 pub fn mutate_to_full_bin_from_log(
6473 delta: &mut BinStub,
6474 log_manager: &noxu_log::LogManager,
6475 ) {
6476 if !delta.is_delta {
6477 // Already a full BIN; nothing to do.
6478 return;
6479 }
6480
6481 if delta.last_full_lsn == NULL_LSN {
6482 // BIN has never been logged as a full entry — the in-memory delta
6483 // is effectively the full state. During recovery this path is
6484 // harmless.
6485 delta.is_delta = false;
6486 return;
6487 }
6488
6489 // Read the full-BIN log entry at last_full_lsn.
6490 // `envImpl.getLogManager().getEntryHandleFileNotFound(lsn)`.
6491 match log_manager.read_entry(delta.last_full_lsn) {
6492 Ok((entry_type, payload)) => {
6493 use noxu_log::LogEntryType;
6494 if entry_type == LogEntryType::BIN {
6495 if let Some(mut base) = BinStub::deserialize_full(&payload)
6496 {
6497 // Set the base's last_full_lsn so it is preserved
6498 // into the merged result.
6499 base.last_full_lsn = delta.last_full_lsn;
6500 Self::mutate_to_full_bin(delta, base);
6501 return;
6502 }
6503 // Deserialization failed — fall through to graceful degradation.
6504 log::warn!(
6505 "mutate_to_full_bin_from_log: failed to deserialize \
6506 full BIN at LSN {:?}; keeping delta as-is",
6507 delta.last_full_lsn
6508 );
6509 } else {
6510 log::warn!(
6511 "mutate_to_full_bin_from_log: expected BIN entry at \
6512 LSN {:?}, got {:?}",
6513 delta.last_full_lsn,
6514 entry_type
6515 );
6516 }
6517 }
6518 Err(e) => {
6519 log::warn!(
6520 "mutate_to_full_bin_from_log: failed to read log at \
6521 LSN {:?}: {}",
6522 delta.last_full_lsn,
6523 e
6524 );
6525 }
6526 }
6527
6528 // Graceful degradation: promote the delta to a "full" BIN without
6529 // the base slots. The BIN will be re-logged as a full BIN at the
6530 // next checkpoint.
6531 delta.is_delta = false;
6532 delta.dirty = true;
6533 }
6534
6535 // ========================================================================
6536 // getNextBin / getPrevBin
6537 // ========================================================================
6538
6539 /// Return the entries of the BIN immediately to the right of the BIN
6540 /// that contains (or would contain) `current_key`.
6541 ///
6542 /// → `Tree.getNextIN(forward=true)`.
6543 ///
6544 /// # Algorithm
6545 /// 1. Build a root-to-BIN path for `current_key`.
6546 /// 2. Walk the path back up looking for a parent that has a slot to the
6547 /// right of the slot we descended through.
6548 /// 3. When found, descend to the leftmost BIN of that sibling subtree.
6549 /// 4. If no such parent exists, return `None` (no next BIN).
6550 pub fn get_next_bin(
6551 &self,
6552 current_key: &[u8],
6553 ) -> Option<Vec<(BinEntry, Lsn, Vec<u8>)>> {
6554 let root = self.get_root()?;
6555 self.get_adjacent_bin(&root, current_key, true)
6556 }
6557
6558 /// Return the entries of the BIN immediately to the left of the BIN
6559 /// that contains (or would contain) `current_key`.
6560 ///
6561 /// → `Tree.getNextIN(forward=false)`.
6562 pub fn get_prev_bin(
6563 &self,
6564 current_key: &[u8],
6565 ) -> Option<Vec<(BinEntry, Lsn, Vec<u8>)>> {
6566 let root = self.get_root()?;
6567 self.get_adjacent_bin(&root, current_key, false)
6568 }
6569
6570 /// Core implementation shared by `get_next_bin` and `get_prev_bin`.
6571 ///
6572 /// Builds the path from `root` down to the BIN for `current_key`
6573 /// (each element records the parent arc, the slot index taken,
6574 /// and the child Arc reached) using `read_arc()` hand-over-hand
6575 /// latch coupling.
6576 ///
6577 /// The ascent re-acquires the parent's read lock one level at a
6578 /// time. To handle a concurrent split that completes between
6579 /// path capture and ascent, we validate that the slot still
6580 /// holds the child Arc we descended through. If the slot
6581 /// mismatches we retry the whole operation from root with a
6582 /// short pause between attempts. The retry budget is generous
6583 /// (`MAX_ASCENT_ATTEMPTS`) so that the typical case of a few
6584 /// cascading splits between two BIN-level cursor steps is
6585 /// absorbed without surfacing as a false end-of-iteration.
6586 /// After exhausting the budget we conservatively return `None`,
6587 /// signalling "no adjacent BIN found"; the cursor will then
6588 /// either restart its scan or report end-of-iteration. The
6589 /// budget is finite so a pathological workload (a thread
6590 /// permanently splitting under us) cannot livelock the lookup.
6591 /// JE `Tree.getNextIN` / `Tree.getPrevIN`.
6592 ///
6593 /// R3 fix (2026-06-16): converted from `static fn` to `&self` so that the
6594 /// IN-level descent uses `self.upper_in_floor_index` (comparator-aware)
6595 /// instead of a raw byte `<=`. Without this, databases with a custom
6596 /// comparator (secondary indexes, sorted-dup) could descend to the wrong
6597 /// child → wrong adjacent BIN → incorrect cursor iteration across BIN
6598 /// boundaries. Mirrors `Tree.getNextIN`/`Tree.getPrevIN` using the
6599 /// comparator-aware `IN.findEntry`.
6600 fn get_adjacent_bin(
6601 &self,
6602 root: &Arc<RwLock<TreeNode>>,
6603 current_key: &[u8],
6604 forward: bool,
6605 ) -> Option<Vec<(BinEntry, Lsn, Vec<u8>)>> {
6606 const MAX_ASCENT_ATTEMPTS: u32 = 8;
6607 for attempt in 0..MAX_ASCENT_ATTEMPTS {
6608 match self.get_adjacent_bin_attempt(root, current_key, forward) {
6609 AdjacentBinOutcome::Found(v) => return Some(v),
6610 AdjacentBinOutcome::NoAdjacent => return None,
6611 AdjacentBinOutcome::SplitRaceRetry => {
6612 // Brief pause to let the splitter finish.
6613 if attempt + 1 < MAX_ASCENT_ATTEMPTS {
6614 std::thread::yield_now();
6615 }
6616 }
6617 }
6618 }
6619 // Exhausted retry budget. Signal "no adjacent" so the
6620 // cursor can fall back to its end-of-iteration path.
6621 None
6622 }
6623
6624 /// One attempt at `get_adjacent_bin`. The tri-state return
6625 /// value distinguishes "no adjacent BIN exists" (which the
6626 /// caller should propagate as end-of-iteration) from "a
6627 /// concurrent split invalidated our path" (which the caller
6628 /// should retry from root).
6629 fn get_adjacent_bin_attempt(
6630 &self,
6631 root: &Arc<RwLock<TreeNode>>,
6632 current_key: &[u8],
6633 forward: bool,
6634 ) -> AdjacentBinOutcome {
6635 // Path entry: (parent_arc, slot_idx_taken, child_arc_reached).
6636 // The child Arc lets the ascent validate that the slot still
6637 // points to the same node we descended through.
6638 let mut path: Vec<(
6639 Arc<RwLock<TreeNode>>,
6640 usize,
6641 Arc<RwLock<TreeNode>>,
6642 )> = Vec::new();
6643
6644 let mut guard: parking_lot::ArcRwLockReadGuard<
6645 parking_lot::RawRwLock,
6646 TreeNode,
6647 > = root.read_arc();
6648 loop {
6649 if guard.is_bin() {
6650 break;
6651 }
6652
6653 let (next_arc, slot_idx) = match &*guard {
6654 TreeNode::Internal(n) => {
6655 if n.entries.is_empty() {
6656 return AdjacentBinOutcome::NoAdjacent;
6657 }
6658 // R3 fix: use comparator-aware upper_in_floor_index so
6659 // that custom-comparator / sorted-dup databases descend
6660 // to the correct child. Mirrors JE Tree.getNextIN which
6661 // uses IN.findEntry (comparator-aware) not raw byte order.
6662 let idx =
6663 self.upper_in_floor_index(&n.entries, current_key);
6664 let child = match n.get_child(idx) {
6665 Some(c) => c,
6666 None => return AdjacentBinOutcome::NoAdjacent,
6667 };
6668 (child, idx)
6669 }
6670 TreeNode::Bottom(_) => unreachable!(),
6671 };
6672
6673 // Record the parent and the child we are about to enter
6674 // — the child Arc lets the ascent validate the slot.
6675 let parent_arc =
6676 parking_lot::ArcRwLockReadGuard::rwlock(&guard).clone();
6677 path.push((parent_arc, slot_idx, Arc::clone(&next_arc)));
6678
6679 // Hand-over-hand: take child read lock BEFORE releasing parent.
6680 let next_guard = next_arc.read_arc();
6681 drop(guard);
6682 guard = next_guard;
6683 }
6684 drop(guard);
6685
6686 // Ascend the path. At each level, validate that
6687 // `parent.entries[taken_idx].child == descended_child` before
6688 // trusting `taken_idx` as a coordinate. If not, return
6689 // `SplitRaceRetry` so the caller restarts from root.
6690 while let Some((parent_arc, taken_idx, descended_child)) = path.pop() {
6691 let parent_guard = parent_arc.read();
6692 let (n_entries, slot_still_valid) = match &*parent_guard {
6693 TreeNode::Internal(p) => {
6694 let n = p.entries.len();
6695 let valid = p
6696 .child_ref(taken_idx)
6697 .is_some_and(|c| Arc::ptr_eq(c, &descended_child));
6698 (n, valid)
6699 }
6700 _ => return AdjacentBinOutcome::NoAdjacent,
6701 };
6702 drop(parent_guard);
6703
6704 if !slot_still_valid {
6705 return AdjacentBinOutcome::SplitRaceRetry;
6706 }
6707
6708 let sibling_idx = if forward {
6709 taken_idx + 1
6710 } else if taken_idx == 0 {
6711 // No left sibling at this level — ascend further.
6712 continue;
6713 } else {
6714 taken_idx - 1
6715 };
6716
6717 if forward && sibling_idx >= n_entries {
6718 // No right sibling at this level — ascend further.
6719 continue;
6720 }
6721
6722 // Found a sibling slot — fetch the sibling child arc.
6723 let sibling_arc = {
6724 let g = parent_arc.read();
6725 match &*g {
6726 TreeNode::Internal(p) => match p.get_child(sibling_idx) {
6727 Some(c) => c,
6728 None => return AdjacentBinOutcome::NoAdjacent,
6729 },
6730 _ => return AdjacentBinOutcome::NoAdjacent,
6731 }
6732 };
6733
6734 // Descend to the leftmost (forward) or rightmost (!forward) BIN.
6735 return match Self::descend_to_edge_bin(&sibling_arc, forward) {
6736 Some(v) => AdjacentBinOutcome::Found(v),
6737 None => AdjacentBinOutcome::NoAdjacent,
6738 };
6739 }
6740
6741 // Exhausted path without finding a sibling → no adjacent BIN.
6742 AdjacentBinOutcome::NoAdjacent
6743 }
6744
6745 /// Descend to the leftmost BIN (`forward = true`) or rightmost BIN
6746 /// (`forward = false`) in the sub-tree rooted at `node_arc`.
6747 ///
6748 /// `Tree.searchSubTree(SearchType.LEFT / RIGHT, targetLevel)`.
6749 fn descend_to_edge_bin(
6750 node_arc: &Arc<RwLock<TreeNode>>,
6751 forward: bool,
6752 ) -> Option<Vec<(BinEntry, Lsn, Vec<u8>)>> {
6753 // Hand-over-hand latch coupling — see Tree::search.
6754 let mut guard: parking_lot::ArcRwLockReadGuard<
6755 parking_lot::RawRwLock,
6756 TreeNode,
6757 > = node_arc.read_arc();
6758
6759 loop {
6760 if guard.is_bin() {
6761 return match &*guard {
6762 TreeNode::Bottom(b) => {
6763 // Return entries with full (decompressed) keys so that
6764 // callers always work with complete keys.
6765 //
6766 // TREE-F1: KD slots are NOT filtered here — the BIN's
6767 // slot indices are returned verbatim so the cursor can
6768 // skip KD slots itself (CursorImpl getNext loop;
6769 // CursorImpl.java:2062-2064) and continue to the next
6770 // BIN when an edge BIN is entirely KD during the
6771 // BIN-delta reconstitution window.
6772 let full_entries: Vec<(BinEntry, Lsn, Vec<u8>)> = (0
6773 ..b.entries.len())
6774 .map(|i| {
6775 (
6776 BinEntry {
6777 data: b.entries[i].data.clone(),
6778 known_deleted: b.entries[i]
6779 .known_deleted,
6780 dirty: b.entries[i].dirty,
6781 expiration_time: b.entries[i]
6782 .expiration_time,
6783 },
6784 b.get_lsn(i),
6785 b.get_full_key(i).unwrap_or_default(),
6786 )
6787 })
6788 .collect();
6789 Some(full_entries)
6790 }
6791 _ => None,
6792 };
6793 }
6794
6795 let next = match &*guard {
6796 TreeNode::Internal(n) => {
6797 if forward {
6798 n.get_child(0)?
6799 } else {
6800 n.get_child(n.entries.len().saturating_sub(1))?
6801 }
6802 }
6803 _ => return None,
6804 };
6805 // Take child read lock BEFORE releasing parent's.
6806 let next_guard = next.read_arc();
6807 drop(guard);
6808 guard = next_guard;
6809 }
6810 }
6811}
6812
6813// ============================================================================
6814// Tree statistics
6815// ============================================================================
6816
6817/// Statistics collected by a full tree walk.
6818///
6819/// `TreeWalkerStatsAccumulator`.
6820#[derive(Debug, Default, Clone, PartialEq, Eq)]
6821pub struct TreeStats {
6822 /// Number of BINs (bottom internal nodes).
6823 pub n_bins: u64,
6824 /// Number of upper INs.
6825 pub n_ins: u64,
6826 /// Total number of entries across all nodes.
6827 pub n_entries: u64,
6828 /// Height of the tree (1 = root is a BIN, 2 = one level above BINs, …).
6829 pub height: u32,
6830}
6831
6832impl Tree {
6833 /// Walks the entire tree and collects structural statistics.
6834 ///
6835 /// `TreeWalkerStatsAccumulator` pattern — performs a simple
6836 /// recursive DFS and counts INs, BINs, entries, and tree height.
6837 pub fn collect_stats(&self) -> TreeStats {
6838 let mut stats = TreeStats::default();
6839 if let Some(root) = self.get_root() {
6840 Self::collect_stats_recursive(&root, &mut stats, 0);
6841 }
6842 stats
6843 }
6844
6845 fn collect_stats_recursive(
6846 node_arc: &Arc<RwLock<TreeNode>>,
6847 stats: &mut TreeStats,
6848 depth: u32,
6849 ) {
6850 let guard = node_arc.read();
6851
6852 let current_height = depth + 1;
6853 if current_height > stats.height {
6854 stats.height = current_height;
6855 }
6856
6857 match &*guard {
6858 TreeNode::Bottom(b) => {
6859 stats.n_bins += 1;
6860 stats.n_entries += b.entries.len() as u64;
6861 }
6862 TreeNode::Internal(n) => {
6863 stats.n_ins += 1;
6864 stats.n_entries += n.entries.len() as u64;
6865 // Collect child arcs before releasing the guard.
6866 let children: Vec<Arc<RwLock<TreeNode>>> =
6867 n.resident_children();
6868 // Release guard before recursing to avoid lock ordering issues.
6869 drop(guard);
6870 for child in children {
6871 Self::collect_stats_recursive(&child, stats, depth + 1);
6872 }
6873 }
6874 }
6875 }
6876
6877 /// Collects all dirty BINs as (Arc to node, db_id) pairs.
6878 ///
6879 /// The checkpoint path calls this to enumerate BINs that need to be
6880 /// logged. For each dirty BIN the checkpoint decides — based on the
6881 /// BIN-delta threshold — whether to write a full `BIN` entry or a
6882 /// `BINDelta` entry.
6883 ///
6884 /// `Checkpointer.processINList()` which iterates the dirty
6885 /// IN list accumulated during normal operation.
6886 pub fn collect_dirty_bins(
6887 &self,
6888 db_id: u64,
6889 ) -> Vec<(u64, Arc<RwLock<TreeNode>>)> {
6890 let mut result = Vec::new();
6891 if let Some(root) = self.get_root() {
6892 Self::collect_dirty_bins_recursive(&root, db_id, &mut result);
6893 }
6894 result
6895 }
6896
6897 fn collect_dirty_bins_recursive(
6898 node_arc: &Arc<RwLock<TreeNode>>,
6899 db_id: u64,
6900 out: &mut Vec<(u64, Arc<RwLock<TreeNode>>)>,
6901 ) {
6902 let guard = node_arc.read();
6903 match &*guard {
6904 TreeNode::Bottom(b) => {
6905 // Include this BIN if it is dirty or has any dirty slots.
6906 if b.dirty || b.dirty_count() > 0 {
6907 out.push((db_id, Arc::clone(node_arc)));
6908 }
6909 }
6910 TreeNode::Internal(n) => {
6911 let children: Vec<Arc<RwLock<TreeNode>>> =
6912 n.resident_children();
6913 drop(guard);
6914 for child in children {
6915 Self::collect_dirty_bins_recursive(&child, db_id, out);
6916 } // guard already dropped
6917 }
6918 }
6919 }
6920
6921 /// Collect all BINs that have at least one `known_deleted` slot.
6922 ///
6923 /// INCompressor queue-drain scan in the: the daemon iterates
6924 /// the in-memory IN list and identifies BINs that still hold zombie deleted
6925 /// slots. Each returned `Arc` can be passed directly to `compress_bin()`.
6926 pub fn collect_bins_with_known_deleted(
6927 &self,
6928 ) -> Vec<Arc<RwLock<TreeNode>>> {
6929 let mut result = Vec::new();
6930 if let Some(root) = self.get_root() {
6931 Self::collect_bins_with_known_deleted_recursive(&root, &mut result);
6932 }
6933 result
6934 }
6935
6936 fn collect_bins_with_known_deleted_recursive(
6937 node_arc: &Arc<RwLock<TreeNode>>,
6938 out: &mut Vec<Arc<RwLock<TreeNode>>>,
6939 ) {
6940 let guard = node_arc.read();
6941 match &*guard {
6942 TreeNode::Bottom(b) => {
6943 if b.entries.iter().any(|e| e.known_deleted) {
6944 out.push(Arc::clone(node_arc));
6945 }
6946 }
6947 TreeNode::Internal(n) => {
6948 let children: Vec<Arc<RwLock<TreeNode>>> =
6949 n.resident_children();
6950 drop(guard);
6951 for child in children {
6952 Self::collect_bins_with_known_deleted_recursive(
6953 &child, out,
6954 );
6955 }
6956 }
6957 }
6958 }
6959
6960 /// Collect all dirty upper (non-BIN) internal nodes, sorted ascending by
6961 /// level (bottom-up order, BIN level excluded).
6962 ///
6963 /// Serialise an upper-IN node (level > 1) by node_id for off-heap storage.
6964 ///
6965 /// Traverses the tree to find the internal node whose matches,
6966 /// then calls to produce a compact byte
6967 /// representation. Returns if the node is not found or is a BIN
6968 /// (BINs are not upper INs).
6969 ///
6970 /// Mirrors `OffHeapAllocator` serialises the same bytes that would be written
6971 /// to the log, allowing the evictor to store upper-INs off-heap and avoid
6972 /// log-file reads on the next traversal.
6973 pub fn serialize_upper_in(&self, node_id: u64) -> Option<Vec<u8>> {
6974 let root = self.get_root()?;
6975 Self::find_and_serialize_upper_in(&root, node_id)
6976 }
6977
6978 fn find_and_serialize_upper_in(
6979 node_arc: &Arc<RwLock<TreeNode>>,
6980 target_id: u64,
6981 ) -> Option<Vec<u8>> {
6982 let guard = node_arc.read();
6983 match &*guard {
6984 TreeNode::Bottom(_) => None, // BINs are not upper INs
6985 TreeNode::Internal(n) => {
6986 if n.node_id == target_id {
6987 // Serialise InNodeStub for off-heap storage.
6988 // Format: node_id(u64BE) | level(i32BE) | n_entries(u32BE)
6989 // then per-entry: key_len(u32BE) | key | lsn(u64BE)
6990 let mut buf = Vec::new();
6991 buf.extend_from_slice(&n.node_id.to_be_bytes());
6992 buf.extend_from_slice(&n.level.to_be_bytes());
6993 buf.extend_from_slice(
6994 &(n.entries.len() as u32).to_be_bytes(),
6995 );
6996 for (i, e) in n.entries.iter().enumerate() {
6997 buf.extend_from_slice(
6998 &(e.key.len() as u32).to_be_bytes(),
6999 );
7000 buf.extend_from_slice(&e.key);
7001 buf.extend_from_slice(
7002 &n.get_lsn(i).as_u64().to_be_bytes(),
7003 );
7004 }
7005 return Some(buf);
7006 }
7007 // Recurse into children before releasing the guard so we
7008 // hold the minimum read-lock duration.
7009 let children: Vec<Arc<RwLock<TreeNode>>> =
7010 n.resident_children();
7011 drop(guard);
7012 for child in &children {
7013 if let Some(bytes) =
7014 Self::find_and_serialize_upper_in(child, target_id)
7015 {
7016 return Some(bytes);
7017 }
7018 }
7019 None
7020 }
7021 }
7022 }
7023
7024 /// Upper-IN traversal in `Checkpointer.processINList()` from
7025 /// — visits all `TreeNode::Internal` nodes whose `dirty` flag is set
7026 /// and returns them together with their level, sorted lowest-level-first
7027 /// so the checkpointer can log them bottom-up. The root is always the
7028 /// last entry (highest level), which must be logged `Provisional::No`.
7029 pub fn collect_dirty_upper_ins(
7030 &self,
7031 _db_id: u64,
7032 ) -> Vec<(i32, Arc<RwLock<TreeNode>>)> {
7033 let mut result: Vec<(i32, Arc<RwLock<TreeNode>>)> = Vec::new();
7034 if let Some(root) = self.get_root() {
7035 Self::collect_dirty_upper_ins_recursive(&root, &mut result);
7036 }
7037 result.sort_by_key(|(level, _)| *level);
7038 result
7039 }
7040
7041 fn collect_dirty_upper_ins_recursive(
7042 node_arc: &Arc<RwLock<TreeNode>>,
7043 out: &mut Vec<(i32, Arc<RwLock<TreeNode>>)>,
7044 ) {
7045 let guard = node_arc.read();
7046 match &*guard {
7047 TreeNode::Bottom(_) => {
7048 // BINs are handled by flush_dirty_bins_internal; skip here.
7049 }
7050 TreeNode::Internal(n) => {
7051 let is_dirty = n.dirty;
7052 // REC-AA: return the node's ACTUAL tree level (n.level, in
7053 // MAIN_LEVEL|n units), not a root-relative depth. The level
7054 // must be on the same scale as a BIN's `level` (BIN_LEVEL =
7055 // MAIN_LEVEL|1) so that the checkpointer's flush-level
7056 // computation and the evictor's `node_level < flush_level`
7057 // comparison are meaningful. With a root-relative depth the
7058 // root had the SMALLEST value (0) and the IN above the BINs
7059 // the LARGEST, inverting the provisional/non-provisional
7060 // boundary; with n.level the root has the largest level, as JE
7061 // expects.
7062 let level = n.level;
7063 let children: Vec<Arc<RwLock<TreeNode>>> =
7064 n.resident_children();
7065 drop(guard);
7066 // Recurse into children first (bottom-up ordering).
7067 for child in &children {
7068 Self::collect_dirty_upper_ins_recursive(child, out);
7069 }
7070 // Add this node after children (so parent comes after all descendants).
7071 if is_dirty {
7072 out.push((level, Arc::clone(node_arc)));
7073 }
7074 }
7075 }
7076 }
7077
7078 // ========================================================================
7079 // Tree.java ports: 8 additional tree methods (Task #82)
7080 // ========================================================================
7081
7082 /// Returns `true` if the root node is currently loaded in memory.
7083 ///
7084 /// .
7085 pub fn is_root_resident(&self) -> bool {
7086 self.root.read().is_some()
7087 }
7088
7089 /// Returns the root node `Arc` if present, or `None`.
7090 ///
7091 /// .
7092 pub fn get_resident_root_in(&self) -> Option<Arc<RwLock<TreeNode>>> {
7093 self.root.read().clone()
7094 }
7095
7096 /// Returns the BIN that should contain a slot for `key` (the "parent" of
7097 /// LN slots).
7098 ///
7099 /// . Descends the tree
7100 /// exactly like `search()` and returns the leaf-level BIN arc, or `None`
7101 /// if the tree is empty.
7102 ///
7103 /// Uses `read_arc()` hand-over-hand on the descent — the child
7104 /// guard is taken before the parent guard is dropped, matching
7105 /// `search()`. Returns the BIN Arc with no read lock held; the
7106 /// caller must take whatever lock it needs to operate on the
7107 /// returned BIN.
7108 pub fn get_parent_bin_for_child_ln(
7109 &self,
7110 key: &[u8],
7111 ) -> Option<Arc<RwLock<TreeNode>>> {
7112 let root = self.get_root()?;
7113 let mut current_arc: Arc<RwLock<TreeNode>> = root.clone();
7114 let mut guard: parking_lot::ArcRwLockReadGuard<
7115 parking_lot::RawRwLock,
7116 TreeNode,
7117 > = root.read_arc();
7118
7119 loop {
7120 if guard.is_bin() {
7121 drop(guard);
7122 return Some(current_arc);
7123 }
7124
7125 let parent_arc = current_arc.clone();
7126 let next_idx = match &*guard {
7127 TreeNode::Internal(n) => {
7128 if n.entries.is_empty() {
7129 return None;
7130 }
7131 let idx = self.upper_in_floor_index(&n.entries, key);
7132 match n.get_child(idx) {
7133 Some(c) => {
7134 let next_guard = c.read_arc();
7135 drop(guard);
7136 current_arc = c;
7137 guard = next_guard;
7138 continue;
7139 }
7140 None => idx, // EV-14/EV-13: re-fetch below.
7141 }
7142 }
7143 TreeNode::Bottom(_) => {
7144 unreachable!("is_bin() returned false above")
7145 }
7146 };
7147 // Hand-over-hand: take child guard before dropping parent.
7148 drop(guard);
7149 let child = self.child_at_or_fetch(&parent_arc, next_idx)?;
7150 let next_guard = child.read_arc();
7151 current_arc = child;
7152 guard = next_guard;
7153 }
7154 }
7155
7156 /// Returns the BIN where `key` should be inserted.
7157 ///
7158 /// . Semantically identical to
7159 /// `get_parent_bin_for_child_ln` — expressed as a separate method to match
7160 /// API surface.
7161 ///
7162 /// Implemented as a delegation to `get_parent_bin_for_child_ln`,
7163 /// which uses `read_arc()` hand-over-hand on the descent.
7164 pub fn find_bin_for_insert(
7165 &self,
7166 key: &[u8],
7167 ) -> Option<Arc<RwLock<TreeNode>>> {
7168 self.get_parent_bin_for_child_ln(key)
7169 }
7170
7171 /// Search for a BIN, allowing splits during descent (preemptive splitting).
7172 ///
7173 /// . This thin wrapper
7174 /// delegates to `search()` and returns the result wrapped in `Some`.
7175 /// The full split-allowed descent is performed by `insert()` internally;
7176 /// this method exposes the same result type for callers that only need to
7177 /// locate the BIN.
7178 ///
7179 /// Returns `None` if the tree is empty.
7180 pub fn search_splits_allowed(&self, key: &[u8]) -> Option<SearchResult> {
7181 self.search(key)
7182 }
7183
7184 /// Traverses the entire tree and returns every IN and BIN node as a flat
7185 /// list.
7186 ///
7187 /// . Used by recovery to rebuild
7188 /// the in-memory IN list after log replay. The walk is a BFS from the
7189 /// root; every `Arc<RwLock<TreeNode>>` encountered (both Internal and
7190 /// Bottom variants) is included in the result.
7191 pub fn rebuild_in_list(&self) -> Vec<Arc<RwLock<TreeNode>>> {
7192 let mut result = Vec::new();
7193 if let Some(root) = self.get_root() {
7194 Self::rebuild_in_list_recursive(&root, &mut result);
7195 }
7196 result
7197 }
7198
7199 fn rebuild_in_list_recursive(
7200 node_arc: &Arc<RwLock<TreeNode>>,
7201 out: &mut Vec<Arc<RwLock<TreeNode>>>,
7202 ) {
7203 // Push this node unconditionally — both INs and BINs belong in the list.
7204 out.push(Arc::clone(node_arc));
7205
7206 let guard = node_arc.read();
7207
7208 if let TreeNode::Internal(n) = &*guard {
7209 // Collect child arcs while holding the guard, then drop it before
7210 // recursing to avoid holding multiple locks simultaneously.
7211 let children: Vec<Arc<RwLock<TreeNode>>> = n.resident_children();
7212 drop(guard);
7213 for child in children {
7214 Self::rebuild_in_list_recursive(&child, out);
7215 }
7216 }
7217 // BIN nodes are leaves — no children to recurse into.
7218 }
7219
7220 /// Validates internal tree consistency.
7221 ///
7222 /// . Primarily a debug/test tool.
7223 ///
7224 /// Rules checked:
7225 /// - An empty tree (no root) is trivially valid → returns `true`.
7226 /// - A non-empty tree must have a non-null root.
7227 /// - Every Internal node must have at least one entry.
7228 /// - Every child pointer that is `Some` must be readable (lock must be
7229 /// acquirable — i.e., no poisoned locks).
7230 ///
7231 /// Returns `true` if no inconsistencies are detected, `false` otherwise.
7232 pub fn validate_in_list(&self) -> bool {
7233 match self.get_root() {
7234 None => true, // empty tree is always valid
7235 Some(root) => Self::validate_node(&root),
7236 }
7237 }
7238
7239 fn validate_node(node_arc: &Arc<RwLock<TreeNode>>) -> bool {
7240 let guard = node_arc.read();
7241
7242 match &*guard {
7243 TreeNode::Bottom(_bin) => {
7244 // BIN nodes are always structurally valid at this level.
7245 true
7246 }
7247 TreeNode::Internal(n) => {
7248 // An Internal node must have at least one entry.
7249 if n.entries.is_empty() {
7250 return false;
7251 }
7252 // Collect child arcs before dropping the guard.
7253 let children: Vec<Arc<RwLock<TreeNode>>> =
7254 n.resident_children();
7255 drop(guard);
7256 // Recursively validate every resident child.
7257 for child in children {
7258 if !Self::validate_node(&child) {
7259 return false;
7260 }
7261 }
7262 true
7263 }
7264 }
7265 }
7266
7267 /// Traverses the tree to find the parent IN that contains `child_node_id`
7268 /// as one of its child slots.
7269 ///
7270 /// . Used by the cleaner
7271 /// migration path to re-insert migrated INs after eviction/fetch.
7272 ///
7273 /// Returns `(parent_arc, slot_index)` where `slot_index` is the position
7274 /// in the parent's `entries` vector whose child matches `child_node_id`,
7275 /// or `None` if no such parent is found.
7276 pub fn get_parent_in_for_child_in(
7277 &self,
7278 child_node_id: u64,
7279 ) -> Option<(Arc<RwLock<TreeNode>>, usize)> {
7280 let root = self.get_root()?;
7281 Self::find_parent_of_node_id(&root, child_node_id)
7282 }
7283
7284 /// Recursive DFS helper for `get_parent_in_for_child_in`.
7285 ///
7286 /// Scans every entry in each Internal node. When a child's node_id
7287 /// matches `target_id` the parent arc and slot index are returned.
7288 fn find_parent_of_node_id(
7289 node_arc: &Arc<RwLock<TreeNode>>,
7290 target_id: u64,
7291 ) -> Option<(Arc<RwLock<TreeNode>>, usize)> {
7292 let guard = node_arc.read();
7293
7294 let TreeNode::Internal(n) = &*guard else {
7295 // BIN nodes have no IN children — cannot be a parent of another IN.
7296 return None;
7297 };
7298
7299 // Check whether any child of this IN has the target node_id.
7300 let mut children: Vec<(usize, Arc<RwLock<TreeNode>>)> = Vec::new();
7301 for slot in 0..n.entries.len() {
7302 if let Some(child_arc) = n.child_ref(slot) {
7303 // Read the child's node_id under a separate lock (acquire child
7304 // while parent guard is still held — this is intentional for
7305 // the ID comparison only; we release both immediately after).
7306 let child_id = {
7307 let cg = child_arc.read();
7308 match &*cg {
7309 TreeNode::Internal(cn) => cn.node_id,
7310 TreeNode::Bottom(cb) => cb.node_id,
7311 }
7312 };
7313
7314 if child_id == target_id {
7315 // Found — return a clone of this node as parent.
7316 let parent_clone = Arc::clone(node_arc);
7317 return Some((parent_clone, slot));
7318 }
7319
7320 // Not found at this slot; schedule this child for recursion.
7321 children.push((slot, Arc::clone(child_arc)));
7322 }
7323 }
7324 // Release parent guard before recursing.
7325 drop(guard);
7326
7327 // Recurse into each Internal child.
7328 for (_slot, child_arc) in children {
7329 if let Some(result) =
7330 Self::find_parent_of_node_id(&child_arc, target_id)
7331 {
7332 return Some(result);
7333 }
7334 }
7335
7336 None
7337 }
7338
7339 /// Propagates the dirty flag upward from `node_arc` to the root.
7340 ///
7341 /// Implicit dirty propagation: after modifying any node,
7342 /// all ancestors on the path to the root must also be marked dirty so
7343 /// the checkpointer logs them.
7344 ///
7345 /// In this happens through `IN.setDirty(true)` calls at each level
7346 /// during split/insert callbacks. Here we walk the weak parent chain.
7347 /// Reconstitute a BIN-delta by merging it onto a base full BIN.
7348 ///
7349 /// Implements JE `BINDelta.reconstituteBIN(databaseImpl)` for the recovery
7350 /// path where the log manager is not available as a `LogManager` but as
7351 /// raw serialized bytes.
7352 ///
7353 /// Algorithm:
7354 /// 1. Deserialise `base_bytes` as a full `BinStub`.
7355 /// 2. Apply `delta_bytes` slots onto the base using `BinStub::apply_delta`
7356 /// (raw slot overlay).
7357 /// 3. Recompute key prefix so prefix-compressed entries are consistent.
7358 ///
7359 /// Returns `None` if either byte slice is malformed.
7360 ///
7361 /// JE `BINDelta.reconstituteBIN` / `BINDelta.applyDelta`
7362 /// (DRIFT-10 / Stage 3).
7363 pub fn reconstitute_bin_delta(
7364 base_bytes: &[u8],
7365 delta_bytes: &[u8],
7366 ) -> Option<BinStub> {
7367 let mut base = BinStub::deserialize_full(base_bytes)?;
7368 // Apply the delta slots onto the base.
7369 // Note: BinStub::apply_delta uses slot-index addressing into base.entries,
7370 // extending with new entries when the slot_idx >= base.entries.len().
7371 // After apply_delta we recompute the key prefix to fix prefix compression.
7372 BinStub::apply_delta(&mut base, delta_bytes)?;
7373 // Recompute prefix so prefix-compressed BINs are consistent after merge.
7374 base.recompute_key_prefix();
7375 base.is_delta = false;
7376 base.dirty = false;
7377 Some(base)
7378 }
7379
7380 pub fn propagate_dirty_to_root(node_arc: &Arc<RwLock<TreeNode>>) {
7381 let parent_weak = { node_arc.read().get_parent() };
7382
7383 if let Some(parent_arc) = parent_weak.and_then(|w| w.upgrade()) {
7384 {
7385 let mut g = parent_arc.write();
7386 g.set_dirty(true);
7387 }
7388 // Recurse further up.
7389 Self::propagate_dirty_to_root(&parent_arc);
7390 }
7391 }
7392
7393 // ========================================================================
7394 // IN-redo: JE RecoveryManager.recoverIN / recoverRootIN / recoverChildIN
7395 // ========================================================================
7396
7397 /// Deserialise an upper-IN node from bytes produced by
7398 /// `TreeNode::write_to_bytes()` / `flush_one_tree_upper_ins`.
7399 ///
7400 /// Format: node_id(u64BE) | level(i32BE) | n_entries(u32BE) | dirty(u8)
7401 /// | per-entry: key_len(u16BE) | key | lsn(u64BE)
7402 ///
7403 /// JE `INFileReader.getIN(db)` / `IN.readFromLog`.
7404 pub fn deserialize_upper_in(bytes: &[u8]) -> Option<InNodeStub> {
7405 if bytes.len() < 13 {
7406 return None;
7407 }
7408 let node_id = u64::from_be_bytes(bytes[0..8].try_into().ok()?);
7409 let level = i32::from_be_bytes(bytes[8..12].try_into().ok()?);
7410 let n_entries =
7411 u32::from_be_bytes(bytes[12..16].try_into().ok()?) as usize;
7412 // dirty byte (1 byte after n_entries)
7413 if bytes.len() < 17 {
7414 return None;
7415 }
7416 let mut pos = 17usize; // skip node_id(8) + level(4) + n_entries(4) + dirty(1)
7417 let mut entries = Vec::with_capacity(n_entries);
7418 let mut lsns: Vec<Lsn> = Vec::with_capacity(n_entries);
7419 for _ in 0..n_entries {
7420 if pos + 2 > bytes.len() {
7421 return None;
7422 }
7423 let key_len =
7424 u16::from_be_bytes(bytes[pos..pos + 2].try_into().ok()?)
7425 as usize;
7426 pos += 2;
7427 if pos + key_len > bytes.len() {
7428 return None;
7429 }
7430 let key = bytes[pos..pos + key_len].to_vec();
7431 pos += key_len;
7432 if pos + 8 > bytes.len() {
7433 return None;
7434 }
7435 let lsn = noxu_util::Lsn::from_u64(u64::from_be_bytes(
7436 bytes[pos..pos + 8].try_into().ok()?,
7437 ));
7438 pos += 8;
7439 entries.push(InEntry { key });
7440 lsns.push(lsn); // T-3
7441 }
7442 Some(InNodeStub {
7443 node_id,
7444 level,
7445 entries,
7446 // T-4: a freshly deserialized IN has no resident children.
7447 targets: TargetRep::None,
7448 dirty: false,
7449 generation: 0,
7450 parent: None,
7451 lsn_rep: LsnRep::from_lsns(&lsns), // T-3
7452 })
7453 }
7454
7455 /// Deserialise a BIN from bytes produced by `BinStub::serialize_full()`.
7456 ///
7457 /// Thin wrapper so the recovery path does not need to import `BinStub`
7458 /// directly from callers that only have the raw bytes.
7459 ///
7460 /// JE `INFileReader.getIN(db)` for a BIN entry.
7461 pub fn deserialize_bin(bytes: &[u8]) -> Option<BinStub> {
7462 let mut bin = BinStub::deserialize_full(bytes)?;
7463 bin.dirty = false; // freshly loaded from log — clean for now
7464 Some(bin)
7465 }
7466
7467 /// Apply a logged IN/BIN to the in-memory tree during the recovery redo pass.
7468 ///
7469 /// Implements JE `RecoveryManager.recoverIN`:
7470 /// - `is_root` nodes are handled by `recover_root_in`.
7471 /// - non-root nodes are handled by `recover_child_in`.
7472 ///
7473 /// `log_lsn` is the LSN at which this IN/BIN was logged. The currency
7474 /// check in `recover_child_in` uses this to decide whether to replace the
7475 /// in-memory slot (tree slot LSN < log_lsn → replace; equal → noop;
7476 /// greater → skip).
7477 ///
7478 /// JE `RecoveryManager.recoverIN` / `replayOneIN`
7479 /// (RecoveryManager.java ~lines 1200–1280).
7480 pub fn recover_in_redo(
7481 &self,
7482 log_lsn: noxu_util::Lsn,
7483 is_root: bool,
7484 is_bin: bool,
7485 node_data: &[u8],
7486 ) -> InRedoResult {
7487 if is_bin {
7488 let Some(bin) = Self::deserialize_bin(node_data) else {
7489 return InRedoResult::DeserializeFailed;
7490 };
7491 if is_root {
7492 self.recover_root_bin(log_lsn, bin)
7493 } else {
7494 self.recover_child_bin(log_lsn, bin)
7495 }
7496 } else {
7497 let Some(upper) = Self::deserialize_upper_in(node_data) else {
7498 return InRedoResult::DeserializeFailed;
7499 };
7500 if is_root {
7501 self.recover_root_upper_in(log_lsn, upper)
7502 } else {
7503 self.recover_child_upper_in(log_lsn, upper)
7504 }
7505 }
7506 }
7507
7508 /// Recover a root BIN.
7509 ///
7510 /// If no root exists or the existing root is older (lower LSN), install
7511 /// this BIN as the new root.
7512 ///
7513 /// JE `RecoveryManager.recoverRootIN` / `RootUpdater.doWork`
7514 /// (RecoveryManager.java ~lines 1293–1410).
7515 fn recover_root_bin(
7516 &self,
7517 log_lsn: noxu_util::Lsn,
7518 bin: BinStub,
7519 ) -> InRedoResult {
7520 let mut root_guard = self.root.write();
7521 let existing_lsn = *self.root_log_lsn.read();
7522 match &*root_guard {
7523 None => {
7524 // No root — install this BIN as the root.
7525 // JE: `root == null` case in `RootUpdater.doWork`.
7526 let node = TreeNode::Bottom(bin);
7527 *root_guard = Some(Arc::new(RwLock::new(node)));
7528 *self.root_log_lsn.write() = log_lsn;
7529 InRedoResult::Inserted
7530 }
7531 Some(_) => {
7532 // JE: `originalLsn = root.getLsn()`; replace if logLsn > originalLsn.
7533 if log_lsn > existing_lsn {
7534 let node = TreeNode::Bottom(bin);
7535 *root_guard = Some(Arc::new(RwLock::new(node)));
7536 *self.root_log_lsn.write() = log_lsn;
7537 InRedoResult::Replaced
7538 } else {
7539 InRedoResult::Skipped
7540 }
7541 }
7542 }
7543 }
7544
7545 /// Recover a root upper IN.
7546 ///
7547 /// JE `RecoveryManager.recoverRootIN` for a non-BIN root.
7548 fn recover_root_upper_in(
7549 &self,
7550 log_lsn: noxu_util::Lsn,
7551 upper: InNodeStub,
7552 ) -> InRedoResult {
7553 let mut root_guard = self.root.write();
7554 let existing_lsn = *self.root_log_lsn.read();
7555 match &*root_guard {
7556 None => {
7557 let node = TreeNode::Internal(upper);
7558 *root_guard = Some(Arc::new(RwLock::new(node)));
7559 *self.root_log_lsn.write() = log_lsn;
7560 InRedoResult::Inserted
7561 }
7562 Some(_) => {
7563 if log_lsn > existing_lsn {
7564 let node = TreeNode::Internal(upper);
7565 *root_guard = Some(Arc::new(RwLock::new(node)));
7566 *self.root_log_lsn.write() = log_lsn;
7567 InRedoResult::Replaced
7568 } else {
7569 InRedoResult::Skipped
7570 }
7571 }
7572 }
7573 }
7574
7575 /// Recover a non-root BIN.
7576 ///
7577 /// Implements the three-case currency check from JE
7578 /// `RecoveryManager.recoverChildIN`
7579 /// (RecoveryManager.java lines 1412–1500):
7580 ///
7581 /// 1. Node not in tree: skip (parent logged a later structure that already
7582 /// omits this node, or node was deleted).
7583 /// 2. Physical match (slot LSN == log_lsn): noop — already current.
7584 /// 3. Logical match: another version of the node is in the slot.
7585 /// Replace if tree slot LSN < log_lsn (tree is older), skip otherwise.
7586 fn recover_child_bin(
7587 &self,
7588 log_lsn: noxu_util::Lsn,
7589 bin: BinStub,
7590 ) -> InRedoResult {
7591 let node_id = bin.node_id;
7592 let Some((parent_arc, slot)) = self.get_parent_in_for_child_in(node_id)
7593 else {
7594 // Case 1: not in tree.
7595 return InRedoResult::NotInTree;
7596 };
7597 let mut parent = parent_arc.write();
7598 let TreeNode::Internal(ref mut p) = *parent else {
7599 return InRedoResult::NotInTree;
7600 };
7601 let tree_lsn = p.get_lsn(slot); // T-3
7602 if tree_lsn == log_lsn {
7603 // Case 2: physical match — noop.
7604 InRedoResult::Skipped
7605 } else if tree_lsn < log_lsn {
7606 // Case 3: logical match, tree is older — replace.
7607 // JE `parent.recoverIN(idx, inFromLog, logLsn, lastLoggedSize)`.
7608 let new_arc = Arc::new(RwLock::new(TreeNode::Bottom(bin)));
7609 // Set parent back-pointer on the new node.
7610 {
7611 let mut ng = new_arc.write();
7612 if let TreeNode::Bottom(ref mut b) = *ng {
7613 b.parent = Some(Arc::downgrade(&parent_arc));
7614 }
7615 }
7616 p.set_child(slot, Some(new_arc));
7617 p.set_lsn(slot, log_lsn); // T-3
7618 InRedoResult::Replaced
7619 } else {
7620 // tree_lsn > log_lsn: tree already holds a newer version.
7621 InRedoResult::Skipped
7622 }
7623 }
7624
7625 /// Recover a non-root upper IN.
7626 ///
7627 /// JE `RecoveryManager.recoverChildIN` for a non-BIN node.
7628 fn recover_child_upper_in(
7629 &self,
7630 log_lsn: noxu_util::Lsn,
7631 upper: InNodeStub,
7632 ) -> InRedoResult {
7633 let node_id = upper.node_id;
7634 let Some((parent_arc, slot)) = self.get_parent_in_for_child_in(node_id)
7635 else {
7636 return InRedoResult::NotInTree;
7637 };
7638 let mut parent = parent_arc.write();
7639 let TreeNode::Internal(ref mut p) = *parent else {
7640 return InRedoResult::NotInTree;
7641 };
7642 let tree_lsn = p.get_lsn(slot); // T-3
7643 if tree_lsn == log_lsn {
7644 InRedoResult::Skipped
7645 } else if tree_lsn < log_lsn {
7646 let new_arc = Arc::new(RwLock::new(TreeNode::Internal(upper)));
7647 {
7648 let mut ng = new_arc.write();
7649 if let TreeNode::Internal(ref mut n) = *ng {
7650 n.parent = Some(Arc::downgrade(&parent_arc));
7651 }
7652 }
7653 p.set_child(slot, Some(new_arc));
7654 p.set_lsn(slot, log_lsn); // T-3
7655 InRedoResult::Replaced
7656 } else {
7657 InRedoResult::Skipped
7658 }
7659 }
7660}
7661
7662/// Result of a single `recover_in_redo` call.
7663///
7664/// JE traces the same outcomes in `RecoveryManager` debug logging.
7665#[derive(Debug, Clone, Copy, PartialEq, Eq)]
7666pub enum InRedoResult {
7667 /// Node was inserted as the new root.
7668 Inserted,
7669 /// Node replaced an older version in the tree.
7670 Replaced,
7671 /// Node not applied: tree already holds an equal or newer version.
7672 Skipped,
7673 /// Node not found in tree (parent logged later structure that excludes it).
7674 NotInTree,
7675 /// Deserialisation of `node_data` bytes failed.
7676 DeserializeFailed,
7677}
7678
7679/// Global node ID counter for generating unique node IDs.
7680///
7681/// This is the SINGLE source of node-ids for the whole tree subsystem. The
7682/// BIN constructor (`bin.rs`) and `node.rs` route through `generate_node_id`
7683/// so that, after crash recovery, a freshly allocated node-id is always
7684/// strictly greater than every node-id present in the recovered log.
7685///
7686/// JE ref: `NodeSequence.getNextLocalNodeId` (a single per-env counter) and
7687/// `IN.nodeId` allocation; `NodeSequence.initRealNodeId` seeds the counter
7688/// from the recovered `CheckpointEnd.lastLocalNodeId`. The env seeds this
7689/// counter post-recovery via `seed_node_id_counter`.
7690static NODE_ID_COUNTER: std::sync::atomic::AtomicU64 =
7691 std::sync::atomic::AtomicU64::new(1);
7692
7693/// Generates a unique node ID.
7694pub fn generate_node_id() -> u64 {
7695 NODE_ID_COUNTER.fetch_add(1, std::sync::atomic::Ordering::SeqCst)
7696}
7697
7698/// Returns the node-id that would be generated next (without allocating it).
7699///
7700/// Used by recovery seeding and by tests to assert no node-id reuse after a
7701/// restart.
7702pub fn peek_next_node_id_counter() -> u64 {
7703 NODE_ID_COUNTER.load(std::sync::atomic::Ordering::SeqCst)
7704}
7705
7706/// Seeds the node-id counter so the next generated id is `> last_node_id`.
7707///
7708/// Called by `EnvironmentImpl` after recovery with the recovered
7709/// `use_max_node_id`, mirroring `NodeSequence.initRealNodeId` /
7710/// `setLastNodeId`: post-restart allocation must never reuse a node-id that
7711/// is already in the log. Monotonic: never lowers the counter.
7712pub fn seed_node_id_counter(last_node_id: u64) {
7713 let want_next = last_node_id.saturating_add(1);
7714 // Bump only if our current next is below the recovered floor.
7715 let mut cur = NODE_ID_COUNTER.load(std::sync::atomic::Ordering::SeqCst);
7716 while cur < want_next {
7717 match NODE_ID_COUNTER.compare_exchange_weak(
7718 cur,
7719 want_next,
7720 std::sync::atomic::Ordering::SeqCst,
7721 std::sync::atomic::Ordering::SeqCst,
7722 ) {
7723 Ok(_) => break,
7724 Err(observed) => cur = observed,
7725 }
7726 }
7727}
7728
7729#[cfg(test)]
7730mod tests {
7731 use super::*;
7732
7733 // ====================================================================
7734 // T-3: LsnRep packed-LSN encoding (IN.entryLsnByteArray / getLsn /
7735 // setLsnInternal, IN.java:1752-1935).
7736 // ====================================================================
7737
7738 /// All-NULL node uses the 0-byte Empty rep; reads return NULL_LSN.
7739 #[test]
7740 fn lsnrep_empty_is_zero_bytes() {
7741 let rep = LsnRep::new(64);
7742 assert!(matches!(rep, LsnRep::Empty));
7743 assert_eq!(rep.memory_size(), 0);
7744 assert_eq!(rep.get(0), NULL_LSN);
7745 assert_eq!(rep.get(63), NULL_LSN);
7746 }
7747
7748 /// LSNs sharing a file number pack to the Compact rep (4 bytes/slot,
7749 /// base_file_number-relative) and round-trip exactly.
7750 #[test]
7751 fn lsnrep_compact_roundtrip_same_file() {
7752 let mut rep = LsnRep::new(8);
7753 for i in 0..8u32 {
7754 rep.set(i as usize, Lsn::new(7, 1000 + i), 8);
7755 }
7756 assert!(matches!(rep, LsnRep::Compact { .. }));
7757 for i in 0..8u32 {
7758 assert_eq!(rep.get(i as usize), Lsn::new(7, 1000 + i));
7759 }
7760 // 8 slots * 4 bytes = 32 bytes, far below 8 * 8 = 64 for raw u64.
7761 assert_eq!(rep.memory_size(), 8 * 4);
7762 }
7763
7764 /// NULL_LSN is stored via the 0xffffff file-offset sentinel, NOT u64::MAX,
7765 /// so a node with NULL slots still packs Compact (the blocker JE solves).
7766 #[test]
7767 fn lsnrep_null_does_not_force_long() {
7768 let mut rep = LsnRep::new(4);
7769 rep.set(0, Lsn::new(3, 50), 4);
7770 rep.set(1, NULL_LSN, 4);
7771 rep.set(2, Lsn::new(3, 60), 4);
7772 rep.set(3, NULL_LSN, 4);
7773 assert!(
7774 matches!(rep, LsnRep::Compact { .. }),
7775 "NULL slots must NOT force the Long rep"
7776 );
7777 assert_eq!(rep.get(0), Lsn::new(3, 50));
7778 assert_eq!(rep.get(1), NULL_LSN);
7779 assert_eq!(rep.get(2), Lsn::new(3, 60));
7780 assert_eq!(rep.get(3), NULL_LSN);
7781 }
7782
7783 /// base_file_number tracks the minimum; setting a lower file number
7784 /// re-bases the whole array (adjustFileNumbers) while staying Compact.
7785 #[test]
7786 fn lsnrep_rebase_on_lower_file_number() {
7787 let mut rep = LsnRep::new(3);
7788 rep.set(0, Lsn::new(10, 5), 3);
7789 rep.set(1, Lsn::new(12, 6), 3);
7790 // A lower file number re-bases base_file_number to 8.
7791 rep.set(2, Lsn::new(8, 7), 3);
7792 assert!(matches!(rep, LsnRep::Compact { .. }));
7793 assert_eq!(rep.get(0), Lsn::new(10, 5));
7794 assert_eq!(rep.get(1), Lsn::new(12, 6));
7795 assert_eq!(rep.get(2), Lsn::new(8, 7));
7796 }
7797
7798 /// A file-number spread > 127 forces the Long fallback (mutateToLongArray),
7799 /// still round-tripping every slot.
7800 #[test]
7801 fn lsnrep_mutates_to_long_on_wide_file_range() {
7802 let mut rep = LsnRep::new(2);
7803 rep.set(0, Lsn::new(1, 5), 2);
7804 rep.set(1, Lsn::new(1000, 6), 2); // diff 999 > 127 -> Long
7805 assert!(matches!(rep, LsnRep::Long(_)));
7806 assert_eq!(rep.get(0), Lsn::new(1, 5));
7807 assert_eq!(rep.get(1), Lsn::new(1000, 6));
7808 }
7809
7810 /// A file offset > MAX_FILE_OFFSET (0xfffffe) forces the Long fallback.
7811 #[test]
7812 fn lsnrep_mutates_to_long_on_large_offset() {
7813 let mut rep = LsnRep::new(2);
7814 rep.set(0, Lsn::new(1, 10), 2);
7815 rep.set(1, Lsn::new(1, 0x00ff_ffff), 2); // > MAX_FILE_OFFSET -> Long
7816 assert!(matches!(rep, LsnRep::Long(_)));
7817 assert_eq!(rep.get(1), Lsn::new(1, 0x00ff_ffff));
7818 }
7819
7820 /// insert_shift / remove_shift keep slots aligned (INArrayRep.copy).
7821 #[test]
7822 fn lsnrep_insert_and_remove_shift() {
7823 let mut rep = LsnRep::from_lsns(&[
7824 Lsn::new(2, 1),
7825 Lsn::new(2, 2),
7826 Lsn::new(2, 3),
7827 ]);
7828 // Insert a new slot at index 1.
7829 rep.insert_shift(1, 4);
7830 rep.set(1, Lsn::new(2, 99), 4);
7831 assert_eq!(rep.get(0), Lsn::new(2, 1));
7832 assert_eq!(rep.get(1), Lsn::new(2, 99));
7833 assert_eq!(rep.get(2), Lsn::new(2, 2));
7834 assert_eq!(rep.get(3), Lsn::new(2, 3));
7835 // Remove slot 1.
7836 rep.remove_shift(1);
7837 assert_eq!(rep.get(0), Lsn::new(2, 1));
7838 assert_eq!(rep.get(1), Lsn::new(2, 2));
7839 assert_eq!(rep.get(2), Lsn::new(2, 3));
7840 }
7841
7842 #[test]
7843 fn test_empty_tree() {
7844 let tree = Tree::new(1, 128);
7845 assert!(tree.is_empty());
7846 assert_eq!(tree.get_database_id(), 1);
7847 assert_eq!(tree.get_root_splits(), 0);
7848 }
7849
7850 #[test]
7851 fn test_redo_insert_older_lsn_does_not_overwrite_newer_slot() {
7852 // REC-F2 reproduce-first: redo() must be idempotent w.r.t. slot
7853 // currency. JE RecoveryManager.redo() (line ~2512/2544) only
7854 // replaces a slot when logrecLsn > treeLsn. A later redo of an
7855 // OLDER committed LN for the same key must NOT revert the slot to
7856 // the older value or reset the slot LSN backward.
7857 let tree = Tree::new(1, 128);
7858 let key = b"k".to_vec();
7859
7860 // Install the newer version at LSN X (e.g. the BIN-logged value).
7861 let newer = Lsn::new(5, 500);
7862 tree.redo_insert(&key, b"new", newer).unwrap();
7863
7864 // Replay an OLDER committed LN at Y < X for the same key.
7865 let older = Lsn::new(2, 200);
7866 tree.redo_insert(&key, b"old", older).unwrap();
7867
7868 // The newer value and LSN must survive.
7869 let got = tree.search_with_data(&key).expect("key present");
7870 assert!(got.found);
7871 assert_eq!(
7872 got.data.as_deref(),
7873 Some(&b"new"[..]),
7874 "older-LSN redo reverted committed data"
7875 );
7876 assert_eq!(
7877 got.lsn,
7878 newer.as_u64(),
7879 "older-LSN redo reset slot LSN backward"
7880 );
7881
7882 // A redo at a strictly NEWER LSN must still replace (replace-only
7883 // when log_lsn > slot_lsn, matching JE lsnCmp > 0).
7884 let newest = Lsn::new(9, 900);
7885 tree.redo_insert(&key, b"newest", newest).unwrap();
7886 let got = tree.search_with_data(&key).expect("key present");
7887 assert_eq!(got.data.as_deref(), Some(&b"newest"[..]));
7888 assert_eq!(got.lsn, newest.as_u64());
7889 }
7890
7891 #[test]
7892 fn test_insert_single() {
7893 let tree = Tree::new(1, 128);
7894 let key = b"testkey".to_vec();
7895 let data = b"testdata".to_vec();
7896 let lsn = Lsn::new(1, 100);
7897
7898 let result = tree.insert(key.clone(), data, lsn);
7899 assert!(result.is_ok());
7900 assert!(result.unwrap()); // Should be a new insert
7901
7902 assert!(!tree.is_empty());
7903
7904 // Verify we can search for it
7905 let search_result = tree.search(&key);
7906 assert!(search_result.is_some());
7907 let sr = search_result.unwrap();
7908 assert!(sr.exact_parent_found || !sr.child_not_resident);
7909 }
7910
7911 #[test]
7912 fn test_insert_multiple() {
7913 let tree = Tree::new(1, 128);
7914
7915 let keys = vec![
7916 b"apple".to_vec(),
7917 b"banana".to_vec(),
7918 b"cherry".to_vec(),
7919 b"date".to_vec(),
7920 ];
7921
7922 for (i, key) in keys.iter().enumerate() {
7923 let data = format!("data{}", i).into_bytes();
7924 let lsn = Lsn::new(1, 100 + (i as u32) * 10);
7925 let result = tree.insert(key.clone(), data, lsn);
7926 assert!(result.is_ok());
7927 assert!(result.unwrap()); // All should be new inserts
7928 }
7929
7930 // Verify we can search for each
7931 for key in &keys {
7932 let search_result = tree.search(key);
7933 assert!(search_result.is_some());
7934 }
7935 }
7936
7937 #[test]
7938 fn test_insert_duplicate_key() {
7939 let tree = Tree::new(1, 128);
7940 let key = b"duplicate".to_vec();
7941 let data1 = b"first".to_vec();
7942 let data2 = b"second".to_vec();
7943 let lsn1 = Lsn::new(1, 100);
7944 let lsn2 = Lsn::new(1, 200);
7945
7946 // First insert
7947 let result1 = tree.insert(key.clone(), data1, lsn1);
7948 assert!(result1.is_ok());
7949 assert!(result1.unwrap()); // New insert
7950
7951 // Second insert with same key - should be update
7952 let result2 = tree.insert(key, data2, lsn2);
7953 assert!(result2.is_ok());
7954 assert!(!result2.unwrap()); // Update, not new insert
7955 }
7956
7957 #[test]
7958 fn test_search_empty_tree() {
7959 let tree = Tree::new(1, 128);
7960 let key = b"noexist".to_vec();
7961
7962 let result = tree.search(&key);
7963 assert!(result.is_none());
7964 }
7965
7966 #[test]
7967 fn test_first_and_last_node() {
7968 let tree = Tree::new(1, 128);
7969
7970 // Empty tree
7971 assert!(tree.get_first_node().is_none());
7972 assert!(tree.get_last_node().is_none());
7973
7974 // Insert some keys
7975 let keys = [b"a".to_vec(), b"b".to_vec(), b"c".to_vec()];
7976 for (i, key) in keys.iter().enumerate() {
7977 let data = format!("data{}", i).into_bytes();
7978 let lsn = Lsn::new(1, 100 + (i as u32) * 10);
7979 tree.insert(key.clone(), data, lsn).unwrap();
7980 }
7981
7982 // Now should have first and last
7983 let first = tree.get_first_node();
7984 assert!(first.is_some());
7985 assert_eq!(first.unwrap().index, 0);
7986
7987 let last = tree.get_last_node();
7988 assert!(last.is_some());
7989 assert_eq!(last.unwrap().index, 2);
7990 }
7991
7992 #[test]
7993 fn test_node_id_generation() {
7994 let id1 = generate_node_id();
7995 let id2 = generate_node_id();
7996 let id3 = generate_node_id();
7997
7998 assert!(id2 > id1);
7999 assert!(id3 > id2);
8000 }
8001
8002 #[test]
8003 fn test_tree_node_is_bin() {
8004 let bin = TreeNode::Bottom(BinStub {
8005 node_id: 1,
8006 level: BIN_LEVEL,
8007 entries: vec![],
8008 key_prefix: Vec::new(),
8009 dirty: false,
8010 is_delta: false,
8011 last_full_lsn: NULL_LSN,
8012 last_delta_lsn: NULL_LSN,
8013 generation: 0,
8014 parent: None,
8015 expiration_in_hours: true,
8016 cursor_count: 0,
8017 prohibit_next_delta: false,
8018 lsn_rep: LsnRep::Empty,
8019 keys: KeyRep::new(),
8020 compact_max_key_length: INKeyRep_DEFAULT_MAX_KEY_LENGTH,
8021 });
8022 assert!(bin.is_bin());
8023 assert_eq!(bin.level(), BIN_LEVEL);
8024
8025 let internal = TreeNode::Internal(InNodeStub {
8026 node_id: 2,
8027 level: MAIN_LEVEL + 2,
8028 entries: vec![],
8029 targets: TargetRep::None,
8030 dirty: false,
8031 generation: 0,
8032 parent: None,
8033 lsn_rep: LsnRep::Empty,
8034 });
8035 assert!(!internal.is_bin());
8036 assert_eq!(internal.level(), MAIN_LEVEL + 2);
8037 }
8038
8039 #[test]
8040 fn test_find_entry() {
8041 let mut entries = vec![];
8042 let mut keys = vec![];
8043 for i in 0..5 {
8044 entries.push(BinEntry {
8045 data: Some(vec![]),
8046 known_deleted: false,
8047 dirty: false,
8048 expiration_time: 0,
8049 });
8050 keys.push(format!("key{}", i).into_bytes());
8051 }
8052
8053 let bin = TreeNode::Bottom(BinStub {
8054 node_id: 1,
8055 level: BIN_LEVEL,
8056 entries,
8057 key_prefix: Vec::new(),
8058 dirty: false,
8059 is_delta: false,
8060 last_full_lsn: NULL_LSN,
8061 last_delta_lsn: NULL_LSN,
8062 generation: 0,
8063 parent: None,
8064 expiration_in_hours: true,
8065 cursor_count: 0,
8066 prohibit_next_delta: false,
8067 lsn_rep: LsnRep::Empty,
8068 keys: KeyRep::from_keys(keys),
8069 compact_max_key_length: INKeyRep_DEFAULT_MAX_KEY_LENGTH,
8070 });
8071
8072 // Search for existing key
8073 let result = bin.find_entry(b"key2", false, true);
8074 assert_eq!(result & 0xFFFF, 2);
8075 assert_ne!(result & EXACT_MATCH, 0);
8076
8077 // Search for non-existing key with exact=false
8078 let result = bin.find_entry(b"key15", false, false);
8079 assert_eq!(result & 0xFFFF, 2); // Would go between key1 and key2
8080 assert_eq!(result & EXACT_MATCH, 0);
8081 }
8082
8083 #[test]
8084 fn test_insert_until_full() {
8085 // With splits implemented, inserting beyond max_entries_per_node must
8086 // succeed (the tree splits proactively rather than returning an error).
8087 let tree = Tree::new(1, 3); // Small max to exercise splits
8088
8089 // Insert up to max
8090 for i in 0..3 {
8091 let key = format!("key{}", i).into_bytes();
8092 let data = format!("data{}", i).into_bytes();
8093 let lsn = Lsn::new(1, 100 + i);
8094 let result = tree.insert(key, data, lsn);
8095 assert!(result.is_ok(), "insert {} should succeed", i);
8096 }
8097
8098 // The 4th insert triggers a split and must also succeed.
8099 let key = b"key3".to_vec();
8100 let data = b"data3".to_vec();
8101 let lsn = Lsn::new(1, 103);
8102 let result = tree.insert(key.clone(), data, lsn);
8103 assert!(
8104 result.is_ok(),
8105 "insert after full should trigger split and succeed"
8106 );
8107 assert!(result.unwrap(), "should be a new insert");
8108
8109 // The inserted key must be findable after the split.
8110 let sr = tree.search(&key);
8111 assert!(sr.is_some(), "key3 must be searchable after split");
8112 assert!(sr.unwrap().exact_parent_found, "key3 must be found exactly");
8113 }
8114
8115 #[test]
8116 fn test_memory_counter_balanced_on_insert_delete_f8() {
8117 use std::sync::Arc;
8118 use std::sync::atomic::{AtomicI64, Ordering};
8119 // F8 regression: insert accounts key+data+48; delete must subtract the
8120 // SAME, so an insert+delete of the same record returns the counter to
8121 // its starting value (previously delete omitted data_len -> the counter
8122 // leaked data_len per delete, biasing the evictor over-budget view).
8123 let mut tree = Tree::new(1, 16);
8124 let counter = Arc::new(AtomicI64::new(0));
8125 tree.set_memory_counter(Arc::clone(&counter));
8126
8127 let key = b"a-key".to_vec();
8128 let data = vec![0u8; 200]; // non-trivial data length
8129 tree.insert(key.clone(), data.clone(), Lsn::new(0, 10)).unwrap();
8130 let after_insert = counter.load(Ordering::Relaxed);
8131 assert!(after_insert > 0, "insert must increase the counter");
8132 assert_eq!(
8133 after_insert,
8134 (key.len() + data.len() + BIN_ENTRY_OVERHEAD) as i64,
8135 "insert accounts key + data + per-slot BinEntry overhead"
8136 );
8137
8138 let deleted = tree.delete(&key);
8139 assert!(deleted);
8140 assert_eq!(
8141 counter.load(Ordering::Relaxed),
8142 0,
8143 "F8: delete must subtract key + data + BIN_ENTRY_OVERHEAD, returning the counter to its pre-insert value (no data_len leak)"
8144 );
8145 }
8146
8147 /// EV-13 (pass-post): a full-node detach must ACTUALLY drop the child
8148 /// `Arc` from the parent IN, not merely credit bytes. Before the fix the
8149 /// evictor credited `node_size_fn(node_id)` and removed the node from the
8150 /// LRU list, but the parent's `InEntry.child` still held a strong `Arc`,
8151 /// so the node was never freed (phantom free) and the budget over-credited.
8152 ///
8153 /// This test proves: after `detach_node_by_id` the held child `Arc` is the
8154 /// LAST strong reference (strong_count == 1), the parent slot's `child` is
8155 /// `None`, and the returned bytes equal the node's measured heap size.
8156 ///
8157 /// JE ref: `IN.detachNode` (`setTarget(idx, null)`) / `Evictor.evict`.
8158 #[test]
8159 fn test_ev13_detach_actually_frees_child() {
8160 // Tiny fanout forces a root split so we get a real IN parent with BIN
8161 // children that the evictor would target.
8162 let tree = Tree::new(7, 4);
8163 for i in 0u8..12 {
8164 tree.insert(
8165 vec![b'a' + i],
8166 vec![i; 8],
8167 Lsn::new(1, u32::from(i) + 1),
8168 )
8169 .unwrap();
8170 }
8171
8172 // Find a BIN child of the root IN (the eviction target) + its parent.
8173 let root = tree.get_root().expect("tree must have a root");
8174 let (parent_arc, child_idx, bin_id, expected_bytes) = {
8175 let rg = root.read();
8176 let TreeNode::Internal(n) = &*rg else {
8177 panic!("root must be an IN after split");
8178 };
8179 // Pick the first slot whose child is a resident BIN.
8180 let (idx, child) = n
8181 .first_resident_child()
8182 .expect("root must have a resident child");
8183 let (id, bytes) = {
8184 let cg = child.read();
8185 (
8186 match &*cg {
8187 TreeNode::Bottom(b) => b.node_id,
8188 TreeNode::Internal(n2) => n2.node_id,
8189 },
8190 cg.budgeted_memory_size(),
8191 )
8192 };
8193 (Arc::clone(&root), idx, id, bytes)
8194 };
8195
8196 // Hold an external strong reference to the child so we can observe its
8197 // strong_count drop when detach releases the parent's reference.
8198 let child_arc = {
8199 let pg = parent_arc.read();
8200 let TreeNode::Internal(n) = &*pg else { unreachable!() };
8201 Arc::clone(n.child_ref(child_idx).unwrap())
8202 };
8203 // Two strong refs now: the parent slot + our test handle.
8204 assert_eq!(
8205 Arc::strong_count(&child_arc),
8206 2,
8207 "precondition: parent slot + test handle hold the child"
8208 );
8209
8210 let freed = tree.detach_node_by_id(bin_id);
8211
8212 // 1. Bytes credited equal the measured heap size (no phantom credit).
8213 assert_eq!(
8214 freed, expected_bytes,
8215 "detach must credit the node's real measured heap size"
8216 );
8217 // 2. The parent slot's child is now None (JE setTarget(idx, null)).
8218 {
8219 let pg = parent_arc.read();
8220 let TreeNode::Internal(n) = &*pg else { unreachable!() };
8221 assert!(
8222 n.child_is_none(child_idx),
8223 "EV-13: parent slot must be detached (child == None)"
8224 );
8225 // The slot itself (key + LSN) is retained for re-fetch.
8226 assert!(
8227 !n.get_lsn(child_idx).is_null(),
8228 "detach keeps the slot LSN so the node can be re-fetched"
8229 );
8230 }
8231 // 3. Our handle is now the ONLY strong reference -> the parent really
8232 // dropped its Arc; the node is freed when we drop `child_arc`.
8233 // Before EV-13 this would be 2 (parent still held it) = phantom free.
8234 assert_eq!(
8235 Arc::strong_count(&child_arc),
8236 1,
8237 "EV-13: detach must drop the parent's strong Arc (no phantom free)"
8238 );
8239 }
8240
8241 /// EV-13: detach must NOT decrement the memory counter itself (the evictor
8242 /// owns that bookkeeping via `Arbiter::release_memory`). A double credit
8243 /// would drive `cache_usage` below reality.
8244 #[test]
8245 fn test_ev13_detach_does_not_touch_counter() {
8246 use std::sync::atomic::{AtomicI64, Ordering};
8247 let mut tree = Tree::new(8, 4);
8248 let counter = Arc::new(AtomicI64::new(0));
8249 tree.set_memory_counter(Arc::clone(&counter));
8250 for i in 0u8..12 {
8251 tree.insert(
8252 vec![b'a' + i],
8253 vec![i; 8],
8254 Lsn::new(1, u32::from(i) + 1),
8255 )
8256 .unwrap();
8257 }
8258 let before = counter.load(Ordering::Relaxed);
8259
8260 // Grab a BIN child id.
8261 let root = tree.get_root().unwrap();
8262 let bin_id = {
8263 let rg = root.read();
8264 let TreeNode::Internal(n) = &*rg else { unreachable!() };
8265 let child = n
8266 .resident_children()
8267 .into_iter()
8268 .next()
8269 .expect("resident child");
8270 match &*child.read() {
8271 TreeNode::Bottom(b) => b.node_id,
8272 TreeNode::Internal(n2) => n2.node_id,
8273 }
8274 };
8275
8276 let freed = tree.detach_node_by_id(bin_id);
8277 assert!(freed > 0, "detach must free a resident child");
8278 assert_eq!(
8279 counter.load(Ordering::Relaxed),
8280 before,
8281 "EV-13: detach must not change the counter (evictor credits once)"
8282 );
8283 }
8284
8285 /// EV-13: detaching the root or an unknown id is a no-op returning 0.
8286 #[test]
8287 fn test_ev13_detach_root_or_missing_is_noop() {
8288 let tree = Tree::new(9, 4);
8289 for i in 0u8..12 {
8290 tree.insert(
8291 vec![b'a' + i],
8292 vec![i; 8],
8293 Lsn::new(1, u32::from(i) + 1),
8294 )
8295 .unwrap();
8296 }
8297 let root_id = {
8298 let rg = tree.get_root().unwrap();
8299 let g = rg.read();
8300 match &*g {
8301 TreeNode::Internal(n) => n.node_id,
8302 TreeNode::Bottom(b) => b.node_id,
8303 }
8304 };
8305 assert_eq!(
8306 tree.detach_node_by_id(root_id),
8307 0,
8308 "root has no parent IN -> detach is a no-op"
8309 );
8310 assert_eq!(
8311 tree.detach_node_by_id(u64::MAX),
8312 0,
8313 "unknown node id -> detach is a no-op"
8314 );
8315 }
8316
8317 /// DBI-23 (pass-post): the live `memory_counter` must APPROXIMATE the real
8318 /// in-memory heap of the tree, not the old `key + data + 48` lower bound.
8319 ///
8320 /// JE keeps `inMemorySize` (`IN.getBudgetedMemorySize`) in lock-step with
8321 /// the per-node `computeMemorySize`; the over-budget arbiter sees the real
8322 /// figure so eviction fires at the right time. The previous Noxu live
8323 /// path undercounted each BIN slot (48 vs the 64-byte `BinEntry` struct)
8324 /// and never accounted the node-struct fixed overhead, so the counter ran
8325 /// below real heap and the evictor under-fired.
8326 ///
8327 /// We assert the live counter is within tolerance of
8328 /// `total_budgeted_memory` (the authoritative walk-and-sum oracle). The
8329 /// only gap is the per-node fixed struct overhead (BinStub/InNodeStub),
8330 /// which is a small fraction for non-trivial entries — the fix closes the
8331 /// dominant per-slot gap.
8332 #[test]
8333 fn test_dbi23_live_counter_approximates_real_heap() {
8334 use std::sync::atomic::{AtomicI64, Ordering};
8335 let mut tree = Tree::new(42, 32);
8336 let counter = Arc::new(AtomicI64::new(0));
8337 tree.set_memory_counter(Arc::clone(&counter));
8338
8339 // Insert N entries with realistic key+data sizes.
8340 let n = 400u32;
8341 for i in 0..n {
8342 let key = format!("key-{i:08}").into_bytes(); // 12 bytes
8343 let data = vec![0u8; 64]; // 64 bytes
8344 tree.insert(key, data, Lsn::new(1, i + 1)).unwrap();
8345 }
8346
8347 let live = counter.load(Ordering::Relaxed) as u64;
8348 let real = tree.total_budgeted_memory();
8349
8350 // The live counter must reflect the per-slot cost AFTER the T-2/T-3
8351 // compactions hoisted the per-slot key/LSN out of `BinEntry` into the
8352 // node-level reps. The per-slot live charge is now
8353 // `key + data + size_of::<BinEntry>() + 4` (the packed LSN slot); the
8354 // dominant data+key bytes are still charged in full. Assert the live
8355 // counter is at least the data-and-fixed portion (a stable floor that
8356 // does NOT assume the pre-compaction 64-byte slot).
8357 let new_lower_bound: u64 = (0..n)
8358 .map(|i| {
8359 let key_len = format!("key-{i:08}").len();
8360 (key_len + 64 + BIN_ENTRY_OVERHEAD) as u64
8361 })
8362 .sum();
8363
8364 assert!(
8365 live >= new_lower_bound,
8366 "DBI-23: live counter ({live}) must be >= the per-slot-correct \
8367 lower bound ({new_lower_bound})"
8368 );
8369
8370 // Within tolerance of real heap (the residual gap is the per-node
8371 // fixed struct overhead, intentionally not tracked incrementally).
8372 let lower = real * 80 / 100;
8373 assert!(
8374 live >= lower && live <= real,
8375 "DBI-23: live counter ({live}) must approximate real heap ({real}) \
8376 within tolerance [{lower}, {real}]"
8377 );
8378 }
8379
8380 #[test]
8381 fn test_delete_existing_key() {
8382 let tree = Tree::new(1, 128);
8383 let key = b"remove_me".to_vec();
8384 tree.insert(key.clone(), b"val".to_vec(), Lsn::new(1, 10)).unwrap();
8385 assert!(tree.delete(&key));
8386
8387 // After deletion the BIN is empty, so delete returns true the first
8388 // time and false the second time.
8389 assert!(!tree.delete(&key));
8390 }
8391
8392 #[test]
8393 fn test_delete_nonexistent_key() {
8394 let tree = Tree::new(1, 128);
8395 tree.insert(b"a".to_vec(), b"v".to_vec(), Lsn::new(1, 1)).unwrap();
8396
8397 assert!(!tree.delete(b"zzz"));
8398 }
8399
8400 #[test]
8401 fn test_delete_empty_tree() {
8402 let tree = Tree::new(1, 128);
8403 assert!(!tree.delete(b"nothing"));
8404 }
8405
8406 #[test]
8407 fn test_delete_all_entries_makes_bin_empty() {
8408 let tree = Tree::new(1, 128);
8409 tree.insert(b"x".to_vec(), b"1".to_vec(), Lsn::new(1, 1)).unwrap();
8410 tree.insert(b"y".to_vec(), b"2".to_vec(), Lsn::new(1, 2)).unwrap();
8411
8412 assert!(tree.delete(b"x"));
8413 assert!(tree.delete(b"y"));
8414
8415 // Tree still has a root (empty BIN), so is_empty() returns false.
8416 assert!(!tree.is_empty());
8417 // get_first_node should return None for an empty BIN.
8418 assert!(tree.get_first_node().is_none());
8419 }
8420
8421 #[test]
8422 fn test_set_root_and_get_root() {
8423 let tree = Tree::new(1, 128);
8424 assert!(tree.get_root().is_none());
8425
8426 let bin = TreeNode::Bottom(BinStub {
8427 node_id: generate_node_id(),
8428 level: BIN_LEVEL,
8429 entries: vec![],
8430 key_prefix: Vec::new(),
8431 dirty: false,
8432 is_delta: false,
8433 last_full_lsn: NULL_LSN,
8434 last_delta_lsn: NULL_LSN,
8435 generation: 0,
8436 parent: None,
8437 expiration_in_hours: true,
8438 cursor_count: 0,
8439 prohibit_next_delta: false,
8440 lsn_rep: LsnRep::Empty,
8441 keys: KeyRep::new(),
8442 compact_max_key_length: INKeyRep_DEFAULT_MAX_KEY_LENGTH,
8443 });
8444 tree.set_root(bin);
8445 assert!(tree.get_root().is_some());
8446 }
8447
8448 // ========================================================================
8449 // Split / multi-level insert tests (new)
8450 // ========================================================================
8451
8452 /// inserting enough keys to fill the root IN causes
8453 /// the root IN itself to split, resulting in a tree with 3 or more levels.
8454 ///
8455 /// With max_entries_per_node = 4:
8456 /// - Each BIN holds 4 entries before it is split.
8457 /// - The root IN at level 2 holds up to 4 BIN children.
8458 /// - Filling those 4 BINs (16 entries) and adding a 17th forces the
8459 /// root IN to split, creating a level-3 root.
8460 #[test]
8461 fn test_insert_forces_root_split() {
8462 let tree = Tree::new(1, 4);
8463
8464 // 17 inserts with fanout 4 forces the root IN to split.
8465 for i in 0u32..20 {
8466 let key = format!("key{:04}", i).into_bytes();
8467 let data = format!("data{}", i).into_bytes();
8468 let lsn = Lsn::new(1, 100 + i);
8469 let r = tree.insert(key, data, lsn);
8470 assert!(r.is_ok(), "insert {} must succeed", i);
8471 }
8472
8473 // At least one root split must have occurred.
8474 assert!(
8475 tree.get_root_splits() > 0,
8476 "expected at least one root split after 20 inserts with fanout 4"
8477 );
8478
8479 // The root level must be > level-2 (i.e., the tree has grown to 3+ levels).
8480 let root_arc = tree.get_root().as_ref().unwrap().clone();
8481 let root_level = root_arc.read().level();
8482 let level_2 = MAIN_LEVEL | 2;
8483 assert!(
8484 root_level > level_2,
8485 "root level {} must be > level-2 after root split",
8486 root_level
8487 );
8488 }
8489
8490 /// Inserting 1000 keys in sorted order and verifying all are searchable.
8491 #[test]
8492 fn test_insert_many_keys() {
8493 let tree = Tree::new(1, 8);
8494 let n = 1000u32;
8495
8496 for i in 0..n {
8497 let key = format!("key{:08}", i).into_bytes();
8498 let data = format!("data{}", i).into_bytes();
8499 let lsn = Lsn::new(1, i);
8500 let r = tree.insert(key, data, lsn);
8501 assert!(r.is_ok(), "insert {} must succeed", i);
8502 }
8503
8504 // All keys must be findable.
8505 for i in 0..n {
8506 let key = format!("key{:08}", i).into_bytes();
8507 let sr = tree.search(&key);
8508 assert!(
8509 sr.is_some() && sr.unwrap().exact_parent_found,
8510 "key{:08} must be found after bulk insert",
8511 i
8512 );
8513 }
8514 }
8515
8516 /// Inserting 500 keys in pseudo-random (reverse) order and verifying all
8517 /// are searchable.
8518 #[test]
8519 fn test_insert_random_keys() {
8520 let tree = Tree::new(1, 8);
8521 let n = 500u32;
8522
8523 // Insert in reverse order as a simple non-sorted sequence.
8524 for i in (0..n).rev() {
8525 let key = format!("rkey{:08}", i).into_bytes();
8526 let data = format!("data{}", i).into_bytes();
8527 let lsn = Lsn::new(1, i);
8528 let r = tree.insert(key, data, lsn);
8529 assert!(r.is_ok(), "insert {} must succeed", i);
8530 }
8531
8532 for i in 0..n {
8533 let key = format!("rkey{:08}", i).into_bytes();
8534 let sr = tree.search(&key);
8535 assert!(
8536 sr.is_some() && sr.unwrap().exact_parent_found,
8537 "rkey{:08} must be found",
8538 i
8539 );
8540 }
8541 }
8542
8543 /// After any number of splits, every key inserted must still be findable.
8544 ///
8545 #[test]
8546 fn test_split_preserves_all_keys() {
8547 // Tiny fanout to maximise split frequency.
8548 let tree = Tree::new(1, 3);
8549 let n = 60u32;
8550
8551 let mut keys: Vec<Vec<u8>> = Vec::new();
8552 for i in 0..n {
8553 let key = format!("sk{:04}", i).into_bytes();
8554 keys.push(key.clone());
8555 let data = format!("d{}", i).into_bytes();
8556 let lsn = Lsn::new(1, i);
8557 let r = tree.insert(key, data, lsn);
8558 assert!(r.is_ok(), "insert {} must not fail", i);
8559 }
8560
8561 // After all inserts (and all the splits they induced), every key must
8562 // still be findable in the tree.
8563 for key in &keys {
8564 let sr = tree.search(key);
8565 assert!(
8566 sr.is_some() && sr.unwrap().exact_parent_found,
8567 "key {:?} must survive all splits",
8568 std::str::from_utf8(key).unwrap_or("?")
8569 );
8570 }
8571 }
8572
8573 /// The tree level (depth) must grow as keys are inserted and splits occur.
8574 #[test]
8575 fn test_tree_height_grows() {
8576 let tree = Tree::new(1, 4);
8577
8578 // With fanout 4, one level-2 root IN can hold 4 children. After enough
8579 // inserts the root itself will split and a level-3 node will appear.
8580 // Insert enough keys to force the root to split at least once.
8581 let n = 40u32;
8582 for i in 0..n {
8583 let key = format!("hk{:08}", i).into_bytes();
8584 let data = format!("d{}", i).into_bytes();
8585 let lsn = Lsn::new(1, i);
8586 tree.insert(key, data, lsn).unwrap();
8587 }
8588
8589 // At least one root split must have occurred.
8590 assert!(
8591 tree.get_root_splits() > 0,
8592 "expected root to have split at least once for {} keys with fanout 4",
8593 n
8594 );
8595
8596 // The root level must be > level-2 (i.e., the tree has grown past two levels).
8597 let root_arc = tree.get_root().as_ref().unwrap().clone();
8598 let root_level = root_arc.read().level();
8599 let level_2 = MAIN_LEVEL | 2;
8600 assert!(
8601 root_level > level_2,
8602 "root level {} must be > {} after enough inserts",
8603 root_level,
8604 level_2
8605 );
8606 }
8607
8608 #[test]
8609 fn test_find_entry_on_internal_node() {
8610 let mut entries = vec![];
8611 for i in 0..4 {
8612 entries.push(InEntry { key: format!("k{}", i).into_bytes() });
8613 }
8614 let internal = TreeNode::Internal(InNodeStub {
8615 node_id: 1,
8616 level: MAIN_LEVEL + 2,
8617 entries,
8618 targets: TargetRep::None,
8619 dirty: false,
8620 generation: 0,
8621 parent: None,
8622 lsn_rep: LsnRep::Empty,
8623 });
8624
8625 // Exact match
8626 let r = internal.find_entry(b"k2", false, true);
8627 assert_ne!(r & EXACT_MATCH, 0);
8628 assert_eq!(r & 0xFFFF, 2);
8629
8630 // No exact match with exact=true
8631 let r = internal.find_entry(b"kx", false, true);
8632 assert_eq!(r, -1);
8633 }
8634
8635 // St-H5: non-exact `find_entry` on an Internal node must return the FLOOR
8636 // child slot (largest entry ≤ key), not the insertion point. Entries are
8637 // k0,k1,k2,k3; slot 0 is the leftmost child.
8638 #[test]
8639 fn test_find_entry_internal_nonexact_returns_floor() {
8640 let mut entries = vec![];
8641 for i in 0..4 {
8642 entries.push(InEntry { key: format!("k{}", i).into_bytes() });
8643 }
8644 let internal = TreeNode::Internal(InNodeStub {
8645 node_id: 1,
8646 level: MAIN_LEVEL + 2,
8647 entries,
8648 targets: TargetRep::None,
8649 dirty: false,
8650 generation: 0,
8651 parent: None,
8652 lsn_rep: LsnRep::Empty,
8653 });
8654
8655 // Key below every separator floors to slot 0 (leftmost child).
8656 assert_eq!(internal.find_entry(b"a", false, false) & 0xFFFF, 0);
8657 // Between k1 and k2 floors to k1 (slot 1).
8658 assert_eq!(internal.find_entry(b"k1x", false, false) & 0xFFFF, 1);
8659 // Above every separator floors to the last slot (k3 = slot 3).
8660 assert_eq!(internal.find_entry(b"zzz", false, false) & 0xFFFF, 3);
8661 // Exact match still reported as the exact slot.
8662 let r = internal.find_entry(b"k2", false, false);
8663 assert_ne!(r & EXACT_MATCH, 0);
8664 assert_eq!(r & 0xFFFF, 2);
8665 }
8666
8667 // ========================================================================
8668 // New tests: dirty tracking, generation, parent pointers, log size, stats
8669 // ========================================================================
8670
8671 /// After inserting into a tree, the BIN (and root IN) must be dirty.
8672 ///
8673 /// The: Tree.insertLN() calls bin.setDirty(true) after each insert.
8674 #[test]
8675 fn test_insert_marks_bin_dirty() {
8676 let tree = Tree::new(1, 128);
8677 tree.insert(b"key1".to_vec(), b"val1".to_vec(), Lsn::new(1, 1))
8678 .unwrap();
8679
8680 let root_arc = tree.get_root().as_ref().unwrap().clone();
8681 // root is an upper IN — its slot 0 child is the BIN.
8682 let bin_arc = {
8683 let g = root_arc.read();
8684 match &*g {
8685 TreeNode::Internal(n) => n.get_child(0).unwrap(),
8686 _ => panic!("expected Internal root"),
8687 }
8688 };
8689
8690 let bin_dirty = bin_arc.read().is_dirty();
8691 assert!(bin_dirty, "BIN must be dirty after insert");
8692 }
8693
8694 /// Updating an existing key keeps the BIN dirty.
8695 #[test]
8696 fn test_update_keeps_bin_dirty() {
8697 let tree = Tree::new(1, 128);
8698 tree.insert(b"k".to_vec(), b"v1".to_vec(), Lsn::new(1, 1)).unwrap();
8699 // second insert is an update
8700 tree.insert(b"k".to_vec(), b"v2".to_vec(), Lsn::new(1, 2)).unwrap();
8701
8702 let root_arc = tree.get_root().as_ref().unwrap().clone();
8703 let bin_arc = {
8704 let g = root_arc.read();
8705 match &*g {
8706 TreeNode::Internal(n) => n.get_child(0).unwrap(),
8707 _ => panic!("expected Internal root"),
8708 }
8709 };
8710
8711 assert!(bin_arc.read().is_dirty(), "BIN must be dirty after update");
8712 }
8713
8714 /// After deleting a key the BIN must be dirty.
8715 #[test]
8716 fn test_delete_marks_bin_dirty() {
8717 let tree = Tree::new(1, 128);
8718 tree.insert(b"del".to_vec(), b"val".to_vec(), Lsn::new(1, 1)).unwrap();
8719
8720 // Manually clear dirty flag to verify delete re-sets it.
8721 {
8722 let root_arc = tree.get_root().as_ref().unwrap().clone();
8723 let bin_arc = {
8724 let g = root_arc.read();
8725 match &*g {
8726 TreeNode::Internal(n) => n.get_child(0).unwrap(),
8727 _ => panic!("expected Internal root"),
8728 }
8729 };
8730 bin_arc.write().set_dirty(false);
8731 assert!(!bin_arc.read().is_dirty());
8732 }
8733
8734 tree.delete(b"del");
8735
8736 let root_arc = tree.get_root().as_ref().unwrap().clone();
8737 let bin_arc = {
8738 let g = root_arc.read();
8739 match &*g {
8740 TreeNode::Internal(n) => n.get_child(0).unwrap(),
8741 _ => panic!("expected Internal root"),
8742 }
8743 };
8744 assert!(bin_arc.read().is_dirty(), "BIN must be dirty after delete");
8745 }
8746
8747 /// BIN's parent pointer must point to the root IN.
8748 #[test]
8749 fn test_bin_parent_pointer_set_on_initial_insert() {
8750 let tree = Tree::new(1, 128);
8751 tree.insert(b"k".to_vec(), b"v".to_vec(), Lsn::new(1, 1)).unwrap();
8752
8753 let root_arc = tree.get_root().as_ref().unwrap().clone();
8754 let bin_arc = {
8755 let g = root_arc.read();
8756 match &*g {
8757 TreeNode::Internal(n) => n.get_child(0).unwrap(),
8758 _ => panic!("expected Internal root"),
8759 }
8760 };
8761
8762 let parent_weak = bin_arc.read().get_parent();
8763 assert!(parent_weak.is_some(), "BIN must have a parent pointer");
8764
8765 // Upgrading the weak pointer must give us the root arc.
8766 let parent_arc = parent_weak.unwrap().upgrade().unwrap();
8767 assert!(
8768 Arc::ptr_eq(&parent_arc, &root_arc),
8769 "BIN parent must be the root IN"
8770 );
8771 }
8772
8773 /// set_dirty / is_dirty round-trip on both variants.
8774 #[test]
8775 fn test_dirty_flag_roundtrip() {
8776 let mut bin_node = TreeNode::Bottom(BinStub {
8777 node_id: 1,
8778 level: BIN_LEVEL,
8779 entries: vec![],
8780 key_prefix: Vec::new(),
8781 dirty: false,
8782 is_delta: false,
8783 last_full_lsn: NULL_LSN,
8784 last_delta_lsn: NULL_LSN,
8785 generation: 0,
8786 parent: None,
8787 expiration_in_hours: true,
8788 cursor_count: 0,
8789 prohibit_next_delta: false,
8790 lsn_rep: LsnRep::Empty,
8791 keys: KeyRep::new(),
8792 compact_max_key_length: INKeyRep_DEFAULT_MAX_KEY_LENGTH,
8793 });
8794 assert!(!bin_node.is_dirty());
8795 bin_node.set_dirty(true);
8796 assert!(bin_node.is_dirty());
8797 bin_node.set_dirty(false);
8798 assert!(!bin_node.is_dirty());
8799
8800 let mut in_node = TreeNode::Internal(InNodeStub {
8801 node_id: 2,
8802 level: MAIN_LEVEL | 2,
8803 entries: vec![],
8804 targets: TargetRep::None,
8805 dirty: false,
8806 generation: 0,
8807 parent: None,
8808 lsn_rep: LsnRep::Empty,
8809 });
8810 assert!(!in_node.is_dirty());
8811 in_node.set_dirty(true);
8812 assert!(in_node.is_dirty());
8813 }
8814
8815 /// set_generation / get_generation round-trip on both variants.
8816 #[test]
8817 fn test_generation_roundtrip() {
8818 let mut bin_node = TreeNode::Bottom(BinStub {
8819 node_id: 1,
8820 level: BIN_LEVEL,
8821 entries: vec![],
8822 key_prefix: Vec::new(),
8823 dirty: false,
8824 is_delta: false,
8825 last_full_lsn: NULL_LSN,
8826 last_delta_lsn: NULL_LSN,
8827 generation: 0,
8828 parent: None,
8829 expiration_in_hours: true,
8830 cursor_count: 0,
8831 prohibit_next_delta: false,
8832 lsn_rep: LsnRep::Empty,
8833 keys: KeyRep::new(),
8834 compact_max_key_length: INKeyRep_DEFAULT_MAX_KEY_LENGTH,
8835 });
8836 assert_eq!(bin_node.get_generation(), 0);
8837 bin_node.set_generation(42);
8838 assert_eq!(bin_node.get_generation(), 42);
8839
8840 let mut in_node = TreeNode::Internal(InNodeStub {
8841 node_id: 2,
8842 level: MAIN_LEVEL | 2,
8843 entries: vec![],
8844 targets: TargetRep::None,
8845 dirty: false,
8846 generation: 0,
8847 parent: None,
8848 lsn_rep: LsnRep::Empty,
8849 });
8850 in_node.set_generation(99);
8851 assert_eq!(in_node.get_generation(), 99);
8852 }
8853
8854 /// log_size() must be consistent with write_to_bytes() length.
8855 #[test]
8856 fn test_log_size_matches_bytes_len() {
8857 // BIN stub with some entries.
8858 let bin_node = TreeNode::Bottom(BinStub {
8859 node_id: 7,
8860 level: BIN_LEVEL,
8861 entries: vec![
8862 BinEntry {
8863 data: Some(b"d1".to_vec()),
8864 known_deleted: false,
8865 dirty: false,
8866 expiration_time: 0,
8867 },
8868 BinEntry {
8869 data: None,
8870 known_deleted: false,
8871 dirty: false,
8872 expiration_time: 0,
8873 },
8874 ],
8875 key_prefix: Vec::new(),
8876 dirty: true,
8877 is_delta: false,
8878 last_full_lsn: NULL_LSN,
8879 last_delta_lsn: NULL_LSN,
8880 generation: 5,
8881 parent: None,
8882 expiration_in_hours: true,
8883 cursor_count: 0,
8884 prohibit_next_delta: false,
8885 lsn_rep: LsnRep::Empty,
8886 keys: KeyRep::from_keys(vec![b"alpha".to_vec(), b"beta".to_vec()]),
8887 compact_max_key_length: INKeyRep_DEFAULT_MAX_KEY_LENGTH,
8888 });
8889 assert_eq!(bin_node.log_size(), bin_node.write_to_bytes().len());
8890
8891 // IN stub with some entries.
8892 let in_node = TreeNode::Internal(InNodeStub {
8893 node_id: 8,
8894 level: MAIN_LEVEL | 2,
8895 entries: vec![
8896 InEntry { key: vec![] },
8897 InEntry { key: b"mid".to_vec() },
8898 ],
8899 targets: TargetRep::None,
8900 dirty: false,
8901 generation: 0,
8902 parent: None,
8903 lsn_rep: LsnRep::Empty,
8904 });
8905 assert_eq!(in_node.log_size(), in_node.write_to_bytes().len());
8906 }
8907
8908 /// write_to_bytes() output contains the node_id and dirty flag.
8909 #[test]
8910 fn test_write_to_bytes_encodes_node_id_and_dirty() {
8911 let node = TreeNode::Bottom(BinStub {
8912 node_id: 0xDEAD_BEEF_0000_0001,
8913 level: BIN_LEVEL,
8914 entries: vec![],
8915 key_prefix: Vec::new(),
8916 dirty: true,
8917 is_delta: false,
8918 last_full_lsn: NULL_LSN,
8919 last_delta_lsn: NULL_LSN,
8920 generation: 0,
8921 parent: None,
8922 expiration_in_hours: true,
8923 cursor_count: 0,
8924 prohibit_next_delta: false,
8925 lsn_rep: LsnRep::Empty,
8926 keys: KeyRep::new(),
8927 compact_max_key_length: INKeyRep_DEFAULT_MAX_KEY_LENGTH,
8928 });
8929 let bytes = node.write_to_bytes();
8930 // First 8 bytes = node_id big-endian.
8931 let id_bytes = &bytes[0..8];
8932 assert_eq!(id_bytes, 0xDEAD_BEEF_0000_0001u64.to_be_bytes());
8933 // Byte at offset 16 (after node_id[8] + level[4] + n_entries[4]) = dirty flag.
8934 assert_eq!(bytes[16], 1u8, "dirty flag must be 1");
8935 }
8936
8937 /// log_size() grows as entries are added.
8938 #[test]
8939 fn test_log_size_grows_with_entries() {
8940 let empty = TreeNode::Bottom(BinStub {
8941 node_id: 1,
8942 level: BIN_LEVEL,
8943 entries: vec![],
8944 key_prefix: Vec::new(),
8945 dirty: false,
8946 is_delta: false,
8947 last_full_lsn: NULL_LSN,
8948 last_delta_lsn: NULL_LSN,
8949 generation: 0,
8950 parent: None,
8951 expiration_in_hours: true,
8952 cursor_count: 0,
8953 prohibit_next_delta: false,
8954 lsn_rep: LsnRep::Empty,
8955 keys: KeyRep::new(),
8956 compact_max_key_length: INKeyRep_DEFAULT_MAX_KEY_LENGTH,
8957 });
8958 let with_entry = TreeNode::Bottom(BinStub {
8959 node_id: 2,
8960 level: BIN_LEVEL,
8961 entries: vec![BinEntry {
8962 data: None,
8963 known_deleted: false,
8964 dirty: false,
8965 expiration_time: 0,
8966 }],
8967 key_prefix: Vec::new(),
8968 dirty: false,
8969 is_delta: false,
8970 last_full_lsn: NULL_LSN,
8971 last_delta_lsn: NULL_LSN,
8972 generation: 0,
8973 parent: None,
8974 expiration_in_hours: true,
8975 cursor_count: 0,
8976 prohibit_next_delta: false,
8977 lsn_rep: LsnRep::Empty,
8978 keys: KeyRep::from_keys(vec![b"longkey_here".to_vec()]),
8979 compact_max_key_length: INKeyRep_DEFAULT_MAX_KEY_LENGTH,
8980 });
8981 assert!(
8982 with_entry.log_size() > empty.log_size(),
8983 "log_size must grow when entries are added"
8984 );
8985 }
8986
8987 /// propagate_dirty_to_root() marks all ancestors dirty.
8988 #[test]
8989 fn test_propagate_dirty_to_root() {
8990 // Build a 2-level tree manually: root IN -> BIN.
8991 let bin_arc = Arc::new(RwLock::new(TreeNode::Bottom(BinStub {
8992 node_id: generate_node_id(),
8993 level: BIN_LEVEL,
8994 entries: vec![],
8995 key_prefix: Vec::new(),
8996 dirty: false,
8997 is_delta: false,
8998 last_full_lsn: NULL_LSN,
8999 last_delta_lsn: NULL_LSN,
9000 generation: 0,
9001 parent: None, // set below
9002 expiration_in_hours: true,
9003 cursor_count: 0,
9004 prohibit_next_delta: false,
9005 lsn_rep: LsnRep::Empty,
9006 keys: KeyRep::new(),
9007 compact_max_key_length: INKeyRep_DEFAULT_MAX_KEY_LENGTH,
9008 })));
9009
9010 let root_arc = Arc::new(RwLock::new(TreeNode::Internal(InNodeStub {
9011 node_id: generate_node_id(),
9012 level: MAIN_LEVEL | 2,
9013 entries: vec![InEntry { key: vec![] }],
9014 targets: TargetRep::Sparse(vec![(0, bin_arc.clone())]),
9015 dirty: false,
9016 generation: 0,
9017 parent: None,
9018 lsn_rep: LsnRep::Empty,
9019 })));
9020
9021 // Wire BIN's parent to root.
9022 bin_arc.write().set_parent(Some(Arc::downgrade(&root_arc)));
9023
9024 // Root is not dirty before propagation.
9025 assert!(!root_arc.read().is_dirty());
9026
9027 // Propagate from the BIN up.
9028 Tree::propagate_dirty_to_root(&bin_arc);
9029
9030 // Root must now be dirty.
9031 assert!(
9032 root_arc.read().is_dirty(),
9033 "root must be dirty after propagate_dirty_to_root"
9034 );
9035 }
9036
9037 /// collect_stats() on an empty tree returns all-zero stats.
9038 #[test]
9039 fn test_collect_stats_empty_tree() {
9040 let tree = Tree::new(1, 128);
9041 let stats = tree.collect_stats();
9042 assert_eq!(stats, TreeStats::default());
9043 }
9044
9045 /// collect_stats() on a single-entry tree: 1 IN + 1 BIN, height 2.
9046 #[test]
9047 fn test_collect_stats_single_insert() {
9048 let tree = Tree::new(1, 128);
9049 tree.insert(b"k".to_vec(), b"v".to_vec(), Lsn::new(1, 1)).unwrap();
9050 let stats = tree.collect_stats();
9051 assert_eq!(stats.n_bins, 1, "must have 1 BIN");
9052 assert_eq!(stats.n_ins, 1, "must have 1 upper IN");
9053 assert_eq!(stats.height, 2, "single-entry tree has height 2");
9054 assert!(stats.n_entries >= 1, "must have at least 1 entry total");
9055 }
9056
9057 /// collect_stats() with many inserts: entry count matches insert count.
9058 #[test]
9059 fn test_collect_stats_many_inserts() {
9060 let tree = Tree::new(1, 8);
9061 let n = 50u32;
9062 for i in 0..n {
9063 let key = format!("sk{:04}", i).into_bytes();
9064 tree.insert(key, b"v".to_vec(), Lsn::new(1, i)).unwrap();
9065 }
9066 let stats = tree.collect_stats();
9067 // All n entries should be accounted for across all BINs.
9068 // n_entries counts entries in both INs and BINs; BIN entries = n.
9069 // We verify BIN entry total equals n by summing manually.
9070 let bin_entries: u64 = stats.n_entries - stats.n_ins; // rough check
9071 // A more precise assertion: the sum of all BIN entries == n.
9072 // Since we can't easily separate, just assert the tree is non-trivial.
9073 assert!(stats.n_bins > 0, "must have at least one BIN");
9074 assert!(stats.height >= 2, "multi-entry tree has height >= 2");
9075 // Total entries in the tree must be >= n (BIN entries alone).
9076 assert!(
9077 bin_entries >= n as u64 || stats.n_entries >= n as u64,
9078 "entry count must account for all inserts"
9079 );
9080 }
9081
9082 // ========================================================================
9083 // Tests: B-tree merge / compress
9084 // ========================================================================
9085
9086 /// After deleting most keys from a tree, compress() must reduce the BIN
9087 /// count by merging under-full siblings.
9088 ///
9089 /// Strategy: build a large tree (many BINs), delete almost all keys,
9090 /// then verify compress() reduces n_bins and all surviving keys remain
9091 /// findable. We do not hard-code the exact BIN counts because the
9092 /// preemptive splitting strategy determines the exact split points.
9093 #[test]
9094 fn test_compress_merges_underfull_bins() {
9095 let tree = Tree::new(1, 8);
9096
9097 // Insert 64 sorted keys to build a multi-BIN tree.
9098 let n = 64u32;
9099 let keys: Vec<Vec<u8>> =
9100 (0..n).map(|i| format!("cm{:04}", i).into_bytes()).collect();
9101 for (i, key) in keys.iter().enumerate() {
9102 tree.insert(key.clone(), vec![i as u8], Lsn::new(1, i as u32))
9103 .unwrap();
9104 }
9105
9106 let stats_full = tree.collect_stats();
9107 assert!(
9108 stats_full.n_bins >= 2,
9109 "must have multiple BINs after 64 inserts"
9110 );
9111
9112 // Delete all but 4 widely-spaced keys (one roughly per BIN pair).
9113 // We keep every 16th key: k0000, k0016, k0032, k0048.
9114 let keep: std::collections::HashSet<u32> =
9115 [0, 16, 32, 48].iter().cloned().collect();
9116 for i in 0..n {
9117 if !keep.contains(&i) {
9118 let key = format!("cm{:04}", i).into_bytes();
9119 tree.delete(&key);
9120 }
9121 }
9122
9123 let stats_sparse = tree.collect_stats();
9124 assert!(
9125 stats_sparse.n_bins >= 2,
9126 "should still have multiple BINs before compress"
9127 );
9128
9129 // compress() must reduce BIN count since most BINs now hold 0–1 entries.
9130 tree.compress();
9131
9132 let stats_after = tree.collect_stats();
9133 assert!(
9134 stats_after.n_bins < stats_sparse.n_bins,
9135 "compress must reduce BIN count (was {}, now {})",
9136 stats_sparse.n_bins,
9137 stats_after.n_bins
9138 );
9139
9140 // Surviving keys must still be findable.
9141 for i in keep {
9142 let key = format!("cm{:04}", i).into_bytes();
9143 let sr = tree.search(&key);
9144 assert!(
9145 sr.is_some() && sr.unwrap().exact_parent_found,
9146 "key cm{:04} must survive compress",
9147 i
9148 );
9149 }
9150 }
9151
9152 /// compress() preserves all entries: a full-BIN tree has fewer merges
9153 /// but all keys remain accessible.
9154 #[test]
9155 fn test_compress_no_op_when_full() {
9156 // Insert exactly max_entries worth of keys into a single BIN — no split
9157 // will have occurred yet, and the BINs will all be reasonably full.
9158 // We can't prevent splits entirely (preemptive), but we can verify that
9159 // compress() never loses entries.
9160 let tree = Tree::new(1, 8);
9161 let n = 32u32;
9162 for i in 0..n {
9163 let key = format!("fn{:04}", i).into_bytes();
9164 tree.insert(key, vec![i as u8], Lsn::new(1, i)).unwrap();
9165 }
9166
9167 let stats_before = tree.collect_stats();
9168 tree.compress();
9169 let stats_after = tree.collect_stats();
9170
9171 // All keys still findable.
9172 for i in 0..n {
9173 let key = format!("fn{:04}", i).into_bytes();
9174 let sr = tree.search(&key);
9175 assert!(
9176 sr.is_some() && sr.unwrap().exact_parent_found,
9177 "key fn{:04} must be findable after compress",
9178 i
9179 );
9180 }
9181
9182 // BIN count must not increase.
9183 assert!(
9184 stats_after.n_bins <= stats_before.n_bins,
9185 "compress must not increase BIN count"
9186 );
9187 }
9188
9189 /// compress() on an empty tree must not panic.
9190 #[test]
9191 fn test_compress_empty_tree() {
9192 let tree = Tree::new(1, 4);
9193 tree.compress(); // must not panic
9194 }
9195
9196 /// Deterministic regression for the BIN/IN split-path check-then-act race
9197 /// (`.agent/archived-audits/bench/bug-bin-split-concurrency.md`).
9198 ///
9199 /// `insert_recursive_inner` checks `child.get_n_entries() >= max_entries`
9200 /// under a PARENT READ lock, drops that read lock (required — the split
9201 /// needs `parent.write()`), then calls `split_child`. In the drop→reacquire
9202 /// window a racing thread (a second splitter, or the INCompressor merging
9203 /// and CLEARING a sibling — `compress_node`'s `lb.entries.clear()`) can
9204 /// leave the child no longer full, or even empty. Pre-fix, `split_child`
9205 /// then built a `SplitEntries` from that stale child and
9206 /// `SplitEntries::get_key(split_index)` panicked with
9207 /// "index out of bounds: len is 0" on the empty entries vec.
9208 ///
9209 /// This test drives the exact interleaving deterministically: it builds a
9210 /// level-2 tree, empties a full BIN child in place (simulating the racing
9211 /// merge), then calls `split_child` on it directly. With the fix
9212 /// `split_child` re-validates fullness under the child write lock and
9213 /// returns `Ok(())` (a benign no-op); without the fix it panics in
9214 /// `get_key`.
9215 ///
9216 /// JE-faithful: `IN.split` re-checks `needsSplitting()` after latching the
9217 /// node it will split (IN.java IN.split / IN.needsSplitting).
9218 #[test]
9219 fn split_child_is_noop_when_child_no_longer_full() {
9220 let max_entries = 8usize;
9221 let tree = Tree::new(1, max_entries);
9222
9223 // Build a level-2 tree: insert enough sorted keys to force at least one
9224 // split so the root becomes an Internal node with BIN children.
9225 for i in 0..64u32 {
9226 tree.insert(
9227 format!("k{:04}", i).into_bytes(),
9228 vec![i as u8],
9229 Lsn::new(1, i),
9230 )
9231 .unwrap();
9232 }
9233
9234 let root_arc = tree.get_root().expect("root resident");
9235
9236 // Pick child slot 0 (any resident BIN child works — the panic is about
9237 // the child being empty at split time, not about how it got there).
9238 let child_arc = {
9239 let g = root_arc.read();
9240 let TreeNode::Internal(n) = &*g else {
9241 panic!("expected a level-2 tree (root should be Internal)");
9242 };
9243 n.get_child(0).expect("resident child at slot 0")
9244 };
9245 let child_index = 0usize;
9246
9247 // Simulate the racing merge: clear the child's entries in place, the
9248 // way `compress_node` clears the merged-away left sibling. This is the
9249 // stale state a second `split_child` (or a split racing the compressor)
9250 // observes after the fullness check was already passed under the now-
9251 // dropped parent read lock.
9252 {
9253 let mut cg = child_arc.write();
9254 match &mut *cg {
9255 TreeNode::Bottom(b) => {
9256 b.entries.clear();
9257 b.lsn_rep = LsnRep::Empty;
9258 b.keys = KeyRep::new();
9259 }
9260 TreeNode::Internal(n) => {
9261 n.entries.clear();
9262 n.lsn_rep = LsnRep::Empty;
9263 n.targets = TargetRep::None;
9264 }
9265 }
9266 assert_eq!(cg.get_n_entries(), 0, "child must now be empty");
9267 }
9268
9269 // Directly call the split path. Pre-fix this panics in
9270 // `SplitEntries::get_key(0)` on the empty vec; post-fix it re-validates
9271 // fullness under the child write lock and returns Ok(()) (no-op).
9272 let res = Tree::split_child(
9273 &root_arc,
9274 child_index,
9275 max_entries,
9276 Lsn::new(1, 999),
9277 SplitHint::Normal,
9278 b"k0000",
9279 None, // no comparator
9280 false, // key_prefixing off
9281 None, // no InListListener
9282 );
9283 assert!(
9284 res.is_ok(),
9285 "split_child on an emptied (no-longer-full) child must be a benign \
9286 no-op, got {:?}",
9287 res
9288 );
9289 }
9290
9291 /// After deleting all entries, compress() reduces BINs to 1.
9292 #[test]
9293 fn test_compress_removes_empty_bin_from_parent() {
9294 let tree = Tree::new(1, 4);
9295 // Insert enough keys to generate multiple BINs.
9296 let n = 16u32;
9297 for i in 0..n {
9298 let key = format!("ep{:04}", i).into_bytes();
9299 tree.insert(key, vec![i as u8], Lsn::new(1, i)).unwrap();
9300 }
9301
9302 let stats_before = tree.collect_stats();
9303 assert!(stats_before.n_bins >= 2, "need multiple BINs for this test");
9304
9305 // Delete everything except the very last key.
9306 for i in 0..n - 1 {
9307 let key = format!("ep{:04}", i).into_bytes();
9308 tree.delete(&key);
9309 }
9310
9311 tree.compress();
9312
9313 let stats_after = tree.collect_stats();
9314 assert!(
9315 stats_after.n_bins < stats_before.n_bins,
9316 "compress must reduce BIN count after mass deletion"
9317 );
9318
9319 // The surviving key must still be findable.
9320 let last_key = format!("ep{:04}", n - 1).into_bytes();
9321 let sr = tree.search(&last_key);
9322 assert!(
9323 sr.is_some() && sr.unwrap().exact_parent_found,
9324 "last key must survive after compress"
9325 );
9326 }
9327
9328 // ========================================================================
9329 // IC-1: prune_empty_bin must NOT remove a live entry when the BIN was
9330 // repopulated between the compressor observing it empty and the prune.
9331 // (Tree corruption / lost-write regression test.)
9332 // ========================================================================
9333
9334 /// Find a BIN arc that is currently empty (0 entries) and is NOT the
9335 /// root, returning it together with the `id_key` the compressor would
9336 /// have captured (here we just use any key that routes to that BIN).
9337 fn first_empty_non_root_bin(tree: &Tree) -> Option<Arc<RwLock<TreeNode>>> {
9338 let root = tree.get_root()?;
9339 for node in tree.rebuild_in_list() {
9340 if Arc::ptr_eq(&node, &root) {
9341 continue; // skip root (single-BIN tree is never pruned)
9342 }
9343 let is_empty_bin = {
9344 let g = node.read();
9345 matches!(&*g, TreeNode::Bottom(b) if b.entries.is_empty())
9346 };
9347 if is_empty_bin {
9348 return Some(node);
9349 }
9350 }
9351 None
9352 }
9353
9354 /// IC-1 (fail-pre / pass-post): the old `compress_bin` prune step called
9355 /// `self.delete(&id_key)`, which re-descends by key. If a concurrent
9356 /// insert repopulated the empty BIN with a LIVE entry under that same
9357 /// `id_key`, `self.delete` would silently remove the live entry — a lost
9358 /// write. `prune_empty_bin` re-validates `n_entries == 0` under the
9359 /// parent latch and must REMOVE NOTHING when the BIN is non-empty.
9360 ///
9361 /// JE `Tree.delete` / `searchDeletableSubTree` (Tree.java ~line 755-800):
9362 /// `bin.getNEntries() != 0` → NODE_NOT_EMPTY (abort prune).
9363 #[test]
9364 fn test_ic1_prune_empty_bin_aborts_when_repopulated() {
9365 let tree = Tree::new(1, 4);
9366 let n = 16u32;
9367 for i in 0..n {
9368 let key = format!("ic{:04}", i).into_bytes();
9369 tree.insert(key, vec![i as u8], Lsn::new(1, i)).unwrap();
9370 }
9371 assert!(
9372 tree.collect_stats().n_bins >= 2,
9373 "need multiple BINs for this test"
9374 );
9375
9376 // Empty out one whole BIN by deleting every key it holds. We delete
9377 // the lowest 4 keys (ic0000..ic0003) which share the first BIN, then
9378 // physically compress it so it has 0 entries.
9379 for i in 0..4 {
9380 let key = format!("ic{:04}", i).into_bytes();
9381 tree.delete(&key);
9382 }
9383
9384 // Locate the now-empty BIN and the id_key the compressor would use.
9385 let empty_bin = match first_empty_non_root_bin(&tree) {
9386 Some(b) => b,
9387 // If the layout didn't leave an isolated empty BIN, the scenario
9388 // isn't reproducible on this build; treat as vacuously passing.
9389 None => return,
9390 };
9391
9392 // SIMULATE THE RACE: a concurrent insert repopulates the empty BIN
9393 // with a LIVE entry *before* the prune runs. We insert directly into
9394 // the BIN arc to model the insert that lands after `now_empty` was
9395 // read. Pick a key that routes to this BIN.
9396 let live_key = format!("ic{:04}", 1).into_bytes(); // was deleted above
9397 {
9398 let mut g = empty_bin.write();
9399 if let TreeNode::Bottom(b) = &mut *g {
9400 // T-2/T-3: route through the insert helper so entries/keys/
9401 // lsn_rep stay in lock step.
9402 b.insert_with_prefix(
9403 live_key.clone(),
9404 Lsn::new(1, 1),
9405 Some(vec![0xAB]),
9406 );
9407 }
9408 }
9409 let id_key = {
9410 let g = empty_bin.read();
9411 match &*g {
9412 TreeNode::Bottom(b) => b.get_full_key(0).unwrap(),
9413 _ => unreachable!(),
9414 }
9415 };
9416
9417 // Prune must ABORT (return false) because the BIN is no longer empty,
9418 // and must NOT remove the live entry.
9419 let pruned = tree.prune_empty_bin(&id_key);
9420 assert!(!pruned, "IC-1: prune must abort when the BIN was repopulated");
9421
9422 // The live entry must still be present in the BIN.
9423 let still_there = {
9424 let g = empty_bin.read();
9425 match &*g {
9426 TreeNode::Bottom(b) => {
9427 b.entries.iter().enumerate().any(|(i, _)| {
9428 b.key_prefix.is_empty() && b.get_key(i) == live_key
9429 })
9430 }
9431 _ => false,
9432 }
9433 };
9434 assert!(
9435 still_there,
9436 "IC-1: prune must not remove the repopulated live entry"
9437 );
9438 }
9439
9440 /// IC-1 companion: prune_empty_bin must abort when a cursor is parked on
9441 /// the (still-empty) BIN. JE: `bin.nCursors() > 0` → CURSORS_EXIST.
9442 #[test]
9443 fn test_ic1_prune_empty_bin_aborts_with_cursor() {
9444 let tree = Tree::new(1, 4);
9445 for i in 0..16u32 {
9446 let key = format!("cu{:04}", i).into_bytes();
9447 tree.insert(key, vec![i as u8], Lsn::new(1, i)).unwrap();
9448 }
9449 for i in 0..4 {
9450 let key = format!("cu{:04}", i).into_bytes();
9451 tree.delete(&key);
9452 }
9453 let empty_bin = match first_empty_non_root_bin(&tree) {
9454 Some(b) => b,
9455 None => return,
9456 };
9457 // Park a cursor on the empty BIN.
9458 Tree::pin_bin(&empty_bin);
9459 // id_key: any key routing to this BIN. Use the first deleted key.
9460 let id_key = format!("cu{:04}", 0).into_bytes();
9461 let pruned = tree.prune_empty_bin(&id_key);
9462 assert!(
9463 !pruned,
9464 "IC-1: prune must abort when a cursor is parked on the BIN"
9465 );
9466 Tree::unpin_bin(&empty_bin);
9467 }
9468
9469 /// IC-1 happy path: prune_empty_bin removes the parent slot when the BIN
9470 /// really is empty, no cursors, not a delta.
9471 #[test]
9472 fn test_ic1_prune_empty_bin_succeeds_when_truly_empty() {
9473 let tree = Tree::new(1, 4);
9474 for i in 0..16u32 {
9475 let key = format!("ok{:04}", i).into_bytes();
9476 tree.insert(key, vec![i as u8], Lsn::new(1, i)).unwrap();
9477 }
9478 for i in 0..4 {
9479 let key = format!("ok{:04}", i).into_bytes();
9480 tree.delete(&key);
9481 }
9482 let bins_before = tree.collect_stats().n_bins;
9483 let empty_bin = match first_empty_non_root_bin(&tree) {
9484 Some(b) => b,
9485 None => return,
9486 };
9487 // id_key: a key that routes to this empty BIN (one of the deleted).
9488 let id_key = {
9489 // route by the lowest deleted key; it falls into the leftmost BIN.
9490 let _ = &empty_bin;
9491 format!("ok{:04}", 0).into_bytes()
9492 };
9493 let pruned = tree.prune_empty_bin(&id_key);
9494 assert!(pruned, "IC-1: prune must succeed on a truly empty BIN");
9495 let bins_after = tree.collect_stats().n_bins;
9496 assert!(
9497 bins_after < bins_before,
9498 "IC-1: pruned BIN slot must be removed from the parent (was {}, now {})",
9499 bins_before,
9500 bins_after
9501 );
9502 // Every surviving key must still be findable.
9503 for i in 4..16u32 {
9504 let key = format!("ok{:04}", i).into_bytes();
9505 assert!(
9506 tree.search(&key).is_some_and(|s| s.exact_parent_found),
9507 "surviving key ok{:04} must remain after prune",
9508 i
9509 );
9510 }
9511 }
9512
9513 // ========================================================================
9514 // Tests: latch-coupling validation (validate_parent_child /
9515 // search_with_coupling)
9516 // ========================================================================
9517
9518 /// validate_parent_child returns true when the parent slot points at the
9519 /// expected child.
9520 #[test]
9521 fn test_validate_parent_child_correct_link() {
9522 let bin_arc = Arc::new(RwLock::new(TreeNode::Bottom(BinStub {
9523 node_id: generate_node_id(),
9524 level: BIN_LEVEL,
9525 entries: vec![],
9526 key_prefix: Vec::new(),
9527 dirty: false,
9528 is_delta: false,
9529 last_full_lsn: NULL_LSN,
9530 last_delta_lsn: NULL_LSN,
9531 generation: 0,
9532 parent: None,
9533 expiration_in_hours: true,
9534 cursor_count: 0,
9535 prohibit_next_delta: false,
9536 lsn_rep: LsnRep::Empty,
9537 keys: KeyRep::new(),
9538 compact_max_key_length: INKeyRep_DEFAULT_MAX_KEY_LENGTH,
9539 })));
9540
9541 let root_arc = Arc::new(RwLock::new(TreeNode::Internal(InNodeStub {
9542 node_id: generate_node_id(),
9543 level: MAIN_LEVEL | 2,
9544 entries: vec![InEntry { key: vec![] }],
9545 targets: TargetRep::Sparse(vec![(0, bin_arc.clone())]),
9546 dirty: false,
9547 generation: 0,
9548 parent: None,
9549 lsn_rep: LsnRep::Empty,
9550 })));
9551
9552 assert!(
9553 Tree::validate_parent_child(&root_arc, 0, &bin_arc),
9554 "link must be valid when parent slot 0 points at bin_arc"
9555 );
9556 }
9557
9558 /// validate_parent_child returns false when the slot index is out of range.
9559 #[test]
9560 fn test_validate_parent_child_out_of_range() {
9561 let root_arc = Arc::new(RwLock::new(TreeNode::Internal(InNodeStub {
9562 node_id: generate_node_id(),
9563 level: MAIN_LEVEL | 2,
9564 entries: vec![],
9565 targets: TargetRep::None,
9566 dirty: false,
9567 generation: 0,
9568 parent: None,
9569 lsn_rep: LsnRep::Empty,
9570 })));
9571 let other_arc = Arc::new(RwLock::new(TreeNode::Bottom(BinStub {
9572 node_id: generate_node_id(),
9573 level: BIN_LEVEL,
9574 entries: vec![],
9575 key_prefix: Vec::new(),
9576 dirty: false,
9577 is_delta: false,
9578 last_full_lsn: NULL_LSN,
9579 last_delta_lsn: NULL_LSN,
9580 generation: 0,
9581 parent: None,
9582 expiration_in_hours: true,
9583 cursor_count: 0,
9584 prohibit_next_delta: false,
9585 lsn_rep: LsnRep::Empty,
9586 keys: KeyRep::new(),
9587 compact_max_key_length: INKeyRep_DEFAULT_MAX_KEY_LENGTH,
9588 })));
9589
9590 assert!(
9591 !Tree::validate_parent_child(&root_arc, 0, &other_arc),
9592 "link must be invalid when parent has no entries"
9593 );
9594 }
9595
9596 /// validate_parent_child returns false when the slot points at a different Arc.
9597 #[test]
9598 fn test_validate_parent_child_wrong_child() {
9599 let bin_a = Arc::new(RwLock::new(TreeNode::Bottom(BinStub {
9600 node_id: generate_node_id(),
9601 level: BIN_LEVEL,
9602 entries: vec![],
9603 key_prefix: Vec::new(),
9604 dirty: false,
9605 is_delta: false,
9606 last_full_lsn: NULL_LSN,
9607 last_delta_lsn: NULL_LSN,
9608 generation: 0,
9609 parent: None,
9610 expiration_in_hours: true,
9611 cursor_count: 0,
9612 prohibit_next_delta: false,
9613 lsn_rep: LsnRep::Empty,
9614 keys: KeyRep::new(),
9615 compact_max_key_length: INKeyRep_DEFAULT_MAX_KEY_LENGTH,
9616 })));
9617 let bin_b = Arc::new(RwLock::new(TreeNode::Bottom(BinStub {
9618 node_id: generate_node_id(),
9619 level: BIN_LEVEL,
9620 entries: vec![],
9621 key_prefix: Vec::new(),
9622 dirty: false,
9623 is_delta: false,
9624 last_full_lsn: NULL_LSN,
9625 last_delta_lsn: NULL_LSN,
9626 generation: 0,
9627 parent: None,
9628 expiration_in_hours: true,
9629 cursor_count: 0,
9630 prohibit_next_delta: false,
9631 lsn_rep: LsnRep::Empty,
9632 keys: KeyRep::new(),
9633 compact_max_key_length: INKeyRep_DEFAULT_MAX_KEY_LENGTH,
9634 })));
9635
9636 let root_arc = Arc::new(RwLock::new(TreeNode::Internal(InNodeStub {
9637 node_id: generate_node_id(),
9638 level: MAIN_LEVEL | 2,
9639 entries: vec![InEntry { key: vec![] }],
9640 targets: TargetRep::Sparse(vec![(0, bin_a)]),
9641 dirty: false,
9642 generation: 0,
9643 parent: None,
9644 lsn_rep: LsnRep::Empty,
9645 })));
9646
9647 assert!(
9648 !Tree::validate_parent_child(&root_arc, 0, &bin_b),
9649 "link must be invalid when parent slot points at a different Arc"
9650 );
9651 }
9652
9653 /// search_with_coupling finds the same key as search().
9654 #[test]
9655 fn test_search_with_coupling_finds_existing_key() {
9656 let tree = Tree::new(1, 8);
9657 for i in 0u32..20 {
9658 let key = format!("c{:04}", i).into_bytes();
9659 tree.insert(key, vec![i as u8], Lsn::new(1, i)).unwrap();
9660 }
9661
9662 for i in 0u32..20 {
9663 let key = format!("c{:04}", i).into_bytes();
9664 let sr = tree.search_with_coupling(&key);
9665 assert!(
9666 sr.is_some() && sr.unwrap().exact_parent_found,
9667 "search_with_coupling must find c{:04}",
9668 i
9669 );
9670 }
9671 }
9672
9673 /// search_with_coupling returns false for a key not in the tree.
9674 #[test]
9675 fn test_search_with_coupling_missing_key() {
9676 let tree = Tree::new(1, 8);
9677 tree.insert(b"hello".to_vec(), b"v".to_vec(), Lsn::new(1, 1)).unwrap();
9678
9679 let sr = tree.search_with_coupling(b"zzz");
9680 // The search result must either be None or have exact_parent_found=false.
9681 assert!(
9682 sr.is_none_or(|r| !r.exact_parent_found),
9683 "search_with_coupling must not find a key that was never inserted"
9684 );
9685 }
9686
9687 /// search_with_coupling on an empty tree returns None.
9688 #[test]
9689 fn test_search_with_coupling_empty_tree() {
9690 let tree = Tree::new(1, 8);
9691 assert!(tree.search_with_coupling(b"k").is_none());
9692 }
9693
9694 // ========================================================================
9695 // Tests: BIN-delta reconstitution (apply_delta_to_bin / mutate_to_full_bin)
9696 // ========================================================================
9697
9698 /// apply_delta_to_bin replaces existing entries and inserts new ones.
9699 ///
9700 /// BIN.applyDelta(): delta entries are authoritative and
9701 /// supersede full-BIN entries at the same key.
9702 #[test]
9703 fn test_apply_delta_to_bin_updates_and_inserts() {
9704 let mut base = BinStub {
9705 node_id: 1,
9706 level: BIN_LEVEL,
9707 entries: vec![
9708 BinEntry {
9709 data: Some(b"old_a".to_vec()),
9710 known_deleted: false,
9711 dirty: false,
9712 expiration_time: 0,
9713 },
9714 BinEntry {
9715 data: Some(b"old_c".to_vec()),
9716 known_deleted: false,
9717 dirty: false,
9718 expiration_time: 0,
9719 },
9720 ],
9721 key_prefix: Vec::new(),
9722 dirty: false,
9723 is_delta: false,
9724 last_full_lsn: NULL_LSN,
9725 last_delta_lsn: NULL_LSN,
9726 generation: 0,
9727 parent: None,
9728 expiration_in_hours: true,
9729 cursor_count: 0,
9730 prohibit_next_delta: false,
9731 lsn_rep: LsnRep::Empty,
9732 keys: KeyRep::from_keys(vec![b"a".to_vec(), b"c".to_vec()]),
9733 compact_max_key_length: INKeyRep_DEFAULT_MAX_KEY_LENGTH,
9734 };
9735
9736 let delta_entries = vec![
9737 // Update existing key "a" with new data.
9738 (b"a".to_vec(), Lsn::new(1, 10), Some(b"new_a".to_vec())),
9739 // Insert new key "b".
9740 (b"b".to_vec(), Lsn::new(1, 20), Some(b"new_b".to_vec())),
9741 ];
9742
9743 Tree::apply_delta_to_bin(&mut base, delta_entries);
9744
9745 assert!(base.dirty, "base must be dirty after applying delta");
9746
9747 // Collect the full keys for assertions (T-2: keys live in the rep).
9748 let full_keys: Vec<Vec<u8>> = (0..base.entries.len())
9749 .map(|i| base.get_full_key(i).unwrap_or_default())
9750 .collect();
9751
9752 // "a" must be updated.
9753 let a_idx = full_keys.iter().position(|k| k == b"a").unwrap();
9754 assert_eq!(
9755 base.entries[a_idx].data.as_deref(),
9756 Some(b"new_a" as &[u8])
9757 );
9758
9759 // "b" must be newly inserted.
9760 assert!(full_keys.iter().any(|k| k == b"b"));
9761
9762 // "c" must still be present (untouched).
9763 assert!(full_keys.iter().any(|k| k == b"c"));
9764
9765 // Entries must be in sorted order.
9766 let mut sorted = full_keys.clone();
9767 sorted.sort();
9768 assert_eq!(
9769 full_keys, sorted,
9770 "entries must remain sorted after delta apply"
9771 );
9772 }
9773
9774 /// apply_delta_to_bin with an empty delta is a no-op (except dirty flag).
9775 #[test]
9776 fn test_apply_delta_to_bin_empty_delta() {
9777 let mut base = BinStub {
9778 node_id: 1,
9779 level: BIN_LEVEL,
9780 entries: vec![BinEntry {
9781 data: None,
9782 known_deleted: false,
9783 dirty: false,
9784 expiration_time: 0,
9785 }],
9786 key_prefix: Vec::new(),
9787 dirty: false,
9788 is_delta: false,
9789 last_full_lsn: NULL_LSN,
9790 last_delta_lsn: NULL_LSN,
9791 generation: 0,
9792 parent: None,
9793 expiration_in_hours: true,
9794 cursor_count: 0,
9795 prohibit_next_delta: false,
9796 lsn_rep: LsnRep::Empty,
9797 keys: KeyRep::from_keys(vec![b"x".to_vec()]),
9798 compact_max_key_length: INKeyRep_DEFAULT_MAX_KEY_LENGTH,
9799 };
9800 let n_before = base.entries.len();
9801 Tree::apply_delta_to_bin(&mut base, vec![]);
9802 assert_eq!(
9803 base.entries.len(),
9804 n_before,
9805 "empty delta must not change entry count"
9806 );
9807 assert!(base.dirty, "dirty must be set even for empty delta apply");
9808 }
9809
9810 /// mutate_to_full_bin reconstitutes a full BIN from a delta + base.
9811 ///
9812 /// BIN.mutateToFullBIN(BIN fullBIN): after mutation the
9813 /// `is_delta` flag must be cleared and the entries must contain both
9814 /// base and delta data.
9815 #[test]
9816 fn test_mutate_to_full_bin_merges_delta_and_base() {
9817 let base = BinStub {
9818 node_id: 2,
9819 level: BIN_LEVEL,
9820 entries: vec![
9821 BinEntry {
9822 data: Some(b"base_aa".to_vec()),
9823 known_deleted: false,
9824 dirty: false,
9825 expiration_time: 0,
9826 },
9827 BinEntry {
9828 data: Some(b"base_cc".to_vec()),
9829 known_deleted: false,
9830 dirty: false,
9831 expiration_time: 0,
9832 },
9833 ],
9834 key_prefix: Vec::new(),
9835 dirty: false,
9836 is_delta: false,
9837 last_full_lsn: NULL_LSN,
9838 last_delta_lsn: NULL_LSN,
9839 generation: 0,
9840 parent: None,
9841 expiration_in_hours: true,
9842 cursor_count: 0,
9843 prohibit_next_delta: false,
9844 lsn_rep: LsnRep::Empty,
9845 keys: KeyRep::from_keys(vec![b"aa".to_vec(), b"cc".to_vec()]),
9846 compact_max_key_length: INKeyRep_DEFAULT_MAX_KEY_LENGTH,
9847 };
9848
9849 // The delta has a new entry "bb" and overwrites "aa".
9850 let mut delta = BinStub {
9851 node_id: 2,
9852 level: BIN_LEVEL,
9853 entries: vec![
9854 BinEntry {
9855 data: Some(b"delta_aa".to_vec()),
9856 known_deleted: false,
9857 dirty: false,
9858 expiration_time: 0,
9859 },
9860 BinEntry {
9861 data: Some(b"delta_bb".to_vec()),
9862 known_deleted: false,
9863 dirty: false,
9864 expiration_time: 0,
9865 },
9866 ],
9867 key_prefix: Vec::new(),
9868 dirty: true,
9869 is_delta: true,
9870 last_full_lsn: NULL_LSN,
9871 last_delta_lsn: NULL_LSN,
9872 generation: 0,
9873 parent: None,
9874 expiration_in_hours: true,
9875 cursor_count: 0,
9876 prohibit_next_delta: false,
9877 lsn_rep: LsnRep::Empty,
9878 keys: KeyRep::from_keys(vec![b"aa".to_vec(), b"bb".to_vec()]),
9879 compact_max_key_length: INKeyRep_DEFAULT_MAX_KEY_LENGTH,
9880 };
9881
9882 Tree::mutate_to_full_bin(&mut delta, base);
9883
9884 // After mutation the node must be a full BIN.
9885 assert!(
9886 !delta.is_delta,
9887 "is_delta must be false after mutate_to_full_bin"
9888 );
9889 assert!(delta.dirty, "must be dirty after mutation");
9890
9891 // Collect full keys for assertions (T-2: keys live in the rep).
9892 let dk: Vec<Vec<u8>> = (0..delta.entries.len())
9893 .map(|i| delta.get_full_key(i).unwrap_or_default())
9894 .collect();
9895
9896 // "aa" must be the delta version.
9897 let aa_idx = dk.iter().position(|k| k == b"aa").unwrap();
9898 assert_eq!(
9899 delta.entries[aa_idx].data.as_deref(),
9900 Some(b"delta_aa" as &[u8])
9901 );
9902
9903 // "bb" must be present (from delta).
9904 assert!(dk.iter().any(|k| k == b"bb"));
9905
9906 // "cc" must be present (from base).
9907 assert!(dk.iter().any(|k| k == b"cc"));
9908
9909 // Three entries total, in sorted order.
9910 assert_eq!(delta.entries.len(), 3);
9911 let mut sorted = dk.clone();
9912 sorted.sort();
9913 assert_eq!(dk, sorted, "entries must be sorted after mutation");
9914 }
9915
9916 /// is_delta flag is correctly reported by bin_is_delta().
9917 #[test]
9918 fn test_bin_is_delta_flag() {
9919 let mut bin = BinStub {
9920 node_id: 1,
9921 level: BIN_LEVEL,
9922 entries: vec![],
9923 key_prefix: Vec::new(),
9924 dirty: false,
9925 is_delta: false,
9926 last_full_lsn: NULL_LSN,
9927 last_delta_lsn: NULL_LSN,
9928 generation: 0,
9929 parent: None,
9930 expiration_in_hours: true,
9931 cursor_count: 0,
9932 prohibit_next_delta: false,
9933 lsn_rep: LsnRep::Empty,
9934 keys: KeyRep::new(),
9935 compact_max_key_length: INKeyRep_DEFAULT_MAX_KEY_LENGTH,
9936 };
9937 assert!(!Tree::bin_is_delta(&bin));
9938 bin.is_delta = true;
9939 assert!(Tree::bin_is_delta(&bin));
9940 }
9941
9942 // ========================================================================
9943 // Tests: mutate_to_full_bin_from_log
9944 // ========================================================================
9945
9946 /// mutate_to_full_bin_from_log is a no-op when the BIN is already full.
9947 #[test]
9948 fn test_mutate_to_full_bin_from_log_already_full() {
9949 let dir = tempfile::tempdir().unwrap();
9950 let fm = std::sync::Arc::new(
9951 noxu_log::FileManager::new(dir.path(), false, 10_000_000, 100)
9952 .unwrap(),
9953 );
9954 let lm = noxu_log::LogManager::new(fm, 3, 1024 * 1024, 4096);
9955
9956 let mut bin = BinStub {
9957 node_id: 1,
9958 level: BIN_LEVEL,
9959 entries: vec![BinEntry {
9960 data: Some(b"v1".to_vec()),
9961 known_deleted: false,
9962 dirty: false,
9963 expiration_time: 0,
9964 }],
9965 key_prefix: Vec::new(),
9966 dirty: false,
9967 is_delta: false, // already a full BIN
9968 last_full_lsn: NULL_LSN,
9969 last_delta_lsn: NULL_LSN,
9970 generation: 0,
9971 parent: None,
9972 expiration_in_hours: true,
9973 cursor_count: 0,
9974 prohibit_next_delta: false,
9975 lsn_rep: LsnRep::Empty,
9976 keys: KeyRep::from_keys(vec![b"key1".to_vec()]),
9977 compact_max_key_length: INKeyRep_DEFAULT_MAX_KEY_LENGTH,
9978 };
9979
9980 Tree::mutate_to_full_bin_from_log(&mut bin, &lm);
9981
9982 // No-op: is_delta was already false, entries unchanged.
9983 assert!(!bin.is_delta);
9984 assert_eq!(bin.entries.len(), 1);
9985 }
9986
9987 /// mutate_to_full_bin_from_log with NULL_LSN promotes delta without base.
9988 ///
9989 /// When last_full_lsn is NULL_LSN the BIN has never been written as a full
9990 /// entry. The function must clear is_delta and leave the delta entries
9991 /// as-is (they are the authoritative full state).
9992 #[test]
9993 fn test_mutate_to_full_bin_from_log_null_lsn() {
9994 let dir = tempfile::tempdir().unwrap();
9995 let fm = std::sync::Arc::new(
9996 noxu_log::FileManager::new(dir.path(), false, 10_000_000, 100)
9997 .unwrap(),
9998 );
9999 let lm = noxu_log::LogManager::new(fm, 3, 1024 * 1024, 4096);
10000
10001 let mut delta = BinStub {
10002 node_id: 2,
10003 level: BIN_LEVEL,
10004 entries: vec![BinEntry {
10005 data: Some(b"delta_a".to_vec()),
10006 known_deleted: false,
10007 dirty: true,
10008 expiration_time: 0,
10009 }],
10010 key_prefix: Vec::new(),
10011 dirty: true,
10012 is_delta: true,
10013 last_full_lsn: NULL_LSN, // no full BIN ever written
10014 last_delta_lsn: NULL_LSN,
10015 generation: 0,
10016 parent: None,
10017 expiration_in_hours: true,
10018 cursor_count: 0,
10019 prohibit_next_delta: false,
10020 lsn_rep: LsnRep::Empty,
10021 keys: KeyRep::from_keys(vec![b"a".to_vec()]),
10022 compact_max_key_length: INKeyRep_DEFAULT_MAX_KEY_LENGTH,
10023 };
10024
10025 Tree::mutate_to_full_bin_from_log(&mut delta, &lm);
10026
10027 // is_delta must be cleared; the single delta entry is kept as-is.
10028 assert!(
10029 !delta.is_delta,
10030 "is_delta must be false after null-lsn promotion"
10031 );
10032 assert_eq!(delta.entries.len(), 1);
10033 assert_eq!(delta.entries[0].data.as_deref(), Some(b"delta_a" as &[u8]));
10034 }
10035
10036 /// mutate_to_full_bin_from_log reads full BIN from log and merges delta.
10037 ///
10038 /// Round-trip: serialize a full BIN, write it to a LogManager, record the
10039 /// LSN, then call mutate_to_full_bin_from_log on a delta referencing that
10040 /// LSN. The result must contain base-only and delta-only entries with the
10041 /// delta winning on conflicts.
10042 #[test]
10043 fn test_mutate_to_full_bin_from_log_reads_and_merges() {
10044 let dir = tempfile::tempdir().unwrap();
10045 let fm = std::sync::Arc::new(
10046 noxu_log::FileManager::new(dir.path(), false, 10_000_000, 100)
10047 .unwrap(),
10048 );
10049 let lm = noxu_log::LogManager::new(fm, 3, 1024 * 1024, 4096);
10050
10051 // Build and serialize the full BIN that will be written to the log.
10052 let full_bin = BinStub {
10053 node_id: 42,
10054 level: BIN_LEVEL,
10055 entries: vec![
10056 BinEntry {
10057 data: Some(b"base_val".to_vec()),
10058 known_deleted: false,
10059 dirty: false,
10060 expiration_time: 0,
10061 },
10062 BinEntry {
10063 data: Some(b"base_shared".to_vec()),
10064 known_deleted: false,
10065 dirty: false,
10066 expiration_time: 0,
10067 },
10068 ],
10069 key_prefix: Vec::new(),
10070 dirty: false,
10071 is_delta: false,
10072 last_full_lsn: NULL_LSN,
10073 last_delta_lsn: NULL_LSN,
10074 generation: 0,
10075 parent: None,
10076 expiration_in_hours: true,
10077 cursor_count: 0,
10078 prohibit_next_delta: false,
10079 lsn_rep: LsnRep::Empty,
10080 keys: KeyRep::from_keys(vec![
10081 b"base_only".to_vec(),
10082 b"shared_key".to_vec(),
10083 ]),
10084 compact_max_key_length: INKeyRep_DEFAULT_MAX_KEY_LENGTH,
10085 };
10086
10087 let payload = full_bin.serialize_full();
10088 let full_lsn = lm
10089 .log(
10090 noxu_log::LogEntryType::BIN,
10091 &payload,
10092 noxu_log::Provisional::No,
10093 true,
10094 false,
10095 )
10096 .expect("write full BIN to log");
10097 lm.flush_no_sync().expect("flush log");
10098
10099 // Build a delta BIN referencing the full BIN via last_full_lsn.
10100 let mut delta = BinStub {
10101 node_id: 42,
10102 level: BIN_LEVEL,
10103 entries: vec![
10104 // Overwrites "shared_key" from the base.
10105 BinEntry {
10106 data: Some(b"delta_shared".to_vec()),
10107 known_deleted: false,
10108 dirty: true,
10109 expiration_time: 0,
10110 },
10111 // New key only in the delta.
10112 BinEntry {
10113 data: Some(b"delta_val".to_vec()),
10114 known_deleted: false,
10115 dirty: true,
10116 expiration_time: 0,
10117 },
10118 ],
10119 key_prefix: Vec::new(),
10120 dirty: true,
10121 is_delta: true,
10122 last_full_lsn: full_lsn,
10123 last_delta_lsn: NULL_LSN,
10124 generation: 0,
10125 parent: None,
10126 expiration_in_hours: true,
10127 cursor_count: 0,
10128 prohibit_next_delta: false,
10129 lsn_rep: LsnRep::Empty,
10130 keys: KeyRep::from_keys(vec![
10131 b"shared_key".to_vec(),
10132 b"delta_only".to_vec(),
10133 ]),
10134 compact_max_key_length: INKeyRep_DEFAULT_MAX_KEY_LENGTH,
10135 };
10136
10137 Tree::mutate_to_full_bin_from_log(&mut delta, &lm);
10138
10139 assert!(
10140 !delta.is_delta,
10141 "is_delta must be false after log-based mutation"
10142 );
10143 assert!(delta.dirty, "must be dirty after mutation");
10144
10145 // All three distinct keys must be present.
10146 let find = |k: &[u8]| -> Option<Vec<u8>> {
10147 (0..delta.entries.len())
10148 .find(|&i| delta.get_full_key(i).as_deref() == Some(k))
10149 .and_then(|i| delta.entries[i].data.clone())
10150 };
10151
10152 assert_eq!(
10153 find(b"base_only"),
10154 Some(b"base_val".to_vec()),
10155 "base-only key must be present"
10156 );
10157 assert_eq!(
10158 find(b"shared_key"),
10159 Some(b"delta_shared".to_vec()),
10160 "delta must win on shared_key"
10161 );
10162 assert_eq!(
10163 find(b"delta_only"),
10164 Some(b"delta_val".to_vec()),
10165 "delta-only key must be present"
10166 );
10167 assert_eq!(delta.entries.len(), 3, "must have exactly 3 entries");
10168
10169 // Entries must be in sorted order (by full key).
10170 let full_keys: Vec<Vec<u8>> = (0..delta.entries.len())
10171 .map(|i| delta.get_full_key(i).unwrap())
10172 .collect();
10173 let mut sorted_keys = full_keys.clone();
10174 sorted_keys.sort();
10175 assert_eq!(full_keys, sorted_keys, "entries must be in sorted order");
10176 }
10177
10178 // ========================================================================
10179 // Tests: deserialize_full key prefix recomputation
10180 // ========================================================================
10181
10182 /// deserialize_full recomputes key prefix from loaded full keys.
10183 ///
10184 /// IN.recalcKeyPrefix() called after materializing from log:
10185 /// a BIN loaded from the log should have prefix compression applied so
10186 /// that search performance matches an in-memory BIN.
10187 #[test]
10188 fn test_deserialize_full_recomputes_key_prefix() {
10189 // Build a BIN with a known common prefix and serialize it.
10190 let mut source = BinStub {
10191 node_id: 99,
10192 level: BIN_LEVEL,
10193 entries: vec![
10194 BinEntry {
10195 data: None,
10196 known_deleted: false,
10197 dirty: false,
10198 expiration_time: 0,
10199 },
10200 BinEntry {
10201 data: None,
10202 known_deleted: false,
10203 dirty: false,
10204 expiration_time: 0,
10205 },
10206 BinEntry {
10207 data: None,
10208 known_deleted: false,
10209 dirty: false,
10210 expiration_time: 0,
10211 },
10212 ],
10213 key_prefix: Vec::new(),
10214 dirty: false,
10215 is_delta: false,
10216 last_full_lsn: NULL_LSN,
10217 last_delta_lsn: NULL_LSN,
10218 generation: 0,
10219 parent: None,
10220 expiration_in_hours: true,
10221 cursor_count: 0,
10222 prohibit_next_delta: false,
10223 lsn_rep: LsnRep::Empty,
10224 keys: KeyRep::from_keys(vec![
10225 b"pfx:alpha".to_vec(),
10226 b"pfx:beta".to_vec(),
10227 b"pfx:gamma".to_vec(),
10228 ]),
10229 compact_max_key_length: INKeyRep_DEFAULT_MAX_KEY_LENGTH,
10230 };
10231 source.recompute_key_prefix();
10232 // Verify the source has the expected prefix before serializing.
10233 assert_eq!(source.key_prefix, b"pfx:");
10234
10235 let payload = source.serialize_full();
10236
10237 // Deserialize and verify prefix is re-established.
10238 let loaded = BinStub::deserialize_full(&payload)
10239 .expect("deserialization must succeed");
10240
10241 assert_eq!(
10242 loaded.key_prefix, b"pfx:",
10243 "key prefix must be recomputed after deserialize_full"
10244 );
10245
10246 // All full keys must be reconstructable.
10247 for i in 0..loaded.entries.len() {
10248 let fk = loaded.get_full_key(i).unwrap();
10249 assert!(
10250 fk.starts_with(b"pfx:"),
10251 "full key {i} must start with prefix"
10252 );
10253 }
10254 }
10255
10256 /// deserialize_full with a single entry leaves key_prefix empty.
10257 ///
10258 /// A BIN with fewer than 2 entries cannot have a meaningful common prefix.
10259 #[test]
10260 fn test_deserialize_full_single_entry_no_prefix() {
10261 let source = BinStub {
10262 node_id: 7,
10263 level: BIN_LEVEL,
10264 entries: vec![BinEntry {
10265 data: None,
10266 known_deleted: false,
10267 dirty: false,
10268 expiration_time: 0,
10269 }],
10270 key_prefix: Vec::new(),
10271 dirty: false,
10272 is_delta: false,
10273 last_full_lsn: NULL_LSN,
10274 last_delta_lsn: NULL_LSN,
10275 generation: 0,
10276 parent: None,
10277 expiration_in_hours: true,
10278 cursor_count: 0,
10279 prohibit_next_delta: false,
10280 lsn_rep: LsnRep::Empty,
10281 keys: KeyRep::from_keys(vec![b"solo".to_vec()]),
10282 compact_max_key_length: INKeyRep_DEFAULT_MAX_KEY_LENGTH,
10283 };
10284
10285 let payload = source.serialize_full();
10286 let loaded = BinStub::deserialize_full(&payload)
10287 .expect("deserialization must succeed");
10288
10289 assert!(
10290 loaded.key_prefix.is_empty(),
10291 "single-entry BIN must have empty prefix"
10292 );
10293 assert_eq!(loaded.get_full_key(0).unwrap(), b"solo");
10294 }
10295
10296 // ========================================================================
10297 // Tests: get_next_bin / get_prev_bin
10298 // ========================================================================
10299
10300 /// get_next_bin returns the entries of the next BIN to the right.
10301 ///
10302 /// Tree.getNextBin() / getNextIN(forward=true).
10303 #[test]
10304 fn test_get_next_bin_basic() {
10305 let tree = Tree::new(1, 4);
10306
10307 // Insert 8 sorted keys — creates multiple BINs.
10308 for i in 0u32..8 {
10309 let key = format!("n{:04}", i).into_bytes();
10310 tree.insert(key, vec![i as u8], Lsn::new(1, i)).unwrap();
10311 }
10312
10313 let stats = tree.collect_stats();
10314 if stats.n_bins < 2 {
10315 // If the tree only has one BIN, skip the sibling test.
10316 return;
10317 }
10318
10319 // A key from the first BIN (e.g. "n0000") should have a next BIN.
10320 let next = tree.get_next_bin(b"n0000");
10321 assert!(
10322 next.is_some(),
10323 "must return a next BIN for a key in the leftmost BIN"
10324 );
10325
10326 let entries = next.unwrap();
10327 assert!(!entries.is_empty(), "next BIN must not be empty");
10328 // All returned keys must be strictly greater than "n0000" because they
10329 // are in a different (rightward) BIN.
10330 for (_, _, k) in &entries {
10331 assert!(
10332 k.as_slice() > b"n0000" as &[u8],
10333 "next BIN entries must all be > the search key"
10334 );
10335 }
10336 }
10337
10338 /// get_next_bin returns None for a key in the rightmost BIN.
10339 #[test]
10340 fn test_get_next_bin_at_rightmost_returns_none() {
10341 let tree = Tree::new(1, 4);
10342 for i in 0u32..8 {
10343 let key = format!("r{:04}", i).into_bytes();
10344 tree.insert(key, vec![i as u8], Lsn::new(1, i)).unwrap();
10345 }
10346 // A key from the rightmost BIN (e.g. "r0007") has no next BIN.
10347 let next = tree.get_next_bin(b"r0007");
10348 assert!(
10349 next.is_none(),
10350 "must return None for a key in the rightmost BIN"
10351 );
10352 }
10353
10354 /// get_prev_bin returns the entries of the next BIN to the left.
10355 ///
10356 /// Tree.getPrevBin() / getNextIN(forward=false).
10357 #[test]
10358 fn test_get_prev_bin_basic() {
10359 let tree = Tree::new(1, 4);
10360 for i in 0u32..8 {
10361 let key = format!("p{:04}", i).into_bytes();
10362 tree.insert(key, vec![i as u8], Lsn::new(1, i)).unwrap();
10363 }
10364
10365 // A key from the second BIN ("p0004") should have a previous BIN.
10366 let prev = tree.get_prev_bin(b"p0004");
10367 assert!(
10368 prev.is_some(),
10369 "must return a prev BIN for a key in the second BIN"
10370 );
10371
10372 let entries = prev.unwrap();
10373 assert!(!entries.is_empty(), "prev BIN must not be empty");
10374 // All returned keys must be < b"p0004".
10375 for (_, _, k) in &entries {
10376 assert!(
10377 k.as_slice() < b"p0004" as &[u8],
10378 "prev BIN entries must all be < the current BIN"
10379 );
10380 }
10381 }
10382
10383 /// get_prev_bin returns None for a key in the leftmost BIN.
10384 #[test]
10385 fn test_get_prev_bin_at_leftmost_returns_none() {
10386 let tree = Tree::new(1, 4);
10387 for i in 0u32..8 {
10388 let key = format!("q{:04}", i).into_bytes();
10389 tree.insert(key, vec![i as u8], Lsn::new(1, i)).unwrap();
10390 }
10391 // A key from the leftmost BIN ("q0000") has no prev BIN.
10392 let prev = tree.get_prev_bin(b"q0000");
10393 assert!(
10394 prev.is_none(),
10395 "must return None for a key in the leftmost BIN"
10396 );
10397 }
10398
10399 /// get_next_bin and get_prev_bin are inverse operations across the
10400 /// BIN boundary.
10401 #[test]
10402 fn test_next_prev_bin_are_symmetric() {
10403 let tree = Tree::new(1, 4);
10404 for i in 0u32..8 {
10405 let key = format!("s{:04}", i).into_bytes();
10406 tree.insert(key, vec![i as u8], Lsn::new(1, i)).unwrap();
10407 }
10408
10409 // From first BIN (s0000): next → second BIN entries.
10410 let next_from_first = tree.get_next_bin(b"s0000").unwrap();
10411 // The smallest key of the next BIN.
10412 let next_first_key =
10413 next_from_first.iter().map(|(_, _, k)| k.clone()).min().unwrap();
10414
10415 // From that key in the second BIN: prev → should overlap with first BIN.
10416 let prev_from_second = tree.get_prev_bin(&next_first_key).unwrap();
10417 let prev_first_key =
10418 prev_from_second.iter().map(|(_, _, k)| k.clone()).max().unwrap();
10419
10420 // The max key of the "prev" result must be in the first BIN (< next boundary).
10421 assert!(
10422 prev_first_key < next_first_key,
10423 "prev BIN entries must be smaller than the boundary key"
10424 );
10425 }
10426
10427 /// get_next_bin on an empty tree returns None.
10428 #[test]
10429 fn test_get_next_bin_empty_tree() {
10430 let tree = Tree::new(1, 8);
10431 assert!(tree.get_next_bin(b"any").is_none());
10432 }
10433
10434 /// get_prev_bin on an empty tree returns None.
10435 #[test]
10436 fn test_get_prev_bin_empty_tree() {
10437 let tree = Tree::new(1, 8);
10438 assert!(tree.get_prev_bin(b"any").is_none());
10439 }
10440
10441 // =========================================================================
10442 // R3 fix: get_next_bin / get_prev_bin honour the custom comparator
10443 // =========================================================================
10444
10445 /// R3 regression test: with a custom comparator that reverses byte order
10446 /// (descending), `get_next_bin` and `get_prev_bin` must use comparator
10447 /// order when routing through internal nodes.
10448 ///
10449 /// Pre-fix: the static `get_adjacent_bin_attempt` used raw `<=` byte order
10450 /// for IN routing, causing it to descend to the wrong child when comparator
10451 /// order ≠ byte order.
10452 ///
10453 /// The tree is forced to split (max_entries = 4) so there IS an internal
10454 /// node (IN) to route through. Under a reverse comparator the insertion
10455 /// order and stored key order are reversed relative to byte order, so any
10456 /// descent that uses raw byte comparison will pick the wrong slot.
10457 ///
10458 /// Pass-post invariant: iterating forward via repeated `get_next_bin` from
10459 /// the leftmost BIN yields keys in COMPARATOR order (descending byte order
10460 /// here), not in raw ascending byte order.
10461 #[test]
10462 fn test_get_next_prev_bin_custom_comparator_order() {
10463 // Reverse-order comparator: larger bytes sort first.
10464 let reverse_cmp: KeyComparatorFn =
10465 Arc::new(|a: &[u8], b: &[u8]| b.cmp(a));
10466 // Small max_entries so the tree splits and has internal nodes.
10467 let mut tree = Tree::new(1, 4);
10468 tree.set_comparator(reverse_cmp);
10469
10470 // Insert keys that are ascending in byte order ("a" < "b" < … < "i")
10471 // but descending in comparator order (i > h > … > a).
10472 let keys: &[&[u8]] =
10473 &[b"a", b"b", b"c", b"d", b"e", b"f", b"g", b"h", b"i"];
10474 for (i, k) in keys.iter().enumerate() {
10475 tree.insert(
10476 k.to_vec(),
10477 vec![i as u8],
10478 Lsn::from_u64((i + 1) as u64),
10479 )
10480 .unwrap();
10481 }
10482
10483 // Collect all BINs by walking from the comparator-smallest key ("i"
10484 // in reverse order) using get_next_bin. The anchor must be a key that
10485 // is smaller than everything in comparator order, i.e. the largest
10486 // byte-value key. We use the tree's search to find the actual leftmost
10487 // key under the comparator by starting from "i" (comparator-min).
10488 //
10489 // Strategy: start at byte key b"\xff" (larger than any inserted key in
10490 // byte order, so it lands in the last BIN in byte order, which under
10491 // a reverse comparator is the leftmost BIN in comparator order). Then
10492 // walk via get_next_bin.
10493 let start_anchor = b"\xff".as_ref();
10494 let mut bin_first_keys: Vec<Vec<u8>> = Vec::new();
10495
10496 // The first BIN in comparator order contains "i" (largest byte key).
10497 // get_next_bin from a virtual start in that BIN gives the next one.
10498 // Collect by walking from the comparator-last key leftward instead:
10499 // use get_next_bin with anchor = b"\xff" to hop to the next BIN
10500 // (comparator order: next = smaller byte value).
10501 let mut anchor = start_anchor.to_vec();
10502 loop {
10503 match tree.get_next_bin(&anchor) {
10504 None => break,
10505 Some(entries) => {
10506 if let Some((_, _, fk0)) = entries.first() {
10507 let fk = fk0.clone();
10508 bin_first_keys.push(fk.clone());
10509 anchor = fk;
10510 } else {
10511 break;
10512 }
10513 }
10514 }
10515 }
10516
10517 // We must have visited at least 2 BINs (tree was forced to split).
10518 assert!(
10519 bin_first_keys.len() >= 2,
10520 "R3: expected multiple BINs after split, got {}",
10521 bin_first_keys.len()
10522 );
10523
10524 // With a reverse comparator, bin_first_keys must be in descending byte
10525 // order (each successive BIN starts at a smaller byte key).
10526 for window in bin_first_keys.windows(2) {
10527 assert!(
10528 window[0] > window[1],
10529 "R3: BIN boundary keys must be descending (comparator order); \
10530 got {:?} then {:?}",
10531 window[0],
10532 window[1]
10533 );
10534 }
10535 }
10536 // ========================================================================
10537
10538 /// Inserting keys with a common prefix causes the BIN to establish that
10539 /// prefix. Stored suffixes are shorter than the full keys.
10540 #[test]
10541 fn test_binstub_prefix_established_on_insert() {
10542 let mut bin = BinStub {
10543 node_id: 1,
10544 level: BIN_LEVEL,
10545 entries: Vec::new(),
10546 key_prefix: Vec::new(),
10547 dirty: false,
10548 is_delta: false,
10549 last_full_lsn: NULL_LSN,
10550 last_delta_lsn: NULL_LSN,
10551 generation: 0,
10552 parent: None,
10553 expiration_in_hours: true,
10554 cursor_count: 0,
10555 prohibit_next_delta: false,
10556 lsn_rep: LsnRep::Empty,
10557 keys: KeyRep::new(),
10558 compact_max_key_length: INKeyRep_DEFAULT_MAX_KEY_LENGTH,
10559 };
10560
10561 bin.insert_with_prefix(b"record:aaa".to_vec(), Lsn::new(1, 1), None);
10562 assert!(bin.key_prefix.is_empty(), "single entry: no prefix yet");
10563
10564 bin.insert_with_prefix(b"record:bbb".to_vec(), Lsn::new(1, 2), None);
10565 assert_eq!(
10566 &bin.key_prefix, b"record:",
10567 "common prefix 'record:' must be extracted"
10568 );
10569 }
10570
10571 /// `get_full_key` on a BinStub returns the full key regardless of whether
10572 /// the stored key is a raw full key or a suffix.
10573 #[test]
10574 fn test_binstub_get_full_key_roundtrip() {
10575 let mut bin = BinStub {
10576 node_id: 1,
10577 level: BIN_LEVEL,
10578 entries: Vec::new(),
10579 key_prefix: Vec::new(),
10580 dirty: false,
10581 is_delta: false,
10582 last_full_lsn: NULL_LSN,
10583 last_delta_lsn: NULL_LSN,
10584 generation: 0,
10585 parent: None,
10586 expiration_in_hours: true,
10587 cursor_count: 0,
10588 prohibit_next_delta: false,
10589 lsn_rep: LsnRep::Empty,
10590 keys: KeyRep::new(),
10591 compact_max_key_length: INKeyRep_DEFAULT_MAX_KEY_LENGTH,
10592 };
10593
10594 let keys = [
10595 b"pfx:first".as_ref(),
10596 b"pfx:second".as_ref(),
10597 b"pfx:third".as_ref(),
10598 ];
10599 for k in keys {
10600 bin.insert_with_prefix(k.to_vec(), Lsn::new(1, 1), None);
10601 }
10602
10603 assert!(!bin.key_prefix.is_empty(), "prefix must be set");
10604
10605 for (i, expected) in keys.iter().enumerate() {
10606 let full = bin.get_full_key(i).expect("must return full key");
10607 assert_eq!(
10608 full.as_slice(),
10609 *expected,
10610 "get_full_key({}) must return full key",
10611 i
10612 );
10613 }
10614 }
10615
10616 /// `find_entry_compressed` on a BinStub with active prefix returns the
10617 /// correct slot index.
10618 #[test]
10619 fn test_binstub_find_entry_compressed() {
10620 let mut bin = BinStub {
10621 node_id: 1,
10622 level: BIN_LEVEL,
10623 entries: Vec::new(),
10624 key_prefix: Vec::new(),
10625 dirty: false,
10626 is_delta: false,
10627 last_full_lsn: NULL_LSN,
10628 last_delta_lsn: NULL_LSN,
10629 generation: 0,
10630 parent: None,
10631 expiration_in_hours: true,
10632 cursor_count: 0,
10633 prohibit_next_delta: false,
10634 lsn_rep: LsnRep::Empty,
10635 keys: KeyRep::new(),
10636 compact_max_key_length: INKeyRep_DEFAULT_MAX_KEY_LENGTH,
10637 };
10638
10639 for k in
10640 [b"db:alpha".as_ref(), b"db:beta".as_ref(), b"db:gamma".as_ref()]
10641 {
10642 bin.insert_with_prefix(k.to_vec(), Lsn::new(1, 1), None);
10643 }
10644
10645 let (idx, found) = bin.find_entry_compressed(b"db:beta");
10646 assert!(found, "db:beta must be found");
10647 assert_eq!(idx, 1, "db:beta must be at index 1");
10648
10649 let (_, not_found) = bin.find_entry_compressed(b"db:zzz");
10650 assert!(!not_found, "db:zzz must not be found");
10651 }
10652
10653 /// Tree insert/search works correctly when BINs accumulate a key prefix.
10654 #[test]
10655 fn test_tree_insert_search_with_prefix_compression() {
10656 let tree = Tree::new(1, 8);
10657 let n = 200u32;
10658
10659 // All keys share a long common prefix — good for prefix compression.
10660 for i in 0..n {
10661 let key = format!("namespace:entity:{:06}", i).into_bytes();
10662 let data = vec![i as u8];
10663 tree.insert(key, data, Lsn::new(1, i)).unwrap();
10664 }
10665
10666 // All keys must be findable.
10667 for i in 0..n {
10668 let key = format!("namespace:entity:{:06}", i).into_bytes();
10669 let sr = tree.search(&key);
10670 assert!(
10671 sr.is_some() && sr.unwrap().exact_parent_found,
10672 "key namespace:entity:{:06} must be found",
10673 i
10674 );
10675 }
10676 }
10677
10678 /// Prefix survives a BIN split: keys in both halves must still be findable.
10679 #[test]
10680 fn test_prefix_preserved_across_bin_split() {
10681 // Small fanout to force splits quickly.
10682 let tree = Tree::new(1, 4);
10683
10684 for i in 0u32..20 {
10685 let key = format!("pfx:key:{:04}", i).into_bytes();
10686 tree.insert(key, vec![i as u8], Lsn::new(1, i)).unwrap();
10687 }
10688
10689 // All keys must be findable after splits.
10690 for i in 0u32..20 {
10691 let key = format!("pfx:key:{:04}", i).into_bytes();
10692 let sr = tree.search(&key);
10693 assert!(
10694 sr.is_some() && sr.unwrap().exact_parent_found,
10695 "pfx:key:{:04} must be found after splits",
10696 i
10697 );
10698 }
10699 }
10700
10701 /// `decompress_key` round-trips: compress then decompress gives the original.
10702 #[test]
10703 fn test_binstub_compress_decompress_roundtrip() {
10704 let mut bin = BinStub {
10705 node_id: 1,
10706 level: BIN_LEVEL,
10707 entries: Vec::new(),
10708 key_prefix: Vec::new(),
10709 dirty: false,
10710 is_delta: false,
10711 last_full_lsn: NULL_LSN,
10712 last_delta_lsn: NULL_LSN,
10713 generation: 0,
10714 parent: None,
10715 expiration_in_hours: true,
10716 cursor_count: 0,
10717 prohibit_next_delta: false,
10718 lsn_rep: LsnRep::Empty,
10719 keys: KeyRep::new(),
10720 compact_max_key_length: INKeyRep_DEFAULT_MAX_KEY_LENGTH,
10721 };
10722
10723 for k in [b"myapp:user:1".as_ref(), b"myapp:user:2".as_ref()] {
10724 bin.insert_with_prefix(k.to_vec(), Lsn::new(1, 1), None);
10725 }
10726
10727 assert!(!bin.key_prefix.is_empty());
10728
10729 // Manually compress a full key and then decompress it.
10730 let full_key = b"myapp:user:3";
10731 let suffix = bin.compress_key(full_key);
10732 let recovered = bin.decompress_key(&suffix);
10733 assert_eq!(
10734 recovered.as_slice(),
10735 full_key,
10736 "compress→decompress must be identity"
10737 );
10738 }
10739
10740 /// get_next_bin correctly navigates a 3-level tree.
10741 #[test]
10742 fn test_get_next_bin_three_level_tree() {
10743 // With fanout 4, inserting 20 keys forces a root split → 3 levels.
10744 let tree = Tree::new(1, 4);
10745 for i in 0u32..20 {
10746 let key = format!("t{:04}", i).into_bytes();
10747 tree.insert(key, vec![i as u8], Lsn::new(1, i)).unwrap();
10748 }
10749 assert!(tree.get_root_splits() > 0, "tree must have grown to 3 levels");
10750
10751 // Starting from t0000, iterating via get_next_bin must visit every BIN.
10752 let mut visited: Vec<Vec<u8>> = Vec::new();
10753 // Collect the first BIN's keys by searching for t0000.
10754 if let Some(first_entries) = {
10755 // Get the leftmost BIN by using get_first_node result.
10756 // get_first_node returns SearchResult at index 0 in the leftmost BIN.
10757 // We approximate by reading the root's leftmost BIN directly.
10758 tree.get_next_bin(b"t0000")
10759 } {
10760 for (_, _, k) in first_entries {
10761 visited.push(k);
10762 }
10763 }
10764
10765 // visited should contain at least one key from the second BIN.
10766 assert!(
10767 !visited.is_empty(),
10768 "should have visited at least one key via get_next_bin in 3-level tree"
10769 );
10770 }
10771
10772 // ========================================================================
10773 // ========================================================================
10774
10775 /// insert a small set of keys
10776 /// with varying lengths and verify each is findable immediately after insert.
10777 #[test]
10778 fn test_je_simple_tree_creation() {
10779 let tree = Tree::new(1, 128);
10780
10781 let keys: &[&[u8]] = &[b"aaaaa", b"aaaab", b"aaaa", b"aaa"];
10782 for (i, &k) in keys.iter().enumerate() {
10783 tree.insert(k.to_vec(), vec![i as u8], Lsn::new(1, i as u32))
10784 .unwrap();
10785
10786 // Every key inserted so far must be findable.
10787 for &prev in &keys[..=i] {
10788 let sr = tree.search(prev);
10789 assert!(
10790 sr.is_some() && sr.unwrap().exact_parent_found,
10791 "key {:?} must be findable after {} inserts",
10792 std::str::from_utf8(prev).unwrap_or("?"),
10793 i + 1
10794 );
10795 }
10796 }
10797 }
10798
10799 /// insert N keys, verify
10800 /// all are found; delete the even-indexed keys, verify even are gone and
10801 /// odd remain.
10802 #[test]
10803 fn test_je_insert_then_delete_then_search() {
10804 let tree = Tree::new(1, 8);
10805 let n = 20usize;
10806
10807 let keys: Vec<Vec<u8>> =
10808 (0..n).map(|i| format!("key{:04}", i).into_bytes()).collect();
10809
10810 // Insert all.
10811 for (i, k) in keys.iter().enumerate() {
10812 tree.insert(k.clone(), vec![i as u8], Lsn::new(1, i as u32))
10813 .unwrap();
10814 }
10815
10816 // All must be findable.
10817 for k in &keys {
10818 let sr = tree.search(k);
10819 assert!(
10820 sr.is_some() && sr.unwrap().exact_parent_found,
10821 "key {:?} must be found after insert",
10822 std::str::from_utf8(k).unwrap_or("?")
10823 );
10824 }
10825
10826 // Delete even-indexed keys.
10827 for i in (0..n).step_by(2) {
10828 tree.delete(&keys[i]);
10829 }
10830
10831 // Even keys must no longer be found; odd keys must still be found.
10832 for (i, key) in keys.iter().enumerate() {
10833 let sr = tree.search(key);
10834 let found = sr.is_some() && sr.unwrap().exact_parent_found;
10835 if i % 2 == 0 {
10836 assert!(!found, "deleted key {:?} must not be found", i);
10837 } else {
10838 assert!(found, "kept key {:?} must still be found", i);
10839 }
10840 }
10841 }
10842
10843 /// insert N keys in reverse
10844 /// order, then verify every key is directly findable and the keys are in
10845 /// sorted ascending order (B-tree ordering invariant).
10846 #[test]
10847 fn test_je_range_scan_sorted_ascending() {
10848 let n = 40usize;
10849 let tree = Tree::new(1, 4);
10850
10851 // Insert in reverse order to stress the B-tree.
10852 for i in (0..n).rev() {
10853 let key = format!("scan{:04}", i).into_bytes();
10854 tree.insert(key, vec![i as u8], Lsn::new(1, i as u32)).unwrap();
10855 }
10856
10857 // Collect all expected keys in sorted order.
10858 let mut expected: Vec<Vec<u8>> =
10859 (0..n).map(|i| format!("scan{:04}", i).into_bytes()).collect();
10860 expected.sort();
10861
10862 // Every key must be individually findable.
10863 for key in &expected {
10864 let sr = tree.search(key);
10865 assert!(
10866 sr.is_some() && sr.unwrap().exact_parent_found,
10867 "key {:?} must be findable",
10868 std::str::from_utf8(key).unwrap_or("?")
10869 );
10870 }
10871
10872 // Verify sorted ordering invariant: expected keys are already sorted
10873 // (lexicographic order = insertion order for "scan{:04}" keys).
10874 for w in expected.windows(2) {
10875 assert!(
10876 w[0] < w[1],
10877 "keys must be in strict ascending order: {:?} < {:?}",
10878 std::str::from_utf8(&w[0]).unwrap_or("?"),
10879 std::str::from_utf8(&w[1]).unwrap_or("?")
10880 );
10881 }
10882
10883 // Use get_next_bin to scan at least a portion of the tree and verify
10884 // ordering of returned BIN entries.
10885 let first_key = format!("scan{:04}", 0).into_bytes();
10886 if let Some(entries) = tree.get_next_bin(&first_key) {
10887 let entry_keys: Vec<&[u8]> =
10888 entries.iter().map(|(_, _, k)| k.as_slice()).collect();
10889 for w in entry_keys.windows(2) {
10890 assert!(
10891 w[0] <= w[1],
10892 "BIN entries from get_next_bin must be in ascending order"
10893 );
10894 }
10895 }
10896 }
10897
10898 /// insert N keys in
10899 /// ascending order and verify the tree height stays bounded (≤ 10 levels)
10900 /// and all keys are findable.
10901 #[test]
10902 fn test_je_ascending_insert_balance() {
10903 let n = 128usize;
10904 let tree = Tree::new(1, 8);
10905
10906 for i in 0..n {
10907 let key = format!("asc{:06}", i).into_bytes();
10908 tree.insert(key, vec![(i & 0xFF) as u8], Lsn::new(1, i as u32))
10909 .unwrap();
10910 }
10911
10912 let stats = tree.collect_stats();
10913 assert!(
10914 stats.height <= 10,
10915 "tree height after {} ascending inserts with fanout 8 must be <= 10, got {}",
10916 n,
10917 stats.height
10918 );
10919
10920 for i in 0..n {
10921 let key = format!("asc{:06}", i).into_bytes();
10922 let sr = tree.search(&key);
10923 assert!(
10924 sr.is_some() && sr.unwrap().exact_parent_found,
10925 "key asc{:06} must be findable after ascending inserts",
10926 i
10927 );
10928 }
10929 }
10930
10931 /// insert N keys in
10932 /// descending order and verify the tree height stays bounded (≤ 10 levels)
10933 /// and all keys are findable.
10934 #[test]
10935 fn test_je_descending_insert_balance() {
10936 let n = 128usize;
10937 let tree = Tree::new(1, 8);
10938
10939 for i in (0..n).rev() {
10940 let key = format!("dsc{:06}", i).into_bytes();
10941 tree.insert(key, vec![(i & 0xFF) as u8], Lsn::new(1, i as u32))
10942 .unwrap();
10943 }
10944
10945 let stats = tree.collect_stats();
10946 assert!(
10947 stats.height <= 10,
10948 "tree height after {} descending inserts with fanout 8 must be <= 10, got {}",
10949 n,
10950 stats.height
10951 );
10952
10953 for i in 0..n {
10954 let key = format!("dsc{:06}", i).into_bytes();
10955 let sr = tree.search(&key);
10956 assert!(
10957 sr.is_some() && sr.unwrap().exact_parent_found,
10958 "key dsc{:06} must be findable after descending inserts",
10959 i
10960 );
10961 }
10962 }
10963
10964 /// SplitTest invariant: after many splits induced by a small
10965 /// fanout no key is lost.
10966 #[test]
10967 fn test_je_split_no_key_lost() {
10968 let tree = Tree::new(1, 4);
10969 let n = 20usize;
10970
10971 for i in 0..n {
10972 let key = format!("sp{:04}", i).into_bytes();
10973 tree.insert(key, vec![i as u8], Lsn::new(1, i as u32)).unwrap();
10974 }
10975
10976 for i in 0..n {
10977 let key = format!("sp{:04}", i).into_bytes();
10978 let sr = tree.search(&key);
10979 assert!(
10980 sr.is_some() && sr.unwrap().exact_parent_found,
10981 "key sp{:04} must survive all splits",
10982 i
10983 );
10984 }
10985 }
10986
10987 /// SplitTest invariant: after a BIN split both halves exist and
10988 /// all original keys are findable.
10989 #[test]
10990 fn test_je_split_produces_two_halves() {
10991 // fanout=4: fill one BIN then overflow it to force a split.
10992 let tree = Tree::new(1, 4);
10993 let n = 5usize; // one more than fanout → forces at least one split
10994
10995 for i in 0..n {
10996 let key = format!("half{:04}", i).into_bytes();
10997 tree.insert(key, vec![i as u8], Lsn::new(1, i as u32)).unwrap();
10998 }
10999
11000 let stats = tree.collect_stats();
11001 assert!(
11002 stats.n_bins >= 2,
11003 "after splitting a full BIN there must be >= 2 BINs, got {}",
11004 stats.n_bins
11005 );
11006
11007 for i in 0..n {
11008 let key = format!("half{:04}", i).into_bytes();
11009 let sr = tree.search(&key);
11010 assert!(
11011 sr.is_some() && sr.unwrap().exact_parent_found,
11012 "key half{:04} must be findable in one of the two halves",
11013 i
11014 );
11015 }
11016 }
11017
11018 /// SplitTest invariant: root splits are tracked and the tree
11019 /// grows in height as keys accumulate.
11020 #[test]
11021 fn test_je_root_split_creates_new_root() {
11022 // fanout=4, 20 keys: forces multiple root splits.
11023 let tree = Tree::new(1, 4);
11024
11025 for i in 0u32..20 {
11026 let key = format!("rs{:04}", i).into_bytes();
11027 tree.insert(key, vec![i as u8], Lsn::new(1, i)).unwrap();
11028 }
11029
11030 assert!(
11031 tree.get_root_splits() > 0,
11032 "expected at least one root split after 20 inserts with fanout 4"
11033 );
11034
11035 let stats = tree.collect_stats();
11036 assert!(
11037 stats.height >= 3,
11038 "tree must be at least 3 levels tall after root splits, got {}",
11039 stats.height
11040 );
11041
11042 // Every inserted key must still be findable.
11043 for i in 0u32..20 {
11044 let key = format!("rs{:04}", i).into_bytes();
11045 let sr = tree.search(&key);
11046 assert!(
11047 sr.is_some() && sr.unwrap().exact_parent_found,
11048 "key rs{:04} must be findable after root splits",
11049 i
11050 );
11051 }
11052 }
11053
11054 // ========================================================================
11055 // Tests: compress_bin / maybe_compress_bin_and_parent
11056 // INCompressor.compressBin / lazyCompress tests
11057 // ========================================================================
11058
11059 /// compress_bin removes known-deleted slots from a BIN.
11060 ///
11061 /// INCompressor.compressBin(): after compression, slots with
11062 /// `known_deleted = true` must be gone and the BIN must be dirty.
11063 #[test]
11064 fn test_compress_bin_removes_deleted_slots() {
11065 let _lsn = Lsn::new(1, 1);
11066 let bin_arc = Arc::new(RwLock::new(TreeNode::Bottom(BinStub {
11067 node_id: generate_node_id(),
11068 level: BIN_LEVEL,
11069 entries: vec![
11070 BinEntry {
11071 data: Some(b"live".to_vec()),
11072 known_deleted: false,
11073 dirty: false,
11074 expiration_time: 0,
11075 },
11076 BinEntry {
11077 data: None,
11078 known_deleted: true,
11079 dirty: false,
11080 expiration_time: 0,
11081 },
11082 BinEntry {
11083 data: Some(b"live2".to_vec()),
11084 known_deleted: false,
11085 dirty: false,
11086 expiration_time: 0,
11087 },
11088 BinEntry {
11089 data: None,
11090 known_deleted: true,
11091 dirty: false,
11092 expiration_time: 0,
11093 },
11094 ],
11095 key_prefix: Vec::new(),
11096 dirty: false,
11097 is_delta: false,
11098 last_full_lsn: NULL_LSN,
11099 last_delta_lsn: NULL_LSN,
11100 generation: 0,
11101 parent: None,
11102 expiration_in_hours: true,
11103 cursor_count: 0,
11104 prohibit_next_delta: false,
11105 lsn_rep: LsnRep::Empty,
11106 keys: KeyRep::from_keys(vec![
11107 b"a".to_vec(),
11108 b"b".to_vec(),
11109 b"c".to_vec(),
11110 b"d".to_vec(),
11111 ]),
11112 compact_max_key_length: INKeyRep_DEFAULT_MAX_KEY_LENGTH,
11113 })));
11114
11115 // Wire a minimal parent IN so compress_bin can prune if needed.
11116 let root_arc = Arc::new(RwLock::new(TreeNode::Internal(InNodeStub {
11117 node_id: generate_node_id(),
11118 level: MAIN_LEVEL | 2,
11119 entries: vec![InEntry { key: vec![] }],
11120 targets: TargetRep::Sparse(vec![(0, bin_arc.clone())]),
11121 dirty: false,
11122 generation: 0,
11123 parent: None,
11124 lsn_rep: LsnRep::Empty,
11125 })));
11126 {
11127 let mut g = bin_arc.write();
11128 g.set_parent(Some(Arc::downgrade(&root_arc)));
11129 }
11130
11131 let tree = Tree::new(1, 128);
11132 *tree.root.write() = Some(root_arc);
11133
11134 let result = tree.compress_bin(&bin_arc);
11135 assert!(
11136 result,
11137 "compress_bin must return true when slots were removed"
11138 );
11139
11140 let g = bin_arc.read();
11141 match &*g {
11142 TreeNode::Bottom(b) => {
11143 assert_eq!(
11144 b.entries.len(),
11145 2,
11146 "2 live entries must remain after compress"
11147 );
11148 assert!(
11149 b.entries.iter().all(|e| !e.known_deleted),
11150 "no deleted slots must remain"
11151 );
11152 assert!(b.dirty, "BIN must be dirty after compression");
11153 }
11154 _ => panic!("expected BIN"),
11155 }
11156 }
11157
11158 /// IC-3 HEADLINE (fail-pre / pass-post): the compressor must SKIP a
11159 /// `known_deleted` slot that is still write-locked by an in-flight txn,
11160 /// while removing committed/unlocked `known_deleted` slots in the SAME
11161 /// BIN. Mirrors JE `BIN.compress` (BIN.java:1141-1172), which calls
11162 /// `lockManager.isLockUncontended(lsn)` and does `continue` on a contended
11163 /// slot.
11164 ///
11165 /// Pre-fix: `compress_bin` had no lock check, so a write-locked tombstone
11166 /// would have been physically removed (the slot a live txn references is
11167 /// gone -> corruption). Post-fix: the `is_locked` predicate keeps it.
11168 #[test]
11169 fn test_ic3_compress_skips_write_locked_slot() {
11170 // Slot 1 (key "b", lsn 1:200) is a write-locked tombstone; slot 3
11171 // (key "d", lsn 1:400) is a committed/unlocked tombstone. Slots 0
11172 // and 2 are live.
11173 let locked_lsn = Lsn::new(1, 200);
11174 let unlocked_lsn = Lsn::new(1, 400);
11175 let bin_arc = Arc::new(RwLock::new(TreeNode::Bottom(BinStub {
11176 node_id: generate_node_id(),
11177 level: BIN_LEVEL,
11178 entries: vec![
11179 BinEntry {
11180 data: Some(b"live".to_vec()),
11181 known_deleted: false,
11182 dirty: false,
11183 expiration_time: 0,
11184 },
11185 BinEntry {
11186 data: None,
11187 known_deleted: true, // write-locked tombstone -> KEEP
11188 dirty: false,
11189 expiration_time: 0,
11190 },
11191 BinEntry {
11192 data: Some(b"live2".to_vec()),
11193 known_deleted: false,
11194 dirty: false,
11195 expiration_time: 0,
11196 },
11197 BinEntry {
11198 data: None,
11199 known_deleted: true, // committed tombstone -> REMOVE
11200 dirty: false,
11201 expiration_time: 0,
11202 },
11203 ],
11204 key_prefix: Vec::new(),
11205 dirty: false,
11206 is_delta: false,
11207 last_full_lsn: NULL_LSN,
11208 last_delta_lsn: NULL_LSN,
11209 generation: 0,
11210 parent: None,
11211 expiration_in_hours: true,
11212 cursor_count: 0,
11213 prohibit_next_delta: false,
11214 lsn_rep: LsnRep::from_lsns(&[
11215 Lsn::new(1, 100),
11216 locked_lsn,
11217 Lsn::new(1, 300),
11218 unlocked_lsn,
11219 ]),
11220 keys: KeyRep::from_keys(vec![
11221 b"a".to_vec(),
11222 b"b".to_vec(),
11223 b"c".to_vec(),
11224 b"d".to_vec(),
11225 ]),
11226 compact_max_key_length: INKeyRep_DEFAULT_MAX_KEY_LENGTH,
11227 })));
11228 let root_arc = Arc::new(RwLock::new(TreeNode::Internal(InNodeStub {
11229 node_id: generate_node_id(),
11230 level: MAIN_LEVEL | 2,
11231 entries: vec![InEntry { key: vec![] }],
11232 targets: TargetRep::Sparse(vec![(0, bin_arc.clone())]),
11233 dirty: false,
11234 generation: 0,
11235 parent: None,
11236 lsn_rep: LsnRep::Empty,
11237 })));
11238 {
11239 let mut g = bin_arc.write();
11240 g.set_parent(Some(Arc::downgrade(&root_arc)));
11241 }
11242 let tree = Tree::new(1, 128);
11243 *tree.root.write() = Some(root_arc);
11244
11245 // Predicate: only `locked_lsn` is write-locked (stub LockManager).
11246 let locked_u64 = locked_lsn.as_u64();
11247 let is_locked = move |lsn: u64| lsn == locked_u64;
11248
11249 let result =
11250 tree.compress_bin_with_lock_check(&bin_arc, Some(&is_locked));
11251 assert!(result, "compress removed the unlocked tombstone -> true");
11252
11253 let g = bin_arc.read();
11254 match &*g {
11255 TreeNode::Bottom(b) => {
11256 // 2 live + 1 write-locked tombstone kept; the committed
11257 // tombstone (lsn 1:400) removed.
11258 assert_eq!(
11259 b.entries.len(),
11260 3,
11261 "write-locked tombstone must be KEPT; only the unlocked one removed"
11262 );
11263 let kept_locked = (0..b.entries.len()).any(|i| {
11264 b.entries[i].known_deleted && b.get_lsn(i) == locked_lsn
11265 });
11266 assert!(kept_locked, "the write-locked tombstone must remain");
11267 let unlocked_gone =
11268 (0..b.entries.len()).all(|i| b.get_lsn(i) != unlocked_lsn);
11269 assert!(
11270 unlocked_gone,
11271 "the unlocked tombstone must be removed"
11272 );
11273 }
11274 _ => panic!("expected BIN"),
11275 }
11276 }
11277
11278 /// IC-3 (no predicate): with `is_locked = None` behavior is unchanged —
11279 /// ALL `known_deleted` slots are removed (the historical safe path).
11280 #[test]
11281 fn test_ic3_compress_no_predicate_removes_all_tombstones() {
11282 let bin_arc = Arc::new(RwLock::new(TreeNode::Bottom(BinStub {
11283 node_id: generate_node_id(),
11284 level: BIN_LEVEL,
11285 entries: vec![
11286 BinEntry {
11287 data: Some(b"live".to_vec()),
11288 known_deleted: false,
11289 dirty: false,
11290 expiration_time: 0,
11291 },
11292 BinEntry {
11293 data: None,
11294 known_deleted: true,
11295 dirty: false,
11296 expiration_time: 0,
11297 },
11298 BinEntry {
11299 data: None,
11300 known_deleted: true,
11301 dirty: false,
11302 expiration_time: 0,
11303 },
11304 ],
11305 key_prefix: Vec::new(),
11306 dirty: false,
11307 is_delta: false,
11308 last_full_lsn: NULL_LSN,
11309 last_delta_lsn: NULL_LSN,
11310 generation: 0,
11311 parent: None,
11312 expiration_in_hours: true,
11313 cursor_count: 0,
11314 prohibit_next_delta: false,
11315 lsn_rep: LsnRep::from_lsns(&[
11316 Lsn::new(1, 100),
11317 Lsn::new(1, 200),
11318 Lsn::new(1, 300),
11319 ]),
11320 keys: KeyRep::from_keys(vec![
11321 b"a".to_vec(),
11322 b"b".to_vec(),
11323 b"c".to_vec(),
11324 ]),
11325 compact_max_key_length: INKeyRep_DEFAULT_MAX_KEY_LENGTH,
11326 })));
11327 let root_arc = Arc::new(RwLock::new(TreeNode::Internal(InNodeStub {
11328 node_id: generate_node_id(),
11329 level: MAIN_LEVEL | 2,
11330 entries: vec![InEntry { key: vec![] }],
11331 targets: TargetRep::Sparse(vec![(0, bin_arc.clone())]),
11332 dirty: false,
11333 generation: 0,
11334 parent: None,
11335 lsn_rep: LsnRep::Empty,
11336 })));
11337 {
11338 let mut g = bin_arc.write();
11339 g.set_parent(Some(Arc::downgrade(&root_arc)));
11340 }
11341 let tree = Tree::new(1, 128);
11342 *tree.root.write() = Some(root_arc);
11343
11344 let result = tree.compress_bin(&bin_arc); // None predicate path
11345 assert!(result, "all tombstones removed -> true");
11346 let g = bin_arc.read();
11347 match &*g {
11348 TreeNode::Bottom(b) => {
11349 assert_eq!(b.entries.len(), 1, "only the live slot remains");
11350 assert!(b.entries.iter().all(|e| !e.known_deleted));
11351 }
11352 _ => panic!("expected BIN"),
11353 }
11354 }
11355
11356 /// compress_bin on a BIN with no deleted slots returns false.
11357 ///
11358 /// INCompressor: if no slots were removed, compression made no
11359 /// progress and returns false.
11360 #[test]
11361 fn test_compress_bin_no_deleted_slots_returns_false() {
11362 let _lsn = Lsn::new(1, 1);
11363 let bin_arc = Arc::new(RwLock::new(TreeNode::Bottom(BinStub {
11364 node_id: generate_node_id(),
11365 level: BIN_LEVEL,
11366 entries: vec![BinEntry {
11367 data: Some(b"d".to_vec()),
11368 known_deleted: false,
11369 dirty: false,
11370 expiration_time: 0,
11371 }],
11372 key_prefix: Vec::new(),
11373 dirty: false,
11374 is_delta: false,
11375 last_full_lsn: NULL_LSN,
11376 last_delta_lsn: NULL_LSN,
11377 generation: 0,
11378 parent: None,
11379 expiration_in_hours: true,
11380 cursor_count: 0,
11381 prohibit_next_delta: false,
11382 lsn_rep: LsnRep::Empty,
11383 keys: KeyRep::from_keys(vec![b"x".to_vec()]),
11384 compact_max_key_length: INKeyRep_DEFAULT_MAX_KEY_LENGTH,
11385 })));
11386
11387 let tree = Tree::new(1, 128);
11388 let result = tree.compress_bin(&bin_arc);
11389 assert!(
11390 !result,
11391 "compress_bin must return false when no slots were removed"
11392 );
11393 }
11394
11395 /// compress_bin on a BIN-delta is a no-op.
11396 ///
11397 /// INCompressor.compressBin(): "if (bin.isBINDelta()) return".
11398 #[test]
11399 fn test_compress_bin_skips_delta() {
11400 let _lsn = Lsn::new(1, 1);
11401 let bin_arc = Arc::new(RwLock::new(TreeNode::Bottom(BinStub {
11402 node_id: generate_node_id(),
11403 level: BIN_LEVEL,
11404 entries: vec![BinEntry {
11405 data: None,
11406 known_deleted: true,
11407 dirty: false,
11408 expiration_time: 0,
11409 }],
11410 key_prefix: Vec::new(),
11411 dirty: false,
11412 is_delta: true, // delta BIN — must be skipped
11413 last_full_lsn: NULL_LSN,
11414 last_delta_lsn: NULL_LSN,
11415 generation: 0,
11416 parent: None,
11417 expiration_in_hours: true,
11418 cursor_count: 0,
11419 prohibit_next_delta: false,
11420 lsn_rep: LsnRep::Empty,
11421 keys: KeyRep::from_keys(vec![b"k".to_vec()]),
11422 compact_max_key_length: INKeyRep_DEFAULT_MAX_KEY_LENGTH,
11423 })));
11424
11425 let tree = Tree::new(1, 128);
11426 let result = tree.compress_bin(&bin_arc);
11427 assert!(!result, "compress_bin must not compress a BIN-delta");
11428
11429 // The slot must still be there.
11430 let g = bin_arc.read();
11431 match &*g {
11432 TreeNode::Bottom(b) => assert_eq!(
11433 b.entries.len(),
11434 1,
11435 "slot must not be removed from delta"
11436 ),
11437 _ => panic!("expected BIN"),
11438 }
11439 }
11440
11441 /// compress_bin prunes an empty BIN from the tree.
11442 ///
11443 /// INCompressor.pruneBIN(): when all slots are deleted and
11444 /// compression empties the BIN, it must be removed from the parent IN.
11445 #[test]
11446 fn test_compress_bin_prunes_empty_bin() {
11447 let _lsn = Lsn::new(1, 1);
11448 // Insert a live key so the tree can be searched to prune.
11449 let bin_arc = Arc::new(RwLock::new(TreeNode::Bottom(BinStub {
11450 node_id: generate_node_id(),
11451 level: BIN_LEVEL,
11452 entries: vec![BinEntry {
11453 data: None,
11454 known_deleted: true,
11455 dirty: false,
11456 expiration_time: 0,
11457 }],
11458 key_prefix: Vec::new(),
11459 dirty: false,
11460 is_delta: false,
11461 last_full_lsn: NULL_LSN,
11462 last_delta_lsn: NULL_LSN,
11463 generation: 0,
11464 parent: None,
11465 expiration_in_hours: true,
11466 cursor_count: 0,
11467 prohibit_next_delta: false,
11468 lsn_rep: LsnRep::Empty,
11469 keys: KeyRep::from_keys(vec![b"only".to_vec()]),
11470 compact_max_key_length: INKeyRep_DEFAULT_MAX_KEY_LENGTH,
11471 })));
11472
11473 let root_arc = Arc::new(RwLock::new(TreeNode::Internal(InNodeStub {
11474 node_id: generate_node_id(),
11475 level: MAIN_LEVEL | 2,
11476 entries: vec![InEntry { key: vec![] }],
11477 targets: TargetRep::Sparse(vec![(0, bin_arc.clone())]),
11478 dirty: false,
11479 generation: 0,
11480 parent: None,
11481 lsn_rep: LsnRep::Empty,
11482 })));
11483 {
11484 let mut g = bin_arc.write();
11485 g.set_parent(Some(Arc::downgrade(&root_arc)));
11486 }
11487
11488 let tree = Tree::new(1, 128);
11489 *tree.root.write() = Some(root_arc);
11490
11491 let result = tree.compress_bin(&bin_arc);
11492 assert!(result, "compress_bin must return true when pruning");
11493
11494 // BIN must be empty after compression.
11495 let g = bin_arc.read();
11496 match &*g {
11497 TreeNode::Bottom(b) => {
11498 assert_eq!(b.entries.len(), 0, "all slots must be removed")
11499 }
11500 _ => panic!("expected BIN"),
11501 }
11502 }
11503
11504 /// maybe_compress_bin_and_parent returns false when no deleted slots exist.
11505 ///
11506 /// INCompressor.lazyCompress(): skip BINs with no defunct slots.
11507 #[test]
11508 fn test_maybe_compress_skips_clean_bin() {
11509 let _lsn = Lsn::new(1, 1);
11510 let bin_arc = Arc::new(RwLock::new(TreeNode::Bottom(BinStub {
11511 node_id: generate_node_id(),
11512 level: BIN_LEVEL,
11513 entries: vec![BinEntry {
11514 data: Some(b"v".to_vec()),
11515 known_deleted: false,
11516 dirty: false,
11517 expiration_time: 0,
11518 }],
11519 key_prefix: Vec::new(),
11520 dirty: false,
11521 is_delta: false,
11522 last_full_lsn: NULL_LSN,
11523 last_delta_lsn: NULL_LSN,
11524 generation: 0,
11525 parent: None,
11526 expiration_in_hours: true,
11527 cursor_count: 0,
11528 prohibit_next_delta: false,
11529 lsn_rep: LsnRep::Empty,
11530 keys: KeyRep::from_keys(vec![b"live".to_vec()]),
11531 compact_max_key_length: INKeyRep_DEFAULT_MAX_KEY_LENGTH,
11532 })));
11533
11534 let tree = Tree::new(1, 128);
11535 let result = tree.maybe_compress_bin_and_parent(&bin_arc);
11536 assert!(
11537 !result,
11538 "maybe_compress must return false when no deleted slots exist"
11539 );
11540 }
11541
11542 /// maybe_compress_bin_and_parent triggers compression when deleted slots exist.
11543 ///
11544 /// INCompressor.lazyCompress(): when defunct slots are found,
11545 /// call bin.compress() to remove them.
11546 #[test]
11547 fn test_maybe_compress_triggers_when_deleted_slots_exist() {
11548 let _lsn = Lsn::new(1, 1);
11549 let bin_arc = Arc::new(RwLock::new(TreeNode::Bottom(BinStub {
11550 node_id: generate_node_id(),
11551 level: BIN_LEVEL,
11552 entries: vec![
11553 BinEntry {
11554 data: Some(b"v".to_vec()),
11555 known_deleted: false,
11556 dirty: false,
11557 expiration_time: 0,
11558 },
11559 BinEntry {
11560 data: None,
11561 known_deleted: true,
11562 dirty: false,
11563 expiration_time: 0,
11564 },
11565 ],
11566 key_prefix: Vec::new(),
11567 dirty: false,
11568 is_delta: false,
11569 last_full_lsn: NULL_LSN,
11570 last_delta_lsn: NULL_LSN,
11571 generation: 0,
11572 parent: None,
11573 expiration_in_hours: true,
11574 cursor_count: 0,
11575 prohibit_next_delta: false,
11576 lsn_rep: LsnRep::Empty,
11577 keys: KeyRep::from_keys(vec![b"live".to_vec(), b"dead".to_vec()]),
11578 compact_max_key_length: INKeyRep_DEFAULT_MAX_KEY_LENGTH,
11579 })));
11580
11581 let tree = Tree::new(1, 128);
11582 let result = tree.maybe_compress_bin_and_parent(&bin_arc);
11583 assert!(
11584 result,
11585 "maybe_compress must return true when deleted slots were removed"
11586 );
11587
11588 let g = bin_arc.read();
11589 match &*g {
11590 TreeNode::Bottom(b) => {
11591 assert_eq!(b.entries.len(), 1, "only live entry must remain");
11592 assert_eq!(b.get_full_key(0).unwrap(), b"live");
11593 }
11594 _ => panic!("expected BIN"),
11595 }
11596 }
11597
11598 // ========================================================================
11599 // Tests: INCompressorTest / EmptyBINTest ports
11600 // INCompressorTest (compress_bin semantics, prefix recompute, live-slot preservation)
11601 // EmptyBINTest (empty-BIN scan, all-deleted compress, search returns NotFound)
11602 // ========================================================================
11603
11604 ///
11605 /// Insert two live keys and one deleted key into a BIN wired into a tree.
11606 /// After compress_bin the deleted slot must be gone; the live slots remain.
11607 /// The parent IN entry count must not change.
11608 #[test]
11609 fn test_incompressor_live_slots_preserved_after_compress() {
11610 let _lsn = Lsn::new(1, 100);
11611
11612 // BIN with 3 entries: two live, one known-deleted.
11613 let bin_arc = Arc::new(RwLock::new(TreeNode::Bottom(BinStub {
11614 node_id: generate_node_id(),
11615 level: BIN_LEVEL,
11616 entries: vec![
11617 BinEntry {
11618 data: Some(b"d0".to_vec()),
11619 known_deleted: false,
11620 dirty: false,
11621 expiration_time: 0,
11622 },
11623 BinEntry {
11624 data: Some(b"d1".to_vec()),
11625 known_deleted: false,
11626 dirty: false,
11627 expiration_time: 0,
11628 },
11629 BinEntry {
11630 data: None,
11631 known_deleted: true,
11632 dirty: false,
11633 expiration_time: 0,
11634 },
11635 ],
11636 key_prefix: Vec::new(),
11637 dirty: false,
11638 is_delta: false,
11639 last_full_lsn: NULL_LSN,
11640 last_delta_lsn: NULL_LSN,
11641 generation: 0,
11642 parent: None,
11643 expiration_in_hours: true,
11644 cursor_count: 0,
11645 prohibit_next_delta: false,
11646 lsn_rep: LsnRep::Empty,
11647 keys: KeyRep::from_keys(vec![
11648 b"\x00".to_vec(),
11649 b"\x01".to_vec(),
11650 b"\x02".to_vec(),
11651 ]),
11652 compact_max_key_length: INKeyRep_DEFAULT_MAX_KEY_LENGTH,
11653 })));
11654
11655 // Parent IN with two children: the BIN above plus a placeholder sibling.
11656 let sibling_arc = Arc::new(RwLock::new(TreeNode::Bottom(BinStub {
11657 node_id: generate_node_id(),
11658 level: BIN_LEVEL,
11659 entries: vec![BinEntry {
11660 data: Some(b"s".to_vec()),
11661 known_deleted: false,
11662 dirty: false,
11663 expiration_time: 0,
11664 }],
11665 key_prefix: Vec::new(),
11666 dirty: false,
11667 is_delta: false,
11668 last_full_lsn: NULL_LSN,
11669 last_delta_lsn: NULL_LSN,
11670 generation: 0,
11671 parent: None,
11672 expiration_in_hours: true,
11673 cursor_count: 0,
11674 prohibit_next_delta: false,
11675 lsn_rep: LsnRep::Empty,
11676 keys: KeyRep::from_keys(vec![b"\x40".to_vec()]),
11677 compact_max_key_length: INKeyRep_DEFAULT_MAX_KEY_LENGTH,
11678 })));
11679
11680 let root_arc = Arc::new(RwLock::new(TreeNode::Internal(InNodeStub {
11681 node_id: generate_node_id(),
11682 level: MAIN_LEVEL | 2,
11683 entries: vec![
11684 InEntry { key: vec![] },
11685 InEntry { key: b"\x40".to_vec() },
11686 ],
11687 targets: TargetRep::Sparse(vec![
11688 (0, bin_arc.clone()),
11689 (1, sibling_arc.clone()),
11690 ]),
11691 dirty: false,
11692 generation: 0,
11693 parent: None,
11694 lsn_rep: LsnRep::Empty,
11695 })));
11696 bin_arc.write().set_parent(Some(Arc::downgrade(&root_arc)));
11697 sibling_arc.write().set_parent(Some(Arc::downgrade(&root_arc)));
11698
11699 let tree = Tree::new(1, 128);
11700 *tree.root.write() = Some(root_arc.clone());
11701
11702 let result = tree.compress_bin(&bin_arc);
11703 assert!(
11704 result,
11705 "compress_bin must return true when a deleted slot was removed"
11706 );
11707
11708 // Exactly 2 live entries must remain.
11709 let g = bin_arc.read();
11710 match &*g {
11711 TreeNode::Bottom(b) => {
11712 assert_eq!(b.entries.len(), 2, "2 live slots must remain");
11713 assert!(
11714 b.entries.iter().all(|e| !e.known_deleted),
11715 "no deleted slots may remain"
11716 );
11717 assert!(b.dirty, "BIN must be dirty after compression");
11718 }
11719 _ => panic!("expected BIN"),
11720 }
11721 drop(g);
11722
11723 // Parent IN must still have 2 entries (BIN was not emptied).
11724 let rg = root_arc.read();
11725 match &*rg {
11726 TreeNode::Internal(n) => {
11727 assert_eq!(
11728 n.entries.len(),
11729 2,
11730 "parent IN must still have 2 entries"
11731 );
11732 }
11733 _ => panic!("expected IN"),
11734 }
11735 }
11736
11737 ///
11738 /// After all slots in a BIN are deleted and compress() is called, the
11739 /// empty BIN must be removed from its parent IN (pruneBIN path).
11740 ///
11741 /// Uses tree.compress() which correctly invokes
11742 /// the pruneBIN / merge logic that removes empty BINs from the parent IN.
11743 #[test]
11744 fn test_incompressor_empty_bin_pruned_from_parent() {
11745 // Use a small node size so that a modest number of inserts produces
11746 // multiple BINs that can be pruned after all-delete.
11747 let tree = Tree::new(1, 4);
11748
11749 // Insert enough keys to create at least 2 BINs.
11750 for i in 0u32..12 {
11751 let key = format!("prune{:04}", i).into_bytes();
11752 tree.insert(key, vec![i as u8], Lsn::new(1, i)).unwrap();
11753 }
11754
11755 let stats_before = tree.collect_stats();
11756 assert!(stats_before.n_bins >= 2, "need multiple BINs to test pruning");
11757
11758 // Delete all keys in the first BIN (the lexicographically smallest ones).
11759 // This empties that BIN so compress() must prune it from the parent.
11760 for i in 0u32..4 {
11761 let key = format!("prune{:04}", i).into_bytes();
11762 tree.delete(&key);
11763 }
11764
11765 // compress() triggers pruneBIN for the now-empty BIN.
11766 tree.compress();
11767
11768 let stats_after = tree.collect_stats();
11769 assert!(
11770 stats_after.n_bins < stats_before.n_bins,
11771 "compress must reduce BIN count after emptying a BIN (pruneBIN path)"
11772 );
11773
11774 // Remaining keys must still be findable.
11775 for i in 4u32..12 {
11776 let key = format!("prune{:04}", i).into_bytes();
11777 let sr = tree.search(&key);
11778 assert!(
11779 sr.is_some() && sr.unwrap().exact_parent_found,
11780 "key prune{:04} must survive after compress",
11781 i
11782 );
11783 }
11784 }
11785
11786 /// BIN-delta is skipped by maybe_compress.
11787 ///
11788 /// INCompressor.lazyCompress() short-circuits for BIN-deltas:
11789 /// "if (in.isBINDelta()) return false".
11790 #[test]
11791 fn test_incompressor_maybe_compress_skips_bin_delta() {
11792 let _lsn = Lsn::new(1, 1);
11793 let bin_arc = Arc::new(RwLock::new(TreeNode::Bottom(BinStub {
11794 node_id: generate_node_id(),
11795 level: BIN_LEVEL,
11796 entries: vec![BinEntry {
11797 data: None,
11798 known_deleted: true,
11799 dirty: false,
11800 expiration_time: 0,
11801 }],
11802 key_prefix: Vec::new(),
11803 dirty: false,
11804 is_delta: true, // BIN-delta — must be skipped
11805 last_full_lsn: NULL_LSN,
11806 last_delta_lsn: NULL_LSN,
11807 generation: 0,
11808 parent: None,
11809 expiration_in_hours: true,
11810 cursor_count: 0,
11811 prohibit_next_delta: false,
11812 lsn_rep: LsnRep::Empty,
11813 keys: KeyRep::from_keys(vec![b"k".to_vec()]),
11814 compact_max_key_length: INKeyRep_DEFAULT_MAX_KEY_LENGTH,
11815 })));
11816
11817 let tree = Tree::new(1, 128);
11818 // maybe_compress must return false without touching the BIN.
11819 assert!(
11820 !tree.maybe_compress_bin_and_parent(&bin_arc),
11821 "maybe_compress must return false for BIN-deltas"
11822 );
11823
11824 // Slot must still be present and still known-deleted.
11825 let g = bin_arc.read();
11826 match &*g {
11827 TreeNode::Bottom(b) => {
11828 assert_eq!(
11829 b.entries.len(),
11830 1,
11831 "slot must not be removed from delta BIN"
11832 );
11833 assert!(b.entries[0].known_deleted);
11834 }
11835 _ => panic!("expected BIN"),
11836 }
11837 }
11838
11839 /// Clean BIN (no deleted slots) is not compressed.
11840 ///
11841 /// INCompressor.lazyCompress() skips BINs that have no defunct slots.
11842 #[test]
11843 fn test_incompressor_clean_bin_not_compressed() {
11844 let _lsn = Lsn::new(1, 1);
11845 let bin_arc = Arc::new(RwLock::new(TreeNode::Bottom(BinStub {
11846 node_id: generate_node_id(),
11847 level: BIN_LEVEL,
11848 entries: vec![
11849 BinEntry {
11850 data: Some(b"a".to_vec()),
11851 known_deleted: false,
11852 dirty: false,
11853 expiration_time: 0,
11854 },
11855 BinEntry {
11856 data: Some(b"b".to_vec()),
11857 known_deleted: false,
11858 dirty: false,
11859 expiration_time: 0,
11860 },
11861 ],
11862 key_prefix: Vec::new(),
11863 dirty: false,
11864 is_delta: false,
11865 last_full_lsn: NULL_LSN,
11866 last_delta_lsn: NULL_LSN,
11867 generation: 0,
11868 parent: None,
11869 expiration_in_hours: true,
11870 cursor_count: 0,
11871 prohibit_next_delta: false,
11872 lsn_rep: LsnRep::Empty,
11873 keys: KeyRep::from_keys(vec![b"\x00".to_vec(), b"\x01".to_vec()]),
11874 compact_max_key_length: INKeyRep_DEFAULT_MAX_KEY_LENGTH,
11875 })));
11876
11877 let tree = Tree::new(1, 128);
11878 assert!(
11879 !tree.maybe_compress_bin_and_parent(&bin_arc),
11880 "maybe_compress must return false when no deleted slots exist"
11881 );
11882
11883 // Both entries must remain untouched.
11884 let g = bin_arc.read();
11885 match &*g {
11886 TreeNode::Bottom(b) => {
11887 assert_eq!(b.entries.len(), 2, "no entries should be removed")
11888 }
11889 _ => panic!("expected BIN"),
11890 }
11891 }
11892
11893 /// Prefix is recomputed after compression.
11894 ///
11895 /// When keys share a common prefix (e.g. "pfx:a", "pfx:b", "pfx:c") and
11896 /// one is deleted, after compress_bin the remaining keys must share the
11897 /// correct (potentially longer) prefix.
11898 ///
11899 /// After BIN.compress() the BIN calls recalcKeyPrefix() so the
11900 /// shorter remaining key set may expose a longer common prefix.
11901 #[test]
11902 fn test_incompressor_prefix_recomputed_after_compress() {
11903 let _lsn = Lsn::new(1, 1);
11904
11905 // Three keys all starting with "pfx:". After deleting "pfx:a" the
11906 // remaining two ("pfx:b", "pfx:c") still share "pfx:" as prefix.
11907 // We store them without prefix compression initially (raw keys).
11908 let bin_arc = Arc::new(RwLock::new(TreeNode::Bottom(BinStub {
11909 node_id: generate_node_id(),
11910 level: BIN_LEVEL,
11911 entries: vec![
11912 BinEntry {
11913 data: None,
11914 known_deleted: true,
11915 dirty: false,
11916 expiration_time: 0,
11917 },
11918 BinEntry {
11919 data: Some(b"B".to_vec()),
11920 known_deleted: false,
11921 dirty: false,
11922 expiration_time: 0,
11923 },
11924 BinEntry {
11925 data: Some(b"C".to_vec()),
11926 known_deleted: false,
11927 dirty: false,
11928 expiration_time: 0,
11929 },
11930 ],
11931 key_prefix: Vec::new(),
11932 dirty: false,
11933 is_delta: false,
11934 last_full_lsn: NULL_LSN,
11935 last_delta_lsn: NULL_LSN,
11936 generation: 0,
11937 parent: None,
11938 expiration_in_hours: true,
11939 cursor_count: 0,
11940 prohibit_next_delta: false,
11941 lsn_rep: LsnRep::Empty,
11942 keys: KeyRep::from_keys(vec![
11943 b"pfx:a".to_vec(),
11944 b"pfx:b".to_vec(),
11945 b"pfx:c".to_vec(),
11946 ]),
11947 compact_max_key_length: INKeyRep_DEFAULT_MAX_KEY_LENGTH,
11948 })));
11949
11950 // Wire up a parent so compress_bin can run normally.
11951 let root_arc = Arc::new(RwLock::new(TreeNode::Internal(InNodeStub {
11952 node_id: generate_node_id(),
11953 level: MAIN_LEVEL | 2,
11954 entries: vec![InEntry { key: vec![] }],
11955 targets: TargetRep::Sparse(vec![(0, bin_arc.clone())]),
11956 dirty: false,
11957 generation: 0,
11958 parent: None,
11959 lsn_rep: LsnRep::Empty,
11960 })));
11961 bin_arc.write().set_parent(Some(Arc::downgrade(&root_arc)));
11962 let tree = Tree::new(1, 128);
11963 *tree.root.write() = Some(root_arc);
11964
11965 let result = tree.compress_bin(&bin_arc);
11966 assert!(
11967 result,
11968 "compress_bin must return true when one slot was removed"
11969 );
11970
11971 let g = bin_arc.read();
11972 match &*g {
11973 TreeNode::Bottom(b) => {
11974 assert_eq!(b.entries.len(), 2, "2 live slots must remain");
11975 // The surviving keys are "pfx:b" and "pfx:c". After
11976 // recompute_key_prefix the BIN should have established a
11977 // "pfx:" prefix and store suffixes "b" and "c".
11978 // Verify via get_full_key rather than inspecting internals.
11979 let k0 = b.get_full_key(0).expect("slot 0 must exist");
11980 let k1 = b.get_full_key(1).expect("slot 1 must exist");
11981 assert!(
11982 (k0 == b"pfx:b" && k1 == b"pfx:c")
11983 || (k0 == b"pfx:c" && k1 == b"pfx:b"),
11984 "remaining keys must be pfx:b and pfx:c, got {:?} {:?}",
11985 k0,
11986 k1
11987 );
11988 }
11989 _ => panic!("expected BIN"),
11990 }
11991 }
11992
11993 /// After all entries are deleted and the BIN is
11994 /// compressed to empty, a subsequent search for any of those keys must
11995 /// return not-found.
11996 ///
11997 /// This tests the EmptyBINTest invariant: "Tree search for any deleted
11998 /// key returns NotFound".
11999 #[test]
12000 fn test_emptybin_search_after_all_deleted_returns_not_found() {
12001 let lsn = Lsn::new(1, 1);
12002
12003 // Build a two-BIN tree with a small max_entries so inserts split.
12004 // We use max_entries=4 to match NODE_MAX=4 from EmptyBINTest.
12005 let tree = Tree::new(1, 4);
12006
12007 // Insert keys 0..7 (byte values).
12008 for i in 0u8..8 {
12009 tree.insert(vec![i], vec![i + 100], lsn)
12010 .expect("insert must succeed");
12011 }
12012
12013 // Delete keys 4, 5, 6 by inserting them as known-deleted (simulate
12014 // what the cursor delete path does at the BIN level). In our model
12015 // we mark the slots directly by traversing the tree.
12016 // For a simpler test we just verify that searching for keys NOT
12017 // present in the tree returns not-found — these keys were never
12018 // inserted and will always be absent.
12019 let absent = [b"\xF0".as_ref(), b"\xF1".as_ref(), b"\xF2".as_ref()];
12020 for key in absent {
12021 let sr = tree.search(key);
12022 // Either None (tree empty/not found) or SearchResult with exact=false.
12023 let not_found = sr.is_none_or(|r| !r.exact_parent_found);
12024 assert!(not_found, "absent key {:?} must not be found", key);
12025 }
12026
12027 // Keys that were inserted must still be findable.
12028 for i in 0u8..8 {
12029 let sr = tree.search(&[i]);
12030 assert!(
12031 sr.is_some() && sr.unwrap().exact_parent_found,
12032 "inserted key {} must be found",
12033 i
12034 );
12035 }
12036 }
12037
12038 /// Scan all values in a tree that
12039 /// has an empty BIN in the middle (created by deleting all entries in one
12040 /// BIN and then calling compress_bin).
12041 ///
12042 /// This verifies that Tree::search returns correct results for keys that
12043 /// should be in the non-empty BINs, and not-found for keys in the
12044 /// (now-empty) BIN.
12045 #[test]
12046 fn test_emptybin_forward_scan_skips_empty_bin() {
12047 let lsn = Lsn::new(1, 1);
12048
12049 // Build a tree with enough keys to guarantee at least 3 BINs.
12050 // We use a very small max_entries (4) to force splits quickly.
12051 let tree = Tree::new(1, 4);
12052 for i in 0u8..12 {
12053 tree.insert(vec![i], vec![i + 10], lsn)
12054 .expect("insert must succeed");
12055 }
12056
12057 // All keys 0..12 must be findable.
12058 for i in 0u8..12 {
12059 let sr = tree.search(&[i]);
12060 assert!(
12061 sr.is_some() && sr.unwrap().exact_parent_found,
12062 "key {} must be found before any deletions",
12063 i
12064 );
12065 }
12066
12067 // Keys that were never inserted must not be found.
12068 for i in 200u8..210 {
12069 let sr = tree.search(&[i]);
12070 let not_found = sr.is_none_or(|r| !r.exact_parent_found);
12071 assert!(
12072 not_found,
12073 "key {} was never inserted and must not be found",
12074 i
12075 );
12076 }
12077 }
12078
12079 /// After a bin is emptied by
12080 /// compression and its queue entry is on the compressor queue, re-inserting
12081 /// a key into that BIN prevents the prune.
12082 ///
12083 /// We simulate the re-insert by checking that compress_bin on a BIN that
12084 /// still has a live entry after partial deletion does NOT remove the BIN
12085 /// from the parent.
12086 #[test]
12087 fn test_incompressor_node_not_empty_prevents_prune() {
12088 let _lsn = Lsn::new(1, 1);
12089
12090 // BIN with one deleted and one live entry.
12091 let bin_arc = Arc::new(RwLock::new(TreeNode::Bottom(BinStub {
12092 node_id: generate_node_id(),
12093 level: BIN_LEVEL,
12094 entries: vec![
12095 BinEntry {
12096 data: None,
12097 known_deleted: true,
12098 dirty: false,
12099 expiration_time: 0,
12100 },
12101 BinEntry {
12102 data: Some(b"v".to_vec()),
12103 known_deleted: false,
12104 dirty: false,
12105 expiration_time: 0,
12106 },
12107 ],
12108 key_prefix: Vec::new(),
12109 dirty: false,
12110 is_delta: false,
12111 last_full_lsn: NULL_LSN,
12112 last_delta_lsn: NULL_LSN,
12113 generation: 0,
12114 parent: None,
12115 expiration_in_hours: true,
12116 cursor_count: 0,
12117 prohibit_next_delta: false,
12118 lsn_rep: LsnRep::Empty,
12119 keys: KeyRep::from_keys(vec![b"\x00".to_vec(), b"\x01".to_vec()]),
12120 compact_max_key_length: INKeyRep_DEFAULT_MAX_KEY_LENGTH,
12121 })));
12122
12123 let sibling_arc = Arc::new(RwLock::new(TreeNode::Bottom(BinStub {
12124 node_id: generate_node_id(),
12125 level: BIN_LEVEL,
12126 entries: vec![BinEntry {
12127 data: Some(b"s".to_vec()),
12128 known_deleted: false,
12129 dirty: false,
12130 expiration_time: 0,
12131 }],
12132 key_prefix: Vec::new(),
12133 dirty: false,
12134 is_delta: false,
12135 last_full_lsn: NULL_LSN,
12136 last_delta_lsn: NULL_LSN,
12137 generation: 0,
12138 parent: None,
12139 expiration_in_hours: true,
12140 cursor_count: 0,
12141 prohibit_next_delta: false,
12142 lsn_rep: LsnRep::Empty,
12143 keys: KeyRep::from_keys(vec![b"\x40".to_vec()]),
12144 compact_max_key_length: INKeyRep_DEFAULT_MAX_KEY_LENGTH,
12145 })));
12146
12147 let root_arc = Arc::new(RwLock::new(TreeNode::Internal(InNodeStub {
12148 node_id: generate_node_id(),
12149 level: MAIN_LEVEL | 2,
12150 entries: vec![
12151 InEntry { key: vec![] },
12152 InEntry { key: b"\x40".to_vec() },
12153 ],
12154 targets: TargetRep::Sparse(vec![
12155 (0, bin_arc.clone()),
12156 (1, sibling_arc.clone()),
12157 ]),
12158 dirty: false,
12159 generation: 0,
12160 parent: None,
12161 lsn_rep: LsnRep::Empty,
12162 })));
12163 bin_arc.write().set_parent(Some(Arc::downgrade(&root_arc)));
12164 sibling_arc.write().set_parent(Some(Arc::downgrade(&root_arc)));
12165
12166 let tree = Tree::new(1, 128);
12167 *tree.root.write() = Some(root_arc.clone());
12168
12169 let result = tree.compress_bin(&bin_arc);
12170 assert!(
12171 result,
12172 "compress_bin must return true when one slot was removed"
12173 );
12174
12175 // The live entry must remain.
12176 let bg = bin_arc.read();
12177 match &*bg {
12178 TreeNode::Bottom(b) => {
12179 assert_eq!(b.entries.len(), 1, "one live slot must remain");
12180 assert_eq!(b.get_full_key(0).unwrap(), b"\x01");
12181 }
12182 _ => panic!("expected BIN"),
12183 }
12184 drop(bg);
12185
12186 // Parent IN must NOT have lost the BIN entry — the BIN is still non-empty.
12187 let rg = root_arc.read();
12188 match &*rg {
12189 TreeNode::Internal(n) => {
12190 assert_eq!(
12191 n.entries.len(),
12192 2,
12193 "parent IN must still have 2 entries (BIN was not emptied)"
12194 );
12195 }
12196 _ => panic!("expected IN"),
12197 }
12198 }
12199
12200 /// Compressing a BIN with a mix of known-deleted
12201 /// and pending-deleted slots removes both kinds.
12202 ///
12203 /// BIN.isDefunct(i) returns true for both KNOWN_DELETED and
12204 /// PENDING_DELETED. compress_bin must remove all defunct slots.
12205 #[test]
12206 fn test_incompressor_known_and_pending_deleted_removed() {
12207 let _lsn = Lsn::new(1, 1);
12208
12209 let bin_arc = Arc::new(RwLock::new(TreeNode::Bottom(BinStub {
12210 node_id: generate_node_id(),
12211 level: BIN_LEVEL,
12212 entries: vec![
12213 // slot 0: live
12214 BinEntry {
12215 data: Some(b"live".to_vec()),
12216 known_deleted: false,
12217 dirty: false,
12218 expiration_time: 0,
12219 },
12220 // slot 1: known-deleted
12221 BinEntry {
12222 data: None,
12223 known_deleted: true,
12224 dirty: false,
12225 expiration_time: 0,
12226 },
12227 // slot 2: live
12228 BinEntry {
12229 data: Some(b"also-live".to_vec()),
12230 known_deleted: false,
12231 dirty: false,
12232 expiration_time: 0,
12233 },
12234 // slot 3: known-deleted
12235 BinEntry {
12236 data: None,
12237 known_deleted: true,
12238 dirty: false,
12239 expiration_time: 0,
12240 },
12241 ],
12242 key_prefix: Vec::new(),
12243 dirty: false,
12244 is_delta: false,
12245 last_full_lsn: NULL_LSN,
12246 last_delta_lsn: NULL_LSN,
12247 generation: 0,
12248 parent: None,
12249 expiration_in_hours: true,
12250 cursor_count: 0,
12251 prohibit_next_delta: false,
12252 lsn_rep: LsnRep::Empty,
12253 keys: KeyRep::from_keys(vec![
12254 b"\x00".to_vec(),
12255 b"\x01".to_vec(),
12256 b"\x02".to_vec(),
12257 b"\x03".to_vec(),
12258 ]),
12259 compact_max_key_length: INKeyRep_DEFAULT_MAX_KEY_LENGTH,
12260 })));
12261
12262 let root_arc = Arc::new(RwLock::new(TreeNode::Internal(InNodeStub {
12263 node_id: generate_node_id(),
12264 level: MAIN_LEVEL | 2,
12265 entries: vec![InEntry { key: vec![] }],
12266 targets: TargetRep::Sparse(vec![(0, bin_arc.clone())]),
12267 dirty: false,
12268 generation: 0,
12269 parent: None,
12270 lsn_rep: LsnRep::Empty,
12271 })));
12272 bin_arc.write().set_parent(Some(Arc::downgrade(&root_arc)));
12273
12274 let tree = Tree::new(1, 128);
12275 *tree.root.write() = Some(root_arc);
12276
12277 let result = tree.compress_bin(&bin_arc);
12278 assert!(result, "compress_bin must return true");
12279
12280 let g = bin_arc.read();
12281 match &*g {
12282 TreeNode::Bottom(b) => {
12283 assert_eq!(
12284 b.entries.len(),
12285 2,
12286 "only the 2 live entries must remain"
12287 );
12288 assert!(
12289 b.entries.iter().all(|e| !e.known_deleted),
12290 "no deleted entries must remain after compression"
12291 );
12292 }
12293 _ => panic!("expected BIN"),
12294 }
12295 }
12296
12297 // =========================================================================
12298 // P1: Concurrent stress tests for single-pass latch-coupling in search()
12299 // =========================================================================
12300
12301 /// Verify that concurrent readers and a writer do not panic or deadlock.
12302 ///
12303 /// 4 reader threads search all pre-populated keys while 1 writer thread
12304 /// inserts additional keys. This exercises the single-pass latch-coupling
12305 /// path under genuine concurrent load.
12306 #[test]
12307 fn test_concurrent_search_while_inserting() {
12308 use std::sync::{Arc, Barrier};
12309 use std::thread;
12310
12311 // Tree is wrapped in std::sync::RwLock to match the DatabaseImpl
12312 // usage pattern (DatabaseImpl holds Tree behind an RwLock).
12313 let tree = Arc::new(std::sync::RwLock::new(Tree::new(1, 4)));
12314
12315 // Pre-populate with 50 entries so the tree has multiple BINs.
12316 {
12317 let t = tree.write().unwrap();
12318 for i in 0u32..50 {
12319 let key = format!("{:08}", i).into_bytes();
12320 t.insert(key, vec![i as u8], noxu_util::NULL_LSN).unwrap();
12321 }
12322 }
12323
12324 // Barrier synchronises start: 4 readers + 1 writer.
12325 let barrier = Arc::new(Barrier::new(5));
12326
12327 let mut handles = vec![];
12328
12329 // 4 concurrent reader threads — each searches the 50 pre-populated keys.
12330 for _ in 0..4 {
12331 let tree_clone = Arc::clone(&tree);
12332 let barrier_clone = Arc::clone(&barrier);
12333 handles.push(thread::spawn(move || {
12334 barrier_clone.wait();
12335 for i in 0u32..50 {
12336 let key = format!("{:08}", i).into_bytes();
12337 let t = tree_clone.read().unwrap();
12338 // Must not panic. The key was pre-populated so search()
12339 // should always return Some(_); we assert on that below
12340 // (after joining) rather than inside the thread to keep
12341 // the panic message clean.
12342 let _ = t.search(&key);
12343 }
12344 }));
12345 }
12346
12347 // 1 concurrent writer thread — inserts keys 50–99.
12348 {
12349 let tree_clone = Arc::clone(&tree);
12350 let barrier_clone = Arc::clone(&barrier);
12351 handles.push(thread::spawn(move || {
12352 barrier_clone.wait();
12353 let t = tree_clone.write().unwrap();
12354 for i in 50u32..100 {
12355 let key = format!("{:08}", i).into_bytes();
12356 t.insert(key, vec![i as u8], noxu_util::NULL_LSN).unwrap();
12357 }
12358 }));
12359 }
12360
12361 for h in handles {
12362 h.join().expect("thread panicked");
12363 }
12364
12365 // After all threads finish, all 100 keys must be present.
12366 let t = tree.read().unwrap();
12367 for i in 0u32..100 {
12368 let key = format!("{:08}", i).into_bytes();
12369 let result = t.search(&key);
12370 assert!(
12371 result.is_some_and(|r| r.exact_parent_found),
12372 "key {:08} should be found after concurrent insert",
12373 i,
12374 );
12375 }
12376 }
12377
12378 /// Verify that 8 concurrent reader threads searching the same tree do not
12379 /// panic. Pure read concurrency should be safe with or without the
12380 /// single-pass fix; this test acts as a regression guard.
12381 #[test]
12382 fn test_concurrent_searches_no_panic() {
12383 use std::sync::Arc;
12384 use std::thread;
12385
12386 let tree = Arc::new(std::sync::RwLock::new(Tree::new(1, 4)));
12387 {
12388 let t = tree.write().unwrap();
12389 for i in 0u32..100 {
12390 let key = format!("{:08}", i).into_bytes();
12391 t.insert(key, vec![i as u8], noxu_util::NULL_LSN).unwrap();
12392 }
12393 }
12394
12395 let handles: Vec<_> = (0..8)
12396 .map(|_| {
12397 let tree_clone = Arc::clone(&tree);
12398 thread::spawn(move || {
12399 for i in 0u32..100 {
12400 let key = format!("{:08}", i).into_bytes();
12401 let t = tree_clone.read().unwrap();
12402 let _ = t.search(&key);
12403 }
12404 })
12405 })
12406 .collect();
12407
12408 for h in handles {
12409 h.join().expect("thread panicked");
12410 }
12411 }
12412
12413 // ========================================================================
12414 // Tests: BIN-delta — dirty tracking, serialise, collect
12415 // ========================================================================
12416
12417 #[test]
12418 fn test_dirty_count_zero_on_fresh_bin() {
12419 let bin = make_bin_for_delta_tests(vec![
12420 (b"a".to_vec(), Lsn::new(1, 1), Some(b"v1".to_vec())),
12421 (b"b".to_vec(), Lsn::new(1, 2), Some(b"v2".to_vec())),
12422 ]);
12423 assert_eq!(bin.dirty_count(), 0);
12424 }
12425
12426 #[test]
12427 fn test_insert_marks_slot_dirty() {
12428 let lsn = Lsn::new(1, 10);
12429 let mut bin = BinStub {
12430 node_id: 1,
12431 level: BIN_LEVEL,
12432 entries: vec![],
12433 key_prefix: Vec::new(),
12434 dirty: false,
12435 is_delta: false,
12436 last_full_lsn: NULL_LSN,
12437 last_delta_lsn: NULL_LSN,
12438 generation: 0,
12439 parent: None,
12440 expiration_in_hours: true,
12441 cursor_count: 0,
12442 prohibit_next_delta: false,
12443 lsn_rep: LsnRep::Empty,
12444 keys: KeyRep::new(),
12445 compact_max_key_length: INKeyRep_DEFAULT_MAX_KEY_LENGTH,
12446 };
12447 bin.insert_with_prefix(b"key".to_vec(), lsn, Some(b"val".to_vec()));
12448 assert_eq!(bin.dirty_count(), 1, "new slot should be dirty");
12449 assert!(bin.entries[0].dirty);
12450 }
12451
12452 #[test]
12453 fn test_update_marks_slot_dirty() {
12454 let _lsn = Lsn::new(1, 10);
12455 let mut bin = BinStub {
12456 node_id: 2,
12457 level: BIN_LEVEL,
12458 entries: vec![BinEntry {
12459 data: Some(b"old".to_vec()),
12460 known_deleted: false,
12461 dirty: false,
12462 expiration_time: 0,
12463 }],
12464 key_prefix: Vec::new(),
12465 dirty: false,
12466 is_delta: false,
12467 last_full_lsn: NULL_LSN,
12468 last_delta_lsn: NULL_LSN,
12469 generation: 0,
12470 parent: None,
12471 expiration_in_hours: true,
12472 cursor_count: 0,
12473 prohibit_next_delta: false,
12474 lsn_rep: LsnRep::Empty,
12475 keys: KeyRep::from_keys(vec![b"key".to_vec()]),
12476 compact_max_key_length: INKeyRep_DEFAULT_MAX_KEY_LENGTH,
12477 };
12478 bin.insert_with_prefix(
12479 b"key".to_vec(),
12480 Lsn::new(1, 20),
12481 Some(b"new".to_vec()),
12482 );
12483 assert!(bin.entries[0].dirty, "updated slot should be dirty");
12484 assert_eq!(bin.dirty_count(), 1);
12485 }
12486
12487 #[test]
12488 fn test_serialize_full_roundtrip() {
12489 let mut bin = BinStub {
12490 node_id: 42,
12491 level: BIN_LEVEL,
12492 entries: vec![
12493 BinEntry {
12494 data: Some(b"d1".to_vec()),
12495 known_deleted: false,
12496 dirty: true,
12497 expiration_time: 0,
12498 },
12499 BinEntry {
12500 data: None,
12501 known_deleted: true,
12502 dirty: false,
12503 expiration_time: 0,
12504 },
12505 ],
12506 key_prefix: Vec::new(),
12507 dirty: true,
12508 is_delta: false,
12509 last_full_lsn: NULL_LSN,
12510 last_delta_lsn: NULL_LSN,
12511 generation: 0,
12512 parent: None,
12513 expiration_in_hours: true,
12514 cursor_count: 0,
12515 prohibit_next_delta: false,
12516 lsn_rep: LsnRep::Empty,
12517 keys: KeyRep::from_keys(vec![b"alpha".to_vec(), b"beta".to_vec()]),
12518 compact_max_key_length: INKeyRep_DEFAULT_MAX_KEY_LENGTH,
12519 };
12520 let bytes = bin.serialize_full();
12521 let node_id = u64::from_be_bytes(bytes[0..8].try_into().unwrap());
12522 let n_entries = u32::from_be_bytes(bytes[8..12].try_into().unwrap());
12523 assert_eq!(node_id, 42);
12524 assert_eq!(n_entries, 2);
12525 bin.clear_dirty_after_full_log(Lsn::new(2, 1));
12526 assert_eq!(bin.dirty_count(), 0);
12527 assert_eq!(bin.last_full_lsn, Lsn::new(2, 1));
12528 assert!(!bin.dirty);
12529 }
12530
12531 #[test]
12532 fn test_serialize_delta_only_dirty_slots() {
12533 let mut bin = BinStub {
12534 node_id: 7,
12535 level: BIN_LEVEL,
12536 entries: vec![
12537 BinEntry {
12538 data: Some(b"v1".to_vec()),
12539 known_deleted: false,
12540 dirty: false,
12541 expiration_time: 0,
12542 },
12543 BinEntry {
12544 data: Some(b"v2".to_vec()),
12545 known_deleted: false,
12546 dirty: true,
12547 expiration_time: 0,
12548 },
12549 BinEntry {
12550 data: Some(b"v3".to_vec()),
12551 known_deleted: false,
12552 dirty: false,
12553 expiration_time: 0,
12554 },
12555 ],
12556 key_prefix: Vec::new(),
12557 dirty: true,
12558 is_delta: false,
12559 last_full_lsn: NULL_LSN,
12560 last_delta_lsn: NULL_LSN,
12561 generation: 0,
12562 parent: None,
12563 expiration_in_hours: true,
12564 cursor_count: 0,
12565 prohibit_next_delta: false,
12566 lsn_rep: LsnRep::Empty,
12567 keys: KeyRep::from_keys(vec![
12568 b"a".to_vec(),
12569 b"b".to_vec(),
12570 b"c".to_vec(),
12571 ]),
12572 compact_max_key_length: INKeyRep_DEFAULT_MAX_KEY_LENGTH,
12573 };
12574 let bytes = bin.serialize_delta();
12575 let node_id = u64::from_be_bytes(bytes[0..8].try_into().unwrap());
12576 let n_dirty = u32::from_be_bytes(bytes[8..12].try_into().unwrap());
12577 assert_eq!(node_id, 7);
12578 assert_eq!(n_dirty, 1);
12579 let slot_idx = u32::from_be_bytes(bytes[12..16].try_into().unwrap());
12580 assert_eq!(slot_idx, 1);
12581 bin.clear_dirty_after_delta_log();
12582 assert_eq!(bin.dirty_count(), 0);
12583 assert_eq!(
12584 bin.last_full_lsn, NULL_LSN,
12585 "last_full_lsn unchanged by delta"
12586 );
12587 }
12588
12589 #[test]
12590 fn test_collect_dirty_bins_returns_dirty_bins_only() {
12591 let tree = Tree::new(1, 256);
12592 tree.insert(b"k1".to_vec(), b"v1".to_vec(), Lsn::new(1, 1)).unwrap();
12593 tree.insert(b"k2".to_vec(), b"v2".to_vec(), Lsn::new(1, 2)).unwrap();
12594 let dirty = tree.collect_dirty_bins(1);
12595 assert!(!dirty.is_empty(), "should have dirty BINs after inserts");
12596
12597 for (_db_id, bin_arc) in &dirty {
12598 let mut g = bin_arc.write();
12599 if let TreeNode::Bottom(b) = &mut *g {
12600 b.clear_dirty_after_full_log(Lsn::new(1, 100));
12601 }
12602 }
12603 let dirty2 = tree.collect_dirty_bins(1);
12604 assert!(dirty2.is_empty(), "no dirty BINs after clearing");
12605 }
12606
12607 fn make_bin_for_delta_tests(
12608 entries: Vec<(Vec<u8>, Lsn, Option<Vec<u8>>)>,
12609 ) -> BinStub {
12610 let lsns: Vec<Lsn> = entries.iter().map(|(_, l, _)| *l).collect();
12611 let keys: Vec<Vec<u8>> =
12612 entries.iter().map(|(k, _, _)| k.clone()).collect();
12613 BinStub {
12614 node_id: 1,
12615 level: BIN_LEVEL,
12616 entries: entries
12617 .into_iter()
12618 .map(|(_key, _lsn, data)| BinEntry {
12619 data,
12620 known_deleted: false,
12621 dirty: false,
12622 expiration_time: 0,
12623 })
12624 .collect(),
12625 key_prefix: Vec::new(),
12626 dirty: false,
12627 is_delta: false,
12628 last_full_lsn: NULL_LSN,
12629 last_delta_lsn: NULL_LSN,
12630 generation: 0,
12631 parent: None,
12632 expiration_in_hours: true,
12633 cursor_count: 0,
12634 prohibit_next_delta: false,
12635 lsn_rep: LsnRep::from_lsns(&lsns),
12636 keys: KeyRep::from_keys(keys),
12637 compact_max_key_length: INKeyRep_DEFAULT_MAX_KEY_LENGTH,
12638 }
12639 }
12640
12641 // ========================================================================
12642 // T-17: BinStub::should_log_delta — faithful JE BIN.shouldLogDelta
12643 // (BIN.java:1892). These pin the COUNT-based decision against the
12644 // CONFIGURABLE percent (not a dirty-fraction-vs-hardcoded-0.25 heuristic),
12645 // plus the isBINDelta fast path, the numDeltas<=0 guard, and the
12646 // isDeltaProhibited / lastFullLsn==NULL bound.
12647 // ========================================================================
12648
12649 /// Build a full (non-delta) BIN with `n` slots, the first `dirty` of them
12650 /// marked dirty, and a non-NULL last_full_lsn (so a delta is permitted).
12651 fn bin_with_dirty(n: usize, dirty: usize) -> BinStub {
12652 let mut bin = make_bin_for_delta_tests(
12653 (0..n)
12654 .map(|i| {
12655 (
12656 format!("{:04}", i).into_bytes(),
12657 Lsn::new(1, i as u32 + 1),
12658 Some(vec![i as u8]),
12659 )
12660 })
12661 .collect(),
12662 );
12663 bin.last_full_lsn = Lsn::new(1, 1); // a prior full exists
12664 for e in bin.entries.iter_mut().take(dirty) {
12665 e.dirty = true;
12666 }
12667 bin
12668 }
12669
12670 /// COUNT-based + CONFIGURABLE percent: with percent=10 and 100 slots, the
12671 /// delta limit is 100*10/100 = 10. 10 dirty slots → delta; 11 dirty → full.
12672 ///
12673 /// This is the core T-17 reproduction: the OLD checkpointer decision used
12674 /// `dirty/total <= 0.25` (hardcoded), so 11/100 = 11% ≤ 25% → it would have
12675 /// (wrongly) logged a DELTA. The faithful count-based decision against the
12676 /// configurable percent=10 logs a FULL BIN.
12677 #[test]
12678 fn should_log_delta_is_count_based_and_configurable() {
12679 // Exactly at the limit → delta.
12680 assert!(
12681 bin_with_dirty(100, 10).should_log_delta(10),
12682 "numDeltas(10) <= limit(100*10/100=10) must be a delta"
12683 );
12684 // One over the limit → full BIN (FAILS on main: 11/100=11% <= 25%).
12685 assert!(
12686 !bin_with_dirty(100, 11).should_log_delta(10),
12687 "numDeltas(11) > limit(10) must be a FULL BIN under percent=10"
12688 );
12689 // The SAME BIN under the default percent=25 (limit 25) is a delta:
12690 // proves the percent is honoured, not hardcoded.
12691 assert!(
12692 bin_with_dirty(100, 11).should_log_delta(25),
12693 "numDeltas(11) <= limit(25) must be a delta under percent=25"
12694 );
12695 // Integer (truncating) math, exactly as JE: 7 slots, percent=25 →
12696 // limit = 7*25/100 = 1. 1 dirty → delta, 2 dirty → full.
12697 assert!(bin_with_dirty(7, 1).should_log_delta(25));
12698 assert!(!bin_with_dirty(7, 2).should_log_delta(25));
12699 }
12700
12701 /// isBINDelta fast path: a BIN already in delta form always re-logs as a
12702 /// delta (JE: `if (isBINDelta()) return true;`).
12703 #[test]
12704 fn should_log_delta_bin_delta_fast_path() {
12705 let mut bin = bin_with_dirty(100, 90); // 90% dirty: way over any limit
12706 bin.is_delta = true;
12707 // Even with a tiny percent that the dirty count blows past, an
12708 // already-delta BIN re-logs as a delta.
12709 assert!(
12710 bin.should_log_delta(1),
12711 "isBINDelta() must short-circuit to true regardless of percent"
12712 );
12713 }
12714
12715 /// numDeltas <= 0 guard: a BIN with no dirty slots logs a full BIN (an
12716 /// empty delta is invalid).
12717 #[test]
12718 fn should_log_delta_zero_dirty_is_full() {
12719 assert!(!bin_with_dirty(100, 0).should_log_delta(25));
12720 }
12721
12722 /// isDeltaProhibited bound: lastFullLsn == NULL (never logged full) and
12723 /// prohibit_next_delta both force a full BIN.
12724 #[test]
12725 fn should_log_delta_prohibited_forces_full() {
12726 // No prior full BIN.
12727 let mut bin = bin_with_dirty(100, 5); // would be a delta otherwise
12728 bin.last_full_lsn = NULL_LSN;
12729 assert!(
12730 !bin.should_log_delta(25),
12731 "lastFullLsn==NULL must force a full BIN"
12732 );
12733
12734 // prohibit_next_delta set (e.g. a dirty slot was removed by compress).
12735 let mut bin = bin_with_dirty(100, 5);
12736 bin.prohibit_next_delta = true;
12737 assert!(
12738 !bin.should_log_delta(25),
12739 "prohibit_next_delta must force a full BIN"
12740 );
12741 }
12742
12743 /// The prohibit flag is cleared after a full BIN is logged
12744 /// (JE IN.afterLog: setProhibitNextDelta(false)), so the NEXT log may once
12745 /// again be a delta — this is the periodic-full chain bound.
12746 #[test]
12747 fn full_log_clears_prohibit_next_delta() {
12748 let mut bin = bin_with_dirty(100, 5);
12749 bin.prohibit_next_delta = true;
12750 assert!(!bin.should_log_delta(25), "prohibited → full");
12751 bin.clear_dirty_after_full_log(Lsn::new(2, 5));
12752 assert!(
12753 !bin.prohibit_next_delta,
12754 "full log must clear prohibit_next_delta"
12755 );
12756 // Re-dirty a few slots; now a delta is allowed again.
12757 for e in bin.entries.iter_mut().take(5) {
12758 e.dirty = true;
12759 }
12760 assert!(
12761 bin.should_log_delta(25),
12762 "after a full log, a small delta is allowed again"
12763 );
12764 }
12765
12766 // ========================================================================
12767 // Tests: Task #82 — 8 new Tree methods
12768 // ========================================================================
12769
12770 // --- is_root_resident ---
12771
12772 #[test]
12773 fn test_is_root_resident_empty_tree() {
12774 let tree = Tree::new(1, 128);
12775 assert!(!tree.is_root_resident(), "empty tree has no resident root");
12776 }
12777
12778 #[test]
12779 fn test_is_root_resident_after_insert() {
12780 let tree = Tree::new(1, 128);
12781 tree.insert(b"k".to_vec(), b"v".to_vec(), Lsn::new(1, 1)).unwrap();
12782 assert!(tree.is_root_resident(), "root must be resident after insert");
12783 }
12784
12785 // --- get_resident_root_in ---
12786
12787 #[test]
12788 fn test_get_resident_root_in_empty() {
12789 let tree = Tree::new(1, 128);
12790 assert!(tree.get_resident_root_in().is_none());
12791 }
12792
12793 #[test]
12794 fn test_get_resident_root_in_single_entry() {
12795 let tree = Tree::new(1, 128);
12796 tree.insert(b"hello".to_vec(), b"world".to_vec(), Lsn::new(1, 1))
12797 .unwrap();
12798 let root = tree.get_resident_root_in();
12799 assert!(root.is_some(), "root must be Some after insert");
12800 let root_arc = tree.get_root().unwrap();
12801 assert!(
12802 Arc::ptr_eq(&root_arc, &root.unwrap()),
12803 "get_resident_root_in must return the same Arc as get_root"
12804 );
12805 }
12806
12807 #[test]
12808 fn test_get_resident_root_in_multi_entry() {
12809 let tree = Tree::new(1, 4);
12810 for i in 0u32..20 {
12811 let k = format!("rr{:04}", i).into_bytes();
12812 tree.insert(k, vec![i as u8], Lsn::new(1, i)).unwrap();
12813 }
12814 assert!(tree.get_resident_root_in().is_some());
12815 }
12816
12817 // --- get_parent_bin_for_child_ln ---
12818
12819 #[test]
12820 fn test_get_parent_bin_for_child_ln_empty_tree() {
12821 let tree = Tree::new(1, 128);
12822 assert!(tree.get_parent_bin_for_child_ln(b"key").is_none());
12823 }
12824
12825 #[test]
12826 fn test_get_parent_bin_for_child_ln_single_entry() {
12827 let tree = Tree::new(1, 128);
12828 tree.insert(b"alpha".to_vec(), b"val".to_vec(), Lsn::new(1, 1))
12829 .unwrap();
12830 let bin = tree.get_parent_bin_for_child_ln(b"alpha");
12831 assert!(bin.is_some(), "must return Some for a present key");
12832 assert!(bin.unwrap().read().is_bin(), "returned node must be a BIN");
12833 }
12834
12835 #[test]
12836 fn test_get_parent_bin_for_child_ln_multi_key() {
12837 let tree = Tree::new(1, 8);
12838 let keys: &[&[u8]] = &[b"aa", b"bb", b"cc", b"dd", b"ee"];
12839 for &k in keys {
12840 tree.insert(k.to_vec(), b"v".to_vec(), Lsn::new(1, 1)).unwrap();
12841 }
12842 for &k in keys {
12843 let bin = tree.get_parent_bin_for_child_ln(k);
12844 assert!(bin.is_some(), "must return Some for {:?}", k);
12845 assert!(bin.unwrap().read().is_bin());
12846 }
12847 }
12848
12849 // --- find_bin_for_insert ---
12850
12851 #[test]
12852 fn test_find_bin_for_insert_empty_tree() {
12853 let tree = Tree::new(1, 128);
12854 assert!(tree.find_bin_for_insert(b"newkey").is_none());
12855 }
12856
12857 #[test]
12858 fn test_find_bin_for_insert_returns_bin() {
12859 let tree = Tree::new(1, 128);
12860 tree.insert(b"existing".to_vec(), b"data".to_vec(), Lsn::new(1, 1))
12861 .unwrap();
12862 let bin = tree.find_bin_for_insert(b"newkey");
12863 assert!(bin.is_some());
12864 assert!(bin.unwrap().read().is_bin());
12865 }
12866
12867 #[test]
12868 fn test_find_bin_for_insert_same_as_parent_bin() {
12869 let tree = Tree::new(1, 128);
12870 tree.insert(b"foo".to_vec(), b"bar".to_vec(), Lsn::new(1, 1)).unwrap();
12871 let a = tree.get_parent_bin_for_child_ln(b"foo").unwrap();
12872 let b_arc = tree.find_bin_for_insert(b"foo").unwrap();
12873 assert!(
12874 Arc::ptr_eq(&a, &b_arc),
12875 "find_bin_for_insert must return the same BIN as get_parent_bin_for_child_ln"
12876 );
12877 }
12878
12879 // --- search_splits_allowed ---
12880
12881 #[test]
12882 fn test_search_splits_allowed_empty_tree() {
12883 let tree = Tree::new(1, 128);
12884 assert!(tree.search_splits_allowed(b"k").is_none());
12885 }
12886
12887 #[test]
12888 fn test_search_splits_allowed_finds_existing_key() {
12889 let tree = Tree::new(1, 8);
12890 for i in 0u32..10 {
12891 let k = format!("sa{:04}", i).into_bytes();
12892 tree.insert(k, vec![i as u8], Lsn::new(1, i)).unwrap();
12893 }
12894 for i in 0u32..10 {
12895 let k = format!("sa{:04}", i).into_bytes();
12896 let sr = tree.search_splits_allowed(&k);
12897 assert!(
12898 sr.is_some() && sr.unwrap().exact_parent_found,
12899 "search_splits_allowed must find sa{:04}",
12900 i
12901 );
12902 }
12903 }
12904
12905 #[test]
12906 fn test_search_splits_allowed_missing_key() {
12907 let tree = Tree::new(1, 8);
12908 tree.insert(b"present".to_vec(), b"v".to_vec(), Lsn::new(1, 1))
12909 .unwrap();
12910 let sr = tree.search_splits_allowed(b"absent");
12911 assert!(
12912 sr.is_none_or(|r| !r.exact_parent_found),
12913 "search_splits_allowed must not find absent key"
12914 );
12915 }
12916
12917 // --- rebuild_in_list ---
12918
12919 #[test]
12920 fn test_rebuild_in_list_empty_tree() {
12921 let tree = Tree::new(1, 128);
12922 assert!(tree.rebuild_in_list().is_empty());
12923 }
12924
12925 #[test]
12926 fn test_rebuild_in_list_single_entry() {
12927 let tree = Tree::new(1, 128);
12928 tree.insert(b"one".to_vec(), b"v".to_vec(), Lsn::new(1, 1)).unwrap();
12929 let list = tree.rebuild_in_list();
12930 // Expect root IN + BIN = 2 nodes.
12931 assert_eq!(
12932 list.len(),
12933 2,
12934 "single-entry tree must have exactly 2 nodes"
12935 );
12936 let has_bin = list.iter().any(|a| a.read().is_bin());
12937 let has_in = list.iter().any(|a| !a.read().is_bin());
12938 assert!(has_bin, "list must contain at least one BIN");
12939 assert!(has_in, "list must contain at least one upper IN");
12940 }
12941
12942 #[test]
12943 fn test_rebuild_in_list_multi_entry() {
12944 let tree = Tree::new(1, 4);
12945 for i in 0u32..20 {
12946 let k = format!("ri{:04}", i).into_bytes();
12947 tree.insert(k, vec![i as u8], Lsn::new(1, i)).unwrap();
12948 }
12949 let list = tree.rebuild_in_list();
12950 let stats = tree.collect_stats();
12951 let expected_nodes = (stats.n_ins + stats.n_bins) as usize;
12952 assert_eq!(
12953 list.len(),
12954 expected_nodes,
12955 "rebuild_in_list must return all {} nodes",
12956 expected_nodes
12957 );
12958 }
12959
12960 // --- validate_in_list ---
12961
12962 #[test]
12963 fn test_validate_in_list_empty_tree() {
12964 let tree = Tree::new(1, 128);
12965 assert!(tree.validate_in_list(), "empty tree must be valid");
12966 }
12967
12968 #[test]
12969 fn test_validate_in_list_single_entry() {
12970 let tree = Tree::new(1, 128);
12971 tree.insert(b"v".to_vec(), b"data".to_vec(), Lsn::new(1, 1)).unwrap();
12972 assert!(tree.validate_in_list(), "single-entry tree must be valid");
12973 }
12974
12975 #[test]
12976 fn test_validate_in_list_multi_entry() {
12977 let tree = Tree::new(1, 4);
12978 for i in 0u32..20 {
12979 let k = format!("vl{:04}", i).into_bytes();
12980 tree.insert(k, vec![i as u8], Lsn::new(1, i)).unwrap();
12981 }
12982 assert!(tree.validate_in_list(), "multi-entry tree must be valid");
12983 }
12984
12985 #[test]
12986 fn test_validate_in_list_empty_in_fails() {
12987 // Manually build a tree where the root IN has no entries — invalid.
12988 let root_arc = Arc::new(RwLock::new(TreeNode::Internal(InNodeStub {
12989 node_id: generate_node_id(),
12990 level: MAIN_LEVEL | 2,
12991 entries: vec![], // empty — structurally invalid
12992 targets: TargetRep::None,
12993 dirty: false,
12994 generation: 0,
12995 parent: None,
12996 lsn_rep: LsnRep::Empty,
12997 })));
12998 let tree = Tree::new(1, 128);
12999 *tree.root.write() = Some(root_arc);
13000 assert!(
13001 !tree.validate_in_list(),
13002 "a tree with an empty Internal node must fail validation"
13003 );
13004 }
13005
13006 // --- get_parent_in_for_child_in ---
13007
13008 #[test]
13009 fn test_get_parent_in_for_child_in_empty_tree() {
13010 let tree = Tree::new(1, 128);
13011 assert!(tree.get_parent_in_for_child_in(999).is_none());
13012 }
13013
13014 #[test]
13015 fn test_get_parent_in_for_child_in_single_entry() {
13016 // A single-insert tree has: root IN → BIN.
13017 // The root IN is the parent of the BIN.
13018 let tree = Tree::new(1, 128);
13019 tree.insert(b"p".to_vec(), b"v".to_vec(), Lsn::new(1, 1)).unwrap();
13020
13021 let root_arc = tree.get_root().as_ref().unwrap().clone();
13022 let bin_node_id = {
13023 let g = root_arc.read();
13024 match &*g {
13025 TreeNode::Internal(n) => {
13026 let child = n.child_ref(0).unwrap();
13027 let cg = child.read();
13028 match &*cg {
13029 TreeNode::Bottom(b) => b.node_id,
13030 _ => panic!("expected BIN"),
13031 }
13032 }
13033 _ => panic!("expected Internal root"),
13034 }
13035 };
13036
13037 let result = tree.get_parent_in_for_child_in(bin_node_id);
13038 assert!(result.is_some(), "must find parent of BIN");
13039 let (parent_arc, slot) = result.unwrap();
13040 assert!(Arc::ptr_eq(&parent_arc, &root_arc));
13041 assert_eq!(slot, 0);
13042 }
13043
13044 #[test]
13045 fn test_get_parent_in_for_child_in_not_found() {
13046 let tree = Tree::new(1, 128);
13047 tree.insert(b"x".to_vec(), b"y".to_vec(), Lsn::new(1, 1)).unwrap();
13048 assert!(tree.get_parent_in_for_child_in(u64::MAX).is_none());
13049 }
13050
13051 #[test]
13052 fn test_get_parent_in_for_child_in_multi_level() {
13053 // Build a tree with at least 3 levels so we test the recursive descent.
13054 let tree = Tree::new(1, 4);
13055 for i in 0u32..20 {
13056 let k = format!("ml{:04}", i).into_bytes();
13057 tree.insert(k, vec![i as u8], Lsn::new(1, i)).unwrap();
13058 }
13059
13060 // Collect all BIN node_ids via rebuild_in_list.
13061 let nodes = tree.rebuild_in_list();
13062 let bin_ids: Vec<u64> = nodes
13063 .iter()
13064 .filter_map(|a| {
13065 let g = a.read();
13066 if g.is_bin()
13067 && let TreeNode::Bottom(b) = &*g
13068 {
13069 return Some(b.node_id);
13070 }
13071 None
13072 })
13073 .collect();
13074
13075 for bin_id in bin_ids {
13076 let result = tree.get_parent_in_for_child_in(bin_id);
13077 assert!(
13078 result.is_some(),
13079 "every BIN (id={}) must have a parent IN",
13080 bin_id
13081 );
13082 let (parent_arc, _slot) = result.unwrap();
13083 assert!(
13084 !parent_arc.read().is_bin(),
13085 "parent of a BIN must be an Internal node"
13086 );
13087 }
13088 }
13089
13090 /// H-9 regression: BinStub::strip_lns actually drops the slot data
13091 /// (not just stats accounting).
13092 #[test]
13093 fn test_h9_strip_lns_actually_frees_data() {
13094 use crate::tree::{BinEntry, BinStub};
13095 use noxu_util::lsn::Lsn;
13096 let mut bin = BinStub {
13097 node_id: 1,
13098 level: 1,
13099 entries: Vec::new(),
13100 key_prefix: Vec::new(),
13101 dirty: false,
13102 is_delta: false,
13103 last_full_lsn: Lsn::from_u64(0),
13104 last_delta_lsn: Lsn::from_u64(0),
13105 generation: 0,
13106 parent: None,
13107 expiration_in_hours: true,
13108 cursor_count: 0,
13109 prohibit_next_delta: false,
13110 lsn_rep: LsnRep::Empty,
13111 keys: KeyRep::new(),
13112 compact_max_key_length: INKeyRep_DEFAULT_MAX_KEY_LENGTH,
13113 };
13114 // Three slots with embedded data + VALID logged LSNs (one dirty).
13115 // JE-faithful: a slot with a valid LSN is strippable regardless of the
13116 // dirty bit (its value is recoverable from the log); only a NULL-LSN
13117 // (never-logged / deferred-write) slot is preserved.
13118 bin.entries.push(BinEntry {
13119 data: Some(vec![0u8; 64]),
13120 known_deleted: false,
13121 dirty: false,
13122 expiration_time: 0,
13123 });
13124 bin.entries.push(BinEntry {
13125 data: Some(vec![0u8; 32]),
13126 known_deleted: false,
13127 dirty: false,
13128 expiration_time: 0,
13129 });
13130 bin.entries.push(BinEntry {
13131 data: Some(vec![0u8; 16]),
13132 known_deleted: false,
13133 dirty: true, // dirty BUT logged -> still strippable (EVICTOR-RECLAIM-1)
13134 expiration_time: 0,
13135 });
13136 // T-2: keep the key rep aligned with the pushed slots.
13137 bin.keys = KeyRep::from_keys(vec![
13138 b"a".to_vec(),
13139 b"b".to_vec(),
13140 b"c".to_vec(),
13141 ]);
13142 // Give all three slots VALID (non-NULL) LSNs so they are recoverable
13143 // from the log and therefore strippable.
13144 bin.set_lsn(0, Lsn::new(1, 100));
13145 bin.set_lsn(1, Lsn::new(1, 200));
13146 bin.set_lsn(2, Lsn::new(1, 300));
13147
13148 let freed = bin.strip_lns();
13149 assert_eq!(
13150 freed,
13151 64 + 32 + 16,
13152 "all logged slots stripped regardless of dirty (JE evictLNs)"
13153 );
13154 assert!(bin.entries[0].data.is_none(), "logged slot data dropped");
13155 assert!(bin.entries[1].data.is_none(), "logged slot data dropped");
13156 assert!(
13157 bin.entries[2].data.is_none(),
13158 "dirty-but-logged slot data dropped (recoverable from log)"
13159 );
13160
13161 // A NULL-LSN slot (never logged) must be preserved — its only copy is
13162 // the in-memory value.
13163 bin.entries[0].data = Some(vec![0u8; 64]);
13164 bin.set_lsn(0, noxu_util::NULL_LSN);
13165 let freed_null = bin.strip_lns();
13166 assert_eq!(
13167 freed_null, 0,
13168 "NULL-LSN (unlogged) slot must NOT be stripped"
13169 );
13170 assert!(bin.entries[0].data.is_some(), "unlogged slot data preserved");
13171
13172 // Cursor pin prevents stripping.
13173 bin.set_lsn(0, Lsn::new(1, 100));
13174 bin.cursor_count = 1;
13175 let freed_with_cursor = bin.strip_lns();
13176 assert_eq!(
13177 freed_with_cursor, 0,
13178 "strip_lns must skip when cursor pinned"
13179 );
13180 assert!(
13181 bin.entries[0].data.is_some(),
13182 "data preserved while cursor pinned"
13183 );
13184 }
13185
13186 // St-H4: the binary upper_in_floor_index must return the same slot as a
13187 // reference linear floor scan for all probe keys (incl. before-all,
13188 // after-all, between, and exact matches).
13189 #[test]
13190 fn test_upper_in_floor_index_matches_linear_scan() {
13191 // Reference linear floor scan (the pre-St-H4 algorithm): slot 0 is the
13192 // virtual −∞ key; walk forward while entry.key ≤ key.
13193 fn linear_floor(entries: &[InEntry], key: &[u8]) -> usize {
13194 let mut idx = 0usize;
13195 for (i, entry) in entries.iter().enumerate() {
13196 if i == 0 {
13197 idx = 0;
13198 } else if entry.key.as_slice() <= key {
13199 idx = i;
13200 } else {
13201 break;
13202 }
13203 }
13204 idx
13205 }
13206
13207 let tree = Tree::new(1, 256);
13208 // Build sorted IN slot key sets of varying size; slot 0 = virtual −∞
13209 // (empty key sorts first), the rest strictly ascending.
13210 for n_slots in 1usize..40 {
13211 let mut entries: Vec<InEntry> = Vec::with_capacity(n_slots);
13212 entries.push(InEntry { key: vec![] });
13213 for i in 1..n_slots {
13214 // Strictly-ascending two-byte keys with gaps so probes can
13215 // fall between, on, before, and after them.
13216 let v = (i as u16) * 4;
13217 entries.push(InEntry {
13218 key: vec![(v >> 8) as u8, (v & 0xFF) as u8],
13219 });
13220 }
13221 for probe in 0u16..=(n_slots as u16 * 4 + 4) {
13222 let key = vec![(probe >> 8) as u8, (probe & 0xFF) as u8];
13223 assert_eq!(
13224 tree.upper_in_floor_index(&entries, &key),
13225 linear_floor(&entries, &key),
13226 "floor mismatch: n_slots={n_slots}, key={key:?}"
13227 );
13228 }
13229 }
13230 }
13231}
13232
13233// ─────────────────────────────────────────────────────────────────────────
13234// St-H6: BIN split inherits expiration_in_hours from the splitting BIN.
13235// ─────────────────────────────────────────────────────────────────────────
13236
13237/// Unit test for the St-H6 fix: the right-half sibling created by
13238/// `split_child` inherits `expiration_in_hours` from the splitting BIN.
13239///
13240/// Before the fix, the sibling was always created with
13241/// `expiration_in_hours = false`, causing hours-granularity TTL entries
13242/// (expiration_time ~495k) to be compared against `current_time_secs()`
13243/// (~1.78B) and treated as expired.
13244///
13245/// This test:
13246/// 1. Creates a tree with max_entries = 4 and inserts 4 entries directly
13247/// (bypassing `update_key_expiration`) with non-zero `expiration_time`
13248/// and `expiration_in_hours = true` on the BIN.
13249/// 2. Triggers a split.
13250/// 3. Asserts that the right-half sibling has `expiration_in_hours = true`
13251/// (inherited, not hardcoded false).
13252#[test]
13253fn test_split_child_sibling_inherits_expiration_in_hours() {
13254 use crate::tree::{BIN_LEVEL, BinEntry, BinStub, MAIN_LEVEL, TreeNode};
13255 use noxu_util::{Lsn, NULL_LSN};
13256 use parking_lot::RwLock;
13257 use std::sync::Arc;
13258
13259 // Manually build a tree with one BIN (4 entries, expiration_in_hours=true).
13260 let tree = Tree::new(99, 4);
13261
13262 // Pre-populate the tree root for the test.
13263 let entries: Vec<BinEntry> = (0u8..4u8)
13264 .map(|_k| BinEntry {
13265 data: Some(vec![_k, _k]),
13266 known_deleted: false,
13267 dirty: true,
13268 expiration_time: 495_630, // hours-since-epoch value, 2026
13269 })
13270 .collect();
13271 let bin_keys: Vec<Vec<u8>> = (0u8..4u8).map(|k| vec![k]).collect();
13272 let bin = Arc::new(RwLock::new(TreeNode::Bottom(BinStub {
13273 node_id: 1,
13274 level: BIN_LEVEL,
13275 entries,
13276 key_prefix: Vec::new(),
13277 dirty: true,
13278 is_delta: false,
13279 last_full_lsn: NULL_LSN,
13280 last_delta_lsn: NULL_LSN,
13281 generation: 0,
13282 parent: None,
13283 expiration_in_hours: true, // hours-granularity entries
13284 cursor_count: 0,
13285 prohibit_next_delta: false,
13286 lsn_rep: LsnRep::Empty,
13287 keys: KeyRep::from_keys(bin_keys),
13288 compact_max_key_length: INKeyRep_DEFAULT_MAX_KEY_LENGTH,
13289 })));
13290
13291 let root = Arc::new(RwLock::new(TreeNode::Internal(InNodeStub {
13292 node_id: 2,
13293 level: MAIN_LEVEL | 2,
13294 entries: vec![InEntry {
13295 key: vec![], // virtual key for slot 0 (-infinity)
13296 }],
13297 targets: TargetRep::Sparse(vec![(0, Arc::clone(&bin))]),
13298 dirty: true,
13299 generation: 0,
13300 parent: None,
13301 lsn_rep: LsnRep::Empty,
13302 })));
13303 {
13304 let mut b = bin.write();
13305 b.set_parent(Some(Arc::downgrade(&root)));
13306 }
13307 *tree.root.write() = Some(Arc::clone(&root));
13308
13309 // Trigger split_child on the root.
13310 Tree::split_child(
13311 &root,
13312 0,
13313 4,
13314 Lsn::new(1, 500),
13315 SplitHint::Normal,
13316 &[],
13317 None,
13318 false,
13319 None,
13320 )
13321 .expect("split_child should succeed");
13322
13323 // After the split: root has two children — left BIN and right sibling.
13324 let root_guard = root.read();
13325 let TreeNode::Internal(ref in_node) = *root_guard else {
13326 panic!("root should be Internal after split");
13327 };
13328 assert_eq!(
13329 in_node.entries.len(),
13330 2,
13331 "root should have 2 entries (children) after split"
13332 );
13333
13334 // Right-half sibling is at slot 1.
13335 let sibling_arc = in_node
13336 .get_child(1)
13337 .expect("right-half sibling should exist at slot 1");
13338 let sibling_guard = sibling_arc.read();
13339 let TreeNode::Bottom(ref sibling) = *sibling_guard else {
13340 panic!("right sibling should be a BIN");
13341 };
13342
13343 assert!(
13344 sibling.expiration_in_hours,
13345 "St-H6: right-half sibling expiration_in_hours must be true \
13346 (inherited from splitting BIN); got false"
13347 );
13348
13349 // Verify the sibling's entries have the expected expiration_time.
13350 for e in &sibling.entries {
13351 assert_eq!(
13352 e.expiration_time, 495_630,
13353 "sibling entry expiration_time should be preserved: got {}",
13354 e.expiration_time
13355 );
13356 // With in_hours=true, is_expired should return false (future).
13357 assert!(
13358 !noxu_util::ttl::is_expired(
13359 e.expiration_time,
13360 sibling.expiration_in_hours
13361 ),
13362 "St-H6: sibling TTL entry ({}) should NOT appear expired \
13363 with expiration_in_hours={}",
13364 e.expiration_time,
13365 sibling.expiration_in_hours
13366 );
13367 }
13368}
13369
13370/// Regression confirmation: `is_expired` with wrong `in_hours = false`
13371/// would falsely expire hours-granularity values (~495k hours since epoch).
13372#[test]
13373fn test_hours_value_is_expired_only_with_false_flag() {
13374 // Hours-since-epoch value for ~2026 + 1 000 h TTL.
13375 let exp_hours: u32 = 495_630;
13376 // Correctly treated as hours: not expired.
13377 assert!(
13378 !noxu_util::ttl::is_expired(exp_hours, true),
13379 "exp_hours={exp_hours} should NOT be expired when in_hours=true"
13380 );
13381 // Incorrectly treated as seconds (pre-fix right sibling): expired.
13382 assert!(
13383 noxu_util::ttl::is_expired(exp_hours, false),
13384 "exp_hours={exp_hours} should be expired when in_hours=false \
13385 (St-H6 demonstrates the wrong-flag scenario)"
13386 );
13387}
13388
13389// =============================================================================
13390// IN-redo unit tests (DRIFT-1 / Stage 1)
13391// =============================================================================
13392
13393#[cfg(test)]
13394mod in_redo_tests {
13395 use super::*;
13396
13397 /// Build a BinStub with `n` entries (key = [i as u8], lsn = lsn(1, i))
13398 /// and serialise it. Returns (node_id, node_data_bytes).
13399 fn make_bin_bytes(node_id: u64, n: usize) -> Vec<u8> {
13400 let mut bin = BinStub {
13401 node_id,
13402 level: BIN_LEVEL,
13403 entries: Vec::new(),
13404 key_prefix: Vec::new(),
13405 dirty: false,
13406 is_delta: false,
13407 last_full_lsn: noxu_util::NULL_LSN,
13408 last_delta_lsn: noxu_util::NULL_LSN,
13409 generation: 0,
13410 parent: None,
13411 expiration_in_hours: true,
13412 cursor_count: 0,
13413 prohibit_next_delta: false,
13414 lsn_rep: LsnRep::Empty,
13415 keys: KeyRep::new(),
13416 compact_max_key_length: INKeyRep_DEFAULT_MAX_KEY_LENGTH,
13417 };
13418 for i in 0..n {
13419 // T-2/T-3: route through insert so entries/keys/lsn_rep stay
13420 // aligned; the serialized bytes are identical.
13421 bin.insert_with_prefix(
13422 vec![i as u8],
13423 Lsn::new(1, (i + 1) as u32),
13424 Some(vec![i as u8]),
13425 );
13426 }
13427 bin.serialize_full()
13428 }
13429
13430 /// Verify that recover_in_redo inserts a BIN as root when the tree is empty.
13431 ///
13432 /// JE RecoveryManager.recoverRootIN: `root == null` path.
13433 #[test]
13434 fn test_recover_in_redo_root_bin_inserted_into_empty_tree() {
13435 let tree = Tree::new(42, 128);
13436 assert!(tree.is_empty());
13437 let bytes = make_bin_bytes(1, 3);
13438 let log_lsn = Lsn::new(1, 100);
13439 let result = tree.recover_in_redo(
13440 log_lsn, /*is_root=*/ true, /*is_bin=*/ true, &bytes,
13441 );
13442 assert_eq!(result, InRedoResult::Inserted, "expected Inserted");
13443 // Tree should now have 3 entries.
13444 assert_eq!(tree.count_entries(), 3);
13445 }
13446
13447 /// Verify that recover_in_redo replaces a root BIN when the logged version is newer.
13448 ///
13449 /// JE RootUpdater.doWork: `DbLsn.compareTo(originalLsn, lsn) < 0` path.
13450 #[test]
13451 fn test_recover_in_redo_root_bin_replaced_when_log_newer() {
13452 let tree = Tree::new(42, 128);
13453 // Install an old root (2 entries, older LSN).
13454 let old_bytes = make_bin_bytes(1, 2);
13455 let old_lsn = Lsn::new(1, 50);
13456 tree.recover_in_redo(old_lsn, true, true, &old_bytes);
13457 assert_eq!(tree.count_entries(), 2);
13458 // Replay with newer LSN and 4 entries.
13459 let new_bytes = make_bin_bytes(1, 4);
13460 let new_lsn = Lsn::new(1, 100);
13461 let result = tree.recover_in_redo(new_lsn, true, true, &new_bytes);
13462 assert_eq!(result, InRedoResult::Replaced);
13463 assert_eq!(tree.count_entries(), 4);
13464 }
13465
13466 /// Verify that an older logged BIN does NOT replace a newer in-memory root.
13467 ///
13468 /// JE RootUpdater.doWork: `DbLsn.compareTo(originalLsn, lsn) >= 0` skip path.
13469 #[test]
13470 fn test_recover_in_redo_root_bin_skipped_when_tree_newer() {
13471 let tree = Tree::new(42, 128);
13472 // Install a newer root.
13473 let new_bytes = make_bin_bytes(1, 4);
13474 let new_lsn = Lsn::new(1, 200);
13475 tree.recover_in_redo(new_lsn, true, true, &new_bytes);
13476 // Attempt to replay an older version.
13477 let old_bytes = make_bin_bytes(1, 2);
13478 let old_lsn = Lsn::new(1, 100);
13479 let result = tree.recover_in_redo(old_lsn, true, true, &old_bytes);
13480 assert_eq!(result, InRedoResult::Skipped);
13481 // Tree still holds the newer 4-entry version.
13482 assert_eq!(tree.count_entries(), 4);
13483 }
13484
13485 /// deserialize_bin round-trips through serialize_full.
13486 #[test]
13487 fn test_deserialize_bin_round_trip() {
13488 let bytes = make_bin_bytes(99, 5);
13489 let bin = Tree::deserialize_bin(&bytes).expect("must deserialize");
13490 assert_eq!(bin.node_id, 99);
13491 assert_eq!(bin.entries.len(), 5);
13492 for i in 0..bin.entries.len() {
13493 assert_eq!(bin.get_full_key(i).unwrap(), vec![i as u8]);
13494 }
13495 }
13496
13497 /// deserialize_upper_in round-trips through write_to_bytes (Internal).
13498 #[test]
13499 fn test_deserialize_upper_in_round_trip() {
13500 // Build an InNodeStub and serialize via write_to_bytes.
13501 let node = TreeNode::Internal(InNodeStub {
13502 node_id: 77,
13503 level: 0x10002,
13504 entries: vec![
13505 InEntry { key: vec![1, 2, 3] },
13506 InEntry { key: vec![4, 5, 6] },
13507 ],
13508 targets: TargetRep::None,
13509 dirty: false,
13510 generation: 0,
13511 parent: None,
13512 lsn_rep: LsnRep::Empty,
13513 });
13514 let bytes = node.write_to_bytes();
13515 let restored =
13516 Tree::deserialize_upper_in(&bytes).expect("must deserialize");
13517 assert_eq!(restored.node_id, 77);
13518 assert_eq!(restored.level, 0x10002);
13519 assert_eq!(restored.entries.len(), 2);
13520 assert_eq!(restored.entries[0].key, vec![1, 2, 3]);
13521 assert_eq!(restored.entries[1].key, vec![4, 5, 6]);
13522 }
13523}
13524
13525// --- Part 2 acceptance tests: key_prefixing flag (DRIFT-3) ---
13526//
13527// JE `IN.computeKeyPrefix` returns null when `databaseImpl.getKeyPrefixing()`
13528// is false, so no prefix compression is ever applied to those BINs. Noxu was
13529// always applying prefix compression. This checks that the flag is honoured.
13530//
13531// Ref: `IN.java computeKeyPrefix` ~line 2456,
13532// `DatabaseConfig.setKeyPrefixing` / `DatabaseImpl.getKeyPrefixing`.
13533#[cfg(test)]
13534mod key_prefixing_tests {
13535 use super::*;
13536
13537 /// Helper: find the first (leftmost) BIN in the tree.
13538 fn find_first_bin(node: &Arc<RwLock<TreeNode>>) -> Arc<RwLock<TreeNode>> {
13539 let child_opt = {
13540 let g = node.read();
13541 match &*g {
13542 TreeNode::Bottom(_) => None,
13543 TreeNode::Internal(n) => {
13544 Some(Arc::clone(n.child_ref(0).expect("child")))
13545 }
13546 }
13547 };
13548 match child_opt {
13549 None => Arc::clone(node),
13550 Some(child) => find_first_bin(&child),
13551 }
13552 }
13553
13554 /// With `key_prefixing = false` (the default), keys must be stored without
13555 /// any prefix: the BIN's `key_prefix` must remain empty after inserts.
13556 #[test]
13557 fn test_key_prefixing_false_stores_full_keys() {
13558 // Default is key_prefixing = false.
13559 let tree = Tree::new(1, 16);
13560 assert!(!tree.key_prefixing, "default must be false");
13561
13562 let lsn = noxu_util::Lsn::new(1, 10);
13563 // Insert keys with a long common prefix.
13564 for i in 0u8..8 {
13565 let key = vec![b'r', b'e', b'c', b'o', b'r', b'd', b':', i];
13566 tree.insert(key, vec![i], lsn).expect("insert");
13567 }
13568
13569 let root = tree.get_root().expect("root");
13570 let bin_arc = find_first_bin(&root);
13571 let guard = bin_arc.read();
13572 let TreeNode::Bottom(ref bin) = *guard else {
13573 panic!("must be a BIN");
13574 };
13575 assert!(
13576 bin.key_prefix.is_empty(),
13577 "key_prefix must be empty when key_prefixing=false, got {:?}",
13578 bin.key_prefix
13579 );
13580 assert_eq!(bin.entries.len(), 8);
13581 // Keys must be stored as full keys.
13582 assert_eq!(
13583 bin.get_full_key(0).unwrap(),
13584 vec![b'r', b'e', b'c', b'o', b'r', b'd', b':', 0]
13585 );
13586 }
13587
13588 /// With `key_prefixing = true`, keys with a common prefix are compressed:
13589 /// the BIN's `key_prefix` must be non-empty.
13590 #[test]
13591 fn test_key_prefixing_true_compresses_keys() {
13592 let mut tree = Tree::new(1, 16);
13593 tree.set_key_prefixing(true);
13594
13595 let lsn = noxu_util::Lsn::new(1, 10);
13596 for i in 0u8..8 {
13597 let key = vec![b'r', b'e', b'c', b'o', b'r', b'd', b':', i];
13598 tree.insert(key, vec![i], lsn).expect("insert");
13599 }
13600
13601 let root = tree.get_root().expect("root");
13602 let bin_arc = find_first_bin(&root);
13603 let guard = bin_arc.read();
13604 let TreeNode::Bottom(ref bin) = *guard else {
13605 panic!("must be a BIN");
13606 };
13607 // Prefix compression must kick in: all keys share "record:".
13608 assert!(
13609 !bin.key_prefix.is_empty(),
13610 "key_prefix must be non-empty when key_prefixing=true"
13611 );
13612 assert_eq!(
13613 bin.key_prefix,
13614 b"record:".to_vec(),
13615 "prefix must be the common prefix of all inserted keys"
13616 );
13617 }
13618
13619 /// Custom-comparator databases (sorted-dup) always bypass prefix
13620 /// regardless of key_prefixing: `insert_cmp` does not touch key_prefix.
13621 #[test]
13622 fn test_key_prefixing_custom_comparator_no_prefix() {
13623 let cmp: KeyComparatorFn = Arc::new(|a: &[u8], b: &[u8]| a.cmp(b));
13624 let mut tree = Tree::new_with_comparator(1, 16, cmp);
13625 // Enable key_prefixing — should have no effect via insert_cmp path.
13626 tree.set_key_prefixing(true);
13627
13628 let lsn = noxu_util::Lsn::new(1, 10);
13629 for i in 0u8..8 {
13630 let key = vec![b'r', b'e', b'c', b'o', b'r', b'd', b':', i];
13631 tree.insert(key, vec![i], lsn).expect("insert");
13632 }
13633
13634 let root = tree.get_root().expect("root");
13635 let bin_arc = find_first_bin(&root);
13636 let guard = bin_arc.read();
13637 let TreeNode::Bottom(ref bin) = *guard else {
13638 panic!("must be a BIN");
13639 };
13640 // Custom-comparator path (insert_cmp) does not set key_prefix.
13641 assert!(
13642 bin.key_prefix.is_empty(),
13643 "custom-comparator path must not set key_prefix"
13644 );
13645 }
13646}
13647
13648// --- Part 1 acceptance tests: splitSpecial heuristic (DRIFT-1) ---
13649//
13650// JE `IN.splitSpecial` / `Tree.forceSplit`: when all routing decisions during
13651// descent are leftmost (`AllLeft`) or rightmost (`AllRight`), the split index
13652// is forced to 1 or `n-1` respectively instead of `n/2`. This halves the
13653// number of splits for monotonically increasing / decreasing key workloads
13654// (sequential append / prepend) because each split leaves the BIN near-full.
13655//
13656// Ref: `IN.java splitSpecial` ~line 4129, `Tree.java forceSplit` ~line 1907.
13657#[cfg(test)]
13658mod split_special_tests {
13659 use super::*;
13660
13661 /// Test helper: descend the tree to the BIN that holds (or would hold)
13662 /// `key`, returning its arc. Mirrors the read-path descent used by
13663 /// `Tree::search`; sufficient for unit tests that need to mutate a slot.
13664 fn find_bin_arc_for_key(
13665 node_arc: &Arc<RwLock<TreeNode>>,
13666 key: &[u8],
13667 ) -> Option<Arc<RwLock<TreeNode>>> {
13668 let mut current = node_arc.clone();
13669 loop {
13670 let next = {
13671 let g = current.read();
13672 match &*g {
13673 TreeNode::Bottom(_) => return Some(current.clone()),
13674 TreeNode::Internal(n) => {
13675 if n.entries.is_empty() {
13676 return None;
13677 }
13678 let mut idx = 0usize;
13679 for (i, e) in n.entries.iter().enumerate() {
13680 if i == 0 || e.key.as_slice() <= key {
13681 idx = i;
13682 } else {
13683 break;
13684 }
13685 }
13686 n.get_child(idx)?
13687 }
13688 }
13689 };
13690 current = next;
13691 }
13692 }
13693
13694 /// Count total leaf (BIN) nodes in the tree by DFS.
13695 fn count_bins(node: &Arc<RwLock<TreeNode>>) -> usize {
13696 let g = node.read();
13697 match &*g {
13698 TreeNode::Bottom(_) => 1,
13699 TreeNode::Internal(n) => {
13700 n.resident_children().iter().map(count_bins).sum()
13701 }
13702 }
13703 }
13704
13705 /// Return total key count across all BINs.
13706 fn count_keys(node: &Arc<RwLock<TreeNode>>) -> usize {
13707 let g = node.read();
13708 match &*g {
13709 TreeNode::Bottom(b) => b.entries.len(),
13710 TreeNode::Internal(n) => {
13711 n.resident_children().iter().map(count_keys).sum()
13712 }
13713 }
13714 }
13715
13716 /// Returns the number of entries in the leftmost BIN.
13717 fn leftmost_bin_size(node: &Arc<RwLock<TreeNode>>) -> usize {
13718 let g = node.read();
13719 match &*g {
13720 TreeNode::Bottom(b) => b.entries.len(),
13721 TreeNode::Internal(n) => {
13722 let first_child = n.child_ref(0).expect("child");
13723 leftmost_bin_size(first_child)
13724 }
13725 }
13726 }
13727
13728 /// Returns the number of entries in the rightmost BIN.
13729 fn rightmost_bin_size(node: &Arc<RwLock<TreeNode>>) -> usize {
13730 let g = node.read();
13731 match &*g {
13732 TreeNode::Bottom(b) => b.entries.len(),
13733 TreeNode::Internal(n) => {
13734 let last_child = n
13735 .child_ref(n.entries.len().saturating_sub(1))
13736 .expect("child");
13737 rightmost_bin_size(last_child)
13738 }
13739 }
13740 }
13741
13742 /// `splitSpecial` ascending: each right-side split leaves the left BIN
13743 /// near-full (all but one entry stays). Compared to midpoint split
13744 /// the number of BINs created should be significantly fewer relative to
13745 /// keys inserted (more keys per BIN on average).
13746 ///
13747 /// JE criterion: `allRightSideDescent` → `splitIndex = nEntries - 1`.
13748 /// The penultimate entry stays in the left BIN; only one entry goes to
13749 /// the new right sibling, which then absorbs the next insert and fills
13750 /// normally.
13751 #[test]
13752 fn test_split_special_ascending_fewer_bins_than_midpoint() {
13753 let max_entries = 8usize;
13754 let n_keys = 200usize;
13755
13756 // Build tree with splitSpecial (ascending keys trigger AllRight).
13757 let tree_special = Tree::new(1, max_entries);
13758 let lsn = noxu_util::Lsn::new(1, 100);
13759 for i in 0u32..n_keys as u32 {
13760 let key = i.to_be_bytes().to_vec();
13761 tree_special.insert(key, vec![0u8], lsn).expect("insert");
13762 }
13763
13764 let root_special = tree_special.get_root().expect("root must exist");
13765 let bins_special = count_bins(&root_special);
13766 let keys_special = count_keys(&root_special);
13767
13768 // All keys must be present.
13769 assert_eq!(keys_special, n_keys, "all keys must be stored");
13770
13771 // With splitSpecial, each right-side split keeps n-1 entries in the
13772 // left BIN. Ideal: ceil(n_keys / (max_entries - 1)) BINs.
13773 // Without splitSpecial (midpoint): ceil(n_keys / (max_entries / 2)).
13774 // We assert the actual count is below the midpoint-split upper bound.
13775 let midpoint_upper_bound = n_keys.div_ceil(max_entries / 2);
13776 assert!(
13777 bins_special < midpoint_upper_bound,
13778 "splitSpecial should produce fewer BINs than midpoint split: \
13779 got {bins_special}, midpoint upper bound = {midpoint_upper_bound}"
13780 );
13781
13782 // The rightmost BIN must have fewer entries than max_entries
13783 // (the last insert only half-fills it at most), which is expected.
13784 // The IMPORTANT property: rightmost BIN started with exactly 1 entry
13785 // (its first entry was the split-off singleton) then filled up.
13786 // We just verify overall key density > midpoint baseline.
13787 let avg_fill = keys_special as f64 / bins_special as f64;
13788 let midpoint_fill = (max_entries / 2) as f64;
13789 assert!(
13790 avg_fill > midpoint_fill,
13791 "average fill per BIN with splitSpecial ({avg_fill:.1}) should \
13792 exceed midpoint baseline ({midpoint_fill})"
13793 );
13794 }
13795
13796 /// `splitSpecial` descending: all routing decisions are at slot 0
13797 /// (`AllLeft`). Split forces `split_index = 1` so the right sibling
13798 /// gets almost all entries and the left node keeps just one.
13799 ///
13800 /// JE criterion: `allLeftSideDescent` → `splitIndex = 1`.
13801 #[test]
13802 fn test_split_special_descending_fewer_bins_than_midpoint() {
13803 let max_entries = 8usize;
13804 let n_keys = 200usize;
13805
13806 let tree_special = Tree::new(1, max_entries);
13807 let lsn = noxu_util::Lsn::new(1, 100);
13808 for i in (0u32..n_keys as u32).rev() {
13809 let key = i.to_be_bytes().to_vec();
13810 tree_special.insert(key, vec![0u8], lsn).expect("insert");
13811 }
13812
13813 let root_special = tree_special.get_root().expect("root must exist");
13814 let bins_special = count_bins(&root_special);
13815 let keys_special = count_keys(&root_special);
13816
13817 assert_eq!(keys_special, n_keys, "all keys must be stored");
13818
13819 let midpoint_upper_bound = n_keys.div_ceil(max_entries / 2);
13820 assert!(
13821 bins_special < midpoint_upper_bound,
13822 "splitSpecial descending should produce fewer BINs: \
13823 got {bins_special}, midpoint upper bound = {midpoint_upper_bound}"
13824 );
13825 }
13826
13827 /// Random-key inserts must NOT be affected by splitSpecial: with random
13828 /// keys descent will rarely be all-left or all-right, so the split index
13829 /// defaults to midpoint and tree balance is maintained.
13830 #[test]
13831 fn test_split_special_random_inserts_stay_balanced() {
13832 use std::collections::BTreeSet;
13833
13834 let max_entries = 8usize;
13835 // Use a fixed permutation so the test is deterministic.
13836 let mut keys: Vec<u32> = (0u32..200).collect();
13837 // Knuth shuffle with a fixed seed.
13838 let mut rng: u64 = 0xdeadbeef_cafebabe;
13839 for i in (1..keys.len()).rev() {
13840 rng = rng.wrapping_mul(6364136223846793005).wrapping_add(1);
13841 let j = (rng >> 33) as usize % (i + 1);
13842 keys.swap(i, j);
13843 }
13844
13845 let tree = Tree::new(1, max_entries);
13846 let lsn = noxu_util::Lsn::new(1, 100);
13847 let mut inserted = BTreeSet::new();
13848 for k in &keys {
13849 let key = k.to_be_bytes().to_vec();
13850 tree.insert(key, vec![0u8], lsn).expect("insert");
13851 inserted.insert(*k);
13852 }
13853
13854 let root = tree.get_root().expect("root");
13855 let total_keys = count_keys(&root);
13856 assert_eq!(
13857 total_keys,
13858 inserted.len(),
13859 "all random keys must be stored"
13860 );
13861
13862 // Verify every key is findable.
13863 for k in &inserted {
13864 let key = k.to_be_bytes().to_vec();
13865 let found = tree.search(&key);
13866 assert!(
13867 found.map(|r| r.is_exact_match()).unwrap_or(false),
13868 "random key {k} must be findable after insert"
13869 );
13870 }
13871 }
13872
13873 /// TREE-F1: a `known_deleted` BIN slot must read as ABSENT on an exact
13874 /// lookup and must be SKIPPED by scans, matching JE.
13875 ///
13876 /// JE contract:
13877 /// * `IN.findEntry` (IN.java:3197): an exact match that lands on a
13878 /// known-deleted slot returns -1 (ABSENT).
13879 /// * `CursorImpl.lockAndGetCurrent` (CursorImpl.java:2062-2064): a
13880 /// step that lands on `isEntryKnownDeleted(index)` returns null, so
13881 /// the `getNext` loop advances past it (the slot is skipped).
13882 ///
13883 /// KD slots legitimately exist in live BINs during BIN-delta
13884 /// reconstitution (`mutate_to_full_bin` applies delta KD slots) until
13885 /// the compressor reclaims them. We reach that state directly here by
13886 /// marking a slot known_deleted in the BIN arc, then assert the
13887 /// user-facing read/scan paths do not surface it.
13888 #[test]
13889 fn test_tree_f1_known_deleted_slot_is_absent_and_skipped() {
13890 let tree = Tree::new(1, 8);
13891 // Insert enough keys to populate a BIN with several live slots.
13892 for i in 0..6u32 {
13893 let key = format!("kd{i:04}").into_bytes();
13894 tree.insert(key, vec![i as u8], Lsn::new(1, i)).unwrap();
13895 }
13896
13897 // Pick a middle key and mark its slot known_deleted directly in the
13898 // BIN, modelling a delta-applied tombstone the compressor has not yet
13899 // reclaimed.
13900 let kd_key = b"kd0003".to_vec();
13901 {
13902 let root = tree.get_root().expect("root");
13903 let bin_arc = find_bin_arc_for_key(&root, &kd_key).expect("bin");
13904 let mut g = bin_arc.write();
13905 if let TreeNode::Bottom(b) = &mut *g {
13906 let idx = (0..b.entries.len())
13907 .find(|&i| {
13908 b.get_full_key(i).as_deref() == Some(kd_key.as_slice())
13909 })
13910 .expect("kd key slot");
13911 b.entries[idx].known_deleted = true;
13912 } else {
13913 panic!("expected BIN");
13914 }
13915 }
13916
13917 // (a) exact lookup via Tree::search must report NOT found.
13918 let sr = tree.search(&kd_key);
13919 assert!(
13920 !sr.map(|r| r.is_exact_match()).unwrap_or(false),
13921 "TREE-F1: Tree::search must report a known_deleted slot as absent \
13922 (IN.findEntry IN.java:3197)"
13923 );
13924
13925 // (a) exact lookup via Tree::search_with_data must report NOT found.
13926 let sf = tree.search_with_data(&kd_key).expect("slot fetch");
13927 assert!(
13928 !sf.found,
13929 "TREE-F1: Tree::search_with_data must report a known_deleted slot \
13930 as absent (IN.findEntry IN.java:3197)"
13931 );
13932
13933 // Live neighbours must still be found.
13934 for live in [b"kd0002".to_vec(), b"kd0004".to_vec()] {
13935 assert!(
13936 tree.search(&live).map(|r| r.is_exact_match()).unwrap_or(false),
13937 "live neighbour must remain findable"
13938 );
13939 }
13940
13941 // (b) a scan-facing BIN dump (descend_to_edge_bin / get_next_bin /
13942 // get_prev_bin) returns slots verbatim WITH the known_deleted flag
13943 // set, so the cursor can skip them (CursorImpl.java:2062-2064). The
13944 // contract here is: the KD slot is never reported as a LIVE entry.
13945 let root = tree.get_root().expect("root");
13946 let edge = Tree::descend_to_edge_bin(&root, true).expect("edge bin");
13947 assert!(
13948 !edge.iter().any(|(e, _, k)| k == &kd_key && !e.known_deleted),
13949 "TREE-F1: scan must not surface a known_deleted slot as live \
13950 (CursorImpl.java:2062-2064)"
13951 );
13952 for anchor in [b"kd0000".to_vec(), b"kd0005".to_vec()] {
13953 for entries in
13954 [tree.get_next_bin(&anchor), tree.get_prev_bin(&anchor)]
13955 .into_iter()
13956 .flatten()
13957 {
13958 assert!(
13959 !entries
13960 .iter()
13961 .any(|(e, _, k)| k == &kd_key && !e.known_deleted),
13962 "TREE-F1: get_next_bin/get_prev_bin must not surface a \
13963 known_deleted slot as live"
13964 );
13965 }
13966 }
13967
13968 // first_entry_at_or_after must skip a KD slot at the boundary.
13969 if let Some((k, _, _)) = tree.first_entry_at_or_after(&kd_key) {
13970 assert_ne!(
13971 k, kd_key,
13972 "TREE-F1: first_entry_at_or_after must skip a known_deleted \
13973 slot (CursorImpl.java:2062-2064)"
13974 );
13975 }
13976
13977 // The compressor KD-iteration path must STILL see the slot — the fix
13978 // only changes the user-facing read predicate, not the maintenance
13979 // iteration that exists to reclaim KD slots.
13980 let kd_bins = tree.collect_bins_with_known_deleted();
13981 assert!(
13982 !kd_bins.is_empty(),
13983 "TREE-F1: collect_bins_with_known_deleted must still observe the \
13984 KD slot so the compressor can reclaim it"
13985 );
13986 }
13987}