Skip to main content

noxu_tree/
tree.rs

1//! B+tree implementation.
2//!
3//!
4//! Tree implements the B+tree. It provides search, insert, and delete
5//! operations on the tree structure. The tree uses latch-coupling for
6//! concurrent access: when traversing down the tree, the parent latch
7//! is released after the child latch is acquired.
8//!
9//! # Architecture
10//!
11//! The tree has a hierarchical structure:
12//! - Internal Nodes (IN) at levels 2 and above
13//! - Bottom Internal Nodes (BIN) at level 1
14//! - Leaf Nodes (LN) containing actual data
15//!
16//! # Locking Strategy
17//!
18//! - Root latch protects the root pointer itself
19//! - Each node has its own latch for concurrent access
20//! - Search uses latch-coupling: acquire child, release parent
21//! - Modifications may require exclusive latches
22
23use crate::error::TreeError;
24use crate::key::{create_key_prefix, get_key_prefix_length};
25use crate::search_result::SearchResult;
26use noxu_latch::{LatchContext, SharedLatch};
27use noxu_util::{Lsn, NULL_LSN};
28use parking_lot::RwLock;
29use std::sync::atomic::{AtomicI64, AtomicU64, Ordering};
30use std::sync::{Arc, Weak};
31
32/// Observer that mirrors JE's `INList` feeding the evictor's `LRUList`s.
33///
34/// The tree owns no eviction policy of its own; instead it notifies a
35/// registered listener whenever an IN/BIN node enters the resident cache, is
36/// accessed, or is removed.  The `Evictor` (in `noxu-evictor`) implements this
37/// trait, but the dependency is one-way (`noxu-evictor` → `noxu-tree`), so the
38/// tree refers to the listener only through this trait object — avoiding a
39/// circular crate dependency.
40///
41/// JE reference: `IN.fetchTarget` / split / `rebuildINList` call
42/// `Evictor.addBack`; node access calls `Evictor.moveBack`; node removal
43/// calls `Evictor.remove`.
44pub trait InListListener: Send + Sync {
45    /// A node has just become resident in the cache (JE `Evictor.addBack`).
46    fn note_ins_added(&self, node_id: u64);
47    /// A resident node was accessed (JE `Evictor.moveBack` — LRU touch).
48    fn note_ins_accessed(&self, node_id: u64);
49    /// A node was removed from the cache (JE `Evictor.remove`).
50    fn note_ins_removed(&self, node_id: u64);
51}
52
53// Level and flag constants re-exported here for tree-internal use.
54pub const DBMAP_LEVEL: i32 = 0x20000;
55pub const MAIN_LEVEL: i32 = 0x10000;
56pub const LEVEL_MASK: i32 = 0x0ffff;
57pub const MIN_LEVEL: i32 = -1;
58pub const BIN_LEVEL: i32 = MAIN_LEVEL | 1;
59pub const EXACT_MATCH: i32 = 1 << 16;
60pub const INSERT_SUCCESS: i32 = 1 << 17;
61
62/// Per-slot fixed memory overhead for a BIN entry, in bytes (DBI-23).
63///
64/// This is the heap footprint of one `BinEntry` *struct* as it lives inside
65/// the BIN's `Vec<BinEntry>` buffer — NOT counting the variable-length key and
66/// data bytes, which are separate heap allocations counted on top of this.
67///
68/// Faithful to JE `IN.getEntryInMemorySize` + the per-slot `entryStates` /
69/// LSN-array overhead folded into `IN.computeMemorySize` (IN.java ~4632):
70/// JE measures the slot's fixed cost with `Sizeof` on the JVM; Rust has a
71/// fixed struct layout so `size_of::<BinEntry>()` is exact.
72///
73/// T-2/T-3: the per-slot `key` (`Vec<u8>` header) and `lsn` (`u64`) were
74/// hoisted out of `BinEntry` into the node-level `KeyRep`/`LsnRep`.  The
75/// `size_of::<BinEntry>()` therefore shrank; we add back the packed per-slot
76/// LSN-rep cost (`LsnRep::BYTES_PER_LSN_ENTRY`, 4 bytes) so the incremental
77/// live counter still approximates the walked heap (the key bytes are charged
78/// separately as `key.len()` at the call site, matching the compact key rep).
79///
80/// Derived (not hard-coded) so a layout change to `BinEntry` is tracked
81/// automatically — see `bin_stub_conformance` for the drift guard.
82pub const BIN_ENTRY_OVERHEAD: usize =
83    std::mem::size_of::<BinEntry>() + LsnRep::BYTES_PER_LSN_ENTRY;
84
85/// Per-slot fixed memory overhead for an IN entry, in bytes (DBI-23).
86///
87/// Heap footprint of one `InEntry` struct inside the IN's `Vec<InEntry>`
88/// buffer (key bytes counted separately).  JE `IN.getEntryInMemorySize` for
89/// an upper IN plus the per-slot state/LSN/target overhead from
90/// `IN.computeMemorySize`.
91pub const IN_ENTRY_OVERHEAD: usize = std::mem::size_of::<InEntry>();
92
93/// Type alias for the key comparator used by sorted-duplicate databases.
94///
95/// The comparator takes two full (uncompressed) keys and returns their
96/// relative ordering.  For sorted-dup databases this is `DupKeyData::compare`,
97/// which splits each key into primary + data parts and applies separate
98/// comparators to each.  For normal databases this field is `None` and
99/// lexicographic byte comparison is used.
100///
101/// `DatabaseImpl.btreeComparator` / `DatabaseImpl.dupComparator`.
102pub type KeyComparatorFn =
103    Arc<dyn Fn(&[u8], &[u8]) -> std::cmp::Ordering + Send + Sync>;
104
105/// Combined search result carrying slot data and the BIN arc, returned by
106/// [`Tree::search_with_data`].
107///
108/// Avoids the double-descent pattern where `Tree::search` checked key
109/// existence and a second call re-descended to fetch the actual slot bytes.
110/// One descent now serves both purposes (Wave-11-I optimisation).
111pub struct SlotFetch {
112    /// `true` if an exact key match was found and is not expired.
113    pub found: bool,
114    /// Data bytes for the slot (`None` when `found` is `false`).
115    pub data: Option<Vec<u8>>,
116    /// Raw slot LSN as `u64`; zero when `found` is `false`.
117    pub lsn: u64,
118    /// Slot index within the BIN.  Set to the actual BIN slot index when
119    /// `found` is `true`; `0` otherwise.
120    ///
121    /// Used by `CursorImpl` to set `current_index` correctly so that
122    /// `retrieve_next` advances to the right slot after a search.
123    pub slot_index: usize,
124    /// Arc to the BIN that the descent reached.  Always `Some` when the
125    /// tree has at least one node, regardless of whether `found` is `true`.
126    pub bin_arc: Arc<RwLock<TreeNode>>,
127}
128
129/// The B+tree.
130///
131///
132///
133/// This is the main tree structure that manages the B+tree nodes and
134/// provides operations for search, insert, delete, and tree maintenance.
135pub struct Tree {
136    /// Database ID this tree belongs to.
137    database_id: u64,
138
139    /// Maximum entries per node (from config).
140    max_entries_per_node: usize,
141
142    /// Root of the tree. None if tree is empty.
143    ///
144    /// Wrapped in `RwLock` so that `insert`, `delete`, and other mutating
145    /// operations can take `&self` (interior mutability), enabling concurrent
146    /// access to different BIN nodes without requiring a global `&mut Tree`
147    /// borrow.  The root pointer itself is only written during root splits
148    /// and initial creation; all other access is read-only.
149    ///
150    /// `Tree.root` protected by the root latch.
151    root: RwLock<Option<Arc<RwLock<TreeNode>>>>,
152
153    /// Latch protecting the root reference itself.
154    /// Must be held when changing the root pointer.
155    root_latch: SharedLatch,
156
157    /// LSN at which the current root IN/BIN was last logged.
158    ///
159    /// Used by the IN-redo currency check (`recover_root_bin` /
160    /// `recover_root_upper_in`) to decide whether a logged root replaces the
161    /// in-memory one.  Updated whenever a new root is installed via
162    /// `set_root_with_lsn` or the IN-redo recover-root path.
163    ///
164    /// JE `RootUpdater.originalLsn` / `ChildReference.getLsn()` for the root.
165    root_log_lsn: RwLock<noxu_util::Lsn>,
166
167    /// Statistics: number of times the root has been split.
168    root_splits: AtomicU64,
169
170    /// Statistics: number of latch upgrades from shared to exclusive.
171    relatches_required: AtomicU64,
172
173    /// Optional custom key comparator for sorted-duplicate databases.
174    ///
175    /// When `Some`, all key comparisons in tree traversal (upper IN routing
176    /// and BIN entry search/insert/delete) use this comparator instead of
177    /// lexicographic byte comparison.
178    ///
179    /// / `dupComparator` stored on the
180    /// database and consulted at every `IN.findEntry()` call.
181    pub key_comparator: Option<KeyComparatorFn>,
182
183    /// Shared memory counter for the evictor / MemoryBudget.
184    ///
185    /// Updated on every BIN entry insert (+key+data+overhead) and delete
186    /// (-key+overhead) so the evictor sees real cache pressure.
187    ///
188    /// `env.getMemoryBudget().updateTreeMemoryUsage(delta)` call
189    /// in the equivalent `IN.updateMemorySize()`.  In Noxu the counter is an
190    /// `Arc<AtomicI64>` shared with the `Arbiter` (and later `MemoryBudget`)
191    /// to avoid a circular crate dependency (`noxu-tree` → `noxu-dbi`).
192    pub memory_counter: Option<Arc<AtomicI64>>,
193
194    /// Optional listener fed on node add/access/remove, mirroring JE's
195    /// `INList` feeding the evictor's `LRUList`s.
196    ///
197    /// When `None` (the default — used by unit tests with no environment),
198    /// the notifications are no-ops.  `EnvironmentImpl` installs the
199    /// `Evictor` here so production inserts/accesses populate the LRU lists
200    /// the evictor drains.
201    ///
202    /// JE reference: `IN.fetchTarget`/split/`rebuildINList` → `addBack`,
203    /// access → `moveBack`, removal → `remove`.
204    pub in_list_listener: Option<Arc<dyn InListListener>>,
205
206    /// Optional log manager so an evicted root IN can be re-materialized from
207    /// its persisted `root_log_lsn` on the next access (EV-14, piece B).
208    ///
209    /// JE's `Tree` reaches the log via `database.getEnv().getLogManager()`;
210    /// `Tree.getRootINRootAlreadyLatched` calls `root.fetchTarget(...)` which
211    /// reads the root IN back from its `ChildReference` LSN when the in-memory
212    /// target is null (Tree.java:477-516, ChildReference.fetchTarget).  Noxu
213    /// has no env back-reference here, so the log manager is installed
214    /// directly (the same one-way wiring as `in_list_listener`).  When `None`
215    /// (unit tests with no environment), an evicted root cannot be re-fetched
216    /// — but `evict_root` refuses to evict without a log manager, so the root
217    /// is never made non-resident in that configuration.
218    pub log_manager: Option<Arc<noxu_log::LogManager>>,
219
220    /// Capacity hint for the recovery redo path.
221    ///
222    /// When non-zero, the first BIN created by `redo_insert` (the first-key
223    /// path) pre-allocates its `entries` Vec with this capacity so that
224    /// redo insertions proceed without Vec-resize doublings.  The value is
225    /// clamped to `max_entries_per_node` at use.
226    ///
227    /// Set by `hint_redo_capacity` before the redo loop.
228    /// Wave 11-K optimisation (Fix 3).
229    redo_capacity_hint: usize,
230
231    /// Whether key-prefix compression is enabled for this tree's BINs.
232    ///
233    /// JE `DatabaseImpl.getKeyPrefixing()` / `DatabaseConfig.setKeyPrefixing()`.
234    /// When `false`, `IN.computeKeyPrefix` returns `null` in JE — no prefix
235    /// is ever set. Noxu mirrors this: `insert_with_prefix` is skipped in
236    /// favour of `insert_raw`, and `recompute_key_prefix` is not called on
237    /// BIN halves after a split.
238    ///
239    /// Default: `false` (matches JE's `DatabaseConfig.KEY_PREFIXING_DEFAULT`).
240    ///
241    /// Ref: `IN.java computeKeyPrefix` ~line 2456.
242    pub key_prefixing: bool,
243    /// T-5: maximum post-prefix key length (bytes) for the compact key rep
244    /// (`INKeyRep.MaxKeySize`).  A node packs all its keys into one fixed-width
245    /// byte array when every post-prefix key is `<=` this length; a longer key
246    /// inflates the node to the `Default` rep.  `<= 0` disables the compact
247    /// rep entirely.
248    ///
249    /// Default 16 (`TREE_COMPACT_MAX_KEY_LENGTH` /
250    /// `INKeyRep.MaxKeySize.DEFAULT_MAX_KEY_LENGTH`).  Wired from
251    /// `EnvironmentConfig` via `Tree::set_compact_max_key_length`
252    /// (`IN.getCompactMaxKeyLength`, IN.java:4929).
253    pub compact_max_key_length: i32,
254}
255
256/// A node in the tree.
257///
258/// TreeNode wraps an upper IN or a BIN. Each variant carries a lightweight
259/// stub whose fields mirror the persistent IN/BIN structure. The stubs will
260/// be replaced with full InNode/Bin types as the implementation matures; the
261/// API surface here is intentionally minimal.
262#[derive(Debug)]
263pub enum TreeNode {
264    /// Internal Node (IN) - non-leaf node in the tree.
265    Internal(InNodeStub),
266
267    /// Bottom Internal Node (BIN) - leaf-level internal node.
268    Bottom(BinStub),
269}
270
271/// Type alias for a resident child pointer.
272pub type ChildArc = Arc<RwLock<TreeNode>>;
273
274/// T-4: per-node representation of the resident-child-pointer array.
275///
276/// Faithful to JE `INTargetRep` (`INTargetRep.java`), the abstract array of
277/// target pointers to an IN's cached children.  These arrays are usually
278/// sparse — most upper INs have NO resident children — so JE never stores a
279/// full per-slot `Node[]` until many children are actually cached:
280///
281///   * `None`   — `INTargetRep.None`: a shared singleton, 0 child-pointer
282///     bytes, used when no children are cached (the common case for upper
283///     INs).  `get` returns null for every slot.
284///   * `Sparse` — `INTargetRep.Sparse`: a small parallel `(index, target)[]`
285///     for 1..=`MAX_ENTRIES` cached children (JE caps at 4).  `get(j)` is a
286///     linear scan of the index array.
287///   * `Default`— `INTargetRep.Default`: the full `Vec<Option<Arc>>`, one
288///     slot per entry, used once more than `MAX_ENTRIES` children are
289///     resident.
290///
291/// A node starts `None` and grows `None → Sparse → Default`.  JE does not
292/// shrink back when entries are nulled (it only compacts on IN-stripping) to
293/// avoid transitionary rep churn; we follow the same policy — `set_child` only
294/// inflates, and `compact()` (called on eviction/stripping) collapses an
295/// empty/small `Default`/`Sparse` back toward `None`.
296#[derive(Debug)]
297pub enum TargetRep {
298    /// `INTargetRep.None` — no children cached (shared-singleton semantics).
299    None,
300    /// `INTargetRep.Sparse` — a few cached children, `(slot_index, child)`.
301    /// Invariant: `len() <= SPARSE_MAX_ENTRIES`.
302    Sparse(Vec<(u16, ChildArc)>),
303    /// `INTargetRep.Default` — full parallel array, one slot per entry.
304    Default(Vec<Option<ChildArc>>),
305}
306
307impl TargetRep {
308    /// `INTargetRep.Sparse.MAX_ENTRIES` (INTargetRep.java) — the maximum
309    /// number of cached children the `Sparse` rep holds before inflating to
310    /// `Default`.
311    pub const SPARSE_MAX_ENTRIES: usize = 4;
312
313    /// `INTargetRep.get(idx)` — the cached child for slot `idx`, or `None`.
314    #[inline]
315    pub fn get(&self, idx: usize) -> Option<&ChildArc> {
316        match self {
317            TargetRep::None => None,
318            TargetRep::Sparse(v) => {
319                v.iter().find(|(i, _)| *i as usize == idx).map(|(_, c)| c)
320            }
321            TargetRep::Default(v) => v.get(idx).and_then(|o| o.as_ref()),
322        }
323    }
324
325    /// `INTargetRep.set(idx, node, parent)` — set (or clear, when `node` is
326    /// `None`) the cached child for slot `idx`, mutating the representation
327    /// upward (`None → Sparse → Default`) as needed.
328    pub fn set(&mut self, idx: usize, node: Option<ChildArc>) {
329        match self {
330            TargetRep::None => {
331                // INTargetRep.None.set: clearing stays None; setting mutates
332                // to a Sparse rep and sets there.
333                if let Some(child) = node {
334                    *self = TargetRep::Sparse(vec![(idx as u16, child)]);
335                }
336            }
337            TargetRep::Sparse(v) => {
338                // Update existing slot in place.
339                if let Some(pos) =
340                    v.iter().position(|(i, _)| *i as usize == idx)
341                {
342                    match node {
343                        Some(child) => v[pos].1 = child,
344                        None => {
345                            v.swap_remove(pos);
346                        }
347                    }
348                    return;
349                }
350                // New child: clearing a non-present slot is a no-op.
351                let Some(child) = node else { return };
352                if v.len() < Self::SPARSE_MAX_ENTRIES {
353                    v.push((idx as u16, child));
354                    return;
355                }
356                // Full — INTargetRep.Sparse.set mutates to Default.
357                let cap = v.iter().map(|(i, _)| *i as usize).max().unwrap_or(0);
358                let cap = cap.max(idx) + 1;
359                let mut def: Vec<Option<ChildArc>> = vec![None; cap];
360                for (i, c) in v.drain(..) {
361                    def[i as usize] = Some(c);
362                }
363                def[idx] = Some(child);
364                *self = TargetRep::Default(def);
365            }
366            TargetRep::Default(v) => {
367                if idx >= v.len() {
368                    if node.is_none() {
369                        return;
370                    }
371                    v.resize_with(idx + 1, || None);
372                }
373                v[idx] = node;
374            }
375        }
376    }
377
378    /// `INTargetRep.None`-aware take: remove and return the cached child for
379    /// slot `idx`, leaving the slot empty (JE `IN.setTarget(idx, null)` plus
380    /// returning the old target).
381    pub fn take(&mut self, idx: usize) -> Option<ChildArc> {
382        match self {
383            TargetRep::None => None,
384            TargetRep::Sparse(v) => v
385                .iter()
386                .position(|(i, _)| *i as usize == idx)
387                .map(|pos| v.swap_remove(pos).1),
388            TargetRep::Default(v) => v.get_mut(idx).and_then(|o| o.take()),
389        }
390    }
391
392    /// JE `INArrayRep.copy(from, to, n, parent)` adapted to slice ops: shift
393    /// the child mapping when an entry is INSERTED at `idx` (all children at
394    /// slots `>= idx` move up by one).  Mirrors how `Vec::insert` shifts the
395    /// parallel `entries` array.
396    pub fn insert_shift(&mut self, idx: usize) {
397        match self {
398            TargetRep::None => {}
399            TargetRep::Sparse(v) => {
400                for (i, _) in v.iter_mut() {
401                    if (*i as usize) >= idx {
402                        *i += 1;
403                    }
404                }
405            }
406            TargetRep::Default(v) => {
407                if idx <= v.len() {
408                    v.insert(idx, None);
409                }
410            }
411        }
412    }
413
414    /// JE `INArrayRep.copy` adapted: shift the child mapping when the entry at
415    /// `idx` is REMOVED (all children at slots `> idx` move down by one; the
416    /// child at `idx` itself is dropped).  Mirrors `Vec::remove`.
417    pub fn remove_shift(&mut self, idx: usize) {
418        match self {
419            TargetRep::None => {}
420            TargetRep::Sparse(v) => {
421                v.retain(|(i, _)| *i as usize != idx);
422                for (i, _) in v.iter_mut() {
423                    if (*i as usize) > idx {
424                        *i -= 1;
425                    }
426                }
427            }
428            TargetRep::Default(v) => {
429                if idx < v.len() {
430                    v.remove(idx);
431                }
432            }
433        }
434    }
435
436    /// `INTargetRep.compact(parent)` — collapse toward the most compact rep:
437    /// an empty rep becomes `None`; a `Default` with `<= MAX_ENTRIES` children
438    /// becomes `Sparse` (or `None`).  Called when an IN is stripped/evicted.
439    pub fn compact(&mut self) {
440        let count = self.resident_count();
441        if count == 0 {
442            *self = TargetRep::None;
443            return;
444        }
445        if count <= Self::SPARSE_MAX_ENTRIES
446            && let TargetRep::Default(v) = self
447        {
448            let sparse: Vec<(u16, ChildArc)> = v
449                .iter()
450                .enumerate()
451                .filter_map(|(i, o)| o.as_ref().map(|c| (i as u16, c.clone())))
452                .collect();
453            *self = TargetRep::Sparse(sparse);
454        }
455    }
456
457    /// Number of resident (non-null) children.
458    pub fn resident_count(&self) -> usize {
459        match self {
460            TargetRep::None => 0,
461            TargetRep::Sparse(v) => v.len(),
462            TargetRep::Default(v) => v.iter().filter(|o| o.is_some()).count(),
463        }
464    }
465
466    /// True if no children are cached (`INTargetRep.None` or empty).
467    pub fn is_empty(&self) -> bool {
468        self.resident_count() == 0
469    }
470
471    /// Iterate every resident child (in unspecified order).
472    pub fn iter_children(&self) -> Box<dyn Iterator<Item = ChildArc> + '_> {
473        match self {
474            TargetRep::None => Box::new(std::iter::empty()),
475            TargetRep::Sparse(v) => Box::new(v.iter().map(|(_, c)| c.clone())),
476            TargetRep::Default(v) => {
477                Box::new(v.iter().filter_map(|o| o.clone()))
478            }
479        }
480    }
481
482    /// `INTargetRep.calculateMemorySize()` — heap bytes of the rep itself
483    /// (excluding the children it points at).  `None` is 0 (shared singleton),
484    /// matching `INTargetRep.None.calculateMemorySize() == 0`.
485    pub fn memory_size(&self) -> usize {
486        use std::mem::size_of;
487        match self {
488            TargetRep::None => 0,
489            TargetRep::Sparse(v) => v.capacity() * size_of::<(u16, ChildArc)>(),
490            TargetRep::Default(v) => {
491                v.capacity() * size_of::<Option<ChildArc>>()
492            }
493        }
494    }
495}
496
497/// T-3: node-level packed LSN array — `IN.entryLsnByteArray` /
498/// `IN.entryLsnLongArray` (IN.java:251-289, getLsn/setLsnInternal
499/// IN.java:1752-1935).
500///
501/// JE stores one LSN per slot.  A naive `Lsn` (u64) costs 8 bytes/slot even
502/// though most LSNs in a node share a file number and have a file offset that
503/// fits in 3 bytes.  JE's compact rep is a single `byte[]` with
504/// `BYTES_PER_LSN_ENTRY == 4` bytes per slot:
505///
506///   * `base_file_number` is the lowest file number of any non-NULL LSN in the
507///     node;
508///   * byte 0 of each slot = `file_number - base_file_number` (0..=127,
509///     `Byte.MAX_VALUE`);
510///   * bytes 1..4 = the 3-byte little-endian file offset (max
511///     `MAX_FILE_OFFSET == 0xff_fffe`).
512///
513/// The NULL_LSN blocker (Noxu `NULL_LSN == u64::MAX`) is solved EXACTLY as JE
514/// does it: NULL is NOT stored as the raw u64; the slot's 3 file-offset bytes
515/// are set to `0xff_ffff` (`THREE_BYTE_NEGATIVE_ONE`), a value `MAX_FILE_OFFSET`
516/// can never reach, and `get_lsn` maps it back to `NULL_LSN`.
517///
518/// If a file-number difference exceeds 127 or a file offset exceeds
519/// `MAX_FILE_OFFSET`, the rep mutates to `Long` (one `u64` per slot), matching
520/// JE's `mutateToLongArray` (IN.java:1924).  An all-NULL node uses `Empty`
521/// (0 bytes), matching the EMPTY_REP/initial-capacity-free state.
522#[derive(Debug)]
523pub enum LsnRep {
524    /// All slots NULL — 0 heap bytes (the `byteArray == null` initial state).
525    Empty,
526    /// `IN.entryLsnByteArray` — 4 bytes/slot, `base_file_number`-relative.
527    Compact { base_file_number: u32, bytes: Vec<u8> },
528    /// `IN.entryLsnLongArray` — 8 bytes/slot fallback after `mutateToLongArray`.
529    Long(Vec<Lsn>),
530}
531
532impl LsnRep {
533    /// `IN.BYTES_PER_LSN_ENTRY` (IN.java:151).
534    pub const BYTES_PER_LSN_ENTRY: usize = 4;
535    /// `IN.MAX_FILE_OFFSET` (IN.java:152) — max file offset the 3-byte form holds.
536    const MAX_FILE_OFFSET: u32 = 0x00ff_fffe;
537    /// `IN.THREE_BYTE_NEGATIVE_ONE` (IN.java:153) — the NULL sentinel in the
538    /// 3 file-offset bytes.
539    const THREE_BYTE_NEGATIVE_ONE: u32 = 0x00ff_ffff;
540    /// `Byte.MAX_VALUE` — max file-number difference the 1-byte offset holds.
541    const MAX_FILE_NUMBER_OFFSET: u32 = 127;
542
543    /// A rep sized for `n` slots, all NULL.  Returns `Empty` (0 bytes); the
544    /// Compact byte array is lazily allocated by the first non-NULL `set_lsn`
545    /// — `base_file_number` is unknown until then (IN.java:1820, the
546    /// `baseFileNumber == -1` first-entry case).
547    #[inline]
548    pub fn new(_n: usize) -> Self {
549        LsnRep::Empty
550    }
551
552    /// Build a rep from a per-slot `Lsn` slice (used by node construction and
553    /// split, where slots arrive together).  Equivalent to `new(lsns.len())`
554    /// followed by `set(i, lsns[i])` for each slot.
555    pub fn from_lsns(lsns: &[Lsn]) -> Self {
556        let mut rep = LsnRep::Empty;
557        let n = lsns.len();
558        for (i, &lsn) in lsns.iter().enumerate() {
559            rep.set(i, lsn, n);
560        }
561        rep
562    }
563
564    /// `IN.getLsn(idx)` (IN.java:1752).
565    pub fn get(&self, idx: usize) -> Lsn {
566        match self {
567            LsnRep::Empty => NULL_LSN,
568            LsnRep::Long(v) => v.get(idx).copied().unwrap_or(NULL_LSN),
569            LsnRep::Compact { base_file_number, bytes } => {
570                let off = idx * Self::BYTES_PER_LSN_ENTRY;
571                if off + Self::BYTES_PER_LSN_ENTRY > bytes.len() {
572                    return NULL_LSN;
573                }
574                let file_offset = Self::get_3byte(bytes, off + 1);
575                if file_offset == Self::THREE_BYTE_NEGATIVE_ONE {
576                    NULL_LSN
577                } else {
578                    let file_number = base_file_number + bytes[off] as u32;
579                    Lsn::new(file_number, file_offset)
580                }
581            }
582        }
583    }
584
585    /// `IN.setLsnInternal(idx, value)` (IN.java:1801) — set the LSN of slot
586    /// `idx`, mutating Empty→Compact→Long as necessary.  `n` is the node's
587    /// slot count (sizes a freshly-allocated Compact array).
588    pub fn set(&mut self, idx: usize, lsn: Lsn, n: usize) {
589        // Empty: first non-NULL value allocates the Compact array; a NULL set
590        // on an Empty rep is a no-op (all slots already read NULL).
591        if let LsnRep::Empty = self {
592            if lsn.is_null() {
593                return;
594            }
595            let cap = n.max(idx + 1);
596            *self = LsnRep::Compact {
597                base_file_number: lsn.file_number(),
598                bytes: vec![0u8; cap * Self::BYTES_PER_LSN_ENTRY],
599            };
600            // Mark every other slot NULL (3-byte offset = 0xffffff).
601            if let LsnRep::Compact { bytes, .. } = self {
602                for s in 0..cap {
603                    if s != idx {
604                        Self::put_3byte(
605                            bytes,
606                            s * Self::BYTES_PER_LSN_ENTRY + 1,
607                            Self::THREE_BYTE_NEGATIVE_ONE,
608                        );
609                    }
610                }
611            }
612            self.set(idx, lsn, n);
613            return;
614        }
615
616        if let LsnRep::Long(v) = self {
617            if idx >= v.len() {
618                v.resize(idx + 1, NULL_LSN);
619            }
620            v[idx] = lsn;
621            return;
622        }
623
624        // Compact path.
625        let LsnRep::Compact { base_file_number, bytes } = self else {
626            unreachable!()
627        };
628        let need = (idx + 1) * Self::BYTES_PER_LSN_ENTRY;
629        if need > bytes.len() {
630            let old = bytes.len() / Self::BYTES_PER_LSN_ENTRY;
631            bytes.resize(need, 0);
632            for s in old..(idx + 1) {
633                Self::put_3byte(
634                    bytes,
635                    s * Self::BYTES_PER_LSN_ENTRY + 1,
636                    Self::THREE_BYTE_NEGATIVE_ONE,
637                );
638            }
639        }
640        let off = idx * Self::BYTES_PER_LSN_ENTRY;
641
642        if lsn.is_null() {
643            // IN.java:1812 — file-number offset 0, file offset -1 (0xffffff).
644            bytes[off] = 0;
645            Self::put_3byte(bytes, off + 1, Self::THREE_BYTE_NEGATIVE_ONE);
646            return;
647        }
648
649        let this_file_number = lsn.file_number();
650        let this_file_offset = lsn.file_offset();
651
652        // Whether to fall back to the Long rep.
653        let mutate = this_file_offset > Self::MAX_FILE_OFFSET || {
654            if this_file_number < *base_file_number {
655                // IN.java:1827 — try to re-base downward; bail if any existing
656                // slot would then exceed the 1-byte file-number offset.
657                !Self::adjust_file_numbers(
658                    bytes,
659                    *base_file_number,
660                    this_file_number,
661                )
662            } else {
663                this_file_number - *base_file_number
664                    > Self::MAX_FILE_NUMBER_OFFSET
665            }
666        };
667
668        if mutate {
669            // IN.java:1924 mutateToLongArray.
670            let nelts = bytes.len() / Self::BYTES_PER_LSN_ENTRY;
671            let mut longs = vec![NULL_LSN; nelts.max(idx + 1)];
672            for (s, slot) in longs.iter_mut().enumerate().take(nelts) {
673                *slot = self_get_compact(*base_file_number, bytes, s);
674            }
675            longs[idx] = lsn;
676            *self = LsnRep::Long(longs);
677            return;
678        }
679
680        if this_file_number < *base_file_number {
681            *base_file_number = this_file_number;
682        }
683        bytes[off] = (this_file_number - *base_file_number) as u8;
684        Self::put_3byte(bytes, off + 1, this_file_offset);
685    }
686
687    /// `IN.adjustFileNumbers` (IN.java:1855) — re-base to a lower file number,
688    /// rewriting every existing slot's 1-byte offset.  Returns false (and
689    /// leaves `bytes` unchanged) if any slot would overflow the 1-byte offset.
690    fn adjust_file_numbers(
691        bytes: &mut [u8],
692        old_base: u32,
693        new_base: u32,
694    ) -> bool {
695        let stride = Self::BYTES_PER_LSN_ENTRY;
696        // First pass: verify none overflow.
697        let mut i = 0;
698        while i < bytes.len() {
699            if Self::get_3byte(bytes, i + 1) != Self::THREE_BYTE_NEGATIVE_ONE {
700                let cur_fn = old_base + bytes[i] as u32;
701                if cur_fn - new_base > Self::MAX_FILE_NUMBER_OFFSET {
702                    return false;
703                }
704            }
705            i += stride;
706        }
707        // Second pass: apply.
708        let mut i = 0;
709        while i < bytes.len() {
710            if Self::get_3byte(bytes, i + 1) != Self::THREE_BYTE_NEGATIVE_ONE {
711                let cur_fn = old_base + bytes[i] as u32;
712                bytes[i] = (cur_fn - new_base) as u8;
713            }
714            i += stride;
715        }
716        true
717    }
718
719    /// `INArrayRep.copy` analogue: shift LSNs when an entry is inserted at
720    /// `idx` (slots `>= idx` move up one).  Mirrors `targets.insert_shift`.
721    pub fn insert_shift(&mut self, idx: usize, n: usize) {
722        match self {
723            LsnRep::Empty => {}
724            LsnRep::Long(v) => {
725                if idx <= v.len() {
726                    v.insert(idx, NULL_LSN);
727                }
728            }
729            LsnRep::Compact { bytes, .. } => {
730                let stride = Self::BYTES_PER_LSN_ENTRY;
731                let cap = (n.max((bytes.len() / stride) + 1)) * stride;
732                bytes.resize(cap, 0);
733                let at = idx * stride;
734                // Shift the tail up by one slot.
735                bytes.copy_within(at..cap - stride, at + stride);
736                // The new slot reads NULL.
737                Self::put_3byte(bytes, at + 1, Self::THREE_BYTE_NEGATIVE_ONE);
738            }
739        }
740    }
741
742    /// `INArrayRep.copy` analogue: shift LSNs when entry `idx` is removed
743    /// (slots `> idx` move down one).  Mirrors `targets.remove_shift`.
744    pub fn remove_shift(&mut self, idx: usize) {
745        match self {
746            LsnRep::Empty => {}
747            LsnRep::Long(v) => {
748                if idx < v.len() {
749                    v.remove(idx);
750                }
751            }
752            LsnRep::Compact { bytes, .. } => {
753                let stride = Self::BYTES_PER_LSN_ENTRY;
754                let at = idx * stride;
755                if at + stride <= bytes.len() {
756                    bytes.copy_within(at + stride.., at);
757                    let newlen = bytes.len() - stride;
758                    bytes.truncate(newlen);
759                }
760            }
761        }
762    }
763
764    /// `IN.computeLsnOverhead` analogue: heap bytes of the rep itself.
765    pub fn memory_size(&self) -> usize {
766        use std::mem::size_of;
767        match self {
768            LsnRep::Empty => 0,
769            LsnRep::Compact { bytes, .. } => bytes.capacity(),
770            LsnRep::Long(v) => v.capacity() * size_of::<Lsn>(),
771        }
772    }
773
774    fn put_3byte(bytes: &mut [u8], offset: usize, value: u32) {
775        bytes[offset] = (value & 0xFF) as u8;
776        bytes[offset + 1] = ((value >> 8) & 0xFF) as u8;
777        bytes[offset + 2] = ((value >> 16) & 0xFF) as u8;
778    }
779
780    fn get_3byte(bytes: &[u8], offset: usize) -> u32 {
781        (bytes[offset] as u32)
782            | ((bytes[offset + 1] as u32) << 8)
783            | ((bytes[offset + 2] as u32) << 16)
784    }
785}
786
787/// Helper used by `LsnRep::set` during `mutateToLongArray` to read an existing
788/// Compact slot without borrowing `self` (which is mid-mutation).
789fn self_get_compact(base_file_number: u32, bytes: &[u8], idx: usize) -> Lsn {
790    let off = idx * LsnRep::BYTES_PER_LSN_ENTRY;
791    let file_offset = LsnRep::get_3byte(bytes, off + 1);
792    if file_offset == LsnRep::THREE_BYTE_NEGATIVE_ONE {
793        NULL_LSN
794    } else {
795        Lsn::new(base_file_number + bytes[off] as u32, file_offset)
796    }
797}
798
799/// `INKeyRep.MaxKeySize.DEFAULT_MAX_KEY_LENGTH` (INKeyRep.java) and the
800/// `TREE_COMPACT_MAX_KEY_LENGTH` config default.
801#[allow(non_upper_case_globals)]
802pub const INKeyRep_DEFAULT_MAX_KEY_LENGTH: i32 = 16;
803
804/// T-2: node-level key array — `INKeyRep.{Default,MaxKeySize}` (INKeyRep.java).
805///
806/// The per-slot key that used to live in `BinEntry`/`InEntry` as a `Vec<u8>`
807/// (24-byte header + a separate heap allocation per key) is hoisted here as a
808/// node-level rep.  When every (post-prefix) key in the node is `<=`
809/// `TREE_COMPACT_MAX_KEY_LENGTH` (default 16) the keys pack into ONE
810/// fixed-width byte buffer (`MaxKeySize`): `slot_width` bytes per slot, with a
811/// parallel `lengths` vector tracking the actual length of each key.  A key
812/// longer than the threshold inflates the whole node to the `Default` rep
813/// (one `Vec<u8>` per slot), matching JE's `Default.compact` /
814/// `MaxKeySize.expandToDefaultRep`.
815///
816/// As in JE, this stores the UNPREFIXED suffix (key prefixing strips the
817/// common prefix first), so the compact rep is the smaller post-prefix bytes.
818#[derive(Debug, Clone)]
819pub enum KeyRep {
820    /// `INKeyRep.Default` — one owned key per slot (any length).
821    Default(Vec<Vec<u8>>),
822    /// `INKeyRep.MaxKeySize` — all keys packed into one fixed-width buffer.
823    /// `buf.len() == slot_width * lengths.len()`; slot `i` occupies
824    /// `buf[i*slot_width .. i*slot_width + lengths[i]]`.
825    Compact { buf: Vec<u8>, slot_width: usize, lengths: Vec<u16> },
826}
827
828impl KeyRep {
829    /// An empty `Default` rep.
830    #[inline]
831    pub fn new() -> Self {
832        KeyRep::Default(Vec::new())
833    }
834
835    /// Build a `Default` rep from owned keys (callers may later `compact`).
836    #[inline]
837    pub fn from_keys(keys: Vec<Vec<u8>>) -> Self {
838        KeyRep::Default(keys)
839    }
840
841    /// Number of slots.
842    #[inline]
843    pub fn len(&self) -> usize {
844        match self {
845            KeyRep::Default(v) => v.len(),
846            KeyRep::Compact { lengths, .. } => lengths.len(),
847        }
848    }
849
850    #[inline]
851    pub fn is_empty(&self) -> bool {
852        self.len() == 0
853    }
854
855    /// `INKeyRep.get(idx)` / `getKey` — borrow the (post-prefix) key at slot
856    /// `idx` without allocating.
857    #[inline]
858    pub fn get(&self, idx: usize) -> &[u8] {
859        match self {
860            KeyRep::Default(v) => v[idx].as_slice(),
861            KeyRep::Compact { buf, slot_width, lengths } => {
862                let off = idx * slot_width;
863                &buf[off..off + lengths[idx] as usize]
864            }
865        }
866    }
867
868    /// Set the key at slot `idx`.  A key longer than a Compact rep's
869    /// `slot_width` inflates the rep to `Default` first
870    /// (`MaxKeySize.expandToDefaultRep`).
871    pub fn set(&mut self, idx: usize, key: Vec<u8>) {
872        match self {
873            KeyRep::Default(v) => v[idx] = key,
874            KeyRep::Compact { slot_width, .. } if key.len() > *slot_width => {
875                self.inflate_to_default();
876                self.set(idx, key);
877            }
878            KeyRep::Compact { buf, slot_width, lengths } => {
879                let off = idx * *slot_width;
880                buf[off..off + key.len()].copy_from_slice(&key);
881                lengths[idx] = key.len() as u16;
882            }
883        }
884    }
885
886    /// Insert a key at slot `idx`, shifting later slots up (mirrors
887    /// `Vec::insert` + `INArrayRep.copy`).
888    pub fn insert(&mut self, idx: usize, key: Vec<u8>) {
889        match self {
890            KeyRep::Default(v) => v.insert(idx, key),
891            KeyRep::Compact { slot_width, .. } if key.len() > *slot_width => {
892                self.inflate_to_default();
893                self.insert(idx, key);
894            }
895            KeyRep::Compact { buf, slot_width, lengths } => {
896                let sw = *slot_width;
897                let at = idx * sw;
898                buf.splice(at..at, std::iter::repeat_n(0u8, sw));
899                buf[at..at + key.len()].copy_from_slice(&key);
900                lengths.insert(idx, key.len() as u16);
901            }
902        }
903    }
904
905    /// Remove the key at slot `idx`, shifting later slots down.
906    pub fn remove(&mut self, idx: usize) -> Vec<u8> {
907        match self {
908            KeyRep::Default(v) => v.remove(idx),
909            KeyRep::Compact { buf, slot_width, lengths } => {
910                let sw = *slot_width;
911                let len = lengths[idx] as usize;
912                let at = idx * sw;
913                let out = buf[at..at + len].to_vec();
914                buf.drain(at..at + sw);
915                lengths.remove(idx);
916                out
917            }
918        }
919    }
920
921    /// `INKeyRep.MaxKeySize.expandToDefaultRep` — mutate a Compact rep to a
922    /// Default rep (one owned `Vec<u8>` per slot).
923    fn inflate_to_default(&mut self) {
924        if let KeyRep::Compact { .. } = self {
925            let keys: Vec<Vec<u8>> =
926                (0..self.len()).map(|i| self.get(i).to_vec()).collect();
927            *self = KeyRep::Default(keys);
928        }
929    }
930
931    /// `INKeyRep.Default.compact(parent)` (INKeyRep.java) — if every key in a
932    /// `Default` rep fits `compact_max_key_length`, pack them into a
933    /// `MaxKeySize` (`Compact`) rep.  `compact_max_key_length <= 0` disables
934    /// compaction.  No-op when already Compact.
935    pub fn compact(&mut self, compact_max_key_length: i32) {
936        if compact_max_key_length <= 0 {
937            return;
938        }
939        let KeyRep::Default(keys) = self else {
940            return; // already Compact
941        };
942        if keys.is_empty() {
943            return;
944        }
945        let max_len = keys.iter().map(|k| k.len()).max().unwrap_or(0);
946        if max_len > compact_max_key_length as usize {
947            return; // a key exceeds the threshold — stay Default
948        }
949        let slot_width = max_len.max(1);
950        let mut buf = vec![0u8; slot_width * keys.len()];
951        let mut lengths = Vec::with_capacity(keys.len());
952        for (i, k) in keys.iter().enumerate() {
953            let off = i * slot_width;
954            buf[off..off + k.len()].copy_from_slice(k);
955            lengths.push(k.len() as u16);
956        }
957        *self = KeyRep::Compact { buf, slot_width, lengths };
958    }
959
960    /// True when key-byte memory is accounted for inside this rep (Compact),
961    /// vs per-slot `Vec` allocations (Default).
962    /// `INKeyRep.accountsForKeyByteMemUsage`.
963    #[inline]
964    pub fn is_compact(&self) -> bool {
965        matches!(self, KeyRep::Compact { .. })
966    }
967
968    /// Heap bytes of the rep itself (`INKeyRep.calculateMemorySize` +
969    /// key-byte accounting).  For Default this is the `Vec<Vec<u8>>` header
970    /// plus each key's heap allocation; for Compact it is the single buffer
971    /// plus the lengths vector.
972    pub fn memory_size(&self) -> usize {
973        use std::mem::size_of;
974        match self {
975            KeyRep::Default(v) => {
976                v.capacity() * size_of::<Vec<u8>>()
977                    + v.iter().map(|k| k.capacity()).sum::<usize>()
978            }
979            KeyRep::Compact { buf, lengths, .. } => {
980                buf.capacity() + lengths.capacity() * size_of::<u16>()
981            }
982        }
983    }
984}
985
986impl Default for KeyRep {
987    fn default() -> Self {
988        KeyRep::new()
989    }
990}
991
992/// Lightweight upper-IN representation used by the tree traversal layer.
993///
994/// `IN`: carries the dirty flag (IN_DIRTY_BIT), the LRU
995/// generation counter, and a weak back-pointer to the parent so that
996/// dirty state can be propagated upward.
997#[derive(Debug)]
998pub struct InNodeStub {
999    /// Node ID.
1000    pub node_id: u64,
1001    /// Level in tree.
1002    pub level: i32,
1003    /// Child entries (key, lsn).
1004    pub entries: Vec<InEntry>,
1005    /// T-4: per-node resident-child-pointer representation.
1006    ///
1007    /// `IN.entryTargets` (`INTargetRep`).  The cached child pointer is no
1008    /// longer a per-`InEntry` `Option<Arc>` (which cost a pointer-sized slot
1009    /// even when no child was resident); it lives here as a compact
1010    /// node-level rep that starts `None` (0 child-pointer bytes — most upper
1011    /// INs have no resident children), grows to `Sparse` for a few cached
1012    /// children, and inflates to `Default` (the full parallel array) once
1013    /// many children are resident.  See `INTargetRep.{None,Sparse,Default}`.
1014    pub targets: TargetRep,
1015    /// Dirty flag — set whenever this node is modified.
1016    /// `IN.dirty` (IN_DIRTY_BIT).
1017    pub dirty: bool,
1018    /// LRU generation counter for the evictor.
1019    /// `IN.generation`.
1020    pub generation: u64,
1021    /// Weak back-pointer to parent IN.
1022    /// Enables dirty-propagation and latch-coupling validation.
1023    /// `IN.parent` reference used during splits and logging.
1024    pub parent: Option<Weak<RwLock<TreeNode>>>,
1025    /// T-3: per-node packed LSN array (`IN.entryLsnByteArray`).  The per-slot
1026    /// `lsn` (8 bytes) that used to live in `InEntry` is hoisted here as a
1027    /// `base_file_number`-relative 4-byte-per-slot rep, falling back to a
1028    /// `u64`-per-slot `Long` rep only when a node's LSN range exceeds the
1029    /// compact form.  Access via `get_lsn(slot)` / `set_lsn(slot, lsn)`.
1030    pub lsn_rep: LsnRep,
1031}
1032
1033/// Entry in an IN node.
1034///
1035/// T-4: the resident-child pointer that used to live here (`Option<Arc>`) was
1036/// hoisted to the node-level `InNodeStub.targets` (`INTargetRep`); access the
1037/// child for slot `i` via `InNodeStub::get_child(i)` / `set_child` / etc.
1038///
1039/// T-3: the per-slot `lsn` (8 bytes) that used to live here was hoisted to the
1040/// node-level `InNodeStub.lsn_rep` (`IN.entryLsnByteArray`); access the LSN for
1041/// slot `i` via `InNodeStub::get_lsn(i)` / `set_lsn(i, lsn)`.
1042#[derive(Debug, Clone)]
1043pub struct InEntry {
1044    /// Key for this entry.
1045    pub key: Vec<u8>,
1046}
1047
1048/// Lightweight BIN representation used by the tree traversal layer.
1049///
1050/// `BIN` (which extends `IN`): carries the dirty flag, LRU
1051/// generation counter, and a weak back-pointer to the parent IN.
1052///
1053/// # Key Prefix Compression
1054///
1055/// BINs support key prefix compression.  When
1056/// `key_prefix` is non-empty the `key` field of every `BinEntry` stores only
1057/// the *suffix* — the bytes after stripping the common leading bytes.  The
1058/// full key is reconstructed by prepending `key_prefix` to the stored suffix.
1059///
1060/// This is transparent to callers through the `get_full_key` / `find_entry`
1061/// helpers on `BinStub`.  The prefix is recomputed after every insert and
1062/// after a split via `recompute_key_prefix`.
1063#[derive(Debug)]
1064pub struct BinStub {
1065    /// Node ID.
1066    pub node_id: u64,
1067    /// Level (always BIN_LEVEL).
1068    pub level: i32,
1069    /// Entries.  When `key_prefix` is non-empty the `key` field in each entry
1070    /// is the *suffix* of the full key (leading `key_prefix` bytes stripped).
1071    /// `IN.entryKeys` (suffix-only storage when prefixing is on).
1072    pub entries: Vec<BinEntry>,
1073    /// Common prefix shared by every key in this BIN.
1074    /// Empty slice means no prefix compression is active.
1075    /// `IN.keyPrefix`.
1076    pub key_prefix: Vec<u8>,
1077    /// Dirty flag — set whenever this BIN is modified.
1078    /// `IN.dirty` (IN_DIRTY_BIT).
1079    pub dirty: bool,
1080    /// BIN-delta flag — true when this BIN contains only dirty (delta) slots
1081    /// rather than a complete set of entries.
1082    /// `IN.IN_DELTA_BIT` (the IN_DELTA_BIT flag inside `flags`).
1083    pub is_delta: bool,
1084    /// LSN at which this BIN was last logged as a full (non-delta) BIN.
1085    ///
1086    /// Used by the checkpoint path to construct `BINDeltaLogEntry.prev_full_lsn`
1087    /// and to compare against `prev_delta_lsn` when deciding whether to write
1088    /// a delta or a full BIN.
1089    ///
1090    /// `BIN.lastFullLsn`.
1091    pub last_full_lsn: Lsn,
1092    /// LSN at which this BIN was last logged as a BIN-delta.
1093    ///
1094    /// Written as `prev_delta_lsn` into the next `BINDeltaLogEntry` so the
1095    /// cleaner's utilization tracker can mark the superseded delta obsolete.
1096    /// Reset to `NULL_LSN` whenever a full BIN is written.
1097    ///
1098    /// `BIN.lastDeltaVersion` / `BIN.getLastDeltaLsn()`.
1099    pub last_delta_lsn: Lsn,
1100    /// LRU generation counter for the evictor.
1101    /// `IN.generation`.
1102    pub generation: u64,
1103    /// Weak back-pointer to parent IN.
1104    /// Enables dirty-propagation and latch-coupling validation.
1105    pub parent: Option<Weak<RwLock<TreeNode>>>,
1106    /// If true, `BinEntry.expiration_time` values in this BIN are packed hours
1107    /// since epoch; if false, they are packed seconds since epoch.
1108    ///
1109    /// Default: `true` (hours, matching TTL resolution).
1110    ///
1111    /// `BIN.expirationInHours`.
1112    pub expiration_in_hours: bool,
1113    /// Number of cursors currently positioned on this BIN.
1114    ///
1115    /// The evictor skips BINs with a non-zero cursor count to avoid evicting
1116    /// a node that a cursor is actively traversing.  CursorImpl increments
1117    /// this when positioning on a BIN and decrements it on reposition/close.
1118    ///
1119    /// `IN.cursorSet.size()` used by `Evictor.selectIN()`.
1120    pub cursor_count: i32,
1121    /// When true, the NEXT log of this BIN must be a full BIN, not a delta.
1122    ///
1123    /// Set after a dirty slot is removed (a delta would silently lose that
1124    /// removal) and cleared after a full BIN is written.  This is the
1125    /// delta-chain bound: it forces a periodic full BIN so a delta never
1126    /// references stale state.
1127    ///
1128    /// `IN.prohibitNextDelta` / `IN.setProhibitNextDelta` (IN.java:5013) /
1129    /// `IN.getProhibitNextDelta`.
1130    pub prohibit_next_delta: bool,
1131    /// T-3: per-node packed LSN array (`IN.entryLsnByteArray`).  The per-slot
1132    /// `lsn` (8 bytes) that used to live in `BinEntry` is hoisted here as a
1133    /// `base_file_number`-relative 4-byte-per-slot rep.  Access via
1134    /// `get_lsn(slot)` / `set_lsn(slot, lsn)`.
1135    pub lsn_rep: LsnRep,
1136    /// T-2: per-node key array (`INKeyRep.{Default,MaxKeySize}`).  The per-slot
1137    /// `key` (`Vec<u8>`, 24-byte header + heap alloc) that used to live in
1138    /// `BinEntry` is hoisted here.  Stores the post-prefix SUFFIX (key
1139    /// prefixing strips the common prefix first).  Packs into one fixed-width
1140    /// buffer (`Compact`) when every suffix is `<= compact_max_key_length`,
1141    /// else one `Vec<u8>` per slot (`Default`).  `keys.len()` is kept in lock
1142    /// step with `entries.len()`.  Access via `get_key(slot)` /
1143    /// `get_full_key(slot)`.
1144    pub keys: KeyRep,
1145    /// T-5: the node's compact-key threshold (`IN.getCompactMaxKeyLength`),
1146    /// copied from the owning `Tree` at construction so `apply_new_prefix` can
1147    /// decide whether the suffixes now fit `MaxKeySize`.  Default 16.
1148    pub compact_max_key_length: i32,
1149}
1150
1151/// Entry in a BIN node.
1152///
1153/// T-3: the per-slot `lsn` (8 bytes) that used to live here was hoisted to the
1154/// node-level `BinStub.lsn_rep` (`IN.entryLsnByteArray`); access the LSN for
1155/// slot `i` via `BinStub::get_lsn(i)` / `set_lsn(i, lsn)`.
1156#[derive(Debug, Clone)]
1157pub struct BinEntry {
1158    /// Optional embedded data (for small records) or cached LN.
1159    pub data: Option<Vec<u8>>,
1160    /// True when this slot has been marked known-deleted (analogous to the
1161    /// KNOWN_DELETED_BIT in `IN.entryStates`).  The slot is eligible for
1162    /// removal by `compress_bin()`.
1163    pub known_deleted: bool,
1164    /// True when this slot has been modified since the last full BIN log write.
1165    ///
1166    /// `IN.entryStates[i] & IN_DIRTY_BIT`.  Used by the checkpoint
1167    /// path to decide whether to write a BIN-delta (few dirty slots) or a
1168    /// full BIN (many dirty slots).
1169    pub dirty: bool,
1170    /// Packed expiration time (0 = no expiration).
1171    ///
1172    /// When the owning `BinStub.expiration_in_hours` is true, this value is
1173    /// hours since Unix epoch; otherwise it is seconds since Unix epoch.
1174    ///
1175    /// `IN.entryExpiration`.
1176    pub expiration_time: u32,
1177}
1178
1179impl InNodeStub {
1180    /// `IN.getTarget(idx)` — the resident child cached for slot `idx`, cloned
1181    /// (a strong `Arc`), or `None` if the child is not cached.  Routes through
1182    /// the node-level `INTargetRep` (T-4).
1183    #[inline]
1184    pub fn get_child(&self, idx: usize) -> Option<ChildArc> {
1185        self.targets.get(idx).cloned()
1186    }
1187
1188    /// Borrow the resident child for slot `idx` without cloning.
1189    #[inline]
1190    pub fn child_ref(&self, idx: usize) -> Option<&ChildArc> {
1191        self.targets.get(idx)
1192    }
1193
1194    /// True if slot `idx` has no resident (cached) child.
1195    /// `IN.getTarget(idx) == null`.
1196    #[inline]
1197    pub fn child_is_none(&self, idx: usize) -> bool {
1198        self.targets.get(idx).is_none()
1199    }
1200
1201    /// `IN.setTarget(idx, node)` — set (or clear) the cached child for slot
1202    /// `idx`, mutating the `INTargetRep` upward as needed.
1203    #[inline]
1204    pub fn set_child(&mut self, idx: usize, node: Option<ChildArc>) {
1205        self.targets.set(idx, node);
1206    }
1207
1208    /// `IN.detachNode` helper — remove and return the cached child for slot
1209    /// `idx`, leaving the slot's key/LSN intact for re-fetch.
1210    #[inline]
1211    pub fn take_child(&mut self, idx: usize) -> Option<ChildArc> {
1212        self.targets.take(idx)
1213    }
1214
1215    /// `IN.getLsn(idx)` (IN.java:1752) — the LSN of slot `idx` via the
1216    /// node-level packed `LsnRep` (T-3).
1217    #[inline]
1218    pub fn get_lsn(&self, idx: usize) -> Lsn {
1219        self.lsn_rep.get(idx)
1220    }
1221
1222    /// `IN.setLsn(idx, lsn)` (IN.java:1773) — set the LSN of slot `idx` via
1223    /// the node-level packed `LsnRep` (T-3).
1224    #[inline]
1225    pub fn set_lsn(&mut self, idx: usize, lsn: Lsn) {
1226        let n = self.entries.len();
1227        self.lsn_rep.set(idx, lsn, n);
1228    }
1229
1230    /// Insert an entry at `idx`, shifting the child mapping to stay aligned
1231    /// (`INArrayRep.copy`), then set the new slot's cached child.  Mirrors the
1232    /// old `entries.insert(idx, InEntry{ child: ..})` in one call.
1233    pub fn insert_entry(
1234        &mut self,
1235        idx: usize,
1236        key: Vec<u8>,
1237        lsn: Lsn,
1238        child: Option<ChildArc>,
1239    ) {
1240        self.entries.insert(idx, InEntry { key });
1241        let n = self.entries.len();
1242        self.lsn_rep.insert_shift(idx, n);
1243        self.lsn_rep.set(idx, lsn, n);
1244        self.targets.insert_shift(idx);
1245        if child.is_some() {
1246            self.targets.set(idx, child);
1247        }
1248    }
1249
1250    /// Remove the entry at `idx`, shifting the child mapping to stay aligned
1251    /// (`INArrayRep.copy`).  Returns the removed `InEntry` (key).
1252    pub fn remove_entry(&mut self, idx: usize) -> InEntry {
1253        let e = self.entries.remove(idx);
1254        self.lsn_rep.remove_shift(idx);
1255        self.targets.remove_shift(idx);
1256        e
1257    }
1258
1259    /// All resident children (cloned `Arc`s), in unspecified order.
1260    /// Replaces `entries.iter().filter_map(|e| e.child.clone())`.
1261    pub fn resident_children(&self) -> Vec<ChildArc> {
1262        self.targets.iter_children().collect()
1263    }
1264
1265    /// `(slot_index, child)` of the first resident child, if any.
1266    pub fn first_resident_child(&self) -> Option<(usize, ChildArc)> {
1267        (0..self.entries.len())
1268            .find_map(|i| self.targets.get(i).map(|c| (i, c.clone())))
1269    }
1270}
1271
1272impl BinStub {
1273    /// `IN.getLsn(idx)` (IN.java:1752) — the LSN of slot `idx` via the
1274    /// node-level packed `LsnRep` (T-3).
1275    #[inline]
1276    pub fn get_lsn(&self, idx: usize) -> Lsn {
1277        self.lsn_rep.get(idx)
1278    }
1279
1280    /// `IN.setLsn(idx, lsn)` (IN.java:1773) — set the LSN of slot `idx` via
1281    /// the node-level packed `LsnRep` (T-3).
1282    #[inline]
1283    pub fn set_lsn(&mut self, idx: usize, lsn: Lsn) {
1284        let n = self.entries.len();
1285        self.lsn_rep.set(idx, lsn, n);
1286    }
1287
1288    /// TREE-F1: the single user-facing liveness predicate for a BIN slot.
1289    ///
1290    /// A slot is LIVE for reads/scans iff it is neither `known_deleted` nor
1291    /// TTL-expired.  This mirrors the two ways JE makes a slot read as ABSENT:
1292    ///   * `IN.findEntry` (IN.java:3197) returns -1 for a `known_deleted`
1293    ///     exact match;
1294    ///   * `CursorImpl.isProbablyExpired` / `lockAndGetCurrent`
1295    ///     (CursorImpl.java:2062-2064) skip `isEntryKnownDeleted` (and
1296    ///     expired) slots while stepping.
1297    ///
1298    /// KD slots legitimately exist in live BINs during BIN-delta
1299    /// reconstitution until the compressor reclaims them; the maintenance
1300    /// paths (compressor / recovery undo) iterate them on purpose and do NOT
1301    /// use this predicate.
1302    #[inline]
1303    pub fn slot_is_live(&self, idx: usize) -> bool {
1304        match self.entries.get(idx) {
1305            Some(e) => {
1306                !(e.known_deleted
1307                    || (e.expiration_time != 0
1308                        && noxu_util::ttl::is_expired(
1309                            e.expiration_time,
1310                            self.expiration_in_hours,
1311                        )))
1312            }
1313            None => false,
1314        }
1315    }
1316
1317    // ========================================================================
1318    // Key prefix compression helpers
1319    // IN.computeKeyPrefix / IN.recalcSuffixes / IN.getKey
1320    // ========================================================================
1321
1322    /// Strips embedded LN data from non-dirty slots, freeing the heap
1323    /// allocations of the per-slot value bytes while keeping the slot keys
1324    /// and LSNs addressable.  Used by the evictor's PartialEvict path: a
1325    /// hot BIN is kept in cache so its descent path stays warm, but the LN
1326    /// data is dropped to make room for hotter content.  Subsequent reads
1327    /// re-fetch the data from the log via the slot LSN.
1328    ///
1329    /// Skips slots that are still dirty (their data has not been written
1330    /// to the log yet, so dropping the in-memory copy would lose the
1331    /// update).  Returns the number of bytes freed (sum of the lengths
1332    /// of the dropped `Vec<u8>` data fields).
1333    ///
1334    /// Returns 0 if the BIN has any open cursors (the cursor may be
1335    /// reading the data right now).
1336    pub fn strip_lns(&mut self) -> usize {
1337        if self.cursor_count > 0 {
1338            return 0;
1339        }
1340        let mut freed = 0usize;
1341        for idx in 0..self.entries.len() {
1342            // JE BIN.evictLNs / LN.isEvictable (LN.java:263 returns true): an
1343            // LN's in-memory value can be stripped whenever it is recoverable
1344            // from the log — i.e. the slot has a valid (logged) LSN — REGARDLESS
1345            // of the dirty bit.  The dirty bit governs whether the BIN's
1346            // *structure* needs re-logging at the next checkpoint (BIN-delta vs
1347            // full BIN), NOT whether the LN *value* is durable: a transactional
1348            // commit logs the LN, so the slot's LSN points at the durable copy
1349            // even while the slot is still dirty.  Gating the strip on `!dirty`
1350            // (the previous behaviour) meant a freshly-written, not-yet-
1351            // checkpointed record — the common case under a write/recently-read
1352            // workload — could never be stripped, so eviction reclaimed almost
1353            // nothing under pressure (EVICTOR-RECLAIM-1).  A slot with a NULL/
1354            // transient LSN (a deferred-write LN never logged) is NOT
1355            // strippable — its only copy is the in-memory value.
1356            if self.get_lsn(idx) == NULL_LSN {
1357                continue;
1358            }
1359            if let Some(data) = self.entries[idx].data.take() {
1360                freed = freed.saturating_add(data.len());
1361            }
1362        }
1363        freed
1364    }
1365
1366    /// Reconstruct the full key for slot `idx` by prepending the BIN's
1367    /// current prefix to the stored suffix.
1368    ///
1369    /// `IN.getKey(int idx)`.
1370    pub fn get_full_key(&self, idx: usize) -> Option<Vec<u8>> {
1371        if idx >= self.keys.len() {
1372            return None;
1373        }
1374        let suffix = self.keys.get(idx); // T-2
1375        if self.key_prefix.is_empty() {
1376            Some(suffix.to_vec())
1377        } else {
1378            let mut full =
1379                Vec::with_capacity(self.key_prefix.len() + suffix.len());
1380            full.extend_from_slice(&self.key_prefix);
1381            full.extend_from_slice(suffix);
1382            Some(full)
1383        }
1384    }
1385
1386    /// Borrow the stored (post-prefix) suffix at slot `idx` (`INKeyRep.get`).
1387    #[inline]
1388    pub fn get_key(&self, idx: usize) -> &[u8] {
1389        self.keys.get(idx)
1390    }
1391
1392    /// T-2: insert a new slot at `idx` keeping the parallel `entries`, `keys`,
1393    /// and `lsn_rep` arrays in lock step.  `suffix` is the post-prefix key.
1394    fn insert_slot(
1395        &mut self,
1396        idx: usize,
1397        suffix: Vec<u8>,
1398        lsn: Lsn,
1399        data: Option<Vec<u8>>,
1400    ) {
1401        self.entries.insert(
1402            idx,
1403            BinEntry {
1404                data,
1405                known_deleted: false,
1406                dirty: true,
1407                expiration_time: 0,
1408            },
1409        );
1410        self.keys.insert(idx, suffix); // T-2
1411        let n = self.entries.len();
1412        self.lsn_rep.insert_shift(idx, n); // T-3
1413        self.lsn_rep.set(idx, lsn, n);
1414    }
1415
1416    /// Decompress a stored suffix back to a full key.
1417    ///
1418    /// `IN.getKey` used from outside: prepend `key_prefix` to
1419    /// `suffix`.  If `key_prefix` is empty the suffix *is* the full key.
1420    pub fn decompress_key(&self, suffix: &[u8]) -> Vec<u8> {
1421        if self.key_prefix.is_empty() {
1422            suffix.to_vec()
1423        } else {
1424            let mut full =
1425                Vec::with_capacity(self.key_prefix.len() + suffix.len());
1426            full.extend_from_slice(&self.key_prefix);
1427            full.extend_from_slice(suffix);
1428            full
1429        }
1430    }
1431
1432    /// Strip the current prefix from a full key to obtain the stored suffix.
1433    ///
1434    /// `IN.computeKeySuffix(byte[] prefix, byte[] key)`.
1435    ///
1436    /// # Panics
1437    /// Panics (debug only) if `full_key` does not start with `key_prefix`.
1438    pub fn compress_key(&self, full_key: &[u8]) -> Vec<u8> {
1439        let plen = self.key_prefix.len();
1440        if plen == 0 {
1441            full_key.to_vec()
1442        } else {
1443            debug_assert!(
1444                full_key.starts_with(&self.key_prefix),
1445                "compress_key: key does not start with current prefix"
1446            );
1447            full_key[plen..].to_vec()
1448        }
1449    }
1450
1451    /// Compute the longest common prefix of all full keys currently in this
1452    /// BIN, optionally excluding the entry at `exclude_idx` (used during
1453    /// insertions to ignore the slot that is about to be replaced).
1454    ///
1455    /// Returns an empty `Vec` if the BIN has fewer than 2 entries or if the
1456    /// keys share no common leading bytes.
1457    ///
1458    /// `IN.computeKeyPrefix(int excludeIdx)`.
1459    pub fn compute_key_prefix(&self, exclude_idx: Option<usize>) -> Vec<u8> {
1460        // Need at least 2 entries to find a common prefix.
1461        let n = self.keys.len();
1462        if n < 2 {
1463            return Vec::new();
1464        }
1465
1466        // Pick the first non-excluded index as the seed.
1467        let first_idx = match exclude_idx {
1468            Some(0) => 1,
1469            _ => 0,
1470        };
1471
1472        // The current prefix_len is taken from the seed full key.
1473        let seed_full = match self.get_full_key(first_idx) {
1474            Some(k) => k,
1475            None => return Vec::new(),
1476        };
1477        let mut prefix_len = seed_full.len();
1478
1479        // Compare every other non-excluded entry against the running prefix.
1480        // Iterate all entries (byteOrdered disabled in too).
1481        for i in (first_idx + 1)..n {
1482            if let Some(ex) = exclude_idx
1483                && i == ex
1484            {
1485                continue;
1486            }
1487            let full_key = match self.get_full_key(i) {
1488                Some(k) => k,
1489                None => continue,
1490            };
1491            let new_len =
1492                get_key_prefix_length(&seed_full[..prefix_len], &full_key);
1493            if new_len < prefix_len {
1494                prefix_len = new_len;
1495            }
1496            if prefix_len == 0 {
1497                return Vec::new();
1498            }
1499        }
1500
1501        seed_full[..prefix_len].to_vec()
1502    }
1503
1504    /// Recompute the key prefix from scratch and re-encode every stored suffix.
1505    ///
1506    /// Call this after bulk inserts, splits, or merges.
1507    ///
1508    /// `IN.recalcKeyPrefix()` → `IN.recalcSuffixes(newPrefix, …)`.
1509    pub fn recompute_key_prefix(&mut self) {
1510        let new_prefix = self.compute_key_prefix(None);
1511        self.apply_new_prefix(new_prefix);
1512    }
1513
1514    /// Apply `new_prefix` as the BIN's key prefix, re-encoding all stored
1515    /// suffixes from the old prefix into the new one.
1516    ///
1517    /// This is the Rust.
1518    fn apply_new_prefix(&mut self, new_prefix: Vec<u8>) {
1519        // Reconstruct all full keys (using old prefix), then re-encode with
1520        // the new prefix.
1521        let full_keys: Vec<Vec<u8>> = (0..self.keys.len())
1522            .map(|i| self.get_full_key(i).unwrap_or_default())
1523            .collect();
1524
1525        self.key_prefix = new_prefix;
1526
1527        // T-2: re-encode every suffix into the key rep, then re-attempt
1528        // compaction (a smaller prefix may make all suffixes fit MaxKeySize).
1529        for (i, full_key) in full_keys.into_iter().enumerate() {
1530            let suffix = self.compress_key(&full_key);
1531            self.keys.set(i, suffix);
1532        }
1533        self.keys.compact(self.compact_max_key_length);
1534    }
1535
1536    /// Binary-search this BIN for `full_key` (a full, uncompressed key).
1537    ///
1538    /// The stored suffixes are compared after stripping the current prefix
1539    /// from `full_key`, so the search is done entirely in suffix-space — no
1540    /// heap allocation needed in the happy path.
1541    ///
1542    /// Returns `(idx, exact)` where:
1543    /// - `idx` is the slot index (or insertion point when `exact == false`).
1544    /// - `exact` is `true` when an exact match was found.
1545    ///
1546    /// `IN.findEntry(key, indicateIfDuplicate, exact)`.
1547    pub fn find_entry_compressed(&self, full_key: &[u8]) -> (usize, bool) {
1548        let plen = self.key_prefix.len();
1549        // Check that the key shares the current prefix; if not it cannot be
1550        // present and we return the appropriate insertion point.
1551        if plen > 0
1552            && (full_key.len() < plen
1553                || &full_key[..plen] != self.key_prefix.as_slice())
1554        {
1555            // The key does not share the current prefix.
1556            // Determine insertion point using full-key comparison.
1557            let pos = self.key_partition_point(|s| {
1558                self.decompress_key(s).as_slice() < full_key
1559            });
1560            return (pos, false);
1561        }
1562        let suffix = &full_key[plen..];
1563        // T-2: binary search over the node-level key rep (suffix space).
1564        match self.key_binary_search(suffix) {
1565            Ok(idx) => (idx, true),
1566            Err(idx) => (idx, false),
1567        }
1568    }
1569
1570    /// Binary search the key rep for `suffix` (suffix space, unsigned bytes).
1571    /// Mirrors `Vec::binary_search_by(|e| e.key.cmp(suffix))` over the
1572    /// node-level `KeyRep` (T-2).
1573    #[inline]
1574    fn key_binary_search(&self, suffix: &[u8]) -> Result<usize, usize> {
1575        let mut lo = 0usize;
1576        let mut hi = self.keys.len();
1577        while lo < hi {
1578            let mid = lo + (hi - lo) / 2;
1579            match self.keys.get(mid).cmp(suffix) {
1580                std::cmp::Ordering::Less => lo = mid + 1,
1581                std::cmp::Ordering::Greater => hi = mid,
1582                std::cmp::Ordering::Equal => return Ok(mid),
1583            }
1584        }
1585        Err(lo)
1586    }
1587
1588    /// `slice::partition_point` over the node-level key rep suffixes (T-2):
1589    /// the index of the first slot for which `pred(suffix)` is false.
1590    #[inline]
1591    fn key_partition_point(
1592        &self,
1593        mut pred: impl FnMut(&[u8]) -> bool,
1594    ) -> usize {
1595        let mut lo = 0usize;
1596        let mut hi = self.keys.len();
1597        while lo < hi {
1598            let mid = lo + (hi - lo) / 2;
1599            if pred(self.keys.get(mid)) {
1600                lo = mid + 1;
1601            } else {
1602                hi = mid;
1603            }
1604        }
1605        lo
1606    }
1607
1608    /// Insert or update a full (uncompressed) key in this BIN.
1609    ///
1610    /// After insertion the key prefix is recomputed; if the prefix changes all
1611    /// stored suffixes are re-encoded.
1612    ///
1613    /// Returns `(slot_index, is_new_insert)`.
1614    ///
1615    /// `IN.setKey` / BIN insert path.
1616    pub fn insert_with_prefix(
1617        &mut self,
1618        full_key: Vec<u8>,
1619        lsn: Lsn,
1620        data: Option<Vec<u8>>,
1621    ) -> (usize, bool) {
1622        // Is the current prefix still compatible with this key?
1623        let plen = self.key_prefix.len();
1624        let new_len = if plen > 0 {
1625            get_key_prefix_length(&self.key_prefix, &full_key)
1626        } else {
1627            0
1628        };
1629
1630        // If the new key shrinks the prefix we must re-encode everything first.
1631        if plen > 0 && new_len < plen {
1632            // Compute new prefix considering the incoming key and
1633            // all existing full keys.  We pass `None` for exclude_idx because
1634            // the slot for this key does not yet exist.
1635            let mut candidate = self.compute_key_prefix(None);
1636            // Also constrain by the new key itself.
1637            if !candidate.is_empty() {
1638                let cl = get_key_prefix_length(&candidate, &full_key);
1639                candidate.truncate(cl);
1640            } else {
1641                // No existing prefix; try to build one from the new key
1642                // against the existing full keys.
1643                if !self.entries.is_empty()
1644                    && let Some(first_full) = self.get_full_key(0)
1645                {
1646                    candidate = create_key_prefix(&first_full, &full_key)
1647                        .unwrap_or_default();
1648                    for i in 1..self.entries.len() {
1649                        if candidate.is_empty() {
1650                            break;
1651                        }
1652                        if let Some(fk) = self.get_full_key(i) {
1653                            let l = get_key_prefix_length(&candidate, &fk);
1654                            candidate.truncate(l);
1655                        }
1656                    }
1657                }
1658            }
1659            self.apply_new_prefix(candidate);
1660        }
1661
1662        // Compress the new key under the (possibly updated) prefix.
1663        let suffix = self.compress_key(&full_key);
1664
1665        match self.key_binary_search(&suffix) {
1666            Ok(idx) => {
1667                // Key exists — update in place.
1668                self.set_lsn(idx, lsn); // T-3
1669                self.entries[idx].data = data;
1670                // Mark slot dirty: this slot changed since the last full BIN log.
1671                // `IN.setDirtyEntry(idx)`.
1672                self.entries[idx].dirty = true;
1673                (idx, false)
1674            }
1675            Err(idx) => {
1676                // New key — insert in sorted position.
1677                // New slots start dirty: they have never been logged in any BIN.
1678                // `IN.setDirtyEntry(idx)` called after `insertEntry`.
1679                self.insert_slot(idx, suffix, lsn, data);
1680                // After insertion, if there is no prefix yet, try to establish one.
1681                if self.key_prefix.is_empty() && self.entries.len() >= 2 {
1682                    self.recompute_key_prefix();
1683                }
1684                (idx, true)
1685            }
1686        }
1687    }
1688
1689    /// Slice-based variant of [`BinStub::insert_with_prefix`] for the recovery redo path.
1690    ///
1691    /// Accepts `key` and `data` as `&[u8]` slices instead of owned `Vec<u8>`,
1692    /// eliminating the intermediate `Vec<u8>` that `redo_ln` would otherwise
1693    /// allocate before crossing the BIN boundary.  The compressed suffix and
1694    /// the data bytes are each copied into the `BinEntry` exactly once.
1695    ///
1696    /// Semantics are identical to `insert_with_prefix`:
1697    /// - Updates the slot in place when the key already exists.
1698    /// - Inserts a new sorted entry when absent, recomputing the key prefix.
1699    ///
1700    /// Wave 11-K optimisation (Fix 1).
1701    pub fn insert_with_prefix_slice(
1702        &mut self,
1703        full_key: &[u8],
1704        lsn: Lsn,
1705        data: Option<&[u8]>,
1706    ) -> (usize, bool) {
1707        let plen = self.key_prefix.len();
1708        let new_len = if plen > 0 {
1709            get_key_prefix_length(&self.key_prefix, full_key)
1710        } else {
1711            0
1712        };
1713
1714        if plen > 0 && new_len < plen {
1715            let mut candidate = self.compute_key_prefix(None);
1716            if !candidate.is_empty() {
1717                let cl = get_key_prefix_length(&candidate, full_key);
1718                candidate.truncate(cl);
1719            } else {
1720                if !self.entries.is_empty()
1721                    && let Some(first_full) = self.get_full_key(0)
1722                {
1723                    candidate = create_key_prefix(&first_full, full_key)
1724                        .unwrap_or_default();
1725                    for i in 1..self.entries.len() {
1726                        if candidate.is_empty() {
1727                            break;
1728                        }
1729                        if let Some(fk) = self.get_full_key(i) {
1730                            let l = get_key_prefix_length(&candidate, &fk);
1731                            candidate.truncate(l);
1732                        }
1733                    }
1734                }
1735            }
1736            self.apply_new_prefix(candidate);
1737        }
1738
1739        let suffix = self.compress_key(full_key);
1740
1741        match self.key_binary_search(&suffix) {
1742            Ok(idx) => {
1743                self.set_lsn(idx, lsn); // T-3
1744                self.entries[idx].data = data.map(|d| d.to_vec());
1745                self.entries[idx].dirty = true;
1746                (idx, false)
1747            }
1748            Err(idx) => {
1749                self.insert_slot(idx, suffix, lsn, data.map(|d| d.to_vec()));
1750                if self.key_prefix.is_empty() && self.entries.len() >= 2 {
1751                    self.recompute_key_prefix();
1752                }
1753                (idx, true)
1754            }
1755        }
1756    }
1757
1758    /// Returns the number of slots that are marked dirty.
1759    ///
1760    /// `BIN.getNumDirtyEntries()`.
1761    pub fn dirty_count(&self) -> usize {
1762        self.entries.iter().filter(|e| e.dirty).count()
1763    }
1764
1765    /// Decide whether to log this BIN as a delta (true) or a full BIN (false).
1766    ///
1767    /// Faithful port of JE `BIN.shouldLogDelta()` (BIN.java:1892).  The
1768    /// decision is COUNT-based (number of would-be delta slots vs a percent of
1769    /// `nEntries`), NOT a dirty-fraction-vs-hardcoded-0.25 heuristic:
1770    ///
1771    /// ```text
1772    /// if (isBINDelta()) { return true; }          // already a delta
1773    /// if (isDeltaProhibited()) return false;       // prohibit / no prior full
1774    /// numDeltas = getNDeltas();
1775    /// if (numDeltas <= 0) return false;            // empty delta is invalid
1776    /// deltaLimit = (getNEntries() * binDeltaPercent) / 100;  // INTEGER math
1777    /// return numDeltas <= deltaLimit;
1778    /// ```
1779    ///
1780    /// `numDeltas` (JE `getNDeltas`) is the count of slots that would appear in
1781    /// the delta — i.e. the dirty slots since the last full BIN — which here is
1782    /// `dirty_count()`.  `binDeltaPercent` is the CONFIGURABLE `TREE_BIN_DELTA`
1783    /// param (JE `DatabaseImpl.getBinDeltaPercent()`, default 25), threaded in
1784    /// by the checkpointer — NOT a hardcoded constant.
1785    ///
1786    /// `isDeltaProhibited()` (BIN.java:1867) is
1787    /// `getProhibitNextDelta() || isDeferredWriteMode() || lastFullLsn == NULL`.
1788    /// Deferred-write mode is not modelled in the runtime stub; the other two
1789    /// terms are.
1790    ///
1791    /// JE ref: `BIN.shouldLogDelta` (BIN.java:1892), `BIN.isDeltaProhibited`
1792    /// (BIN.java:1867).
1793    pub fn should_log_delta(&self, bin_delta_percent: i32) -> bool {
1794        // Already a delta: re-log as a delta.  JE asserts !prohibitNextDelta
1795        // and lastFullLsn != NULL here.
1796        if self.is_delta {
1797            return self.last_full_lsn != NULL_LSN && !self.prohibit_next_delta;
1798        }
1799
1800        // isDeltaProhibited(): cheapest checks first.
1801        if self.prohibit_next_delta || self.last_full_lsn == NULL_LSN {
1802            return false;
1803        }
1804
1805        // numDeltas = getNDeltas(): the dirty slots that would be in the delta.
1806        let num_deltas = self.dirty_count() as i32;
1807
1808        // A delta with zero items is not valid.
1809        if num_deltas <= 0 {
1810            return false;
1811        }
1812
1813        // Configured BinDeltaPercent limit — INTEGER math, exactly as JE.
1814        let delta_limit = (self.entries.len() as i32 * bin_delta_percent) / 100;
1815        num_deltas <= delta_limit
1816    }
1817
1818    /// Comparator-aware binary search: finds `full_key` using `cmp`.
1819    ///
1820    /// Unlike `find_entry_compressed` (which uses suffix-based lexicographic
1821    /// comparison), this decompresses each entry's key to its full form and
1822    /// applies the provided comparator — required for sorted-dup databases
1823    /// where lexicographic suffix comparison would give wrong results when
1824    /// different-length primary keys are in the same BIN.
1825    ///
1826    /// Returns `(idx, exact)`.  Does NOT do prefix compression.
1827    ///
1828    /// `IN.findEntry` with btreeComparator active.
1829    pub fn find_entry_cmp(
1830        &self,
1831        full_key: &[u8],
1832        cmp: &dyn Fn(&[u8], &[u8]) -> std::cmp::Ordering,
1833    ) -> (usize, bool) {
1834        // Hot path: avoid per-comparison Vec<u8> allocation.
1835        // When key_prefix is empty the stored suffix IS the full key, so we
1836        // pass the suffix slice directly.  When prefix is non-empty we build a
1837        // temporary concatenation only once per comparison using a small
1838        // stack-local Vec that is dropped immediately after the call — this
1839        // still allocates but is limited to O(key_len) bytes per call and
1840        // avoids retaining any heap state between comparisons.
1841        if self.key_prefix.is_empty() {
1842            match self.key_binary_search_by(|s| cmp(s, full_key)) {
1843                Ok(idx) => (idx, true),
1844                Err(idx) => (idx, false),
1845            }
1846        } else {
1847            let prefix = self.key_prefix.as_slice();
1848            match self.key_binary_search_by(|s| {
1849                let mut fk = Vec::with_capacity(prefix.len() + s.len());
1850                fk.extend_from_slice(prefix);
1851                fk.extend_from_slice(s);
1852                cmp(&fk, full_key)
1853            }) {
1854                Ok(idx) => (idx, true),
1855                Err(idx) => (idx, false),
1856            }
1857        }
1858    }
1859
1860    /// Comparator-driven binary search over the node-level key rep (T-2).
1861    /// `cmp(stored_suffix)` returns how the stored slot compares to the
1862    /// search key.
1863    #[inline]
1864    fn key_binary_search_by(
1865        &self,
1866        mut cmp: impl FnMut(&[u8]) -> std::cmp::Ordering,
1867    ) -> Result<usize, usize> {
1868        let mut lo = 0usize;
1869        let mut hi = self.keys.len();
1870        while lo < hi {
1871            let mid = lo + (hi - lo) / 2;
1872            match cmp(self.keys.get(mid)) {
1873                std::cmp::Ordering::Less => lo = mid + 1,
1874                std::cmp::Ordering::Greater => hi = mid,
1875                std::cmp::Ordering::Equal => return Ok(mid),
1876            }
1877        }
1878        Err(lo)
1879    }
1880
1881    /// Returns the LSN of the slot matching `full_key`, if one exists.
1882    ///
1883    /// Used by the recovery LN-redo apply to enforce JE's currency check
1884    /// (`RecoveryManager.redo()` line ~2512): a logged LN is applied only
1885    /// when `logrecLsn > treeLsn`.  Returns `None` when the key is absent
1886    /// (always apply).  Uses the same lookup variant the matching insert
1887    /// path uses so the comparison is over the right slot.
1888    pub fn redo_slot_lsn(
1889        &self,
1890        full_key: &[u8],
1891        cmp: Option<&dyn Fn(&[u8], &[u8]) -> std::cmp::Ordering>,
1892        key_prefixing: bool,
1893    ) -> Option<Lsn> {
1894        let (idx, found) = match cmp {
1895            Some(c) => self.find_entry_cmp(full_key, c),
1896            None if key_prefixing => self.find_entry_compressed(full_key),
1897            None => {
1898                // insert_raw path: full keys stored verbatim.
1899                match self.key_binary_search(full_key) {
1900                    Ok(idx) => (idx, true),
1901                    Err(idx) => (idx, false),
1902                }
1903            }
1904        };
1905        if found { Some(self.get_lsn(idx)) } else { None }
1906    }
1907
1908    /// Raw insert (no prefix compression) for databases with
1909    /// `key_prefixing = false`.
1910    ///
1911    /// JE `IN.computeKeyPrefix` returns `null` when
1912    /// `databaseImpl.getKeyPrefixing()` is `false`, so no prefix is ever
1913    /// set on those BINs.  Noxu was previously ignoring the flag and always
1914    /// calling `insert_with_prefix`; this method provides the faithful path.
1915    ///
1916    /// The key is stored verbatim (no suffix stripping). An existing
1917    /// `key_prefix` on the BIN is left untouched; callers must ensure it is
1918    /// empty (split_child already guarantees this for new BINs when
1919    /// `key_prefixing = false`).
1920    ///
1921    /// Returns `(slot_index, is_new_insert)`.
1922    ///
1923    /// Ref: `IN.java computeKeyPrefix` ~line 2456,
1924    ///      `DatabaseConfig.setKeyPrefixing` / `DatabaseImpl.getKeyPrefixing`.
1925    pub fn insert_raw(
1926        &mut self,
1927        full_key: Vec<u8>,
1928        lsn: Lsn,
1929        data: Option<Vec<u8>>,
1930    ) -> (usize, bool) {
1931        // Binary search on the stored (full) keys.
1932        // When key_prefix is empty entries store full keys directly; for
1933        // key_prefixing=false DBs the prefix is always empty.
1934        match self.key_binary_search(full_key.as_slice()) {
1935            Ok(idx) => {
1936                self.set_lsn(idx, lsn); // T-3
1937                self.entries[idx].data = data;
1938                self.entries[idx].dirty = true;
1939                (idx, false)
1940            }
1941            Err(idx) => {
1942                self.insert_slot(idx, full_key, lsn, data);
1943                (idx, true)
1944            }
1945        }
1946    }
1947
1948    /// Comparator-aware insert: inserts `full_key` into the BIN using `cmp`.
1949    ///
1950    /// Prefix compression is DISABLED: the key is stored as-is.  This is
1951    /// intentional for sorted-dup databases where the custom comparator
1952    /// requires full-key access at every comparison.
1953    ///
1954    /// Returns `(slot_index, is_new_insert)`.
1955    ///
1956    pub fn insert_cmp(
1957        &mut self,
1958        full_key: Vec<u8>,
1959        lsn: Lsn,
1960        data: Option<Vec<u8>>,
1961        cmp: &dyn Fn(&[u8], &[u8]) -> std::cmp::Ordering,
1962    ) -> (usize, bool) {
1963        if self.key_prefix.is_empty() {
1964            match self.key_binary_search_by(|s| cmp(s, &full_key)) {
1965                Ok(idx) => {
1966                    self.set_lsn(idx, lsn); // T-3
1967                    self.entries[idx].data = data;
1968                    self.entries[idx].dirty = true;
1969                    (idx, false)
1970                }
1971                Err(idx) => {
1972                    self.insert_slot(idx, full_key, lsn, data);
1973                    (idx, true)
1974                }
1975            }
1976        } else {
1977            let prefix = self.key_prefix.clone();
1978            match self.key_binary_search_by(|s| {
1979                let mut fk = Vec::with_capacity(prefix.len() + s.len());
1980                fk.extend_from_slice(&prefix);
1981                fk.extend_from_slice(s);
1982                cmp(&fk, &full_key)
1983            }) {
1984                Ok(idx) => {
1985                    // Key exists — update in place.
1986                    self.set_lsn(idx, lsn); // T-3
1987                    self.entries[idx].data = data;
1988                    self.entries[idx].dirty = true;
1989                    (idx, false)
1990                }
1991                Err(idx) => {
1992                    // New key — insert at sorted position (no prefix compression).
1993                    self.insert_slot(idx, full_key, lsn, data);
1994                    (idx, true)
1995                }
1996            }
1997        }
1998    }
1999
2000    /// Comparator-aware delete: removes `full_key` from the BIN using `cmp`.
2001    ///
2002    /// Returns `true` if the entry was found and removed.
2003    pub fn delete_cmp(
2004        &mut self,
2005        full_key: &[u8],
2006        cmp: &dyn Fn(&[u8], &[u8]) -> std::cmp::Ordering,
2007    ) -> bool {
2008        let result = if self.key_prefix.is_empty() {
2009            self.key_binary_search_by(|s| cmp(s, full_key))
2010        } else {
2011            let prefix = self.key_prefix.clone();
2012            self.key_binary_search_by(|s| {
2013                let mut fk = Vec::with_capacity(prefix.len() + s.len());
2014                fk.extend_from_slice(&prefix);
2015                fk.extend_from_slice(s);
2016                cmp(&fk, full_key)
2017            })
2018        };
2019        match result {
2020            Ok(idx) => {
2021                self.entries.remove(idx);
2022                self.keys.remove(idx); // T-2
2023                self.lsn_rep.remove_shift(idx); // T-3
2024                self.dirty = true;
2025                true
2026            }
2027            Err(_) => false,
2028        }
2029    }
2030
2031    /// Serialise ALL entries (full BIN write).
2032    ///
2033    /// Format (per slot): key_len(u32BE) | key | lsn(u64BE) |
2034    ///   has_data(u8) | data_len(u32BE) | data | known_deleted(u8)
2035    ///
2036    /// Prepended by: node_id(u64BE) | num_entries(u32BE).
2037    ///
2038    /// `BIN.writeToLog()` (non-delta path).
2039    pub fn serialize_full(&self) -> Vec<u8> {
2040        let mut buf = Vec::new();
2041        buf.extend_from_slice(&self.node_id.to_be_bytes());
2042        buf.extend_from_slice(&(self.entries.len() as u32).to_be_bytes());
2043        for i in 0..self.entries.len() {
2044            let full_key = self.get_full_key(i).unwrap_or_default();
2045            buf.extend_from_slice(&(full_key.len() as u32).to_be_bytes());
2046            buf.extend_from_slice(&full_key);
2047            let lsn = self.get_lsn(i); // T-3
2048            let e = &self.entries[i];
2049            buf.extend_from_slice(&lsn.as_u64().to_be_bytes());
2050            if let Some(d) = &e.data {
2051                buf.push(1u8);
2052                buf.extend_from_slice(&(d.len() as u32).to_be_bytes());
2053                buf.extend_from_slice(d);
2054            } else {
2055                buf.push(0u8);
2056            }
2057            buf.push(e.known_deleted as u8);
2058        }
2059        buf
2060    }
2061
2062    /// Serialise only dirty slots (BIN-delta write).
2063    ///
2064    /// Format (per dirty slot): slot_idx(u32BE) | key_len(u32BE) | key |
2065    ///   lsn(u64BE) | has_data(u8) | data_len(u32BE) | data | known_deleted(u8)
2066    ///
2067    /// Prepended by: node_id(u64BE) | num_dirty(u32BE).
2068    ///
2069    /// `BIN.writeToLog()` (delta path).
2070    pub fn serialize_delta(&self) -> Vec<u8> {
2071        let dirty: Vec<usize> = (0..self.entries.len())
2072            .filter(|&i| self.entries[i].dirty)
2073            .collect();
2074        let mut buf = Vec::new();
2075        buf.extend_from_slice(&self.node_id.to_be_bytes());
2076        buf.extend_from_slice(&(dirty.len() as u32).to_be_bytes());
2077        for idx in dirty {
2078            buf.extend_from_slice(&(idx as u32).to_be_bytes());
2079            let full_key = self.get_full_key(idx).unwrap_or_default();
2080            buf.extend_from_slice(&(full_key.len() as u32).to_be_bytes());
2081            buf.extend_from_slice(&full_key);
2082            let lsn = self.get_lsn(idx); // T-3
2083            let e = &self.entries[idx];
2084            buf.extend_from_slice(&lsn.as_u64().to_be_bytes());
2085            if let Some(d) = &e.data {
2086                buf.push(1u8);
2087                buf.extend_from_slice(&(d.len() as u32).to_be_bytes());
2088                buf.extend_from_slice(d);
2089            } else {
2090                buf.push(0u8);
2091            }
2092            buf.push(e.known_deleted as u8);
2093        }
2094        buf
2095    }
2096
2097    /// Deserialise a full BIN from the bytes produced by `serialize_full()`.
2098    ///
2099    /// Returns a `BinStub` with all entries populated and all slots marked
2100    /// clean (they are already on disk at `last_full_lsn`).  Returns `None`
2101    /// if the byte slice is malformed.
2102    ///
2103    /// `INLogEntry.readEntry()` / `IN.readFromLog()` (non-delta).
2104    pub fn deserialize_full(bytes: &[u8]) -> Option<BinStub> {
2105        if bytes.len() < 12 {
2106            return None;
2107        }
2108        let node_id = u64::from_be_bytes(bytes[0..8].try_into().ok()?);
2109        let num_entries =
2110            u32::from_be_bytes(bytes[8..12].try_into().ok()?) as usize;
2111        let mut pos = 12usize;
2112        let mut entries = Vec::with_capacity(num_entries);
2113        let mut lsns: Vec<Lsn> = Vec::with_capacity(num_entries);
2114        let mut keys: Vec<Vec<u8>> = Vec::with_capacity(num_entries); // T-2
2115        for _ in 0..num_entries {
2116            // key_len(u32BE) | key | lsn(u64BE) | has_data(u8) [| data_len(u32BE) | data] | known_deleted(u8)
2117            if pos + 4 > bytes.len() {
2118                return None;
2119            }
2120            let key_len =
2121                u32::from_be_bytes(bytes[pos..pos + 4].try_into().ok()?)
2122                    as usize;
2123            pos += 4;
2124            if pos + key_len > bytes.len() {
2125                return None;
2126            }
2127            let key = bytes[pos..pos + key_len].to_vec();
2128            pos += key_len;
2129            if pos + 8 > bytes.len() {
2130                return None;
2131            }
2132            let lsn = Lsn::from_u64(u64::from_be_bytes(
2133                bytes[pos..pos + 8].try_into().ok()?,
2134            ));
2135            pos += 8;
2136            if pos + 1 > bytes.len() {
2137                return None;
2138            }
2139            let has_data = bytes[pos] != 0;
2140            pos += 1;
2141            let data = if has_data {
2142                if pos + 4 > bytes.len() {
2143                    return None;
2144                }
2145                let data_len =
2146                    u32::from_be_bytes(bytes[pos..pos + 4].try_into().ok()?)
2147                        as usize;
2148                pos += 4;
2149                if pos + data_len > bytes.len() {
2150                    return None;
2151                }
2152                let d = bytes[pos..pos + data_len].to_vec();
2153                pos += data_len;
2154                Some(d)
2155            } else {
2156                None
2157            };
2158            if pos + 1 > bytes.len() {
2159                return None;
2160            }
2161            let known_deleted = bytes[pos] != 0;
2162            pos += 1;
2163            entries.push(BinEntry {
2164                data,
2165                known_deleted,
2166                dirty: false, // freshly loaded from log — clean
2167                expiration_time: 0,
2168            });
2169            keys.push(key); // T-2 (full keys; recompute_key_prefix compresses)
2170            lsns.push(lsn); // T-3
2171        }
2172        // Keys stored in the serialized format are full (uncompressed) keys.
2173        // Re-establish the key prefix after loading so that memory use and
2174        // search performance match an in-memory BIN.
2175        // `IN.readFromLog()` → key prefix is part of the wire
2176        // format in the; in Noxu we store full keys and recompute on load.
2177        let mut bin = BinStub {
2178            node_id,
2179            level: BIN_LEVEL,
2180            entries,
2181            key_prefix: Vec::new(),
2182            dirty: false,
2183            is_delta: false,
2184            last_full_lsn: NULL_LSN, // caller sets this to the logged LSN
2185            last_delta_lsn: NULL_LSN,
2186            generation: 0,
2187            parent: None,
2188            expiration_in_hours: true,
2189            cursor_count: 0,
2190            prohibit_next_delta: false,
2191            lsn_rep: LsnRep::from_lsns(&lsns), // T-3
2192            keys: KeyRep::from_keys(keys),     // T-2 (full keys, no prefix yet)
2193            compact_max_key_length: INKeyRep_DEFAULT_MAX_KEY_LENGTH,
2194        };
2195        // Recompute key prefix from the full keys just loaded.
2196        // `IN.recalcKeyPrefix()` called after materializing from log.
2197        if bin.entries.len() >= 2 {
2198            bin.recompute_key_prefix();
2199        } else {
2200            // Even a single-slot BIN should attempt compaction.
2201            bin.keys.compact(bin.compact_max_key_length);
2202        }
2203        Some(bin)
2204    }
2205
2206    /// Deserialise a BIN delta from the bytes produced by `serialize_delta()`.
2207    ///
2208    /// **DO NOT USE for BIN reconstruction.** This helper writes full
2209    /// (uncompressed) keys directly into slots without recomputing the BIN
2210    /// key prefix, so on a prefix-compressed BIN it corrupts the slot keys and
2211    /// breaks the sorted-suffix invariant. It is NOT wired into any live path.
2212    /// The correct delta-reconstruction path is
2213    /// `mutate_to_full_bin` → `apply_delta_to_bin` → `insert_with_prefix`,
2214    /// which recomputes the prefix. This function is retained only for the
2215    /// raw byte-format round-trip and must not be used to reconstitute a BIN.
2216    /// Tracked for removal — see the v3.x review synthesis (storage C-2).
2217    ///
2218    /// Returns `None` if `delta_bytes` is malformed.
2219    pub fn apply_delta(base: &mut BinStub, delta_bytes: &[u8]) -> Option<()> {
2220        if delta_bytes.len() < 12 {
2221            return None;
2222        }
2223        // node_id(u64BE) — must match base
2224        let _node_id = u64::from_be_bytes(delta_bytes[0..8].try_into().ok()?);
2225        let num_dirty =
2226            u32::from_be_bytes(delta_bytes[8..12].try_into().ok()?) as usize;
2227        let mut pos = 12usize;
2228        for _ in 0..num_dirty {
2229            // slot_idx(u32BE) | key_len(u32BE) | key | lsn(u64BE) | has_data(u8) [| data_len | data] | known_deleted(u8)
2230            if pos + 4 > delta_bytes.len() {
2231                return None;
2232            }
2233            let slot_idx =
2234                u32::from_be_bytes(delta_bytes[pos..pos + 4].try_into().ok()?)
2235                    as usize;
2236            pos += 4;
2237            if pos + 4 > delta_bytes.len() {
2238                return None;
2239            }
2240            let key_len =
2241                u32::from_be_bytes(delta_bytes[pos..pos + 4].try_into().ok()?)
2242                    as usize;
2243            pos += 4;
2244            if pos + key_len > delta_bytes.len() {
2245                return None;
2246            }
2247            let key = delta_bytes[pos..pos + key_len].to_vec();
2248            pos += key_len;
2249            if pos + 8 > delta_bytes.len() {
2250                return None;
2251            }
2252            let lsn = Lsn::from_u64(u64::from_be_bytes(
2253                delta_bytes[pos..pos + 8].try_into().ok()?,
2254            ));
2255            pos += 8;
2256            if pos + 1 > delta_bytes.len() {
2257                return None;
2258            }
2259            let has_data = delta_bytes[pos] != 0;
2260            pos += 1;
2261            let data = if has_data {
2262                if pos + 4 > delta_bytes.len() {
2263                    return None;
2264                }
2265                let data_len = u32::from_be_bytes(
2266                    delta_bytes[pos..pos + 4].try_into().ok()?,
2267                ) as usize;
2268                pos += 4;
2269                if pos + data_len > delta_bytes.len() {
2270                    return None;
2271                }
2272                let d = delta_bytes[pos..pos + data_len].to_vec();
2273                pos += data_len;
2274                Some(d)
2275            } else {
2276                None
2277            };
2278            if pos + 1 > delta_bytes.len() {
2279                return None;
2280            }
2281            let known_deleted = delta_bytes[pos] != 0;
2282            pos += 1;
2283
2284            // Apply to base: update existing slot or insert new one.
2285            if slot_idx < base.entries.len() {
2286                base.keys.set(slot_idx, key); // T-2
2287                base.set_lsn(slot_idx, lsn); // T-3
2288                base.entries[slot_idx].data = data;
2289                base.entries[slot_idx].known_deleted = known_deleted;
2290                base.entries[slot_idx].dirty = false;
2291            } else {
2292                // Slot index beyond current length — append.
2293                base.entries.push(BinEntry {
2294                    data,
2295                    known_deleted,
2296                    dirty: false,
2297                    expiration_time: 0,
2298                });
2299                let n = base.entries.len();
2300                base.keys.insert(n - 1, key); // T-2
2301                base.lsn_rep.set(n - 1, lsn, n); // T-3
2302            }
2303        }
2304        Some(())
2305    }
2306
2307    /// Clear per-slot dirty flags and record `logged_at` as the LSN at which
2308    /// this BIN was last fully logged.
2309    ///
2310    /// Called by the checkpoint path after a successful full-BIN log write.
2311    /// `BIN.afterLog()` / `BIN.setLastFullLsn()`.
2312    pub fn clear_dirty_after_full_log(&mut self, logged_at: Lsn) {
2313        for e in &mut self.entries {
2314            e.dirty = false;
2315        }
2316        self.last_full_lsn = logged_at;
2317        self.dirty = false;
2318        // A full BIN captures all current state, so the delta-chain bound is
2319        // cleared: the next log may once again be a delta.
2320        // JE `IN.afterLog` clears the prohibit flag after a full log
2321        // (IN.java:5557 `bin.setProhibitNextDelta(false)`).
2322        self.prohibit_next_delta = false;
2323    }
2324
2325    /// Clear per-slot dirty flags after a successful delta log write.
2326    ///
2327    /// `last_full_lsn` is NOT updated — the full LSN only changes after a
2328    /// full BIN write.
2329    /// `BIN.afterLog()` (delta path).
2330    pub fn clear_dirty_after_delta_log(&mut self) {
2331        for e in &mut self.entries {
2332            e.dirty = false;
2333        }
2334        self.dirty = false;
2335    }
2336}
2337
2338impl TreeNode {
2339    /// Returns true if this is a BIN (bottom internal node).
2340    pub fn is_bin(&self) -> bool {
2341        matches!(self, TreeNode::Bottom(_))
2342    }
2343
2344    /// Returns the level of this node.
2345    pub fn level(&self) -> i32 {
2346        match self {
2347            TreeNode::Internal(n) => n.level,
2348            TreeNode::Bottom(b) => b.level,
2349        }
2350    }
2351
2352    /// Returns the node id of this node.
2353    pub fn node_id(&self) -> u64 {
2354        match self {
2355            TreeNode::Internal(n) => n.node_id,
2356            TreeNode::Bottom(b) => b.node_id,
2357        }
2358    }
2359
2360    /// Faithful in-memory heap footprint of this node, in bytes.
2361    ///
2362    /// JE `IN.getBudgetedMemorySize()` (IN.java) returns the running
2363    /// `inMemorySize` that `MemoryBudget` tracks for the node: the fixed
2364    /// IN/BIN struct overhead plus, per slot, the fixed entry overhead and the
2365    /// variable key (and embedded-LN data for BINs) bytes.  This is the single
2366    /// source of truth for both the live tree accounting and the evictor's
2367    /// detach credit (EV-13) — keeping it on `TreeNode` avoids the formula
2368    /// drifting between `noxu-tree` and `noxu-evictor`.
2369    ///
2370    /// Rust has a fixed struct layout (unlike JE's `Sizeof`-measured JVM
2371    /// constants) so `size_of` is exact for the fixed overheads; the variable
2372    /// part mirrors JE's per-slot `entryKeys`/embedded-data accounting.
2373    pub fn budgeted_memory_size(&self) -> u64 {
2374        use std::mem::size_of;
2375        match self {
2376            TreeNode::Bottom(b) => {
2377                (size_of::<BinStub>()
2378                    + b.entries.len() * size_of::<BinEntry>()
2379                    + b.key_prefix.len()
2380                    + b.keys.memory_size() // T-2: node-level key rep bytes
2381                    + b.lsn_rep.memory_size() // T-3: node-level LSN rep bytes
2382                    + b.entries
2383                        .iter()
2384                        .map(|e| {
2385                            e.data.as_ref().map(|d| d.len()).unwrap_or(0)
2386                        })
2387                        .sum::<usize>()) as u64
2388            }
2389            TreeNode::Internal(n) => {
2390                (size_of::<InNodeStub>()
2391                    + n.entries.len() * size_of::<InEntry>()
2392                    + n.targets.memory_size()
2393                    + n.entries.iter().map(|e| e.key.len()).sum::<usize>())
2394                    as u64
2395            }
2396        }
2397    }
2398
2399    /// Binary search for a key in this node.
2400    ///
2401    /// For BIN nodes the search is prefix-aware: if the BIN has a key prefix,
2402    /// `key` (a full, uncompressed key) is compared against stored suffixes
2403    /// after stripping the prefix.
2404    /// `IN.findEntry(key, indicateIfDuplicate, exact)`.
2405    ///
2406    /// Returns index with EXACT_MATCH flag set if exact match found.
2407    /// If exact is false, returns insertion point.
2408    pub fn find_entry(&self, key: &[u8], _indicator: bool, exact: bool) -> i32 {
2409        match self {
2410            TreeNode::Internal(n) => {
2411                let result = n
2412                    .entries
2413                    .binary_search_by(|entry| entry.key.as_slice().cmp(key));
2414                match result {
2415                    Ok(idx) => (idx as i32) | EXACT_MATCH,
2416                    Err(idx) => {
2417                        if exact {
2418                            -1
2419                        } else {
2420                            // Floor (not insertion point): the child slot to
2421                            // descend into is the largest entry ≤ key. Slot 0
2422                            // is the leftmost child, so a key below every
2423                            // separator floors to 0. (St-H5: previously
2424                            // returned the insertion point `idx`, which routes
2425                            // one child too far right.)
2426                            (idx as i32 - 1).max(0)
2427                        }
2428                    }
2429                }
2430            }
2431            TreeNode::Bottom(b) => {
2432                // Use prefix-aware search: the stored key is a suffix when
2433                // key_prefix is non-empty.
2434                let (idx, found) = b.find_entry_compressed(key);
2435                if found {
2436                    (idx as i32) | EXACT_MATCH
2437                } else if exact {
2438                    -1
2439                } else {
2440                    idx as i32
2441                }
2442            }
2443        }
2444    }
2445
2446    /// Gets the number of entries in this node.
2447    pub fn get_n_entries(&self) -> usize {
2448        match self {
2449            TreeNode::Internal(n) => n.entries.len(),
2450            TreeNode::Bottom(b) => b.entries.len(),
2451        }
2452    }
2453
2454    // ========================================================================
2455    // Dirty flag
2456    // ========================================================================
2457
2458    /// Returns true if this node has been modified since last checkpoint.
2459    ///
2460    /// `IN.getDirty()`.
2461    pub fn is_dirty(&self) -> bool {
2462        match self {
2463            TreeNode::Internal(n) => n.dirty,
2464            TreeNode::Bottom(b) => b.dirty,
2465        }
2466    }
2467
2468    /// Sets or clears the dirty flag on this node.
2469    ///
2470    /// `IN.setDirty(boolean dirty)`.
2471    pub fn set_dirty(&mut self, dirty: bool) {
2472        match self {
2473            TreeNode::Internal(n) => n.dirty = dirty,
2474            TreeNode::Bottom(b) => b.dirty = dirty,
2475        }
2476    }
2477
2478    // ========================================================================
2479    // LRU generation
2480    // ========================================================================
2481
2482    /// Returns the LRU generation counter.
2483    ///
2484    /// `IN.getGeneration()`.
2485    pub fn get_generation(&self) -> u64 {
2486        match self {
2487            TreeNode::Internal(n) => n.generation,
2488            TreeNode::Bottom(b) => b.generation,
2489        }
2490    }
2491
2492    /// Sets the LRU generation counter.
2493    ///
2494    /// `IN.setGeneration(long gen)`.
2495    pub fn set_generation(&mut self, r#gen: u64) {
2496        match self {
2497            TreeNode::Internal(n) => n.generation = r#gen,
2498            TreeNode::Bottom(b) => b.generation = r#gen,
2499        }
2500    }
2501
2502    // ========================================================================
2503    // Parent pointer
2504    // ========================================================================
2505
2506    /// Returns a clone of the weak parent pointer, if any.
2507    pub fn get_parent(&self) -> Option<Weak<RwLock<TreeNode>>> {
2508        match self {
2509            TreeNode::Internal(n) => n.parent.clone(),
2510            TreeNode::Bottom(b) => b.parent.clone(),
2511        }
2512    }
2513
2514    /// Sets the weak parent pointer on this node.
2515    pub fn set_parent(&mut self, parent: Option<Weak<RwLock<TreeNode>>>) {
2516        match self {
2517            TreeNode::Internal(n) => n.parent = parent,
2518            TreeNode::Bottom(b) => b.parent = parent,
2519        }
2520    }
2521
2522    // ========================================================================
2523    // Log serialization
2524    // ========================================================================
2525
2526    /// Estimates the serialized byte size of this node for log/checkpoint use.
2527    ///
2528    /// `IN.getLogSize()` — Noxu-native serialization format.
2529    ///
2530    /// Format (big-endian):
2531    /// - node_id     : 8 bytes
2532    /// - level       : 4 bytes
2533    /// - n_entries   : 4 bytes
2534    /// - dirty       : 1 byte
2535    /// - For each entry:
2536    ///   - key_len   : 2 bytes
2537    ///   - key       : key_len bytes
2538    ///   - lsn       : 8 bytes
2539    pub fn log_size(&self) -> usize {
2540        // Fixed header: node_id(8) + level(4) + n_entries(4) + dirty(1)
2541        let mut size: usize = 8 + 4 + 4 + 1;
2542        match self {
2543            TreeNode::Internal(n) => {
2544                for entry in &n.entries {
2545                    size += 2 + entry.key.len() + 8; // key_len + key + lsn
2546                }
2547            }
2548            TreeNode::Bottom(b) => {
2549                for i in 0..b.entries.len() {
2550                    size += 2 + b.get_key(i).len() + 8; // key_len + key + lsn
2551                }
2552            }
2553        }
2554        size
2555    }
2556
2557    /// Serializes this node to bytes for log writing.
2558    ///
2559    /// `IN.writeToLog(ByteBuffer logBuffer)` — Noxu-native
2560    /// format matching `log_size()`.
2561    pub fn write_to_bytes(&self) -> Vec<u8> {
2562        let mut buf = Vec::with_capacity(self.log_size());
2563        match self {
2564            TreeNode::Internal(n) => {
2565                buf.extend_from_slice(&n.node_id.to_be_bytes());
2566                buf.extend_from_slice(&n.level.to_be_bytes());
2567                buf.extend_from_slice(&(n.entries.len() as u32).to_be_bytes());
2568                buf.push(n.dirty as u8);
2569                for (i, entry) in n.entries.iter().enumerate() {
2570                    buf.extend_from_slice(
2571                        &(entry.key.len() as u16).to_be_bytes(),
2572                    );
2573                    buf.extend_from_slice(&entry.key);
2574                    buf.extend_from_slice(&n.get_lsn(i).as_u64().to_be_bytes());
2575                }
2576            }
2577            TreeNode::Bottom(b) => {
2578                buf.extend_from_slice(&b.node_id.to_be_bytes());
2579                buf.extend_from_slice(&b.level.to_be_bytes());
2580                buf.extend_from_slice(&(b.entries.len() as u32).to_be_bytes());
2581                buf.push(b.dirty as u8);
2582                for i in 0..b.entries.len() {
2583                    let key = b.get_key(i);
2584                    buf.extend_from_slice(&(key.len() as u16).to_be_bytes());
2585                    buf.extend_from_slice(key);
2586                    buf.extend_from_slice(&b.get_lsn(i).as_u64().to_be_bytes());
2587                }
2588            }
2589        }
2590        buf
2591    }
2592}
2593
2594/// Internal helper used during splits to carry entries of either node kind.
2595///
2596/// `BinStub` and `InNodeStub` store different entry types, so we need a
2597/// common wrapper to pass split slices around without code duplication.
2598enum SplitEntries {
2599    /// Upper-IN entries plus the parallel resident-child pointers (one per
2600    /// entry; `None` when the child is not cached) and the parallel per-slot
2601    /// LSNs (T-3: LSNs travel with their slots on a split, just like JE
2602    /// `IN.split` copies `entryLsnByteArray`/`entryLsnLongArray`).
2603    Internal(Vec<InEntry>, Vec<Option<ChildArc>>, Vec<Lsn>),
2604    /// BIN entries (metadata only) plus the parallel per-slot LSNs and the
2605    /// parallel FULL keys (T-2: keys live in the node-level `KeyRep`, not in
2606    /// `BinEntry`, so they travel as a separate `Vec<Vec<u8>>` of full keys
2607    /// through the split — the new BINs recompute their prefix from these).
2608    Bottom(Vec<BinEntry>, Vec<Lsn>, Vec<Vec<u8>>),
2609}
2610
2611impl SplitEntries {
2612    /// Returns the number of entries.
2613    fn len(&self) -> usize {
2614        match self {
2615            SplitEntries::Internal(v, _, _) => v.len(),
2616            SplitEntries::Bottom(v, _, _) => v.len(),
2617        }
2618    }
2619
2620    /// Returns the key at `index` as a slice.
2621    fn get_key(&self, index: usize) -> &[u8] {
2622        match self {
2623            SplitEntries::Internal(v, _, _) => v[index].key.as_slice(),
2624            SplitEntries::Bottom(_, _, k) => k[index].as_slice(),
2625        }
2626    }
2627
2628    /// Returns a sub-range `[lo, hi)` as a new `SplitEntries`.
2629    fn slice(&self, lo: usize, hi: usize) -> Self {
2630        match self {
2631            SplitEntries::Internal(v, c, l) => SplitEntries::Internal(
2632                v[lo..hi].to_vec(),
2633                c[lo..hi].to_vec(),
2634                l[lo..hi].to_vec(),
2635            ),
2636            SplitEntries::Bottom(v, l, k) => SplitEntries::Bottom(
2637                v[lo..hi].to_vec(),
2638                l[lo..hi].to_vec(),
2639                k[lo..hi].to_vec(),
2640            ),
2641        }
2642    }
2643}
2644
2645/// Tri-state outcome from one attempt at
2646/// `Tree::get_adjacent_bin_attempt`.
2647///
2648/// Distinguishes "the tree genuinely has no BIN in the requested
2649/// direction" (→ propagate as end-of-iteration) from "the path we
2650/// captured was invalidated by a concurrent split" (→ caller
2651/// retries from root). This split is necessary because the cursor
2652/// translates a `None` from `get_adjacent_bin` into
2653/// `OperationStatus::NotFound`, which is indistinguishable from a
2654/// real end-of-tree.
2655#[derive(Debug)]
2656enum AdjacentBinOutcome {
2657    /// A BIN was found in the requested direction.  T-3: each slot carries its
2658    /// `Lsn` alongside the `BinEntry` (the LSN lives in the node's packed
2659    /// `LsnRep`, not in `BinEntry`, so the scan snapshot pairs them).
2660    Found(Vec<(BinEntry, Lsn, Vec<u8>)>),
2661    /// The tree genuinely has no BIN in the requested direction.
2662    NoAdjacent,
2663    /// A concurrent split invalidated our captured path; the
2664    /// caller should retry from root.
2665    SplitRaceRetry,
2666}
2667
2668/// Split hint for the `splitSpecial` heuristic.
2669///
2670/// JE `Tree.forceSplit` tracks `allLeftSideDescent` / `allRightSideDescent`
2671/// (true if **every** routing decision during the top-down descent followed
2672/// the leftmost / rightmost child). At split time, when one of those flags
2673/// is set, `IN.splitSpecial` forces the split index to 1 (left side) or
2674/// `nEntries - 1` (right side) instead of `nEntries / 2`.
2675///
2676/// Effect: for sequential-append workloads the left BIN stays near-full
2677/// after every split (only one entry migrates to the new sibling), cutting
2678/// the split count roughly in half and reducing write amplification.
2679///
2680/// Ref: `IN.java splitSpecial` ~line 4129, `Tree.java forceSplit` ~line 1907.
2681#[derive(Clone, Copy, Debug, PartialEq, Eq)]
2682enum SplitHint {
2683    /// Normal midpoint split (`n_entries / 2`).
2684    Normal,
2685    /// Key was at position 0 on every level of descent.
2686    /// → `split_index = 1` so left node keeps all but the first entry.
2687    AllLeft,
2688    /// Key was at the rightmost position on every level of descent.
2689    /// → `split_index = n_entries - 1` so left node keeps almost everything.
2690    AllRight,
2691}
2692
2693impl Tree {
2694    /// Creates a new empty tree.
2695    ///
2696    /// Constructor.
2697    pub fn new(database_id: u64, max_entries_per_node: usize) -> Self {
2698        Tree {
2699            database_id,
2700            max_entries_per_node,
2701            root: RwLock::new(None),
2702            root_latch: SharedLatch::new(LatchContext::new("TreeRoot"), false),
2703            root_log_lsn: RwLock::new(noxu_util::NULL_LSN),
2704            root_splits: AtomicU64::new(0),
2705            relatches_required: AtomicU64::new(0),
2706            key_comparator: None,
2707            memory_counter: None,
2708            in_list_listener: None,
2709            log_manager: None,
2710            redo_capacity_hint: 0,
2711            key_prefixing: false, // JE default: KEY_PREFIXING_DEFAULT = false
2712            compact_max_key_length: INKeyRep_DEFAULT_MAX_KEY_LENGTH, // T-5
2713        }
2714    }
2715
2716    /// Installs a shared memory counter for evictor / MemoryBudget feedback.
2717    ///
2718    /// → `env.getMemoryBudget().updateTreeMemoryUsage(delta)`
2719    ///.  The counter is updated on every BIN entry insert/delete.
2720    pub fn set_memory_counter(&mut self, counter: Arc<AtomicI64>) {
2721        self.memory_counter = Some(counter);
2722    }
2723
2724    /// Installs the [`InListListener`] (the evictor) so node add/access/remove
2725    /// feed the LRU lists.  JE: `INList` registration that feeds
2726    /// `Evictor.addBack`/`moveBack`/`remove`.
2727    pub fn set_in_list_listener(&mut self, listener: Arc<dyn InListListener>) {
2728        self.in_list_listener = Some(listener);
2729    }
2730
2731    /// Installs the [`noxu_log::LogManager`] so an evicted root IN can be
2732    /// re-materialized from its persisted LSN on the next access (EV-14).
2733    ///
2734    /// JE: the tree reaches the log through `database.getEnv().getLogManager()`
2735    /// for `ChildReference.fetchTarget`.  Noxu installs it directly.
2736    pub fn set_log_manager(&mut self, lm: Arc<noxu_log::LogManager>) {
2737        self.log_manager = Some(lm);
2738    }
2739
2740    /// Drops this tree's `Arc<LogManager>` reference (EV-14 teardown).
2741    ///
2742    /// The env's `Drop` calls this on every tree it owns so the
2743    /// `Tree -> Arc<LogManager> -> Arc<FileManager>` chain cannot keep the
2744    /// FileManager (and its on-disk exclusive lock) alive past environment
2745    /// close.  After this the tree can no longer re-fetch an evicted root
2746    /// from the log — which is correct, because the environment is shutting
2747    /// down and the tree is about to be dropped.
2748    pub fn clear_log_manager(&mut self) {
2749        self.log_manager = None;
2750    }
2751
2752    /// T-5: set the compact-key threshold (`TREE_COMPACT_MAX_KEY_LENGTH` /
2753    /// `IN.getCompactMaxKeyLength`).  New BINs created by this tree inherit it;
2754    /// `<= 0` disables the compact key rep.  Default 16.
2755    pub fn set_compact_max_key_length(&mut self, len: i32) {
2756        self.compact_max_key_length = len;
2757    }
2758
2759    /// Notify the listener that a node became resident (JE `Evictor.addBack`).
2760    #[inline]
2761    fn note_added(&self, node_id: u64) {
2762        if let Some(l) = &self.in_list_listener {
2763            l.note_ins_added(node_id);
2764        }
2765    }
2766
2767    /// Notify the listener that a resident node was accessed
2768    /// (JE `Evictor.moveBack` — LRU touch).
2769    #[inline]
2770    fn note_accessed(&self, node_id: u64) {
2771        if let Some(l) = &self.in_list_listener {
2772            l.note_ins_accessed(node_id);
2773        }
2774    }
2775
2776    /// Notify the listener that a node was removed (JE `Evictor.remove`).
2777    #[inline]
2778    fn note_removed(&self, node_id: u64) {
2779        if let Some(l) = &self.in_list_listener {
2780            l.note_ins_removed(node_id);
2781        }
2782    }
2783
2784    /// Creates a new empty tree with a custom key comparator.
2785    ///
2786    /// Used for sorted-duplicate databases where keys are two-part
2787    /// composite keys that require a custom ordering function.
2788    ///
2789    /// Constructor with `btreeComparator` parameter.
2790    pub fn new_with_comparator(
2791        database_id: u64,
2792        max_entries_per_node: usize,
2793        comparator: KeyComparatorFn,
2794    ) -> Self {
2795        Tree {
2796            database_id,
2797            max_entries_per_node,
2798            root: RwLock::new(None),
2799            root_latch: SharedLatch::new(LatchContext::new("TreeRoot"), false),
2800            root_log_lsn: RwLock::new(noxu_util::NULL_LSN),
2801            root_splits: AtomicU64::new(0),
2802            relatches_required: AtomicU64::new(0),
2803            key_comparator: Some(comparator),
2804            memory_counter: None,
2805            in_list_listener: None,
2806            log_manager: None,
2807            redo_capacity_hint: 0,
2808            key_prefixing: false,
2809            compact_max_key_length: INKeyRep_DEFAULT_MAX_KEY_LENGTH, // T-5
2810        }
2811    }
2812
2813    /// Sets the key-prefixing flag.
2814    ///
2815    /// When `true`, BIN key-prefix compression is enabled: shared leading
2816    /// bytes are factored out of each slot's key.  When `false` (the
2817    /// default), keys are stored verbatim — matching JE
2818    /// `DatabaseConfig.setKeyPrefixing(false)` / `IN.computeKeyPrefix`
2819    /// returning `null`.
2820    ///
2821    /// Ref: `IN.java computeKeyPrefix` ~line 2456.
2822    pub fn set_key_prefixing(&mut self, enabled: bool) {
2823        self.key_prefixing = enabled;
2824    }
2825
2826    /// Sets the key comparator, replacing any existing one.
2827    pub fn set_comparator(&mut self, comparator: KeyComparatorFn) {
2828        self.key_comparator = Some(comparator);
2829    }
2830
2831    /// Store a capacity hint used by `redo_insert` when it creates the first
2832    /// BIN for this tree (the first-key path).
2833    ///
2834    /// The first BIN's `entries` Vec is pre-allocated with
2835    /// `capacity.min(max_entries_per_node)` slots, eliminating the
2836    /// Vec-resize doubling cycle (1 → 2 → 4 → … → cap) that would
2837    /// otherwise occur during the redo loop.
2838    ///
2839    /// Call once before the redo loop.  Has no effect on `insert` (the
2840    /// normal, non-recovery path).
2841    ///
2842    /// Wave 11-K optimisation (Fix 3).
2843    pub fn hint_redo_capacity(&mut self, capacity: usize) {
2844        self.redo_capacity_hint = capacity;
2845    }
2846
2847    /// Returns the current redo capacity hint (0 = no hint set).
2848    pub fn get_redo_capacity_hint(&self) -> usize {
2849        self.redo_capacity_hint
2850    }
2851
2852    /// Takes the key comparator out of this tree (leaving None).
2853    pub fn take_comparator(&mut self) -> Option<KeyComparatorFn> {
2854        self.key_comparator.take()
2855    }
2856
2857    /// Returns a reference to the key comparator, if configured.
2858    ///
2859    /// Used by `CursorImpl::find_bin_for_key` (R4 fix) so the cursor's own
2860    /// IN-level descent uses the same comparator-aware floor slot as the
2861    /// tree's own search paths. Mirrors JE `DatabaseImpl.getKeyComparator()`.
2862    pub fn get_comparator(&self) -> Option<&KeyComparatorFn> {
2863        self.key_comparator.as_ref()
2864    }
2865
2866    /// Returns the key comparator if set, or performs lexicographic comparison.
2867    #[inline]
2868    fn key_cmp(&self, a: &[u8], b: &[u8]) -> std::cmp::Ordering {
2869        match &self.key_comparator {
2870            Some(cmp) => cmp(a, b),
2871            None => a.cmp(b),
2872        }
2873    }
2874
2875    /// Floor child slot index for descending an internal node: the largest
2876    /// slot whose key is ≤ `key`. Slot 0 carries a virtual −∞ key (always
2877    /// qualifies); `entries[1..]` are sorted ascending, so this binary-searches
2878    /// the partition point instead of an O(n) linear walk (St-H4). Uses
2879    /// `key_cmp` so a configured custom comparator is honoured on every descent
2880    /// path. Returns 0 for an empty/single-slot node.
2881    fn upper_in_floor_index(&self, entries: &[InEntry], key: &[u8]) -> usize {
2882        if entries.len() <= 1 {
2883            return 0;
2884        }
2885        entries[1..].partition_point(|e| {
2886            self.key_cmp(e.key.as_slice(), key) != std::cmp::Ordering::Greater
2887        })
2888    }
2889
2890    /// Returns true if the tree has no root (is empty).
2891    pub fn is_empty(&self) -> bool {
2892        self.root.read().is_none()
2893    }
2894
2895    /// Sets the root of the tree.
2896    ///
2897    /// Must hold root_latch exclusively before calling.
2898    pub fn set_root(&self, node: TreeNode) {
2899        *self.root.write() = Some(Arc::new(RwLock::new(node)));
2900    }
2901
2902    /// Returns the root Arc, if any.
2903    ///
2904    /// Returns a cloned `Arc` rather than a reference so the caller does not
2905    /// hold the inner `RwLock` guard.
2906    ///
2907    /// EV-14: when the in-memory root has been evicted (`evict_root`) but a
2908    /// persisted version exists (`root_log_lsn` set), this re-materializes it
2909    /// from the log before returning — the faithful equivalent of JE
2910    /// `Tree.getRootIN` always calling `root.fetchTarget(...)`.  Returns
2911    /// `None` only for a genuinely empty tree (no resident root and no
2912    /// persisted root LSN).
2913    pub fn get_root(&self) -> Option<Arc<RwLock<TreeNode>>> {
2914        if let Some(r) = self.root.read().clone() {
2915            return Some(r);
2916        }
2917        // Root not resident: re-fetch it from `root_log_lsn` if one exists
2918        // (a no-op returning None when the tree was never populated).
2919        self.fetch_root_from_log()
2920    }
2921
2922    /// Returns the database ID.
2923    pub fn get_database_id(&self) -> u64 {
2924        self.database_id
2925    }
2926
2927    /// Count the total number of live (non-deleted) entries across all BINs.
2928    ///
2929    /// Used by `DatabaseImpl::set_recovered_tree()` to initialise the
2930    /// per-database `entry_count` AtomicU64 after recovery replays the log.
2931    pub fn count_entries(&self) -> u64 {
2932        let mut total = 0u64;
2933        if let Some(root) = self.get_root() {
2934            Self::count_entries_recursive(&root, &mut total);
2935        }
2936        total
2937    }
2938
2939    /// DBI-14: collect every live `(full_key, data, lsn)` triple in physical
2940    /// (left-to-right) order.  Used by `resort_under_comparator` to rebuild a
2941    /// tree whose slots were laid out in byte order (e.g. by recovery redo,
2942    /// which has no access to the application comparator) under the real
2943    /// configured comparator.
2944    fn collect_all_entries(&self) -> Vec<(Vec<u8>, Vec<u8>, Lsn)> {
2945        let mut out = Vec::new();
2946        if let Some(root) = self.get_root() {
2947            Self::collect_all_entries_recursive(&root, &mut out);
2948        }
2949        out
2950    }
2951
2952    fn collect_all_entries_recursive(
2953        node_arc: &Arc<RwLock<TreeNode>>,
2954        out: &mut Vec<(Vec<u8>, Vec<u8>, Lsn)>,
2955    ) {
2956        let guard = node_arc.read();
2957        match &*guard {
2958            TreeNode::Bottom(b) => {
2959                for i in 0..b.entries.len() {
2960                    if b.entries[i].known_deleted {
2961                        continue;
2962                    }
2963                    if let Some(fk) = b.get_full_key(i) {
2964                        let data =
2965                            b.entries[i].data.clone().unwrap_or_default();
2966                        out.push((fk, data, b.get_lsn(i)));
2967                    }
2968                }
2969            }
2970            TreeNode::Internal(n) => {
2971                let children: Vec<Arc<RwLock<TreeNode>>> =
2972                    n.resident_children();
2973                drop(guard);
2974                for child in &children {
2975                    Self::collect_all_entries_recursive(child, out);
2976                }
2977            }
2978        }
2979    }
2980
2981    /// DBI-14: rebuild this tree so that its on-disk byte-ordered slot layout
2982    /// is re-sorted under the currently-configured key comparator.
2983    ///
2984    /// Recovery redo (`redo_insert`) has no access to the application's
2985    /// comparator function — only the persisted identity — so it lays keys
2986    /// out in unsigned-byte order.  After `set_recovered_tree` attaches the
2987    /// real comparator, the slots must be re-sorted, or comparator-driven
2988    /// searches would binary-search a tree ordered by the wrong relation.
2989    ///
2990    /// No-op when no comparator is configured (byte order already matches the
2991    /// recovered layout) or when the tree is empty.  Mirrors the effect of
2992    /// JE reconstructing the comparator at open and the tree always having
2993    /// been built under it.
2994    pub fn resort_under_comparator(&self) {
2995        if self.key_comparator.is_none() {
2996            return;
2997        }
2998        let entries = self.collect_all_entries();
2999        if entries.is_empty() {
3000            return;
3001        }
3002        // Drop the current root; re-insert every entry through the normal
3003        // comparator-aware insert path so the new layout obeys the comparator.
3004        *self.root.write() = None;
3005        *self.root_log_lsn.write() = noxu_util::NULL_LSN;
3006        for (key, data, lsn) in entries {
3007            // Best-effort: a failed re-insert would be a tree-structure bug;
3008            // surface it loudly in debug builds.
3009            let r = self.insert(key, data, lsn);
3010            debug_assert!(
3011                r.is_ok(),
3012                "resort_under_comparator: re-insert failed: {r:?}"
3013            );
3014        }
3015    }
3016
3017    fn count_entries_recursive(
3018        node_arc: &Arc<RwLock<TreeNode>>,
3019        total: &mut u64,
3020    ) {
3021        let guard = node_arc.read();
3022        match &*guard {
3023            TreeNode::Bottom(b) => {
3024                // Count only live (non-known_deleted) entries.
3025                *total += b.entries.iter().filter(|e| !e.known_deleted).count()
3026                    as u64;
3027            }
3028            TreeNode::Internal(n) => {
3029                let children: Vec<Arc<RwLock<TreeNode>>> =
3030                    n.resident_children();
3031                drop(guard);
3032                for child in children {
3033                    Self::count_entries_recursive(&child, total);
3034                }
3035            }
3036        }
3037    }
3038
3039    /// Sum the real in-memory heap footprint of every resident node in the
3040    /// tree (DBI-23 oracle / reconciliation), in bytes.
3041    ///
3042    /// Walks all resident IN/BIN nodes and adds each node's
3043    /// `budgeted_memory_size` (JE `IN.getBudgetedMemorySize`).  This is the
3044    /// authoritative "real heap" figure the incrementally-maintained
3045    /// `memory_counter` is meant to approximate; an engine can call it to
3046    /// reconcile counter drift, and the DBI-23 test uses it as the oracle the
3047    /// live counter must stay within tolerance of.
3048    pub fn total_budgeted_memory(&self) -> u64 {
3049        let mut total = 0u64;
3050        if let Some(root) = self.get_root() {
3051            Self::total_budgeted_memory_recursive(&root, &mut total);
3052        }
3053        total
3054    }
3055
3056    fn total_budgeted_memory_recursive(
3057        node_arc: &Arc<RwLock<TreeNode>>,
3058        total: &mut u64,
3059    ) {
3060        let guard = node_arc.read();
3061        *total += guard.budgeted_memory_size();
3062        if let TreeNode::Internal(n) = &*guard {
3063            let children: Vec<Arc<RwLock<TreeNode>>> = n.resident_children();
3064            drop(guard);
3065            for child in children {
3066                Self::total_budgeted_memory_recursive(&child, total);
3067            }
3068        }
3069    }
3070
3071    /// Search for a BIN that should contain the given key.
3072    ///
3073    /// This is the core tree traversal operation. It walks from root to BIN
3074    /// using latch-coupling (acquire child latch, then release parent latch).
3075    ///
3076    /// . Descends the tree until a BIN is
3077    /// reached, following the child pointer at the slot whose key is the
3078    /// largest key <= the search key (the "LTE" rule).  Slot 0 in every upper
3079    /// IN carries a virtual key (-infinity) so any search key routes through
3080    /// it when all real keys are larger.
3081    ///
3082    /// Returns a SearchResult indicating where the key is or should be.
3083    /// Returns None if tree is empty.
3084    pub fn search(&self, key: &[u8]) -> Option<SearchResult> {
3085        let root = self.get_root()?;
3086
3087        // Hand-over-hand latch coupling for the descent. At each level we
3088        // hold a `parking_lot::ArcRwLockReadGuard` on the current node;
3089        // before dropping it, we acquire the child's read guard via
3090        // `Arc::read_arc`. This keeps a continuous chain of read locks
3091        // along the descent path so that no concurrent `split_child(parent,
3092        // …)` can run on a node we are about to enter — `split_child` takes
3093        // `parent.write()` to install the new sibling, and that write
3094        // blocks while we hold `parent.read()`. Without this, the prior
3095        // pattern (capture child Arc, drop parent guard, then take child
3096        // read lock) left a window in which a split could relocate the
3097        // child entries: a search for a key that should have ended up in
3098        // the new sibling would instead reach the (now left-half) child
3099        // and return a false `NotFound`.
3100        //
3101        // `read_arc()` returns `ArcRwLockReadGuard<RawRwLock, TreeNode>`
3102        // — a guard that owns its own Arc reference, so it has no
3103        // borrow lifetime and can be held across loop iterations and
3104        // assignment.
3105        let mut guard: parking_lot::ArcRwLockReadGuard<
3106            parking_lot::RawRwLock,
3107            TreeNode,
3108        > = root.read_arc();
3109
3110        loop {
3111            if guard.is_bin() {
3112                // JE: IN.fetchTarget / CursorImpl access moves the reached
3113                // BIN toward the hot end of the evictor's LRU list
3114                // (Evictor.moveBack).  A freshly split BIN that has not yet
3115                // been registered is added here (moveBack is add-if-absent).
3116                if let TreeNode::Bottom(bin) = &*guard {
3117                    self.note_accessed(bin.node_id);
3118                }
3119                // Reached a BIN: final key lookup within the same guard.
3120                // Use indicate_if_duplicate=true so an exact match sets
3121                // EXACT_MATCH in the return value.  Guard against -1 (not
3122                // found): -1i32 has all bits set, so the naive
3123                // `index & EXACT_MATCH != 0` check would incorrectly report
3124                // an exact match for a missing key.
3125                let (found, raw_idx) = match &*guard {
3126                    TreeNode::Bottom(bin) => match &self.key_comparator {
3127                        Some(cmp) => {
3128                            let (idx, exact) =
3129                                bin.find_entry_cmp(key, cmp.as_ref());
3130                            (exact, idx as i32)
3131                        }
3132                        None => {
3133                            let index = guard.find_entry(key, true, true);
3134                            let exact =
3135                                index >= 0 && (index & EXACT_MATCH != 0);
3136                            (exact, index & 0xFFFF)
3137                        }
3138                    },
3139                    _ => {
3140                        let index = guard.find_entry(key, true, true);
3141                        let exact = index >= 0 && (index & EXACT_MATCH != 0);
3142                        (exact, index & 0xFFFF)
3143                    }
3144                };
3145                // CursorImpl.isProbablyExpired(): if an exact match
3146                // was found, check whether the entry's TTL has already elapsed.
3147                // If it has, treat the slot as not found so callers skip it.
3148                //
3149                // TREE-F1: also treat a known_deleted slot as ABSENT on an
3150                // exact lookup, mirroring the tail of IN.findEntry
3151                // (IN.java:3197): `if (ret >= 0 && exact &&
3152                // isEntryKnownDeleted(ret & 0xffff)) return -1;`.  KD slots
3153                // legitimately exist in live BINs during BIN-delta
3154                // reconstitution until the compressor reclaims them.
3155                let found = if found {
3156                    if let TreeNode::Bottom(bin) = &*guard {
3157                        let idx = (raw_idx & 0x7FFF) as usize;
3158                        bin.slot_is_live(idx)
3159                    } else {
3160                        found
3161                    }
3162                } else {
3163                    found
3164                };
3165                return Some(SearchResult::with_values(found, raw_idx, false));
3166            }
3167
3168            // Upper IN: find the child slot with the largest key <= search
3169            // key, and capture the child Arc WHILE HOLDING the guard.
3170            // Slot 0 has a virtual key that compares as -infinity.
3171            let parent_arc =
3172                parking_lot::ArcRwLockReadGuard::rwlock(&guard).clone();
3173            let next_arc = match &*guard {
3174                TreeNode::Internal(n) => {
3175                    if n.entries.is_empty() {
3176                        return None;
3177                    }
3178                    // Walk forward as long as entry.key <= key, starting
3179                    // from slot 0 (which always qualifies because its key
3180                    // is the virtual -infinity key).
3181                    let idx = self.upper_in_floor_index(&n.entries, key);
3182                    match n.get_child(idx) {
3183                        // Resident child: keep the hand-over-hand fast path.
3184                        Some(c) => {
3185                            let next_guard = c.read_arc();
3186                            drop(guard);
3187                            guard = next_guard;
3188                            continue;
3189                        }
3190                        // EV-14/EV-13: child evicted — re-fetch it from its
3191                        // slot LSN (JE ChildReference.fetchTarget).  Must
3192                        // drop the parent read guard to upgrade to a write
3193                        // latch inside child_at_or_fetch.
3194                        None => idx,
3195                    }
3196                }
3197                TreeNode::Bottom(_) => {
3198                    unreachable!("is_bin() returned false above")
3199                }
3200            };
3201            drop(guard);
3202            let child = self.child_at_or_fetch(&parent_arc, next_arc)?;
3203            guard = child.read_arc();
3204        }
3205    }
3206
3207    /// Combined search-and-fetch: descend once to the BIN and return the
3208    /// slot's data together with a reference to the BIN arc.
3209    ///
3210    /// Replaces the previous three-descent sequence on the `Database::get`
3211    /// hot path:
3212    ///   1. `Tree::search` — existence check only.
3213    ///   2. `CursorImpl::get_data_from_tree` — re-descended to fetch data.
3214    ///   3. `CursorImpl::find_bin_for_key` — re-descended for BIN pinning.
3215    ///
3216    /// One descent now does all three jobs.  At the BIN level it uses the
3217    /// existing binary-search helper `find_entry_compressed` instead of the
3218    /// O(n) `iter().find()` used by `get_data_from_tree`.
3219    ///
3220    /// Returns `None` only when the tree is empty.  Otherwise returns
3221    /// `Some(SlotFetch)` — callers must inspect `SlotFetch::found` to
3222    /// determine whether the key was present.  The BIN read-guard is released
3223    /// before this method returns so callers may safely call `lock_ln`
3224    /// (which may block) without holding any tree latch.
3225    ///
3226    /// Wave-11-I — see the 2026 review.
3227    pub fn search_with_data(&self, key: &[u8]) -> Option<SlotFetch> {
3228        let root = self.get_root()?;
3229        let mut guard: parking_lot::ArcRwLockReadGuard<
3230            parking_lot::RawRwLock,
3231            TreeNode,
3232        > = root.read_arc();
3233
3234        loop {
3235            if guard.is_bin() {
3236                // Capture the BIN Arc before inspecting entries.
3237                let bin_arc =
3238                    parking_lot::ArcRwLockReadGuard::rwlock(&guard).clone();
3239
3240                let (found, data, lsn, slot_index) = match &*guard {
3241                    TreeNode::Bottom(bin) => {
3242                        let (idx, exact) = match &self.key_comparator {
3243                            Some(cmp) => bin.find_entry_cmp(key, cmp.as_ref()),
3244                            None => bin.find_entry_compressed(key),
3245                        };
3246                        if exact {
3247                            // TREE-F1: a slot is reported as found only when
3248                            // live (not known_deleted, not TTL-expired) — the
3249                            // same predicate used by Tree::search and the
3250                            // cursor scan.  Mirrors IN.findEntry (IN.java:3197)
3251                            // and CursorImpl.isProbablyExpired.
3252                            if bin.slot_is_live(idx) {
3253                                let lsn = bin.get_lsn(idx); // T-3
3254                                let e = &bin.entries[idx];
3255                                (true, e.data.clone(), lsn.as_u64(), idx)
3256                            } else {
3257                                (false, None, 0u64, 0)
3258                            }
3259                        } else {
3260                            (false, None, 0u64, 0)
3261                        }
3262                    }
3263                    _ => (false, None, 0u64, 0),
3264                };
3265                // Release the BIN read guard before returning so the caller
3266                // can call lock_ln (which may block) without holding a latch.
3267                drop(guard);
3268                return Some(SlotFetch {
3269                    found,
3270                    data,
3271                    lsn,
3272                    slot_index,
3273                    bin_arc,
3274                });
3275            }
3276
3277            // Upper IN: same hand-over-hand descent as `Tree::search`.
3278            let parent_arc =
3279                parking_lot::ArcRwLockReadGuard::rwlock(&guard).clone();
3280            let next_idx = match &*guard {
3281                TreeNode::Internal(n) => {
3282                    if n.entries.is_empty() {
3283                        return None;
3284                    }
3285                    // Slot 0 = virtual −∞; walk forward while entry.key ≤ key.
3286                    let idx = self.upper_in_floor_index(&n.entries, key);
3287                    match n.get_child(idx) {
3288                        Some(c) => {
3289                            let next_guard = c.read_arc();
3290                            drop(guard);
3291                            guard = next_guard;
3292                            continue;
3293                        }
3294                        // EV-14/EV-13: re-fetch an evicted child from its LSN.
3295                        None => idx,
3296                    }
3297                }
3298                TreeNode::Bottom(_) => {
3299                    unreachable!("is_bin() returned false above")
3300                }
3301            };
3302            drop(guard);
3303            let child = self.child_at_or_fetch(&parent_arc, next_idx)?;
3304            guard = child.read_arc();
3305        }
3306    }
3307
3308    /// Sets the expiration time (in absolute hours since Unix epoch) for an
3309    /// existing key's BIN slot.
3310    ///
3311    /// Returns `true` if the key was found and updated, `false` otherwise.
3312    ///
3313    /// Used by `Database::put_with_options()` to apply per-record TTL.
3314    /// `IN.entryExpiration` / `BIN.expirationInHours` path.
3315    pub fn update_key_expiration(
3316        &self,
3317        key: &[u8],
3318        expiration_hours: u32,
3319    ) -> bool {
3320        let root = match self.get_root() {
3321            Some(r) => r,
3322            None => return false,
3323        };
3324        // Hand-over-hand latch coupling for the descent. At the BIN we
3325        // need a write lock; we drop our read lock first and take the
3326        // write lock under the protection of the *outer* parent's read
3327        // lock (held by the previous loop iteration's guard). For the
3328        // first iteration there is no outer parent, but no `split_child`
3329        // can run on the root itself in that single-level case because
3330        // root splits go through `split_root_if_needed` which holds
3331        // `self.root.write()`. So the worst case is that the root is
3332        // promoted from a single BIN to a level-2 IN between our read
3333        // detect and our write — handled by the `is_bin` re-check
3334        // inside the write lock.
3335        //
3336        // We retry the descent up to a small bound to absorb the rare
3337        // case where a concurrent split moved this key into the new
3338        // sibling between the read-chain release and the write-lock
3339        // acquisition. Without the retry, the sole caller
3340        // (`Database::put_with_options`) would silently lose the TTL
3341        // for the affected key. Three attempts is generous: each
3342        // retry only races a single split and splits are infrequent.
3343        for _ in 0..3 {
3344            let mut guard: parking_lot::ArcRwLockReadGuard<
3345                parking_lot::RawRwLock,
3346                TreeNode,
3347            > = root.read_arc();
3348            let bin_arc;
3349            loop {
3350                if guard.is_bin() {
3351                    bin_arc =
3352                        parking_lot::ArcRwLockReadGuard::rwlock(&guard).clone();
3353                    drop(guard);
3354                    break;
3355                }
3356                let next_arc = match &*guard {
3357                    TreeNode::Internal(n) => {
3358                        if n.entries.is_empty() {
3359                            return false;
3360                        }
3361                        let idx = self.upper_in_floor_index(&n.entries, key);
3362                        match n.get_child(idx) {
3363                            Some(c) => c,
3364                            None => return false,
3365                        }
3366                    }
3367                    TreeNode::Bottom(_) => unreachable!(),
3368                };
3369                let next_guard = next_arc.read_arc();
3370                drop(guard);
3371                guard = next_guard;
3372            }
3373
3374            // Now take the write lock on the BIN we descended to.
3375            let mut wguard = bin_arc.write();
3376            if let TreeNode::Bottom(bin) = &mut *wguard {
3377                let slot = if let Some(cmp) = &self.key_comparator {
3378                    let (idx, exact) = bin.find_entry_cmp(key, cmp.as_ref());
3379                    if exact { Some(idx) } else { None }
3380                } else {
3381                    let (idx, exact) = bin.find_entry_compressed(key);
3382                    if exact { Some(idx) } else { None }
3383                };
3384                if let Some(slot_idx) = slot
3385                    && let Some(entry) = bin.entries.get_mut(slot_idx)
3386                {
3387                    entry.expiration_time = expiration_hours;
3388                    bin.expiration_in_hours = true;
3389                    bin.dirty = true;
3390                    return true;
3391                }
3392            }
3393            // Key not in this BIN — either it was never present or a
3394            // concurrent split moved it. Retry the descent; at most a
3395            // few iterations are needed to follow the key into its new
3396            // BIN.
3397        }
3398        false
3399    }
3400
3401    /// Returns the key and data of the first BIN entry at or after `key`.
3402    ///
3403    /// Descends with the tree's key comparator (same path as `search()`), then
3404    /// within the BIN finds the first slot whose stored key >= `key` using the
3405    /// comparator.  Returns `None` if every entry in the tree is < `key`.
3406    ///
3407    /// Used by sorted-duplicate cursor `search(Set)` to position at the first
3408    /// (key, data) pair whose two-part key >= `lower_bound(primary_key)`.
3409    ///
3410    /// → BIN scan path.
3411    pub fn first_entry_at_or_after(
3412        &self,
3413        key: &[u8],
3414    ) -> Option<(Vec<u8>, Vec<u8>, u64)> {
3415        // Hand-over-hand latch coupling — see Tree::search for the
3416        // detailed rationale on why this closes a reader-vs-splitter
3417        // race window.
3418        let mut guard: parking_lot::ArcRwLockReadGuard<
3419            parking_lot::RawRwLock,
3420            TreeNode,
3421        > = self.get_root()?.read_arc();
3422
3423        loop {
3424            if guard.is_bin() {
3425                let result = match &*guard {
3426                    TreeNode::Bottom(bin) => {
3427                        let (mut idx, _exact) = match &self.key_comparator {
3428                            Some(cmp) => bin.find_entry_cmp(key, cmp.as_ref()),
3429                            None => bin.find_entry_compressed(key),
3430                        };
3431                        // TREE-F1: skip non-live slots (known_deleted /
3432                        // TTL-expired) at/after the floor index, mirroring the
3433                        // cursor getNext skip (CursorImpl.java:2062-2064).
3434                        while idx < bin.entries.len() && !bin.slot_is_live(idx)
3435                        {
3436                            idx += 1;
3437                        }
3438                        if idx < bin.entries.len() {
3439                            let full_key =
3440                                bin.get_full_key(idx).unwrap_or_default();
3441                            let data = bin.entries[idx]
3442                                .data
3443                                .clone()
3444                                .unwrap_or_default();
3445                            let lsn = bin.get_lsn(idx).as_u64(); // T-3
3446                            Some((full_key, data, lsn))
3447                        } else {
3448                            None
3449                        }
3450                    }
3451                    _ => None,
3452                };
3453                return result;
3454            }
3455
3456            // Upper IN: same descent as search().
3457            let parent_arc =
3458                parking_lot::ArcRwLockReadGuard::rwlock(&guard).clone();
3459            let next_idx = match &*guard {
3460                TreeNode::Internal(n) => {
3461                    if n.entries.is_empty() {
3462                        return None;
3463                    }
3464                    let idx = self.upper_in_floor_index(&n.entries, key);
3465                    match n.get_child(idx) {
3466                        Some(c) => {
3467                            let next_guard = c.read_arc();
3468                            drop(guard);
3469                            guard = next_guard;
3470                            continue;
3471                        }
3472                        None => idx, // EV-14/EV-13: re-fetch below.
3473                    }
3474                }
3475                TreeNode::Bottom(_) => unreachable!(),
3476            };
3477            drop(guard);
3478            let child = self.child_at_or_fetch(&parent_arc, next_idx)?;
3479            guard = child.read_arc();
3480        }
3481    }
3482
3483    /// Like [`Tree::first_entry_at_or_after`] but also returns the BIN node
3484    /// (so callers may pin it) and the entry's slot index inside that
3485    /// BIN.
3486    ///
3487    /// Wave 11-N (Bug 2): `CursorImpl::search_dup` previously stored
3488    /// `current_index = 0` after a sorted-dup `Search`, which broke the
3489    /// fast-path of `retrieve_next` (and the slow path's
3490    /// `next_index = current_index + 1` arithmetic) for any primary
3491    /// that was not the first slot of its BIN.  This helper hands back
3492    /// the real index so the cursor can be positioned correctly.
3493    ///
3494    /// CC-2 fix: uses the same `read_arc()` hand-over-hand latch coupling
3495    /// as every other descent method (`search`, `first_entry_at_or_after`,
3496    /// `get_first_node`, `get_adjacent_bin_attempt`).  The original
3497    /// implementation did `arc.read().is_bin()` (lock acquired and released)
3498    /// then a SECOND `arc.read()` on the next line — a gap in which a
3499    /// concurrent split can promote the node (BIN→upper IN) or move the
3500    /// sought key to a new sibling, yielding a false "not found" for an
3501    /// existing key.  Mirrors JE `Tree.searchSubTree` / `Tree.search`
3502    /// which hold the latch across the `is_bin()` test and the subsequent
3503    /// entry lookup.
3504    pub fn first_entry_at_or_after_with_index(
3505        &self,
3506        key: &[u8],
3507    ) -> Option<(
3508        Vec<u8>,
3509        Vec<u8>,
3510        usize,
3511        u64,
3512        std::sync::Arc<crate::NodeRwLock<TreeNode>>,
3513    )> {
3514        // Hand-over-hand latch coupling — identical strategy to
3515        // first_entry_at_or_after; the guard is held continuously across
3516        // is_bin() and the subsequent entry lookup so no split can
3517        // restructure the path between the two observations.
3518        let mut guard: parking_lot::ArcRwLockReadGuard<
3519            parking_lot::RawRwLock,
3520            TreeNode,
3521        > = self.get_root()?.read_arc();
3522        loop {
3523            if guard.is_bin() {
3524                if let TreeNode::Bottom(bin) = &*guard {
3525                    let (idx, _exact) = match &self.key_comparator {
3526                        Some(cmp) => bin.find_entry_cmp(key, cmp.as_ref()),
3527                        None => bin.find_entry_compressed(key),
3528                    };
3529                    // TREE-F1: skip non-live slots (known_deleted /
3530                    // TTL-expired) at/after the floor index
3531                    // (CursorImpl.java:2062-2064).
3532                    let mut idx = idx;
3533                    while idx < bin.entries.len() && !bin.slot_is_live(idx) {
3534                        idx += 1;
3535                    }
3536                    if idx < bin.entries.len() {
3537                        let full_key =
3538                            bin.get_full_key(idx).unwrap_or_default();
3539                        let data =
3540                            bin.entries[idx].data.clone().unwrap_or_default();
3541                        let lsn = bin.get_lsn(idx).as_u64(); // T-3
3542                        // Obtain the Arc for the BIN node the guard came from.
3543                        // `ArcRwLockReadGuard::rwlock()` returns the backing Arc.
3544                        let bin_arc =
3545                            parking_lot::ArcRwLockReadGuard::rwlock(&guard)
3546                                .clone();
3547                        return Some((full_key, data, idx, lsn, bin_arc));
3548                    } else {
3549                        return None;
3550                    }
3551                }
3552                return None;
3553            }
3554
3555            // Upper IN: descend as in first_entry_at_or_after / search.
3556            let parent_arc =
3557                parking_lot::ArcRwLockReadGuard::rwlock(&guard).clone();
3558            let next_idx = match &*guard {
3559                TreeNode::Internal(n) => {
3560                    if n.entries.is_empty() {
3561                        return None;
3562                    }
3563                    let idx = self.upper_in_floor_index(&n.entries, key);
3564                    match n.get_child(idx) {
3565                        Some(c) => {
3566                            let next_guard = c.read_arc();
3567                            drop(guard);
3568                            guard = next_guard;
3569                            continue;
3570                        }
3571                        None => idx, // EV-14/EV-13: re-fetch below.
3572                    }
3573                }
3574                TreeNode::Bottom(_) => unreachable!(),
3575            };
3576            drop(guard);
3577            let child = self.child_at_or_fetch(&parent_arc, next_idx)?;
3578            guard = child.read_arc();
3579        }
3580    }
3581
3582    /// Insert a key/data pair into the tree.
3583    ///
3584    /// . Handles the root-is-null case by
3585    /// creating a two-level tree (upper IN + BIN) per initialisation path,
3586    /// then delegates to `insert_recursive` which performs preemptive splitting
3587    /// as it descends.
3588    ///
3589    /// Returns Ok(true) if this was a new insert, Ok(false) if it was an update.
3590    pub fn insert(
3591        &self,
3592        key: Vec<u8>,
3593        data: Vec<u8>,
3594        lsn: Lsn,
3595    ) -> Result<bool, TreeError> {
3596        // Save sizes before potentially moving key/data — needed for memory tracking.
3597        let key_len = key.len();
3598        let data_len = data.len();
3599
3600        // First-key path. We MUST hold the write lock while testing
3601        // root.is_none() and replacing the root, otherwise N threads can all
3602        // observe an empty tree, each build a fresh single-entry root, and
3603        // the last writer's `*self.root.write() = Some(...)` silently
3604        // discards the others' inserts. (Reproducer:
3605        // xa_protocol_test::test_concurrent_independent_xids — 8 threads
3606        // each inserting their own key into an empty tree lost ~30% of
3607        // inserts before this lock change.)
3608        {
3609            let mut root_guard = self.root.write();
3610            if root_guard.is_none() {
3611                let bin_node_id = generate_node_id();
3612                let root_node_id = generate_node_id();
3613                let bin = Arc::new(RwLock::new(TreeNode::Bottom(BinStub {
3614                    node_id: bin_node_id,
3615                    level: BIN_LEVEL,
3616                    entries: vec![BinEntry {
3617                        data: Some(data),
3618                        known_deleted: false,
3619                        dirty: false,
3620                        expiration_time: 0,
3621                    }],
3622                    key_prefix: Vec::new(), // single entry — no common prefix yet
3623                    dirty: true,
3624                    is_delta: false,
3625                    last_full_lsn: NULL_LSN,
3626                    last_delta_lsn: NULL_LSN,
3627                    generation: 0,
3628                    parent: None, // set below after root_in is created
3629                    // St-H6: use true to match the engine-wide invariant that
3630                    // every BIN which may hold TTL entries uses hours granularity
3631                    // (JE BIN.java default; matches tree.rs:980 and read_from_log).
3632                    expiration_in_hours: true,
3633                    cursor_count: 0,
3634                    prohibit_next_delta: false,
3635                    lsn_rep: LsnRep::from_lsns(&[lsn]),
3636                    keys: KeyRep::from_keys(vec![key]), // T-2
3637                    compact_max_key_length: self.compact_max_key_length,
3638                })));
3639
3640                // Upper IN at level 2; slot 0 uses an empty key (virtual root key).
3641                let root_arc =
3642                    Arc::new(RwLock::new(TreeNode::Internal(InNodeStub {
3643                        node_id: root_node_id,
3644                        level: MAIN_LEVEL | 2,
3645                        entries: vec![InEntry {
3646                            key: vec![], // virtual key for slot 0 in upper IN
3647                        }],
3648                        // T-4: the single resident child at slot 0.
3649                        targets: TargetRep::Sparse(vec![(0, bin.clone())]),
3650                        dirty: true,
3651                        generation: 0,
3652                        parent: None,
3653                        lsn_rep: LsnRep::from_lsns(&[lsn]),
3654                    })));
3655
3656                // Wire the BIN's parent pointer back to the root IN.
3657                {
3658                    let mut g = bin.write();
3659                    g.set_parent(Some(Arc::downgrade(&root_arc)));
3660                }
3661
3662                *root_guard = Some(root_arc);
3663
3664                // JE: IN.fetchTarget / initial tree build registers the new
3665                // resident nodes with the evictor (Evictor.addBack).
3666                self.note_added(root_node_id);
3667                self.note_added(bin_node_id);
3668
3669                // Count the first entry.
3670                if let Some(counter) = &self.memory_counter {
3671                    let delta =
3672                        (key_len + data_len + BIN_ENTRY_OVERHEAD) as i64;
3673                    counter.fetch_add(delta, Ordering::Relaxed);
3674                }
3675                return Ok(true);
3676            }
3677            // Another thread initialized the root while we were waiting for
3678            // the write lock; fall through and insert into the existing tree.
3679        }
3680
3681        // Check whether the root itself needs to be split before descending.
3682        // Tree.searchSplitsAllowed(): if rootIN.needsSplitting()
3683        // call splitRoot first.
3684        self.split_root_if_needed(lsn)?;
3685
3686        // Recursively insert, splitting children proactively as we descend
3687        // (forceSplit / searchSplitsAllowed pattern).
3688        let root_arc = self.get_root().unwrap();
3689        let result = Self::insert_recursive(
3690            &root_arc,
3691            key,
3692            data,
3693            lsn,
3694            self.max_entries_per_node,
3695            self.key_comparator.as_ref(),
3696            self.key_prefixing,
3697            self.in_list_listener.as_ref(),
3698        )?;
3699
3700        // Update the memory counter for new inserts.
3701        // IN.updateMemorySize(delta) → MemoryBudget.updateTreeMemoryUsage(delta).
3702        // LN_OVERHEAD = 48 bytes (approximate fixed overhead per entry).
3703        if result && let Some(counter) = &self.memory_counter {
3704            let delta = (key_len + data_len + BIN_ENTRY_OVERHEAD) as i64;
3705            counter.fetch_add(delta, Ordering::Relaxed);
3706        }
3707
3708        Ok(result)
3709    }
3710
3711    /// Recovery-redo variant of [`Tree::insert`] that accepts `&[u8]` slices.
3712    ///
3713    /// Eliminates the two intermediate `Vec<u8>` allocations that the normal
3714    /// insert path requires at the `redo_ln` call site (one for the key, one
3715    /// for the data).  The compressed key suffix and the data bytes are each
3716    /// materialised into their `BinEntry` slots exactly once.
3717    ///
3718    /// Semantics are identical to `insert`:
3719    /// - Updates the existing slot when the key is already present.
3720    /// - Inserts a new sorted entry when the key is absent.
3721    /// - Triggers the same root-split and proactive-split logic.
3722    ///
3723    /// `data` should be the raw value bytes, or an empty slice for a
3724    /// deletion (which should not normally arrive here during redo, but is
3725    /// handled gracefully).
3726    ///
3727    /// Wave 11-K optimisation (Fix 1).
3728    pub fn redo_insert(
3729        &self,
3730        key: &[u8],
3731        data: &[u8],
3732        lsn: Lsn,
3733    ) -> Result<bool, TreeError> {
3734        let key_len = key.len();
3735        let data_len = data.len();
3736        let data_opt: Option<&[u8]> =
3737            if data.is_empty() { None } else { Some(data) };
3738
3739        // First-key path: initialise a two-level tree from scratch.
3740        {
3741            let mut root_guard = self.root.write();
3742            if root_guard.is_none() {
3743                // Pre-allocate the BIN's entries Vec using the redo capacity
3744                // hint (Fix 3).  Without the hint the first BIN starts at
3745                // capacity 1 and doubles on each insert; with the hint it
3746                // starts at min(hint, max_entries) entries, eliminating
3747                // ~log2(max_entries) Vec-resize doublings.
3748                let initial_cap = if self.redo_capacity_hint > 0 {
3749                    self.redo_capacity_hint.min(self.max_entries_per_node)
3750                } else {
3751                    1
3752                };
3753                let mut initial_entries = Vec::with_capacity(initial_cap);
3754                initial_entries.push(BinEntry {
3755                    data: data_opt.map(|d| d.to_vec()),
3756                    known_deleted: false,
3757                    dirty: false,
3758                    expiration_time: 0,
3759                });
3760                let bin = Arc::new(RwLock::new(TreeNode::Bottom(BinStub {
3761                    node_id: generate_node_id(),
3762                    level: BIN_LEVEL,
3763                    entries: initial_entries,
3764                    key_prefix: Vec::new(),
3765                    dirty: true,
3766                    is_delta: false,
3767                    last_full_lsn: NULL_LSN,
3768                    last_delta_lsn: NULL_LSN,
3769                    generation: 0,
3770                    parent: None,
3771                    // St-H6: use true to match the engine-wide hours-only
3772                    // invariant (JE BIN.java default; matches tree.rs:980).
3773                    expiration_in_hours: true,
3774                    cursor_count: 0,
3775                    prohibit_next_delta: false,
3776                    lsn_rep: LsnRep::from_lsns(&[lsn]),
3777                    keys: KeyRep::from_keys(vec![key.to_vec()]), // T-2
3778                    compact_max_key_length: self.compact_max_key_length,
3779                })));
3780
3781                let root_arc =
3782                    Arc::new(RwLock::new(TreeNode::Internal(InNodeStub {
3783                        node_id: generate_node_id(),
3784                        level: MAIN_LEVEL | 2,
3785                        entries: vec![InEntry { key: vec![] }],
3786                        // T-4: the single resident child at slot 0.
3787                        targets: TargetRep::Sparse(vec![(0, bin.clone())]),
3788                        dirty: true,
3789                        generation: 0,
3790                        parent: None,
3791                        lsn_rep: LsnRep::from_lsns(&[lsn]),
3792                    })));
3793
3794                {
3795                    let mut g = bin.write();
3796                    g.set_parent(Some(Arc::downgrade(&root_arc)));
3797                }
3798
3799                *root_guard = Some(root_arc);
3800
3801                if let Some(counter) = &self.memory_counter {
3802                    let delta =
3803                        (key_len + data_len + BIN_ENTRY_OVERHEAD) as i64;
3804                    counter.fetch_add(delta, Ordering::Relaxed);
3805                }
3806                return Ok(true);
3807            }
3808        }
3809
3810        self.split_root_if_needed(lsn)?;
3811
3812        let root_arc = self.get_root().unwrap();
3813        let result = Self::redo_insert_recursive(
3814            &root_arc,
3815            key,
3816            data_opt,
3817            lsn,
3818            self.max_entries_per_node,
3819            self.key_comparator.as_ref(),
3820            self.key_prefixing,
3821        )?;
3822
3823        if result && let Some(counter) = &self.memory_counter {
3824            let delta = (key_len + data_len + BIN_ENTRY_OVERHEAD) as i64;
3825            counter.fetch_add(delta, Ordering::Relaxed);
3826        }
3827
3828        Ok(result)
3829    }
3830
3831    /// Splits the root node if it is full (needsSplitting).
3832    ///
3833    ///
3834    /// ```text
3835    /// 1. Save oldRoot (the current root IN or BIN).
3836    /// 2. Create newRoot at oldRoot.level + 1.
3837    /// 3. Insert oldRoot into newRoot at slot 0 with a virtual (empty) key.
3838    /// 4. Call split_node on oldRoot, passing newRoot as parent.
3839    /// 5. Replace tree root with newRoot.
3840    /// ```
3841    fn split_root_if_needed(&self, lsn: Lsn) -> Result<(), TreeError> {
3842        // Hold `self.root.write()` across the needs_split check and the
3843        // root promotion, mirroring the first-key path fix and matching
3844        // the broader insert/split serialisation discipline.
3845        //
3846        // With the previous read-then-write pattern, two concurrent
3847        // splitters could each observe needs_split == true, then take()
3848        // and install in turn, with the second wrapping the first's
3849        // already-promoted root in its own new IN. Each level wraps the
3850        // previous, producing a chain of one-child internal nodes. No
3851        // data is lost (every entry is still reachable) but the tree
3852        // becomes unnecessarily deep, and the imbalance can compound
3853        // under heavy concurrent insertion.
3854        let mut root_guard = self.root.write();
3855        let needs_split = match root_guard.as_ref() {
3856            Some(arc) => {
3857                let g = arc.read();
3858                g.get_n_entries() >= self.max_entries_per_node
3859            }
3860            None => false,
3861        };
3862        if !needs_split {
3863            return Ok(());
3864        }
3865
3866        // Create a fresh new root one level above the current root.
3867        let old_root_arc = root_guard.take().expect("checked Some above");
3868        let old_root_level = {
3869            let g = old_root_arc.read();
3870            g.level()
3871        };
3872
3873        // newRoot = new IN(level = oldRoot.level + 1) with slot 0 = oldRoot.
3874        // The key at slot 0 is the virtual key (empty slice) following the
3875        // convention that entry-zero in an upper IN compares as -infinity.
3876        let new_root_arc =
3877            Arc::new(RwLock::new(TreeNode::Internal(InNodeStub {
3878                node_id: generate_node_id(),
3879                level: old_root_level + 1,
3880                entries: vec![InEntry { key: vec![] }],
3881                // T-4: slot 0's resident child is the old root.
3882                targets: TargetRep::Sparse(vec![(0, old_root_arc.clone())]),
3883                dirty: true,
3884                generation: 0,
3885                parent: None,
3886                lsn_rep: LsnRep::from_lsns(&[lsn]),
3887            })));
3888
3889        // Update the old root's parent pointer to the new root.
3890        {
3891            let mut g = old_root_arc.write();
3892            g.set_parent(Some(Arc::downgrade(&new_root_arc)));
3893        }
3894
3895        // Install the new root before calling split_child so split_child
3896        // (which itself takes parent.write()) can run unencumbered.
3897        *root_guard = Some(new_root_arc.clone());
3898        drop(root_guard);
3899
3900        // Now split the old root (which is now child at slot 0 in new_root).
3901        Self::split_child(
3902            &new_root_arc,
3903            0, // child is at slot 0
3904            self.max_entries_per_node,
3905            lsn,
3906            SplitHint::Normal,
3907            &[], // no insertion key at root-init time
3908            self.key_comparator.as_ref(),
3909            self.key_prefixing,
3910            self.in_list_listener.as_ref(),
3911        )?;
3912
3913        // EVICTOR-RECLAIM-1: register the freshly-promoted root IN with the
3914        // evictor's LRU (JE Tree.splitRoot adds the new root to the INList).
3915        // split_child above already registers the new sibling.
3916        let new_root_id = match &*new_root_arc.read() {
3917            TreeNode::Internal(n) => n.node_id,
3918            TreeNode::Bottom(b) => b.node_id,
3919        };
3920        self.note_added(new_root_id);
3921
3922        self.root_splits.fetch_add(1, Ordering::Relaxed);
3923        Ok(())
3924    }
3925
3926    /// Splits the child at `child_index` in `parent`.
3927    ///
3928    /// .  This implementation always keeps the **left** half in the
3929    /// existing child node (`child_arc`) and puts the right half in the new
3930    /// sibling, regardless of where the `identifierKey` falls.  JE's
3931    /// `IN.splitInternal` (`idKeyIndex` logic ~line 4172) can place either
3932    /// half in the existing node; Noxu's preemptive-split discipline ensures
3933    /// the parent always has a free slot at split time (the split is done on
3934    /// the way *down*, before the parent fills up), so the safe simplification
3935    /// of always using the left half is correct here — no routing information
3936    /// is lost.  This comment replaces the previous incorrect claim that
3937    /// `idKeyIndex` drove the choice.
3938    ///
3939    /// Note: does not emit a split log entry; split nodes are marked dirty
3940    /// and flushed at the next checkpoint (flush_dirty_bins/upper_ins).
3941    ///
3942    /// ```text
3943    /// 1. splitIndex = child.nEntries / 2  (or 1 / n-1 for splitSpecial)
3944    /// 2. Create newSibling at the same level.
3945    /// 3. Move entries [splitIndex..nEntries) to newSibling.
3946    /// 4. Update parent slot childIndex -> child (left half),
3947    ///    insert newSibling with newIdKey after childIndex.
3948    /// ```
3949    fn split_child(
3950        parent: &Arc<RwLock<TreeNode>>,
3951        child_index: usize,
3952        max_entries: usize,
3953        lsn: Lsn,
3954        hint: SplitHint,
3955        insert_key: &[u8],
3956        key_comparator: Option<&KeyComparatorFn>,
3957        key_prefixing: bool,
3958        listener: Option<&Arc<dyn InListListener>>,
3959    ) -> Result<(), TreeError> {
3960        // The split is performed under `parent.write()` for the entire
3961        // duration. This is a deliberate choice for correctness:
3962        //
3963        // - Without it, between dropping `child.write()` (after installing
3964        //   the left half) and acquiring `parent.write()` (to install the
3965        //   sibling), a concurrent descender can pick `child_arc` from the
3966        //   parent (still pointing at it), descend, take `child.write()`
3967        //   and insert a key. Whether the descender's key belongs in the
3968        //   left half (now in `child`) or the right half (which will be
3969        //   in the new sibling) is determined by the parent's split key —
3970        //   but the parent doesn't know about the split key yet, so the
3971        //   descender's routing decision is based on stale data. If the
3972        //   descender's key falls in the right half, it lands in `child`
3973        //   (left half) where a future search will not find it: the
3974        //   future search descends from the root, the parent now has the
3975        //   sibling installed, the search routes the key to the sibling,
3976        //   the sibling does not contain the key — silently lost.
3977        //
3978        // - Holding `parent.write()` throughout serialises split_child
3979        //   against every descender that wants `parent.read()`. A
3980        //   descender already holding `parent.read()` (latch coupling
3981        //   from above) keeps split_child waiting at this lock until it
3982        //   has finished its own work. Combined, the split + sibling
3983        //   install is atomic with respect to descents.
3984        //
3985        // - Splits are infrequent compared to inserts (~ once per
3986        //   max_entries new keys) so the extra serialisation here does
3987        //   not dominate.
3988        //
3989        // Reproducer that exercises this race:
3990        // crates/noxu-db/tests/concurrent_commits_stress.rs.
3991        let mut parent_write_guard = parent.write();
3992
3993        // Extract the child Arc from the parent slot.
3994        let child_arc = match &*parent_write_guard {
3995            TreeNode::Internal(p) => {
3996                p.get_child(child_index).ok_or(TreeError::SplitRequired)?
3997            }
3998            TreeNode::Bottom(_) => return Err(TreeError::SplitRequired),
3999        };
4000
4001        // Gather all entries from the child plus split metadata, AND
4002        // perform the in-place left-half install, all under a single
4003        // write lock on the child. See the earlier comment on the race
4004        // this avoids inside split_child.
4005        let mut child_guard = child_arc.write();
4006        let child_level = child_guard.level();
4007        // St-H6: capture the splitting BIN's expiration_in_hours flag BEFORE
4008        // drop(child_guard) so the right-half sibling inherits it.
4009        // JE: BIN.java::setExpiration calls setExpirationInHours(hours) to
4010        // propagate the flag on split/clone; the Rust split was hardcoding
4011        // false instead of inheriting — this caused hours-granularity TTL
4012        // entries in the right sibling to be read with in_hours=false, making
4013        // the hours-since-epoch value compare as seconds-since-epoch (far in
4014        // the past) and every right-sibling TTL record appear expired.
4015        let bin_expiration_in_hours: bool = match &*child_guard {
4016            TreeNode::Bottom(b) => b.expiration_in_hours,
4017            // Internal nodes do not carry per-entry TTL; default to true
4018            // (the engine-wide invariant for any BIN that may hold TTL data).
4019            TreeNode::Internal(_) => true,
4020        };
4021        // T-2/T-5: the compact-key threshold the new sibling BIN inherits.
4022        // (Only consumed when the child is a BIN; an upper-IN split produces
4023        // upper-IN siblings, which have no compact key rep.)
4024        let bin_compact_max_key_length: i32 = match &*child_guard {
4025            TreeNode::Bottom(b) => b.compact_max_key_length,
4026            TreeNode::Internal(_) => INKeyRep_DEFAULT_MAX_KEY_LENGTH,
4027        };
4028        let (all_entries, bin_old_prefix) = match &*child_guard {
4029            TreeNode::Internal(n) => {
4030                // T-4: capture the parallel resident-child array alongside the
4031                // entries so children travel with their slots through the
4032                // split (JE `IN.split` copies `entryTargets`).
4033                let children: Vec<Option<ChildArc>> =
4034                    (0..n.entries.len()).map(|i| n.get_child(i)).collect();
4035                // T-3: capture the parallel per-slot LSNs so they travel with
4036                // their slots (JE `IN.split` copies `entryLsnByteArray`).
4037                let lsns: Vec<Lsn> =
4038                    (0..n.entries.len()).map(|i| n.get_lsn(i)).collect();
4039                (
4040                    SplitEntries::Internal(n.entries.clone(), children, lsns),
4041                    Vec::new(),
4042                )
4043            }
4044            TreeNode::Bottom(b) => {
4045                // Decompress to full keys.
4046                let full: Vec<BinEntry> = (0..b.entries.len())
4047                    .map(|i| BinEntry {
4048                        data: b.entries[i].data.clone(),
4049                        known_deleted: b.entries[i].known_deleted,
4050                        dirty: b.entries[i].dirty,
4051                        expiration_time: b.entries[i].expiration_time,
4052                    })
4053                    .collect();
4054                let lsns: Vec<Lsn> =
4055                    (0..b.entries.len()).map(|i| b.get_lsn(i)).collect();
4056                // T-2: carry FULL keys through the split; the new BINs
4057                // recompute their own prefix from them.
4058                let full_keys: Vec<Vec<u8>> = (0..b.entries.len())
4059                    .map(|i| b.get_full_key(i).unwrap_or_default())
4060                    .collect();
4061                (
4062                    SplitEntries::Bottom(full, lsns, full_keys),
4063                    b.key_prefix.clone(),
4064                )
4065            }
4066        };
4067
4068        // Determine split point — JE `IN.splitSpecial` / `IN.splitInternal`.
4069        //
4070        // Normal midpoint: `n_entries / 2`.
4071        // AllLeft:  insertion key is at position 0 on every descend level.
4072        //   → split_index = 1 (left half keeps n-1 entries; new right sibling
4073        //     gets only the former-first slot, then the insertion fills it).
4074        //   This matches JE: `if (leftSide && index == 0) splitInternal(…, 1)`.
4075        // AllRight: insertion key is at the last position on every level.
4076        //   → split_index = n_entries - 1 (left half keeps all but one entry).
4077        //   JE: `else if (!leftSide && index == nEntries-1) splitInternal(…, nEntries-1)`.
4078        //
4079        // Ref: `IN.java` splitSpecial ~line 4129, splitInternal ~line 4159.
4080        let n_entries = all_entries.len();
4081        let split_index = if n_entries >= 2 {
4082            // Find where insert_key falls in the child.
4083            let insert_idx = {
4084                let mut idx = 0usize;
4085                for i in 1..n_entries {
4086                    let ord = match key_comparator {
4087                        Some(cmp) => cmp(all_entries.get_key(i), insert_key),
4088                        None => all_entries.get_key(i).cmp(insert_key),
4089                    };
4090                    if ord != std::cmp::Ordering::Greater {
4091                        idx = i;
4092                    } else {
4093                        break;
4094                    }
4095                }
4096                idx
4097            };
4098            match hint {
4099                SplitHint::AllLeft if insert_idx == 0 => 1,
4100                SplitHint::AllRight if insert_idx == n_entries - 1 => {
4101                    n_entries - 1
4102                }
4103                _ => n_entries / 2,
4104            }
4105        } else {
4106            n_entries / 2
4107        };
4108
4109        // newIdKey — the full key of the first entry of the right half.
4110        // For BIN: entries are already full keys after decompression above.
4111        // For IN:  entries carry full keys directly.
4112        let new_id_key = all_entries.get_key(split_index).to_vec();
4113        // Suppress unused-variable warning when no BIN is involved.
4114        let _ = &bin_old_prefix;
4115
4116        // Divide into left and right halves.
4117        let left_entries = all_entries.slice(0, split_index);
4118        let right_entries = all_entries.slice(split_index, n_entries);
4119
4120        // Install the left half into `child_arc` (still under the same
4121        // write lock) and mark the node dirty.
4122        match (&mut *child_guard, &left_entries) {
4123            (TreeNode::Internal(n), SplitEntries::Internal(le, lc, ll)) => {
4124                n.entries = le.clone();
4125                // T-4: reinstall the (now-shorter) left child array.
4126                n.targets = TargetRep::None;
4127                for (i, c) in lc.iter().enumerate() {
4128                    if let Some(child) = c {
4129                        n.set_child(i, Some(child.clone()));
4130                    }
4131                }
4132                // T-3: reinstall the (now-shorter) left LSN array.
4133                n.lsn_rep = LsnRep::from_lsns(ll);
4134            }
4135            (TreeNode::Bottom(b), SplitEntries::Bottom(le, ll, lk)) => {
4136                // Reset prefix; keys arrive as FULL keys (no prefix yet).
4137                b.key_prefix = Vec::new();
4138                // Pre-allocate at max_entries capacity so the left half
4139                // does not need to reallocate on the next insert (Fix 3).
4140                let mut left = Vec::with_capacity(max_entries);
4141                left.extend_from_slice(le);
4142                b.entries = left;
4143                // T-3: reinstall the left LSN array.
4144                b.lsn_rep = LsnRep::from_lsns(ll);
4145                // T-2: reinstall the left key rep from the full keys (Default;
4146                // recompute_key_prefix below compresses + compacts).
4147                b.keys = KeyRep::from_keys(lk.clone());
4148                // Recompute prefix on each half after split (only when
4149                // key_prefixing is enabled for this database).
4150                // JE: IN.computeKeyPrefix returns null when
4151                // databaseImpl.getKeyPrefixing() is false.
4152                // Ref: IN.java computeKeyPrefix ~line 2456.
4153                if key_prefixing && b.entries.len() >= 2 {
4154                    b.recompute_key_prefix();
4155                } else {
4156                    b.keys.compact(b.compact_max_key_length); // T-2
4157                }
4158            }
4159            _ => return Err(TreeError::SplitRequired),
4160        }
4161        child_guard.set_dirty(true);
4162        drop(child_guard);
4163
4164        // Create the new right-half sibling.
4165        // Parent pointer will be wired in when it is inserted into the parent.
4166        let new_sibling = match right_entries {
4167            SplitEntries::Internal(re, rc, rl) => {
4168                let mut rin = InNodeStub {
4169                    node_id: generate_node_id(),
4170                    level: child_level,
4171                    entries: re,
4172                    targets: TargetRep::None,
4173                    dirty: true,
4174                    generation: 0,
4175                    parent: None, // set below
4176                    // T-3: the right half's per-slot LSNs.
4177                    lsn_rep: LsnRep::from_lsns(&rl),
4178                };
4179                // T-4: install the right half's resident children.
4180                for (i, c) in rc.into_iter().enumerate() {
4181                    if c.is_some() {
4182                        rin.set_child(i, c);
4183                    }
4184                }
4185                Arc::new(RwLock::new(TreeNode::Internal(rin)))
4186            }
4187            SplitEntries::Bottom(re, rl, rk) => {
4188                // Entries arrive as FULL keys; build BinStub with no prefix
4189                // then recompute key prefix for the new sibling.
4190                // Pre-allocate at max_entries capacity so the right half
4191                // does not need to reallocate on the next insert (Fix 3).
4192                let mut right = Vec::with_capacity(max_entries);
4193                right.extend(re);
4194                let mut sibling_bin = BinStub {
4195                    node_id: generate_node_id(),
4196                    level: child_level,
4197                    entries: right,
4198                    key_prefix: Vec::new(),
4199                    dirty: true,
4200                    is_delta: false,
4201                    last_full_lsn: NULL_LSN,
4202                    last_delta_lsn: NULL_LSN,
4203                    generation: 0,
4204                    parent: None, // set below
4205                    // St-H6 fix: inherit the splitting BIN's flag so that
4206                    // is_expired() uses the correct granularity for entries
4207                    // that were already in the BIN before the split.
4208                    // JE reference: BIN.java::split() propagates
4209                    // expirationInHours via setExpirationInHours(hours).
4210                    expiration_in_hours: bin_expiration_in_hours,
4211                    cursor_count: 0,
4212                    prohibit_next_delta: false,
4213                    // T-3: the right half's per-slot LSNs.
4214                    lsn_rep: LsnRep::from_lsns(&rl),
4215                    // T-2: full keys (Default); recompute/compact below.
4216                    keys: KeyRep::from_keys(rk),
4217                    compact_max_key_length: bin_compact_max_key_length,
4218                };
4219                // St-H6 debug guard: the sibling must carry the same flag as
4220                // the splitting BIN so that in_hours-resolution entries are
4221                // never silently expired by a mismatched false flag.
4222                debug_assert_eq!(
4223                    sibling_bin.expiration_in_hours, bin_expiration_in_hours,
4224                    "St-H6 invariant: sibling BIN expiration_in_hours must \
4225                     match the splitting BIN (got {}, expected {})",
4226                    sibling_bin.expiration_in_hours, bin_expiration_in_hours
4227                );
4228
4229                if key_prefixing && sibling_bin.entries.len() >= 2 {
4230                    sibling_bin.recompute_key_prefix();
4231                } else {
4232                    sibling_bin.keys.compact(bin_compact_max_key_length); // T-2
4233                }
4234                Arc::new(RwLock::new(TreeNode::Bottom(sibling_bin)))
4235            }
4236        };
4237
4238        // Note: the child (left half) was marked dirty earlier under the
4239        // same write lock that installed left_entries; no need to re-take
4240        // the write lock here.
4241
4242        // Insert the new sibling into the parent after child_index.
4243        // We already hold `parent.write()` (taken at the top of the
4244        // function); operate on it directly rather than re-acquiring.
4245        match &mut *parent_write_guard {
4246            TreeNode::Internal(p) => {
4247                let insert_pos = child_index + 1;
4248                // T-4: insert the parent slot and set its cached child via the
4249                // node-level INTargetRep (shifting existing children).
4250                p.insert_entry(
4251                    insert_pos,
4252                    new_id_key,
4253                    lsn,
4254                    Some(new_sibling.clone()),
4255                );
4256                // Parent is dirty because it gained a new entry.
4257                p.dirty = true;
4258            }
4259            TreeNode::Bottom(_) => return Err(TreeError::SplitRequired),
4260        }
4261
4262        // Wire the new sibling's parent pointer to the parent node
4263        // before releasing parent_write_guard, so a future descent that
4264        // takes parent.read() and finds the sibling immediately sees a
4265        // fully-wired parent pointer.
4266        {
4267            let mut g = new_sibling.write();
4268            g.set_parent(Some(Arc::downgrade(parent)));
4269        }
4270        // T-4: when an upper IN split, the children that moved into the new
4271        // sibling must have their parent back-pointers re-wired to the
4272        // sibling (JE re-parents moved targets in IN.split).
4273        {
4274            let sg = new_sibling.read();
4275            if let TreeNode::Internal(sn) = &*sg {
4276                let moved = sn.resident_children();
4277                drop(sg);
4278                for child in moved {
4279                    let mut cg = child.write();
4280                    cg.set_parent(Some(Arc::downgrade(&new_sibling)));
4281                }
4282            }
4283        }
4284        drop(parent_write_guard);
4285
4286        // EVICTOR-RECLAIM-1: register the freshly-split sibling with the
4287        // evictor's LRU (JE IN.splitInternal calls inList.add(newSibling)).
4288        // Without this, split-created BINs/INs are invisible to the evictor:
4289        // the policy lists never receive them, every evict_batch phase quota
4290        // is 0, and eviction reclaims nothing under pressure even though the
4291        // nodes are fully resident.  Only the very first root+BIN (the
4292        // first-key path) and re-fetched nodes were ever registered.
4293        if let Some(l) = listener {
4294            let sibling_id = match &*new_sibling.read() {
4295                TreeNode::Internal(n) => n.node_id,
4296                TreeNode::Bottom(b) => b.node_id,
4297            };
4298            l.note_ins_added(sibling_id);
4299        }
4300
4301        Ok(())
4302    }
4303
4304    /// Recursive insert with preemptive splitting.
4305    ///
4306    /// Top-down traversal in `Tree.forceSplit` +
4307    /// `Tree.searchSplitsAllowed`:
4308    ///
4309    /// 1. At an upper IN: find which child slot covers `key`, split the child
4310    ///    proactively if it is full (so we always have room to insert the split
4311    ///    key into the parent), then recurse into the appropriate child.
4312    /// 2. At a BIN: insert the key/data directly.
4313    ///
4314    /// This implements the "preemptive splitting" strategy from the: we split
4315    /// children on the way down so we never need to walk back up.
4316    fn insert_recursive(
4317        node_arc: &Arc<RwLock<TreeNode>>,
4318        key: Vec<u8>,
4319        data: Vec<u8>,
4320        lsn: Lsn,
4321        max_entries: usize,
4322        key_comparator: Option<&KeyComparatorFn>,
4323        key_prefixing: bool,
4324        listener: Option<&Arc<dyn InListListener>>,
4325    ) -> Result<bool, TreeError> {
4326        Self::insert_recursive_inner(
4327            node_arc,
4328            key,
4329            data,
4330            lsn,
4331            max_entries,
4332            key_comparator,
4333            key_prefixing,
4334            true, // all_left_so_far
4335            true, // all_right_so_far
4336            listener,
4337        )
4338    }
4339
4340    /// Inner recursive helper that threads `allLeftSideDescent` /
4341    /// `allRightSideDescent` from `Tree.forceSplit` (JE ~line 1912).
4342    ///
4343    /// Both flags start `true` at the root and are cleared as soon as the
4344    /// descent takes a non-leftmost / non-rightmost child slot.  At split
4345    /// time they are forwarded to `split_child` which uses them to pick the
4346    /// `splitSpecial` split index (JE `IN.splitSpecial` ~line 4129).
4347    #[allow(clippy::too_many_arguments)]
4348    fn insert_recursive_inner(
4349        node_arc: &Arc<RwLock<TreeNode>>,
4350        key: Vec<u8>,
4351        data: Vec<u8>,
4352        lsn: Lsn,
4353        max_entries: usize,
4354        key_comparator: Option<&KeyComparatorFn>,
4355        key_prefixing: bool,
4356        all_left_so_far: bool,
4357        all_right_so_far: bool,
4358        listener: Option<&Arc<dyn InListListener>>,
4359    ) -> Result<bool, TreeError> {
4360        // Determine if this is a BIN (leaf level).
4361        //
4362        // We hold a read lock on `node_arc` (the parent of any descent we
4363        // do below) for the duration of this call, releasing it just
4364        // before returning. That achieves *latch coupling*: a concurrent
4365        // `split_child(parent, …)` that wants to reorganise our subtree
4366        // ultimately needs `parent.write()` to install the new sibling,
4367        // and that write blocks until our read lock is dropped. Without
4368        // this, the descender-vs-splitter race goes:
4369        //
4370        //   T_X: at root, picks child_arc (BIN), drops root read lock.
4371        //   T_Y: at root, runs split_child(root, …): takes child_arc.write(),
4372        //        installs left half [E1..E5], creates sibling [E6..E10],
4373        //        takes root.write() and inserts the sibling.
4374        //   T_X: now takes child_arc.write() and inserts a key whose
4375        //        sort order falls in the right half. The key lands in
4376        //        child_arc (left half) but a future search descending
4377        //        from the root routes that key to the new sibling and
4378        //        does not find it — silently lost.
4379        //
4380        // Reproducer: noxu-db/tests/concurrent_commits_stress.rs
4381        // (32 threads × 100 keys, ~1–6 lost writes per run before this fix;
4382        // occasionally hundreds when an entire BIN is orphaned).
4383        let parent_guard = node_arc.read();
4384        let is_bin = parent_guard.is_bin();
4385
4386        if is_bin {
4387            // BIN: drop the read lock and take the write lock; this is
4388            // safe because the *outer* call frame still holds a read
4389            // lock on this BIN's parent (or this is the root, in which
4390            // case the first-key path has already initialised it). A
4391            // concurrent split_child(parent, …) cannot run while the
4392            // outer parent.read() is held, so the BIN cannot be
4393            // restructured between dropping our read lock and acquiring
4394            // our write lock.
4395            drop(parent_guard);
4396            let mut guard = node_arc.write();
4397            match &mut *guard {
4398                TreeNode::Bottom(bin) => {
4399                    let is_new = if let Some(cmp) = key_comparator {
4400                        // Comparator-based insert: no prefix compression.
4401                        let (_idx, new) =
4402                            bin.insert_cmp(key, lsn, Some(data), cmp.as_ref());
4403                        new
4404                    } else if key_prefixing {
4405                        // insert_with_prefix handles prefix recomputation when
4406                        // the new key shrinks the existing prefix, and also
4407                        // initialises the prefix when 2 entries are present for
4408                        // the first time.
4409                        let (_idx, new) =
4410                            bin.insert_with_prefix(key, lsn, Some(data));
4411                        new
4412                    } else {
4413                        // key_prefixing disabled: store full key, no prefix.
4414                        // JE: IN.computeKeyPrefix returns null when
4415                        // databaseImpl.getKeyPrefixing() is false.
4416                        // Ref: IN.java computeKeyPrefix ~line 2456.
4417                        let (_idx, new) = bin.insert_raw(key, lsn, Some(data));
4418                        new
4419                    };
4420                    // Mark dirty after any modification.
4421                    bin.dirty = true;
4422                    Ok(is_new)
4423                }
4424                TreeNode::Internal(_) => Err(TreeError::SplitRequired),
4425            }
4426        } else {
4427            // Upper IN: find the child slot that covers key.
4428            // Index = parent.findEntry(key, false, false)
4429            // Entry zero in an upper IN has a virtual key (-infinity), so
4430            // any real key is routed to at least slot 0.
4431            let (child_index, n_entries_at_level, child_arc) =
4432                match &*parent_guard {
4433                    TreeNode::Internal(n) => {
4434                        // Binary search for the largest key <= search key.
4435                        // Slot 0 always matches (virtual key = -infinity).
4436                        let mut idx = 0usize;
4437                        for (i, entry) in n.entries.iter().enumerate() {
4438                            if i == 0 {
4439                                idx = 0;
4440                            } else {
4441                                let ord = match key_comparator {
4442                                    Some(cmp) => cmp(
4443                                        entry.key.as_slice(),
4444                                        key.as_slice(),
4445                                    ),
4446                                    None => {
4447                                        entry.key.as_slice().cmp(key.as_slice())
4448                                    }
4449                                };
4450                                if ord != std::cmp::Ordering::Greater {
4451                                    idx = i;
4452                                } else {
4453                                    break;
4454                                }
4455                            }
4456                        }
4457                        let child =
4458                            n.get_child(idx).ok_or(TreeError::SplitRequired)?;
4459                        (idx, n.entries.len(), child)
4460                    }
4461                    TreeNode::Bottom(_) => {
4462                        return Err(TreeError::SplitRequired);
4463                    }
4464                };
4465
4466            // Update the descent-side flags (JE `Tree.forceSplit` ~1959).
4467            // `allLeftSideDescent`  ← still true only if we chose slot 0.
4468            // `allRightSideDescent` ← still true only if we chose the last slot.
4469            let all_left = all_left_so_far && child_index == 0;
4470            let all_right = all_right_so_far
4471                && child_index == n_entries_at_level.saturating_sub(1);
4472
4473            // Proactively split the child if it is full.
4474            // If (child.needsSplitting()) child.split(parent, ...)
4475            let child_full = {
4476                let g = child_arc.read();
4477                g.get_n_entries() >= max_entries
4478            };
4479
4480            if child_full {
4481                // Build the splitSpecial hint from the accumulated flags.
4482                // JE `Tree.forceSplit` ~line 2010:
4483                //   if (allLeftSideDescent || allRightSideDescent)
4484                //       child.splitSpecial(parent, index, grandParent,
4485                //           maxTreeEntriesPerNode, key, allLeftSideDescent)
4486                let hint = match (all_left, all_right) {
4487                    (true, _) => SplitHint::AllLeft,
4488                    (_, true) => SplitHint::AllRight,
4489                    _ => SplitHint::Normal,
4490                };
4491                // split_child(parent, …) needs parent.write(); we must
4492                // drop our parent read lock before calling it.
4493                drop(parent_guard);
4494                Self::split_child(
4495                    node_arc,
4496                    child_index,
4497                    max_entries,
4498                    lsn,
4499                    hint,
4500                    &key,
4501                    key_comparator,
4502                    key_prefixing,
4503                    listener,
4504                )?;
4505
4506                // After the split, re-find which child now covers key.
4507                // Re-enter at the top of the inner function; carry the
4508                // flags (the new topology doesn't invalidate them — we
4509                // still know the overall descent direction).
4510                return Self::insert_recursive_inner(
4511                    node_arc,
4512                    key,
4513                    data,
4514                    lsn,
4515                    max_entries,
4516                    key_comparator,
4517                    key_prefixing,
4518                    all_left_so_far,
4519                    all_right_so_far,
4520                    listener,
4521                );
4522            }
4523
4524            // Descend into the child while still holding parent_guard.
4525            // The recursive call will hold child.read() before this
4526            // returns, then drop it; combined with our parent_guard,
4527            // the latch coupling chain is preserved on the way down and
4528            // unwound on the way back up.
4529            let r = Self::insert_recursive_inner(
4530                &child_arc,
4531                key,
4532                data,
4533                lsn,
4534                max_entries,
4535                key_comparator,
4536                key_prefixing,
4537                all_left,
4538                all_right,
4539                listener,
4540            );
4541            drop(parent_guard);
4542            r
4543        }
4544    }
4545
4546    /// Slice-based variant of [`Tree::insert_recursive`] for the recovery redo path.
4547    ///
4548    /// Accepts `key: &[u8]` and `data: Option<&[u8]>` instead of owned
4549    /// `Vec<u8>` values.  At the BIN leaf, calls
4550    /// [`BinStub::insert_with_prefix_slice`] which copies bytes into the
4551    /// `BinEntry` exactly once.
4552    ///
4553    /// For the comparator path (custom key comparator), falls back to
4554    /// `insert_cmp` with a one-time `to_vec()` conversion — that path is
4555    /// rare in practice (sorted-dup databases only) and is not on the
4556    /// W11 hot path.
4557    ///
4558    /// Wave 11-K optimisation (Fix 1).
4559    fn redo_insert_recursive(
4560        node_arc: &Arc<RwLock<TreeNode>>,
4561        key: &[u8],
4562        data: Option<&[u8]>,
4563        lsn: Lsn,
4564        max_entries: usize,
4565        key_comparator: Option<&KeyComparatorFn>,
4566        key_prefixing: bool,
4567    ) -> Result<bool, TreeError> {
4568        Self::redo_insert_recursive_inner(
4569            node_arc,
4570            key,
4571            data,
4572            lsn,
4573            max_entries,
4574            key_comparator,
4575            key_prefixing,
4576            true,
4577            true,
4578        )
4579    }
4580
4581    #[allow(clippy::too_many_arguments)]
4582    fn redo_insert_recursive_inner(
4583        node_arc: &Arc<RwLock<TreeNode>>,
4584        key: &[u8],
4585        data: Option<&[u8]>,
4586        lsn: Lsn,
4587        max_entries: usize,
4588        key_comparator: Option<&KeyComparatorFn>,
4589        key_prefixing: bool,
4590        all_left_so_far: bool,
4591        all_right_so_far: bool,
4592    ) -> Result<bool, TreeError> {
4593        let parent_guard = node_arc.read();
4594        let is_bin = parent_guard.is_bin();
4595
4596        if is_bin {
4597            drop(parent_guard);
4598            let mut guard = node_arc.write();
4599            match &mut *guard {
4600                TreeNode::Bottom(bin) => {
4601                    // REC-F2: JE redo currency check
4602                    // (RecoveryManager.redo() line ~2512/2544).  A logged LN
4603                    // is applied only when logrecLsn > treeLsn.  If the slot
4604                    // already holds an equal-or-newer LSN, skip the overwrite
4605                    // so an out-of-order (older-LSN) redo cannot revert
4606                    // committed data or reset the slot LSN backward.  This
4607                    // makes redo genuinely idempotent regardless of
4608                    // redo/undo phase order.  Deletes never reach this path
4609                    // (redo_ln routes Delete through tree.delete), so the JE
4610                    // "lsnCmp == 0 && isDeletion -> set KD" sub-case does not
4611                    // apply here.
4612                    let cmp_ref = key_comparator.map(|c| {
4613                        c.as_ref()
4614                            as &dyn Fn(&[u8], &[u8]) -> std::cmp::Ordering
4615                    });
4616                    if let Some(slot_lsn) =
4617                        bin.redo_slot_lsn(key, cmp_ref, key_prefixing)
4618                        && lsn <= slot_lsn
4619                    {
4620                        // Tree already holds an equal-or-newer version.
4621                        return Ok(false);
4622                    }
4623                    let is_new = if let Some(cmp) = key_comparator {
4624                        // Comparator path: fall back to owned-Vec variant.
4625                        let (_idx, new) = bin.insert_cmp(
4626                            key.to_vec(),
4627                            lsn,
4628                            data.map(|d| d.to_vec()),
4629                            cmp.as_ref(),
4630                        );
4631                        new
4632                    } else if key_prefixing {
4633                        let (_idx, new) =
4634                            bin.insert_with_prefix_slice(key, lsn, data);
4635                        new
4636                    } else {
4637                        // key_prefixing disabled: store full key verbatim.
4638                        // Ref: IN.java computeKeyPrefix ~line 2456.
4639                        let (_idx, new) = bin.insert_raw(
4640                            key.to_vec(),
4641                            lsn,
4642                            data.map(|d| d.to_vec()),
4643                        );
4644                        new
4645                    };
4646                    bin.dirty = true;
4647                    Ok(is_new)
4648                }
4649                TreeNode::Internal(_) => Err(TreeError::SplitRequired),
4650            }
4651        } else {
4652            let (child_index, n_entries_at_level, child_arc) =
4653                match &*parent_guard {
4654                    TreeNode::Internal(n) => {
4655                        let mut idx = 0usize;
4656                        for (i, entry) in n.entries.iter().enumerate() {
4657                            if i == 0 {
4658                                idx = 0;
4659                            } else {
4660                                let ord = match key_comparator {
4661                                    Some(cmp) => cmp(entry.key.as_slice(), key),
4662                                    None => entry.key.as_slice().cmp(key),
4663                                };
4664                                if ord != std::cmp::Ordering::Greater {
4665                                    idx = i;
4666                                } else {
4667                                    break;
4668                                }
4669                            }
4670                        }
4671                        let child =
4672                            n.get_child(idx).ok_or(TreeError::SplitRequired)?;
4673                        (idx, n.entries.len(), child)
4674                    }
4675                    TreeNode::Bottom(_) => {
4676                        return Err(TreeError::SplitRequired);
4677                    }
4678                };
4679
4680            let all_left = all_left_so_far && child_index == 0;
4681            let all_right = all_right_so_far
4682                && child_index == n_entries_at_level.saturating_sub(1);
4683
4684            let child_full = {
4685                let g = child_arc.read();
4686                g.get_n_entries() >= max_entries
4687            };
4688
4689            if child_full {
4690                let hint = match (all_left, all_right) {
4691                    (true, _) => SplitHint::AllLeft,
4692                    (_, true) => SplitHint::AllRight,
4693                    _ => SplitHint::Normal,
4694                };
4695                drop(parent_guard);
4696                Self::split_child(
4697                    node_arc,
4698                    child_index,
4699                    max_entries,
4700                    lsn,
4701                    hint,
4702                    key,
4703                    key_comparator,
4704                    key_prefixing,
4705                    // Recovery redo path: the listener is not active during
4706                    // log replay (the evictor is wired AFTER recovery, and
4707                    // the INList is rebuilt separately).  EVICTOR-RECLAIM-1
4708                    // registration happens on the live insert path.
4709                    None,
4710                )?;
4711                return Self::redo_insert_recursive_inner(
4712                    node_arc,
4713                    key,
4714                    data,
4715                    lsn,
4716                    max_entries,
4717                    key_comparator,
4718                    key_prefixing,
4719                    all_left_so_far,
4720                    all_right_so_far,
4721                );
4722            }
4723
4724            let r = Self::redo_insert_recursive_inner(
4725                &child_arc,
4726                key,
4727                data,
4728                lsn,
4729                max_entries,
4730                key_comparator,
4731                key_prefixing,
4732                all_left,
4733                all_right,
4734            );
4735            drop(parent_guard);
4736            r
4737        }
4738    }
4739
4740    /// Pre-warm the tree's internal `Vec<BinEntry>` capacity before a redo
4741    /// pass that will insert approximately `n` records.
4742    ///
4743    /// If the tree is empty, this is a no-op (there is no BIN yet to reserve
4744    /// capacity on).  If the tree already has a root BIN (from a previous
4745    /// checkpoint), reserves `n.min(max_entries_per_node)` additional slots
4746    /// in that BIN's entries vector, eliminating the resize-double cycle
4747    /// during the redo loop.
4748    ///
4749    /// Wave 11-K optimisation (Fix 3).
4750    pub fn reserve_redo_capacity(&self, n: usize) {
4751        if n == 0 {
4752            return;
4753        }
4754        let root = match self.get_root() {
4755            Some(r) => r,
4756            None => return,
4757        };
4758        // Descend to the leftmost BIN and reserve there.
4759        let mut arc = root;
4760        loop {
4761            let guard = arc.read();
4762            match &*guard {
4763                TreeNode::Bottom(bin_guard) => {
4764                    let additional = n
4765                        .min(self.max_entries_per_node)
4766                        .saturating_sub(bin_guard.entries.len());
4767                    drop(guard);
4768                    let mut wguard = arc.write();
4769                    if let TreeNode::Bottom(bin) = &mut *wguard {
4770                        bin.entries.reserve(additional);
4771                    }
4772                    return;
4773                }
4774                TreeNode::Internal(inner) => {
4775                    let child = inner.get_child(0);
4776                    drop(guard);
4777                    match child {
4778                        Some(c) => arc = c,
4779                        None => return,
4780                    }
4781                }
4782            }
4783        }
4784    }
4785
4786    /// Get the first (leftmost) BIN in the tree.
4787    ///
4788    /// Descends to the leftmost BIN by
4789    /// always following the first child slot at each upper IN level.
4790    pub fn get_first_node(&self) -> Option<SearchResult> {
4791        let mut guard: parking_lot::ArcRwLockReadGuard<
4792            parking_lot::RawRwLock,
4793            TreeNode,
4794        > = self.get_root()?.read_arc();
4795
4796        loop {
4797            if guard.is_bin() {
4798                let n = guard.get_n_entries();
4799                if n == 0 {
4800                    return None;
4801                }
4802                // TREE-F1: return the first LIVE slot, skipping known_deleted
4803                // slots (CursorImpl.java:2062-2064).  If the leftmost BIN is
4804                // entirely KD during the reconstitution window the cursor's
4805                // get_first falls through to its cross-BIN advance.
4806                if let TreeNode::Bottom(b) = &*guard {
4807                    match (0..b.entries.len()).find(|&i| b.slot_is_live(i)) {
4808                        Some(i) => {
4809                            return Some(SearchResult::with_values(
4810                                true, i as i32, false,
4811                            ));
4812                        }
4813                        None => return None,
4814                    }
4815                }
4816                return Some(SearchResult::with_values(true, 0, false));
4817            }
4818
4819            // Capture the leftmost child Arc while holding `guard`, then
4820            // hand-over-hand: take the child read lock before releasing
4821            // the parent's. Same race fix as `Tree::search`.
4822            let next_arc = match &*guard {
4823                TreeNode::Internal(n_node) => n_node.get_child(0)?,
4824                _ => return None,
4825            };
4826            let next_guard = next_arc.read_arc();
4827            drop(guard);
4828            guard = next_guard;
4829        }
4830    }
4831
4832    /// Get the last (rightmost) BIN in the tree.
4833    ///
4834    /// Descends to the rightmost BIN by
4835    /// always following the last child slot at each upper IN level.
4836    pub fn get_last_node(&self) -> Option<SearchResult> {
4837        let mut guard: parking_lot::ArcRwLockReadGuard<
4838            parking_lot::RawRwLock,
4839            TreeNode,
4840        > = self.get_root()?.read_arc();
4841
4842        loop {
4843            if guard.is_bin() {
4844                let n = guard.get_n_entries();
4845                if n == 0 {
4846                    return None;
4847                }
4848                // TREE-F1: return the last LIVE slot, skipping known_deleted
4849                // slots (CursorImpl.java:2062-2064).
4850                if let TreeNode::Bottom(b) = &*guard {
4851                    match (0..b.entries.len())
4852                        .rev()
4853                        .find(|&i| b.slot_is_live(i))
4854                    {
4855                        Some(i) => {
4856                            return Some(SearchResult::with_values(
4857                                true, i as i32, false,
4858                            ));
4859                        }
4860                        None => return None,
4861                    }
4862                }
4863                return Some(SearchResult::with_values(
4864                    true,
4865                    (n - 1) as i32,
4866                    false,
4867                ));
4868            }
4869
4870            // Capture the rightmost child Arc while holding `guard`, then
4871            // hand-over-hand: take the child read lock before releasing
4872            // the parent's. Same race fix as `Tree::search`.
4873            let next_arc = match &*guard {
4874                TreeNode::Internal(n_node) => {
4875                    n_node.get_child(n_node.entries.len().saturating_sub(1))?
4876                }
4877                _ => return None,
4878            };
4879            let next_guard = next_arc.read_arc();
4880            drop(guard);
4881            guard = next_guard;
4882        }
4883    }
4884
4885    /// Returns the number of root splits that have occurred.
4886    pub fn get_root_splits(&self) -> u64 {
4887        self.root_splits.load(Ordering::Relaxed)
4888    }
4889
4890    /// Returns the number of relatches required.
4891    pub fn get_relatches_required(&self) -> u64 {
4892        self.relatches_required.load(Ordering::Relaxed)
4893    }
4894
4895    /// Delete a key from the tree.
4896    ///
4897    /// Traverses the tree to find the BIN that should contain the key, then
4898    /// removes the entry. Returns true if the key was found and removed.
4899    ///
4900    /// Delete path in `Tree` from the.
4901    ///
4902    /// In-memory removal only — WAL logging for deletes is handled by the
4903    /// cursor layer (`cursor_impl.rs::log_ln_write`) before this is called,
4904    /// matching separation between LN logging and tree mutation.
4905    pub fn delete(&self, key: &[u8]) -> bool {
4906        let root = match self.get_root() {
4907            Some(r) => r,
4908            None => return false,
4909        };
4910
4911        // F8 consistency: insert accounts key + data + BIN_ENTRY_OVERHEAD; delete must
4912        // subtract the SAME (data_len was previously omitted, leaking
4913        // data_len from the cache counter on every delete and biasing the
4914        // evictor's over-budget view). Peek the data length before deleting.
4915        let data_len = if self.memory_counter.is_some() {
4916            self.search_with_data(key)
4917                .filter(|sf| sf.found)
4918                .and_then(|sf| sf.data.as_ref().map(|d| d.len()))
4919                .unwrap_or(0)
4920        } else {
4921            0
4922        };
4923
4924        let deleted =
4925            Self::delete_recursive(&root, key, self.key_comparator.as_ref());
4926
4927        // Update the memory counter when an entry is removed.
4928        // IN.updateMemorySize(-delta) → MemoryBudget.updateTreeMemoryUsage(-delta).
4929        if deleted && let Some(counter) = &self.memory_counter {
4930            let delta = (key.len() + data_len + BIN_ENTRY_OVERHEAD) as i64;
4931            counter.fetch_sub(delta, Ordering::Relaxed);
4932        }
4933
4934        deleted
4935    }
4936
4937    /// Recursive helper for `delete`: descend to the BIN that holds `key`
4938    /// and remove it.
4939    fn delete_recursive(
4940        node_arc: &Arc<RwLock<TreeNode>>,
4941        key: &[u8],
4942        key_comparator: Option<&KeyComparatorFn>,
4943    ) -> bool {
4944        // Latch coupling, mirroring `insert_recursive`. Without this,
4945        // delete has the same "BIN split out from under us" race: thread
4946        // A finds child_arc as the target BIN under parent.read(), drops
4947        // the lock, and another thread runs split_child(parent, …) that
4948        // moves the target key into the new sibling. A then takes
4949        // child_arc.write(), looks for the key in the (now left-half)
4950        // BIN, doesn't find it, and returns `false`. The caller treats
4951        // the `false` as "key was not present", but the key is actually
4952        // still in the tree (in the sibling). Subsequent operations
4953        // observe a stale record that should have been deleted —
4954        // semantically a lost delete.
4955        let parent_guard = node_arc.read();
4956        let is_bin = parent_guard.is_bin();
4957        let child_arc = if !is_bin {
4958            match &*parent_guard {
4959                TreeNode::Internal(n) => {
4960                    // Find child slot with largest key <= search key
4961                    let mut idx = 0usize;
4962                    for (i, entry) in n.entries.iter().enumerate() {
4963                        if i == 0 {
4964                            idx = 0;
4965                        } else {
4966                            let ord = match key_comparator {
4967                                Some(cmp) => cmp(entry.key.as_slice(), key),
4968                                None => entry.key.as_slice().cmp(key),
4969                            };
4970                            if ord != std::cmp::Ordering::Greater {
4971                                idx = i;
4972                            } else {
4973                                break;
4974                            }
4975                        }
4976                    }
4977                    n.get_child(idx)
4978                }
4979                _ => None,
4980            }
4981        } else {
4982            None
4983        };
4984
4985        if is_bin {
4986            // Drop the read lock before taking the write lock; the outer
4987            // call frame still holds the parent read lock so a concurrent
4988            // split_child cannot run on this BIN's parent until we unwind.
4989            drop(parent_guard);
4990            let mut g = node_arc.write();
4991            match &mut *g {
4992                TreeNode::Bottom(bin) => {
4993                    if let Some(cmp) = key_comparator {
4994                        bin.delete_cmp(key, cmp.as_ref())
4995                    } else {
4996                        // Entries store compressed (suffix) keys when key_prefix
4997                        // is non-empty.  Compress the search key before comparing.
4998                        //
4999                        // The caller is not required to ensure that `key`
5000                        // shares this BIN's learned `key_prefix` — a stray
5001                        // delete of a key that was never present (or that
5002                        // sits under a different prefix) is legal and must
5003                        // simply return `false`.  Calling `compress_key`
5004                        // unconditionally would `debug_assert!`-panic on
5005                        // such inputs, so guard it the same way the cursor
5006                        // path does.
5007                        if !bin.key_prefix.is_empty()
5008                            && !key.starts_with(bin.key_prefix.as_slice())
5009                        {
5010                            return false;
5011                        }
5012                        let suffix = bin.compress_key(key);
5013                        match bin.key_binary_search(suffix.as_slice()) {
5014                            Ok(idx) => {
5015                                bin.entries.remove(idx);
5016                                bin.keys.remove(idx); // T-2
5017                                bin.lsn_rep.remove_shift(idx); // T-3
5018                                // Mark dirty after any modification.
5019                                bin.dirty = true;
5020                                true
5021                            }
5022                            Err(_) => false,
5023                        }
5024                    }
5025                }
5026                _ => false,
5027            }
5028        } else {
5029            // Descend with parent_guard still held; the recursion will
5030            // hold its own read lock and drop ours after it returns.
5031            let r = match child_arc {
5032                Some(child) => {
5033                    Self::delete_recursive(&child, key, key_comparator)
5034                }
5035                None => false,
5036            };
5037            drop(parent_guard);
5038            r
5039        }
5040    }
5041
5042    // ========================================================================
5043    // B-tree Merge / Compress
5044    // ========================================================================
5045
5046    /// Merge under-full sibling BIN pairs and remove empty subtrees.
5047    ///
5048    /// `INCompressor` / `Tree.compressInternal()` logic.
5049    ///
5050    /// merges two adjacent siblings when their combined entry count is
5051    /// ≤ `max_entries_per_node` (the merge threshold equal to the node
5052    /// capacity).  The left sibling's entries are prepended into the right
5053    /// sibling; the parent key slot pointing at the left sibling is then
5054    /// removed from the parent IN with `deleteEntry`.  If the parent IN
5055    /// becomes empty after the removal the process repeats recursively up
5056    /// the tree.
5057    ///
5058    /// This implementation performs a single post-order walk so that each
5059    /// level is compressed after all its children have been compressed.
5060    pub fn compress(&self) {
5061        let root = match self.get_root() {
5062            Some(r) => r,
5063            None => return,
5064        };
5065        Self::compress_node(&root, self.max_entries_per_node);
5066    }
5067
5068    /// Recursive post-order compress helper.
5069    ///
5070    /// Visits children first (post-order), then scans adjacent child
5071    /// pairs in the current IN and merges them when the merge condition
5072    /// holds: `left.n_entries + right.n_entries <= max_entries`.
5073    ///
5074    /// After merging, the parent entry for the left sibling is deleted.
5075    /// The loop restarts after each merge so that newly under-full pairs
5076    /// created by previous merges are also considered.
5077    fn compress_node(node_arc: &Arc<RwLock<TreeNode>>, max_entries: usize) {
5078        // Collect child arcs to recurse without holding the node lock.
5079        let children: Vec<Arc<RwLock<TreeNode>>> = {
5080            let g = node_arc.read();
5081            match &*g {
5082                TreeNode::Internal(n) => n.resident_children(),
5083                // BINs are leaves; nothing to compress at this level.
5084                TreeNode::Bottom(_) => return,
5085            }
5086        };
5087
5088        // Post-order: recurse into every child before working on this level.
5089        for child in &children {
5090            Self::compress_node(child, max_entries);
5091        }
5092
5093        // Compress the current IN level: merge adjacent under-full children.
5094        // Repeat until a full pass produces no merges.
5095        loop {
5096            let n_entries = {
5097                let g = node_arc.read();
5098                g.get_n_entries()
5099            };
5100
5101            let mut merged_any = false;
5102
5103            // `i` is the index of the *left* candidate; right is at `i+1`.
5104            let mut i = 0usize;
5105            while i + 1 < n_entries {
5106                // Fetch left and right child arcs.
5107                let (left_arc, right_arc) = {
5108                    let g = node_arc.read();
5109                    match &*g {
5110                        TreeNode::Internal(p) => {
5111                            let l = p.get_child(i);
5112                            let r = p.get_child(i + 1);
5113                            match (l, r) {
5114                                (Some(l), Some(r)) => (l, r),
5115                                _ => {
5116                                    i += 1;
5117                                    continue;
5118                                }
5119                            }
5120                        }
5121                        TreeNode::Bottom(_) => return,
5122                    }
5123                };
5124
5125                let left_n = { left_arc.read().get_n_entries() };
5126                let right_n = { right_arc.read().get_n_entries() };
5127
5128                // merge condition: combined count fits within one node.
5129                if left_n + right_n > max_entries {
5130                    i += 1;
5131                    continue;
5132                }
5133
5134                // Determine node kind from left child.
5135                let left_is_bin = { left_arc.read().is_bin() };
5136
5137                if left_is_bin {
5138                    // BIN merge: decompress left entries to full keys, then
5139                    // prepend into right BIN (also decompressed), and finally
5140                    // recompute the merged BIN's prefix.
5141                    // merge left into right, then
5142                    // recalcKeyPrefix on the merged node.
5143                    let left_full_entries: Vec<BinEntry> = {
5144                        {
5145                            let g = left_arc.read();
5146                            match &*g {
5147                                TreeNode::Bottom(b) => (0..b.entries.len())
5148                                    .map(|j| BinEntry {
5149                                        data: b.entries[j].data.clone(),
5150                                        known_deleted: b.entries[j]
5151                                            .known_deleted,
5152                                        dirty: b.entries[j].dirty,
5153                                        expiration_time: b.entries[j]
5154                                            .expiration_time,
5155                                    })
5156                                    .collect(),
5157                                _ => {
5158                                    i += 1;
5159                                    continue;
5160                                }
5161                            }
5162                        }
5163                    };
5164                    // T-3 / T-2: capture left's per-slot LSNs and FULL keys.
5165                    let (left_full_lsns, left_full_keys): (
5166                        Vec<Lsn>,
5167                        Vec<Vec<u8>>,
5168                    ) = {
5169                        let g = left_arc.read();
5170                        match &*g {
5171                            TreeNode::Bottom(b) => (
5172                                (0..b.entries.len())
5173                                    .map(|j| b.get_lsn(j))
5174                                    .collect(),
5175                                (0..b.entries.len())
5176                                    .map(|j| {
5177                                        b.get_full_key(j).unwrap_or_default()
5178                                    })
5179                                    .collect(),
5180                            ),
5181                            _ => (Vec::new(), Vec::new()),
5182                        }
5183                    };
5184                    {
5185                        {
5186                            let mut g = right_arc.write();
5187                            match &mut *g {
5188                                TreeNode::Bottom(rb) => {
5189                                    // Decompress right entries to full keys.
5190                                    let right_full: Vec<BinEntry> = (0..rb
5191                                        .entries
5192                                        .len())
5193                                        .map(|j| BinEntry {
5194                                            data: rb.entries[j].data.clone(),
5195                                            known_deleted: rb.entries[j]
5196                                                .known_deleted,
5197                                            dirty: rb.entries[j].dirty,
5198                                            expiration_time: rb.entries[j]
5199                                                .expiration_time,
5200                                        })
5201                                        .collect();
5202                                    // T-3 / T-2: right's per-slot LSNs + keys.
5203                                    let right_full_lsns: Vec<Lsn> =
5204                                        (0..rb.entries.len())
5205                                            .map(|j| rb.get_lsn(j))
5206                                            .collect();
5207                                    let right_full_keys: Vec<Vec<u8>> =
5208                                        (0..rb.entries.len())
5209                                            .map(|j| {
5210                                                rb.get_full_key(j)
5211                                                    .unwrap_or_default()
5212                                            })
5213                                            .collect();
5214                                    // Left entries are all smaller; prepend.
5215                                    let mut combined = left_full_entries;
5216                                    combined.extend(right_full);
5217                                    let mut combined_lsns = left_full_lsns;
5218                                    combined_lsns.extend(right_full_lsns);
5219                                    let mut combined_keys = left_full_keys;
5220                                    combined_keys.extend(right_full_keys);
5221                                    // Reset prefix and assign full keys.
5222                                    rb.key_prefix = Vec::new();
5223                                    rb.entries = combined;
5224                                    // T-3: rebuild the merged LSN array.
5225                                    rb.lsn_rep =
5226                                        LsnRep::from_lsns(&combined_lsns);
5227                                    // T-2: rebuild the merged key rep (Default;
5228                                    // recompute below compresses + compacts).
5229                                    rb.keys = KeyRep::from_keys(combined_keys);
5230                                    // Recompute prefix on merged BIN.
5231                                    if rb.entries.len() >= 2 {
5232                                        rb.recompute_key_prefix();
5233                                    } else {
5234                                        rb.keys
5235                                            .compact(rb.compact_max_key_length);
5236                                    }
5237                                    rb.dirty = true;
5238                                }
5239                                _ => {
5240                                    i += 1;
5241                                    continue;
5242                                }
5243                            }
5244                        }
5245                    }
5246                    // Clear the now-merged left BIN.
5247                    {
5248                        let mut g = left_arc.write();
5249                        if let TreeNode::Bottom(lb) = &mut *g {
5250                            lb.entries.clear();
5251                            lb.lsn_rep = LsnRep::Empty; // T-3
5252                            lb.keys = KeyRep::new(); // T-2
5253                            lb.key_prefix = Vec::new();
5254                            lb.dirty = true;
5255                        }
5256                    }
5257                } else {
5258                    // Upper-IN merge: prepend left's InEntries into right.
5259                    // T-4: capture left's resident children alongside its
5260                    // entries so they travel into the merged right IN.
5261                    let (left_in_entries, left_children): (
5262                        Vec<InEntry>,
5263                        Vec<Option<ChildArc>>,
5264                    ) = {
5265                        let g = left_arc.read();
5266                        match &*g {
5267                            TreeNode::Internal(n) => {
5268                                let children = (0..n.entries.len())
5269                                    .map(|j| n.get_child(j))
5270                                    .collect();
5271                                (n.entries.clone(), children)
5272                            }
5273                            _ => {
5274                                i += 1;
5275                                continue;
5276                            }
5277                        }
5278                    };
5279                    // T-3: capture left's per-slot LSNs.
5280                    let left_in_lsns: Vec<Lsn> = {
5281                        let g = left_arc.read();
5282                        match &*g {
5283                            TreeNode::Internal(n) => (0..n.entries.len())
5284                                .map(|j| n.get_lsn(j))
5285                                .collect(),
5286                            _ => Vec::new(),
5287                        }
5288                    };
5289                    let n_left = left_in_entries.len();
5290                    {
5291                        {
5292                            let mut g = right_arc.write();
5293                            match &mut *g {
5294                                TreeNode::Internal(rn) => {
5295                                    // Snapshot right's existing children, then
5296                                    // rebuild the merged entry + target arrays
5297                                    // (left half first, then right half).
5298                                    let right_children: Vec<Option<ChildArc>> =
5299                                        (0..rn.entries.len())
5300                                            .map(|j| rn.get_child(j))
5301                                            .collect();
5302                                    // T-3: snapshot right's LSNs too.
5303                                    let right_in_lsns: Vec<Lsn> =
5304                                        (0..rn.entries.len())
5305                                            .map(|j| rn.get_lsn(j))
5306                                            .collect();
5307                                    let mut combined = left_in_entries.clone();
5308                                    combined.append(&mut rn.entries);
5309                                    rn.entries = combined;
5310                                    // T-3: rebuild the merged LSN array.
5311                                    let mut combined_lsns =
5312                                        left_in_lsns.clone();
5313                                    combined_lsns.extend(right_in_lsns);
5314                                    rn.lsn_rep =
5315                                        LsnRep::from_lsns(&combined_lsns);
5316                                    rn.targets = TargetRep::None;
5317                                    for (j, c) in
5318                                        left_children.iter().enumerate()
5319                                    {
5320                                        if let Some(child) = c {
5321                                            rn.set_child(
5322                                                j,
5323                                                Some(child.clone()),
5324                                            );
5325                                        }
5326                                    }
5327                                    for (j, c) in
5328                                        right_children.into_iter().enumerate()
5329                                    {
5330                                        if c.is_some() {
5331                                            rn.set_child(n_left + j, c);
5332                                        }
5333                                    }
5334                                    rn.dirty = true;
5335                                }
5336                                _ => {
5337                                    i += 1;
5338                                    continue;
5339                                }
5340                            }
5341                        }
5342                    }
5343                    // Update parent pointers for moved children.
5344                    for child in left_children.into_iter().flatten() {
5345                        let mut cg = child.write();
5346                        cg.set_parent(Some(Arc::downgrade(&right_arc)));
5347                    }
5348                    // Clear the now-merged left IN.
5349                    {
5350                        let mut g = left_arc.write();
5351                        if let TreeNode::Internal(ln) = &mut *g {
5352                            ln.entries.clear();
5353                            ln.lsn_rep = LsnRep::Empty; // T-3
5354                            ln.targets = TargetRep::None;
5355                            ln.dirty = true;
5356                        }
5357                    }
5358                }
5359
5360                // Remove the right sibling's parent slot and update
5361                // the left slot to point at the merged right child.
5362                //
5363                // We keep the LEFT slot's key (which is the correct minimum for
5364                // the merged BIN's range) and remove the RIGHT slot (i+1).
5365                // This avoids having to update the parent key when i == 0.
5366                {
5367                    {
5368                        let mut g = node_arc.write();
5369                        match &mut *g {
5370                            TreeNode::Internal(p) => {
5371                                // Update left slot (i) to point at right_arc
5372                                // (which now contains the merged entries).
5373                                if i < p.entries.len() {
5374                                    p.set_child(i, Some(right_arc.clone()));
5375                                }
5376                                // Remove right slot (i+1) — it is now redundant.
5377                                // T-4: remove_entry shifts the child array too.
5378                                if i + 1 < p.entries.len() {
5379                                    p.remove_entry(i + 1);
5380                                }
5381                                p.dirty = true;
5382                            }
5383                            TreeNode::Bottom(_) => return,
5384                        }
5385                    }
5386                }
5387
5388                merged_any = true;
5389                // Advance i to check the merged BIN against its new right
5390                // sibling (the old slot i+2 is now at i+1).
5391                i += 1;
5392                let updated_n = { node_arc.read().get_n_entries() };
5393                if i + 1 >= updated_n {
5394                    break;
5395                }
5396            }
5397
5398            if !merged_any {
5399                break;
5400            }
5401        }
5402    }
5403
5404    // ========================================================================
5405    // BIN slot compression
5406    // ========================================================================
5407
5408    /// Compress deleted slots from a BIN node, then prune it from its parent
5409    /// IN when it becomes empty.
5410    ///
5411    /// (the in-place slot-removal
5412    /// path, NOT the sibling-merge path handled by `compress()`).
5413    ///
5414    /// # Algorithm
5415    ///
5416    /// 1. If the BIN is a delta, skip — deltas cannot be compressed.
5417    /// 2. Remove all slots where `entry.known_deleted` is true.  This mirrors
5418    ///    `bin.compress(!bin.shouldLogDelta(), localTracker)`.
5419    /// 3. If the BIN is now empty, remove it from its parent IN.  This mirrors
5420    ///    `pruneBIN(db, binRef, idKey)` → `tree.delete(idKey)`.
5421    ///
5422    /// # Arguments
5423    ///
5424    /// * `bin_arc` — the BIN to compress (must be a `TreeNode::Bottom`).
5425    ///
5426    /// # Returns
5427    ///
5428    /// `true` if compression made progress (slots were removed or the BIN was
5429    /// pruned), `false` if the BIN was skipped (delta, no cursors issue, etc.).
5430    pub fn compress_bin(&self, bin_arc: &Arc<RwLock<TreeNode>>) -> bool {
5431        // ---- Step 1: collect metadata without holding the write lock ----
5432        let (is_delta, n_entries, id_key) = {
5433            {
5434                let g = bin_arc.read();
5435                match &*g {
5436                    TreeNode::Bottom(b) => {
5437                        // Identifier key = first full key in the BIN
5438                        // (the: bin.getIdentifierKey()).
5439                        let id_key = b.get_full_key(0);
5440                        (b.is_delta, b.entries.len(), id_key)
5441                    }
5442                    _ => return false, // not a BIN
5443                }
5444            }
5445        };
5446
5447        // If (bin.isBINDelta()) return; — deltas cannot be compressed.
5448        if is_delta {
5449            return false;
5450        }
5451
5452        // ---- Step 2: remove known-deleted slots) ----
5453        // We compress dirty slots too (compress_dirty_slots = true) because
5454        // we are not writing a BIN-delta here.
5455        let removed_any = {
5456            {
5457                let mut g = bin_arc.write();
5458                match &mut *g {
5459                    TreeNode::Bottom(b) => {
5460                        let before = b.entries.len();
5461                        // BIN.compress(): walk backwards to remove
5462                        // deleted slots without index confusion.
5463                        //
5464                        // ponytail: IC-3 — we remove `known_deleted` slots
5465                        // without consulting the lock manager's per-record
5466                        // write-lock state (JE BIN.compress inspects the
5467                        // cursor/lock state).  The lock manager lives in a
5468                        // DIFFERENT crate (noxu-txn); the tree layer has no
5469                        // access to it, so a cross-crate write-lock check is
5470                        // out of scope here.  This is SAFE in the current
5471                        // design because the only slots that reach here with
5472                        // `known_deleted == true` are committed deletes:
5473                        //   * the dbi write path (cursor_impl.rs delete())
5474                        //     PHYSICALLY removes the slot via tree.delete()
5475                        //     while holding the txn write lock — it never
5476                        //     leaves a write-locked `known_deleted` tombstone
5477                        //     in a BinStub; and
5478                        //   * the only writer of BinStub.known_deleted == true
5479                        //     is BIN-delta / recovery replay, which only
5480                        //     replays already-committed deletes.
5481                        // The compressor daemon
5482                        // (environment_impl.rs: collect_bins_with_known_deleted
5483                        // → compress_bin) therefore only ever sees committed
5484                        // (unlocked) defunct slots.  See
5485                        // docs/src/operations/known-limitations.md (IC-3) for
5486                        // the upgrade path if a future write path ever leaves
5487                        // an uncommitted write-locked tombstone in a BinStub.
5488                        let mut j = b.entries.len();
5489                        while j > 0 {
5490                            j -= 1;
5491                            if b.entries[j].known_deleted {
5492                                // JE `IN.deleteEntry` (IN.java:3466): removing a
5493                                // DIRTY slot must prohibit the next delta — a
5494                                // delta only carries dirty slots, so the removal
5495                                // would otherwise be silently lost.  Force a
5496                                // full BIN on the next log.
5497                                if b.entries[j].dirty {
5498                                    b.prohibit_next_delta = true;
5499                                }
5500                                b.entries.remove(j);
5501                                b.keys.remove(j); // T-2
5502                                b.lsn_rep.remove_shift(j); // T-3
5503                                b.dirty = true;
5504                            }
5505                        }
5506                        // Recompute prefix after slot removal, since the
5507                        // remaining keys may share a longer common prefix.
5508                        // After compress(), call recalcKeyPrefix().
5509                        if b.entries.len() >= 2 {
5510                            b.recompute_key_prefix();
5511                        } else if b.entries.len() < 2 {
5512                            b.key_prefix = Vec::new();
5513                        }
5514                        b.entries.len() < before
5515                    }
5516                    _ => false,
5517                }
5518            }
5519        };
5520
5521        // ---- Step 3: prune empty BIN from parent ----
5522        // If (empty) pruneBIN(db, binRef, idKey)  → tree.delete(idKey).
5523        // We only prune when the BIN is actually empty after compression.
5524        let now_empty = { bin_arc.read().get_n_entries() == 0 };
5525
5526        if now_empty {
5527            // pruneBIN re-descends to the SPECIFIC empty BIN and removes its
5528            // parent-IN slot ONLY IF the BIN is still empty (and has no
5529            // cursors and is not a delta) UNDER THE PARENT LATCH.
5530            //
5531            // We must NOT use `self.delete(&id_key)` here (IC-1): that
5532            // re-descends by key and removes whatever live entry now matches
5533            // `id_key`.  Between reading `now_empty` (a fresh read lock taken
5534            // after the compression write lock was dropped) and acting on it,
5535            // a concurrent insert can repopulate this BIN; `self.delete` would
5536            // then drop a LIVE entry — tree corruption / lost write.
5537            //
5538            // JE `INCompressor.pruneBIN` (INCompressor.java ~line 502-510)
5539            // calls `tree.delete(idKey)`, and JE `Tree.delete` /
5540            // `searchDeletableSubTree` (Tree.java ~line 755-800) re-validates
5541            // `bin.getNEntries() != 0` → NODE_NOT_EMPTY (abort) and
5542            // `bin.nCursors() > 0` → CURSORS_EXIST (abort) while holding the
5543            // parent (branch) latch.  `prune_empty_bin` reproduces exactly
5544            // that re-validation.  See `prune_empty_bin` below.
5545            //
5546            // Note: we only attempt the prune if n_entries was > 0 before
5547            // compression (an already-empty BIN we never populated is left
5548            // alone, matching the pre-existing guard).
5549            if let Some(key) = id_key
5550                && n_entries > 0
5551            {
5552                self.prune_empty_bin(&key);
5553            }
5554            return true;
5555        }
5556
5557        removed_any
5558    }
5559
5560    /// Re-descend to the leaf BIN that should contain `id_key` and remove its
5561    /// parent-IN child slot ONLY IF the BIN is still safe to prune.
5562    ///
5563    /// This is the faithful port of JE `Tree.delete(idKey)` /
5564    /// `Tree.searchDeletableSubTree` (Tree.java ~line 755-800) as invoked by
5565    /// `INCompressor.pruneBIN` (INCompressor.java ~line 502-510).  JE takes the
5566    /// branch-parent latch, re-descends to the specific empty BIN, and aborts
5567    /// the prune (removing NOTHING) if any of the following changed since the
5568    /// compressor observed the BIN as empty:
5569    ///
5570    /// * `bin.getNEntries() != 0`  → `NodeNotEmptyException` (a concurrent
5571    ///   insert repopulated the BIN — IC-1: we must NOT delete a live entry).
5572    /// * `bin.isBINDelta()`        → `unexpectedState` (deltas are never empty).
5573    /// * `bin.nCursors() > 0`      → `CursorsExistException` (a cursor is parked
5574    ///   on the empty BIN; requeue rather than orphan the cursor).
5575    ///
5576    /// The re-check and the slot removal both happen while holding the
5577    /// **parent IN write latch**.  Holding the parent write latch blocks every
5578    /// descender (insert / delete take `parent.read()` hand-over-hand), so a
5579    /// concurrent insert cannot reach the BIN between our re-check and the
5580    /// slot removal — the TOCTOU window IC-1 describes is closed.
5581    ///
5582    /// Returns `true` iff a parent-IN slot was removed, `false` otherwise
5583    /// (BIN repopulated, has a cursor, is a delta, vanished, or is the root —
5584    /// in every `false` case NOTHING is removed).
5585    pub fn prune_empty_bin(&self, id_key: &[u8]) -> bool {
5586        let root = match self.get_root() {
5587            Some(r) => r,
5588            None => return false,
5589        };
5590
5591        // If the root itself is the BIN (single-BIN tree) there is no parent
5592        // IN to remove a slot from.  JE's searchDeletableSubTree returns null
5593        // ("the entire tree is empty") and keeps the root BIN; we do the same.
5594        if root.read().is_bin() {
5595            return false;
5596        }
5597
5598        // Descend by id_key tracking the IN that is the *parent of the leaf
5599        // BIN* and the child index within it.  Hand-over-hand read coupling
5600        // keeps the descent consistent with concurrent splits, exactly like
5601        // `get_parent_bin_for_child_ln`.
5602        let (parent_arc, child_index) = {
5603            let mut parent_arc: Arc<RwLock<TreeNode>> = root.clone();
5604            let mut guard: parking_lot::ArcRwLockReadGuard<
5605                parking_lot::RawRwLock,
5606                TreeNode,
5607            > = root.read_arc();
5608            loop {
5609                let (next_arc, idx) = match &*guard {
5610                    TreeNode::Internal(n) => {
5611                        if n.entries.is_empty() {
5612                            return false;
5613                        }
5614                        let idx = self.upper_in_floor_index(&n.entries, id_key);
5615                        match n.get_child(idx) {
5616                            Some(c) => (c, idx),
5617                            None => return false,
5618                        }
5619                    }
5620                    TreeNode::Bottom(_) => {
5621                        unreachable!("is_bin checked before / below")
5622                    }
5623                };
5624                // Is the next node the leaf BIN?  If so, `guard`'s node is the
5625                // parent IN we want and `idx` is the child slot.
5626                if next_arc.read().is_bin() {
5627                    drop(guard);
5628                    break (parent_arc, idx);
5629                }
5630                let next_guard = next_arc.read_arc();
5631                drop(guard);
5632                parent_arc = next_arc;
5633                guard = next_guard;
5634            }
5635        };
5636
5637        // ---- Re-validate and remove the slot UNDER THE PARENT WRITE LATCH ----
5638        // Holding parent.write() excludes all descenders (they need
5639        // parent.read()), so the BIN cannot be repopulated between the
5640        // re-check and the slot removal.
5641        let mut parent_guard = parent_arc.write();
5642        let pruned_bin_id;
5643        let removed_key_len = match &mut *parent_guard {
5644            TreeNode::Internal(p) => {
5645                let child = match p.get_child(child_index) {
5646                    Some(c) => c,
5647                    None => return false, // slot already vacated / invalid
5648                };
5649                // Re-validate the child BIN under the parent latch.
5650                {
5651                    let cg = child.read();
5652                    match &*cg {
5653                        TreeNode::Bottom(b) => {
5654                            // JE: bin.getNEntries() != 0 → NODE_NOT_EMPTY (abort).
5655                            if !b.entries.is_empty() {
5656                                return false;
5657                            }
5658                            // JE: bin.isBINDelta() → unexpectedState (abort).
5659                            if b.is_delta {
5660                                return false;
5661                            }
5662                            // JE: bin.nCursors() > 0 → CURSORS_EXIST (abort).
5663                            if b.cursor_count > 0 {
5664                                return false;
5665                            }
5666                            pruned_bin_id = b.node_id;
5667                        }
5668                        // A concurrent split could in principle have replaced
5669                        // the child with an IN; never prune in that case.
5670                        TreeNode::Internal(_) => return false,
5671                    }
5672                }
5673                // Safe to prune: remove the BIN's slot from the parent IN.
5674                // Mirrors the parent-slot removal `Tree.delete` performs for
5675                // an empty BIN (Tree.java deleteEntry under the branch latch).
5676                // T-4: remove_entry shifts the node-level child array too.
5677                let removed = p.remove_entry(child_index);
5678                p.dirty = true;
5679                removed.key.len()
5680            }
5681            TreeNode::Bottom(_) => return false,
5682        };
5683        drop(parent_guard);
5684
5685        // JE: removing the BIN slot detaches the BIN from the tree; the
5686        // evictor must drop it from its LRU lists (Evictor.remove).
5687        self.note_removed(pruned_bin_id);
5688
5689        // Preserve the memory-counter bookkeeping that `self.delete` performed
5690        // (IN.updateMemorySize(-delta) → MemoryBudget.updateTreeMemoryUsage).
5691        // The pruned slot's key plus the fixed per-entry overhead matches the
5692        // `delete` accounting (key.len() + BIN_ENTRY_OVERHEAD).
5693        if let Some(counter) = &self.memory_counter {
5694            let delta = (removed_key_len + BIN_ENTRY_OVERHEAD) as i64;
5695            counter.fetch_sub(delta, Ordering::Relaxed);
5696        }
5697
5698        true
5699    }
5700
5701    /// Detach the resident child node `node_id` from its parent IN, dropping
5702    /// the strong `Arc` so the node is actually freed from memory, and return
5703    /// the heap bytes reclaimed (0 if not found / not detachable).
5704    ///
5705    /// This is the faithful port of JE `IN.detachNode(idx, updateLsn, newLsn)`
5706    /// (IN.java ~4019) as called from `Evictor.evict` (Evictor.java ~3035):
5707    /// `evict` measures `target.getBudgetedMemorySize()` and then
5708    /// `parent.detachNode(index, ...)` does `setTarget(idx, null)` to drop the
5709    /// child reference and `getInMemoryINs().remove(child)` to drop it from
5710    /// the INList.
5711    ///
5712    /// EV-13: before this method existed, the evictor credited
5713    /// `node_size_fn(node_id)` bytes back to the budget and removed the node
5714    /// from the LRU lists, but the parent's `InEntry.child` still held a
5715    /// strong `Arc` — so the node was never dropped from the heap.  The budget
5716    /// over-credited (claimed bytes freed that were not), `cache_usage`
5717    /// drifted below reality, and the evictor under-fired.  Detaching here
5718    /// drops the `Arc` for real and credits exactly the measured size.
5719    ///
5720    /// The detach happens **under the parent IN write latch** (JE detaches
5721    /// under the parent's latch), so no concurrent descender can re-cache the
5722    /// child between measurement and detach.  The slot (key + LSN) is kept —
5723    /// only the in-memory `child` target is cleared — matching JE's
5724    /// `setTarget(idx, null)` which leaves the `ChildReference` LSN intact so
5725    /// the node can be re-fetched from the log later.
5726    ///
5727    /// Returns `0` if the node is not a resident child of any IN (e.g. it is
5728    /// the root, already detached, or was pinned and could not be latched).
5729    pub fn detach_node_by_id(&self, node_id: u64) -> u64 {
5730        let root = match self.get_root() {
5731            Some(r) => r,
5732            None => return 0,
5733        };
5734
5735        // The root has no parent IN to detach from (JE evicts the root via a
5736        // separate evictRoot path; we keep the root resident here).
5737        let root_id = {
5738            let g = root.read();
5739            match &*g {
5740                TreeNode::Internal(n) => n.node_id,
5741                TreeNode::Bottom(b) => b.node_id,
5742            }
5743        };
5744        if root_id == node_id {
5745            return 0;
5746        }
5747
5748        // Locate the parent IN and the child slot index.
5749        let (parent_arc, child_index) =
5750            match Self::find_parent_of_node_id(&root, node_id) {
5751                Some(p) => p,
5752                None => return 0,
5753            };
5754
5755        // ---- Measure + detach UNDER THE PARENT WRITE LATCH ----
5756        // Holding parent.write() excludes all descenders (they take
5757        // parent.read() hand-over-hand), so the child cannot be re-cached or
5758        // re-pinned between the measurement and the detach.  Mirrors JE
5759        // detachNode running under the parent latch held by Evictor.evict.
5760        let mut parent_guard = parent_arc.write();
5761        let TreeNode::Internal(p) = &mut *parent_guard else {
5762            return 0; // parent is not an IN (concurrent restructure)
5763        };
5764        if child_index >= p.entries.len() {
5765            return 0;
5766        }
5767        // T-4: detach the cached child via the node-level INTargetRep, leaving
5768        // the slot's key/LSN intact for re-fetch (JE IN.setTarget(idx, null)).
5769        let child = match p.take_child(child_index) {
5770            Some(c) => c,     // child Arc removed from the slot
5771            None => return 0, // already detached
5772        };
5773
5774        // Measure the child's real heap footprint while we still hold it.
5775        // JE: long evictedBytes = target.getBudgetedMemorySize().
5776        let freed = child.read().budgeted_memory_size();
5777
5778        // EV-14 re-fetch correctness: the parent slot LSN must point at the
5779        // child's CURRENT on-disk version so `child_at_or_fetch` re-reads the
5780        // right bytes (JE `IN.updateEntry(idx, newLsn)` is called whenever a
5781        // child is logged; the parent slot LSN tracks the child's LSN).  The
5782        // evictor only fully evicts/detaches a CLEAN BIN (it logs+clears dirty
5783        // BINs via flush_dirty_node_to_log first, which sets `last_full_lsn`),
5784        // so the child's authoritative LSN is its `last_full_lsn`.  Stamp it
5785        // into the parent slot before dropping the child; if it is null (the
5786        // child was never logged) leave the existing slot LSN intact rather
5787        // than writing a null — a never-logged clean child cannot occur on
5788        // the evict path, but be conservative.
5789        let child_full_lsn = match &*child.read() {
5790            TreeNode::Bottom(b) => b.last_full_lsn,
5791            TreeNode::Internal(_) => NULL_LSN,
5792        };
5793        if child_full_lsn != NULL_LSN {
5794            p.set_lsn(child_index, child_full_lsn);
5795        }
5796
5797        // Mark the parent dirty: the slot's in-memory target changed (JE
5798        // detachNode sets dirty when updateLsn; we conservatively mark dirty
5799        // so the parent is re-logged with the now-non-resident slot).
5800        p.dirty = true;
5801
5802        // Drop the strong Arc explicitly so the node is freed now (the slot's
5803        // `child` is already None).  If any other resident path still held a
5804        // strong reference this would not free — but the tree is the sole
5805        // strong owner of a cached child, so this drops the last strong ref.
5806        drop(parent_guard);
5807        drop(child);
5808
5809        // JE: getInMemoryINs().remove(child) — drop it from the evictor LRU.
5810        self.note_removed(node_id);
5811
5812        // NOTE: the live tree-memory counter (`memory_counter`) is the SAME
5813        // `Arc<AtomicI64>` the evictor's Arbiter uses as `cache_usage`.  The
5814        // evictor decrements it once via `Arbiter::release_memory(bytes)` for
5815        // the full eviction batch, so detach must NOT decrement here too —
5816        // that would double-credit and drive `cache_usage` below reality
5817        // (the very drift EV-13 fixes, in the other direction).  We only
5818        // measure-and-free; the caller does the single counter update.
5819        freed
5820    }
5821
5822    /// Evict the root IN of this tree (EV-14).
5823    ///
5824    /// Faithful port of JE `Evictor.evictRoot` (Evictor.java:3050-3110) plus
5825    /// the `RootEvictor.doWork` + `Tree.withRootLatchedExclusive` framing
5826    /// (Evictor.java:2529-2576, Tree.java:508-517).  Unlike a normal IN, the
5827    /// root has no parent slot to detach from; instead the *tree's* root
5828    /// reference is the equivalent of the `RootChildReference`, so eviction:
5829    ///
5830    ///   1. Latches the root reference exclusively (`rootLatch.acquireExclusive`
5831    ///      via `withRootLatchedExclusive`).
5832    ///   2. Re-checks that the root is still resident and still evictable
5833    ///      (no resident children, no pinned BIN — JE `RootEvictor.doWork`
5834    ///      re-latches and re-checks `rootIN == target && rootIN.isRoot()`).
5835    ///   3. If the root is dirty, LOGS it first so the on-disk version is
5836    ///      current and updates `root_log_lsn` to the new LSN (JE
5837    ///      `evictRoot`: `long newLsn = target.log(...); rootRef.setLsn(newLsn)`).
5838    ///   4. Clears the in-memory root (`rootRef.clearTarget()` — JE leaves the
5839    ///      `ChildReference` LSN intact; here `root_log_lsn` is that LSN) and
5840    ///      `note_removed`s it from the evictor LRU (JE `inList.remove(target)`).
5841    ///
5842    /// On the next access `fetch_root_from_log` re-materializes the root from
5843    /// `root_log_lsn` (JE `Tree.getRootINRootAlreadyLatched` →
5844    /// `root.fetchTarget`).
5845    ///
5846    /// # Conditions (eviction is REFUSED, returning `None`, when)
5847    ///
5848    /// * there is no log manager wired (the root could never be re-fetched),
5849    /// * the tree has no resident root (already evicted),
5850    /// * the root has any resident child (JE only evicts a childless root —
5851    ///   the `hasCachedChildren` skip in `processTarget`; a root with cached
5852    ///   children would orphan them, the EV-6 invariant),
5853    /// * the root is a BIN pinned by a cursor (`cursor_count > 0`),
5854    /// * the root is dirty but we have no clean persisted version AND logging
5855    ///   it fails, or
5856    /// * the root is clean but `root_log_lsn` is null (never logged — cannot
5857    ///   be re-fetched; happens only for a brand-new unlogged tree).
5858    ///
5859    /// Returns `Some((freed_bytes, was_dirty))` on success, where `freed_bytes`
5860    /// is the root's measured heap footprint (JE
5861    /// `target.getBudgetedMemorySize()`) and `was_dirty` reports whether the
5862    /// root had to be logged (JE `rootEvictor.flushed`, which drives
5863    /// `nDirtyNodesEvicted` and `modifyDbRoot`).
5864    pub fn evict_root(&self, db_id: u64) -> Option<(u64, bool)> {
5865        // A root with no re-fetch path must never be made non-resident.
5866        self.log_manager.as_ref()?;
5867
5868        // JE `Tree.withRootLatchedExclusive(rootEvictor)`: hold the root latch
5869        // exclusively across the whole evict so no descender or splitter can
5870        // observe/install a half-evicted root.  Acquiring `self.root.write()`
5871        // is the Noxu equivalent (it is the lock guarding the root pointer).
5872        let mut root_slot = self.root.write();
5873        let root_arc = root_slot.as_ref()?.clone();
5874
5875        // JE `RootEvictor.doWork`: re-latch the target and re-check the
5876        // conditions.  We hold the node guard for the duration.
5877        let node_guard = root_arc.write();
5878
5879        // EV-6 / JE `processTarget` hasCachedChildren skip: a root with any
5880        // resident child must NOT be evicted (it would orphan the child).
5881        // EV-14 only evicts an *idle* root whose children are already
5882        // non-resident (or which is itself a leaf BIN).
5883        let (node_id, was_dirty, freed) = match &*node_guard {
5884            TreeNode::Internal(n) => {
5885                if !n.resident_children().is_empty() {
5886                    return None; // has cached children — keep resident
5887                }
5888                (n.node_id, n.dirty, node_guard.budgeted_memory_size())
5889            }
5890            TreeNode::Bottom(b) => {
5891                if b.cursor_count > 0 {
5892                    return None; // pinned by a cursor — keep resident
5893                }
5894                (
5895                    b.node_id,
5896                    b.dirty || b.dirty_count() > 0,
5897                    node_guard.budgeted_memory_size(),
5898                )
5899            }
5900        };
5901
5902        // If dirty, log the root first so the on-disk version is current,
5903        // then record the new LSN as the root's re-fetch point (JE
5904        // `evictRoot`: target.log(...) + rootRef.setLsn(newLsn)).
5905        if was_dirty {
5906            let lm = self.log_manager.as_ref()?; // checked above; re-borrow
5907            let node_bytes = node_guard.write_to_bytes();
5908            let is_bin = node_guard.is_bin();
5909            let entry = noxu_log::entry::in_log_entry::InLogEntry::new(
5910                db_id, NULL_LSN, // prev_full_lsn
5911                NULL_LSN, // prev_delta_lsn
5912                node_bytes,
5913            );
5914            let mut buf = bytes::BytesMut::with_capacity(entry.log_size());
5915            entry.write_to_log(&mut buf);
5916            let entry_type = if is_bin {
5917                noxu_log::LogEntryType::BIN
5918            } else {
5919                noxu_log::LogEntryType::IN
5920            };
5921            // flush_required = true so the root's bytes are durable before we
5922            // drop the in-memory copy (JE logs synchronously in evictRoot).
5923            let new_lsn = match lm.log(
5924                entry_type,
5925                &buf,
5926                noxu_log::Provisional::No,
5927                true,  // flush_required
5928                false, // fsync at next checkpoint
5929            ) {
5930                Ok(l) => l,
5931                Err(_) => return None, // could not log — keep the root resident
5932            };
5933            *self.root_log_lsn.write() = new_lsn;
5934        } else {
5935            // Clean root: it must already be re-fetchable.  If it was never
5936            // logged (root_log_lsn null) we cannot evict it safely.
5937            if *self.root_log_lsn.read() == NULL_LSN {
5938                return None;
5939            }
5940        }
5941
5942        // JE `rootRef.clearTarget()` + `inList.remove(target)`: drop the
5943        // in-memory root and remove it from the evictor LRU.  The root_log_lsn
5944        // is the surviving `ChildReference` LSN used to re-fetch it.
5945        drop(node_guard);
5946        *root_slot = None;
5947        drop(root_slot);
5948        self.note_removed(node_id);
5949
5950        Some((freed, was_dirty))
5951    }
5952
5953    /// Re-materialize an evicted root IN from its persisted `root_log_lsn`
5954    /// (EV-14, piece B).
5955    /// Faithful to JE `Tree.getRootINRootAlreadyLatched` (Tree.java:477-516)
5956    /// which calls `root.fetchTarget(database, null)` when the in-memory
5957    /// target is null.  Idempotent and cheap when the root is already
5958    /// resident: returns the resident root without touching the log.
5959    ///
5960    /// Returns `None` only when the tree is genuinely empty (no resident root
5961    /// AND `root_log_lsn` is null) or when the re-fetch fails (no log manager,
5962    /// log read error, deserialize failure) — callers then see an empty tree,
5963    /// never wrong data.
5964    pub fn fetch_root_from_log(&self) -> Option<Arc<RwLock<TreeNode>>> {
5965        // Fast path: root already resident.
5966        if let Some(r) = self.root.read().clone() {
5967            return Some(r);
5968        }
5969        // Take the write lock and re-check (another thread may have re-fetched
5970        // it while we waited — JE upgrades the root latch the same way).
5971        let mut root_slot = self.root.write();
5972        if let Some(r) = root_slot.as_ref() {
5973            return Some(r.clone());
5974        }
5975        let log_lsn = *self.root_log_lsn.read();
5976        let node = self.fetch_node_from_log(log_lsn)?;
5977        let node_id = node.node_id();
5978        let arc = Arc::new(RwLock::new(node));
5979        *root_slot = Some(arc.clone());
5980        drop(root_slot);
5981        // JE: a fetched IN is added back to the INList (Evictor LRU).
5982        self.note_added(node_id);
5983        Some(arc)
5984    }
5985
5986    /// Return the resident child Arc for slot `idx` of `parent_arc`, fetching
5987    /// it from its slot LSN and installing it if it is not resident (EV-14 /
5988    /// EV-13 re-fetch on descent).
5989    ///
5990    /// Faithful to JE `ChildReference.fetchTarget` (and `IN.fetchTarget`):
5991    /// when a slot's in-memory target is null but its LSN is valid, the node
5992    /// is read back from the log and cached in the slot.  Installing the
5993    /// fetched child requires the parent EX-latch, so this takes the parent
5994    /// write lock; the fast path (child already resident) takes only a read
5995    /// lock.
5996    ///
5997    /// Returns `None` only when the slot index is out of range, the slot has
5998    /// no valid LSN, or the log read/deserialize fails — callers then treat
5999    /// the descent as terminating in an empty subtree, never wrong data.
6000    fn child_at_or_fetch(
6001        &self,
6002        parent_arc: &Arc<RwLock<TreeNode>>,
6003        idx: usize,
6004    ) -> Option<ChildArc> {
6005        // Fast path: child already cached (read lock only).
6006        {
6007            let g = parent_arc.read();
6008            if let TreeNode::Internal(n) = &*g {
6009                if let Some(c) = n.get_child(idx) {
6010                    return Some(c);
6011                }
6012            } else {
6013                return None; // BINs have no IN children
6014            }
6015        }
6016        // Slow path: fetch the child from its slot LSN under the parent
6017        // EX-latch (JE installs the fetched target under the IN latch).
6018        let mut g = parent_arc.write();
6019        let TreeNode::Internal(n) = &mut *g else {
6020            return None;
6021        };
6022        // Re-check: another thread may have fetched it while we upgraded.
6023        if let Some(c) = n.get_child(idx) {
6024            return Some(c);
6025        }
6026        if idx >= n.entries.len() {
6027            return None;
6028        }
6029        let child_lsn = n.get_lsn(idx);
6030        let node = self.fetch_node_from_log(child_lsn)?;
6031        let node_id = node.node_id();
6032        let arc: ChildArc = Arc::new(RwLock::new(node));
6033        n.set_child(idx, Some(arc.clone()));
6034        drop(g);
6035        // JE: a fetched IN is added back to the INList (Evictor LRU).
6036        self.note_added(node_id);
6037        Some(arc)
6038    }
6039
6040    /// Check whether a BIN node is a candidate for slot compression and,
6041    /// if so, trigger `compress_bin`.
6042    ///
6043    /// from (the opportunistic / lazy compression path).
6044    ///
6045    /// # Algorithm
6046    ///
6047    /// 1. Skip the BIN if it is a delta or has no defunct (known-deleted) slots.
6048    /// 2. If compression succeeds and the BIN becomes empty, it is pruned.
6049    ///
6050    /// # Returns
6051    ///
6052    /// `true` if compression was triggered (regardless of whether any slots
6053    /// were actually removed), `false` if the BIN does not need compression.
6054    pub fn maybe_compress_bin_and_parent(
6055        &self,
6056        bin_arc: &Arc<RwLock<TreeNode>>,
6057    ) -> bool {
6058        // Check whether the BIN has any deleted slots worth compressing.
6059        // lazyCompress: skip deltas and BINs with no defunct slots.
6060        let should_compress = {
6061            {
6062                let g = bin_arc.read();
6063                match &*g {
6064                    TreeNode::Bottom(b) => {
6065                        // Skip deltas (the: !in.isBIN() || in.isBINDelta()).
6066                        if b.is_delta {
6067                            false
6068                        } else {
6069                            // Check for any known-deleted slot
6070                            // (the: for (int i=0; i < bin.getNEntries(); i++) {
6071                            //        if (bin.isDefunct(i)) { ... break; }
6072                            //      }).
6073                            b.entries.iter().any(|e| e.known_deleted)
6074                        }
6075                    }
6076                    _ => false,
6077                }
6078            }
6079        };
6080
6081        if !should_compress {
6082            return false;
6083        }
6084
6085        self.compress_bin(bin_arc)
6086    }
6087
6088    // ========================================================================
6089    // Latch-coupling validation
6090    // ========================================================================
6091
6092    /// Validate that `parent.entries[child_index].child` still points at
6093    /// `child_arc` after acquiring the child's latch.
6094    ///
6095    /// Re-latch validation step inside the
6096    /// `Tree.searchSplitsAllowed`: after a concurrent split the parent
6097    /// slot that previously held the child may have changed.  Callers that
6098    /// plan to mutate the child must verify the parent-child link is still
6099    /// intact before proceeding.
6100    ///
6101    /// Returns `true` if the parent-child link is intact.
6102    pub fn validate_parent_child(
6103        parent: &Arc<RwLock<TreeNode>>,
6104        child_index: usize,
6105        child_arc: &Arc<RwLock<TreeNode>>,
6106    ) -> bool {
6107        let g = parent.read();
6108        match &*g {
6109            TreeNode::Internal(p) => match p.child_ref(child_index) {
6110                Some(stored) => Arc::ptr_eq(stored, child_arc),
6111                None => false,
6112            },
6113            TreeNode::Bottom(_) => false,
6114        }
6115    }
6116
6117    /// Search for the BIN that should contain `key`, with latch-coupling
6118    /// validation at every level of descent.
6119    ///
6120    /// .
6121    ///
6122    /// The difference from `search()` is that after obtaining the child
6123    /// arc we call `validate_parent_child` to confirm the parent still
6124    /// holds the expected Arc.  If the link has been broken (e.g. by a
6125    /// concurrent split that relocated the child) the traversal restarts
6126    /// from the root.
6127    ///
6128    /// Returns a `SearchResult` if the key is (or should be) in the tree,
6129    /// `None` if the tree is empty.
6130    ///
6131    /// Same as [`Tree::search`] but exposes the hand-over-hand latch
6132    /// coupling explicitly. Kept as a public, equivalent API for
6133    /// callers (today only tests) that want to verify the
6134    /// latch-coupling behaviour against `search()` itself.
6135    ///
6136    /// Both `search()` and this method use the same `read_arc()`
6137    /// hand-over-hand: take the child read guard *before* dropping
6138    /// the parent guard, so a concurrent `split_child(parent, ..)`
6139    /// (which takes `parent.write()`) cannot run between when we
6140    /// captured the child Arc and when we entered the child. There
6141    /// is no validate-and-restart loop because the coupling makes
6142    /// the race unreachable.
6143    pub fn search_with_coupling(&self, key: &[u8]) -> Option<SearchResult> {
6144        let root = self.get_root()?;
6145        let mut guard: parking_lot::ArcRwLockReadGuard<
6146            parking_lot::RawRwLock,
6147            TreeNode,
6148        > = root.read_arc();
6149
6150        loop {
6151            if guard.is_bin() {
6152                let index = guard.find_entry(key, true, true);
6153                let found = index >= 0 && (index & EXACT_MATCH != 0);
6154                return Some(SearchResult::with_values(
6155                    found,
6156                    index & 0xFFFF,
6157                    false,
6158                ));
6159            }
6160
6161            let parent_arc =
6162                parking_lot::ArcRwLockReadGuard::rwlock(&guard).clone();
6163            let next_idx = match &*guard {
6164                TreeNode::Internal(n) => {
6165                    if n.entries.is_empty() {
6166                        return None;
6167                    }
6168                    let idx = self.upper_in_floor_index(&n.entries, key);
6169                    match n.get_child(idx) {
6170                        Some(c) => {
6171                            let next_guard = c.read_arc();
6172                            drop(guard);
6173                            guard = next_guard;
6174                            continue;
6175                        }
6176                        None => idx, // EV-14/EV-13: re-fetch below.
6177                    }
6178                }
6179                TreeNode::Bottom(_) => {
6180                    unreachable!("is_bin() returned false above")
6181                }
6182            };
6183            // Hand-over-hand: take the child read guard before
6184            // releasing the parent guard. Closes the
6185            // descender-vs-splitter window: a concurrent
6186            // split_child(parent, ..) takes parent.write(), which
6187            // blocks while we still hold parent.read().
6188            drop(guard);
6189            let child = self.child_at_or_fetch(&parent_arc, next_idx)?;
6190            guard = child.read_arc();
6191        }
6192    }
6193
6194    // ========================================================================
6195    // BIN-Delta reconstitution
6196    // ========================================================================
6197
6198    /// Increments the cursor-pin count on a BIN node.
6199    ///
6200    /// Called by `CursorImpl` when it positions on (or enters) a BIN.
6201    /// The evictor will not select a BIN with `cursor_count > 0` for eviction
6202    /// (`RealNodeInfo.pin_count`), matching `BIN.incrementCursorCount()`.
6203    pub fn pin_bin(bin_arc: &Arc<RwLock<TreeNode>>) {
6204        let mut guard = bin_arc.write();
6205        if let TreeNode::Bottom(ref mut stub) = *guard {
6206            stub.cursor_count += 1;
6207        }
6208    }
6209
6210    /// Decrements the cursor-pin count on a BIN node.
6211    ///
6212    /// Called by `CursorImpl` when it moves away from or closes on a BIN.
6213    /// Uses `saturating_sub` to guard against an accidental double-unpin.
6214    /// Matching `BIN.decrementCursorCount()`.
6215    pub fn unpin_bin(bin_arc: &Arc<RwLock<TreeNode>>) {
6216        let mut guard = bin_arc.write();
6217        if let TreeNode::Bottom(ref mut stub) = *guard {
6218            stub.cursor_count = stub.cursor_count.saturating_sub(1);
6219        }
6220    }
6221
6222    /// Returns `true` if the given `BinStub` is a BIN-delta (not a full BIN).
6223    ///
6224    /// `IN.isBINDelta()`.
6225    pub fn bin_is_delta(bin: &BinStub) -> bool {
6226        bin.is_delta
6227    }
6228
6229    /// Merge delta entries into a full BIN's entry list.
6230    ///
6231    /// - For each delta entry: if a matching key already exists in `bin`,
6232    ///   replace it (delta is authoritative).
6233    /// - Otherwise insert the delta entry in sorted position.
6234    ///
6235    /// Delta entries carry **full** keys (prefix already prepended by the
6236    /// caller).  After applying all delta entries the BIN's prefix is
6237    /// recomputed so the final state is consistent.
6238    ///
6239    /// All delta entries are considered to be the most-recently-dirtied
6240    /// state, exactly as in where delta slots supersede full-BIN slots.
6241    pub fn apply_delta_to_bin(
6242        bin: &mut BinStub,
6243        delta_entries: Vec<(Vec<u8>, Lsn, Option<Vec<u8>>)>,
6244    ) {
6245        for (full_key, lsn, data) in delta_entries {
6246            // `full_key` is a full (uncompressed) key here.
6247            bin.insert_with_prefix(full_key, lsn, data);
6248        }
6249        bin.dirty = true;
6250    }
6251
6252    /// Reconstitute a BIN-delta into a full BIN.
6253    ///
6254    /// from the:
6255    ///
6256    /// 1. Extract the delta entries from `self` (this BIN-delta), decompressing
6257    ///    them to full keys.
6258    /// 2. Apply them onto `base` (the previously logged full BIN) via
6259    ///    `apply_delta_to_bin`.
6260    /// 3. Copy `base`'s merged entries and prefix back into `self`.
6261    /// 4. Clear the `is_delta` flag so subsequent code treats `self` as
6262    ///    a full BIN.
6263    ///
6264    /// After this call `self` is a full BIN; `base` should be discarded.
6265    pub fn mutate_to_full_bin(delta: &mut BinStub, mut base: BinStub) {
6266        // Decompress delta entries to full keys before applying.
6267        let delta_full_entries: Vec<(Vec<u8>, Lsn, Option<Vec<u8>>)> = (0
6268            ..delta.entries.len())
6269            .map(|i| {
6270                (
6271                    delta.get_full_key(i).unwrap_or_default(),
6272                    delta.get_lsn(i),
6273                    delta.entries[i].data.clone(),
6274                )
6275            })
6276            .collect();
6277        // reconstituteBIN + resetContent + setBINDelta(false).
6278        Self::apply_delta_to_bin(&mut base, delta_full_entries);
6279        delta.entries = base.entries;
6280        delta.lsn_rep = base.lsn_rep; // T-3
6281        delta.keys = base.keys; // T-2
6282        delta.key_prefix = base.key_prefix;
6283        delta.is_delta = false;
6284        delta.dirty = true;
6285    }
6286
6287    /// Read an IN/BIN log entry at `log_lsn` and deserialise it into a
6288    /// `TreeNode`, ready to be installed as a (re-fetched) resident node.
6289    ///
6290    /// JE `LogManager.getLogEntry(lsn)` + `IN.readFromLog` as used by
6291    /// `ChildReference.fetchTarget` (the path that re-materializes a
6292    /// non-resident node from its persisted LSN on descent) and by
6293    /// `Tree.getRootINRootAlreadyLatched` for the root.  The freshly-fetched
6294    /// node has no resident children (`TargetRep::None`); its own children, if
6295    /// any, are re-fetched on demand the same way when the descent reaches
6296    /// them.
6297    ///
6298    /// Returns `None` if the LSN is null, the log read fails, the entry is not
6299    /// an IN/BIN, or deserialisation fails (the caller treats this as "node
6300    /// unavailable" rather than panicking, matching the graceful-degradation
6301    /// policy of `mutate_to_full_bin_from_log`).
6302    fn fetch_node_from_log(&self, log_lsn: Lsn) -> Option<TreeNode> {
6303        if log_lsn == NULL_LSN {
6304            return None;
6305        }
6306        let lm = self.log_manager.as_ref()?;
6307        let (entry_type, payload) = lm.read_entry(log_lsn).ok()?;
6308        // The on-disk payload is an `InLogEntry` body (db_id | prev_full_lsn
6309        // | prev_delta_lsn | len | node_data).  The recovery scanner strips
6310        // this header before calling `recover_in_redo`; re-fetch must do the
6311        // same so `deserialize_*` sees the bare node bytes.  JE
6312        // `INLogEntry.readEntry` parses the same wrapper.
6313        let in_entry =
6314            noxu_log::entry::in_log_entry::InLogEntry::read_from_log(&payload)
6315                .ok()?;
6316        let node_data = &in_entry.node_data;
6317        use noxu_log::LogEntryType;
6318        match entry_type {
6319            LogEntryType::BIN => {
6320                Self::deserialize_bin(node_data).map(TreeNode::Bottom)
6321            }
6322            LogEntryType::IN => {
6323                Self::deserialize_upper_in(node_data).map(TreeNode::Internal)
6324            }
6325            // BIN-deltas are never logged as the *root* version and are
6326            // reconstituted by the BIN-delta path, not here.
6327            _ => {
6328                log::warn!(
6329                    "fetch_node_from_log: expected IN/BIN entry at LSN {:?}, \
6330                     got {:?}",
6331                    log_lsn,
6332                    entry_type
6333                );
6334                None
6335            }
6336        }
6337    }
6338
6339    /// Reconstitute a BIN-delta into a full BIN by reading the base from log.
6340    ///
6341    /// — the
6342    /// single-argument overload that calls `fetchFullBIN(databaseImpl)` to
6343    /// read the last full BIN from the log manager automatically.
6344    ///
6345    /// Algorithm:
6346    /// 1. If `delta.last_full_lsn == NULL_LSN`, the BIN was never written as a
6347    ///    full entry; there is no base to merge so the delta IS the full BIN.
6348    ///    Clear `is_delta` and return.
6349    /// 2. Read the full-BIN log entry at `delta.last_full_lsn` using
6350    ///    `log_manager.read_entry(lsn)`.
6351    /// 3. Deserialize the payload with `BinStub::deserialize_full()`.
6352    /// 4. Delegate to `Self::mutate_to_full_bin(delta, base)` to merge and
6353    ///    replace `delta`'s contents.
6354    ///
6355    /// On any read / parse failure the function falls back to clearing the
6356    /// `is_delta` flag without merging, so the caller always gets a non-delta
6357    /// BIN (possibly missing some old slots).  This mirrors the
6358    /// `EnvironmentFailureException` path but gracefully degrades instead of
6359    /// panicking.
6360    ///
6361    /// `BIN.fetchFullBIN(dbImpl)` + `BIN.mutateToFullBIN(boolean)`.
6362    pub fn mutate_to_full_bin_from_log(
6363        delta: &mut BinStub,
6364        log_manager: &noxu_log::LogManager,
6365    ) {
6366        if !delta.is_delta {
6367            // Already a full BIN; nothing to do.
6368            return;
6369        }
6370
6371        if delta.last_full_lsn == NULL_LSN {
6372            // BIN has never been logged as a full entry — the in-memory delta
6373            // is effectively the full state. During recovery this path is
6374            // harmless.
6375            delta.is_delta = false;
6376            return;
6377        }
6378
6379        // Read the full-BIN log entry at last_full_lsn.
6380        // `envImpl.getLogManager().getEntryHandleFileNotFound(lsn)`.
6381        match log_manager.read_entry(delta.last_full_lsn) {
6382            Ok((entry_type, payload)) => {
6383                use noxu_log::LogEntryType;
6384                if entry_type == LogEntryType::BIN {
6385                    if let Some(mut base) = BinStub::deserialize_full(&payload)
6386                    {
6387                        // Set the base's last_full_lsn so it is preserved
6388                        // into the merged result.
6389                        base.last_full_lsn = delta.last_full_lsn;
6390                        Self::mutate_to_full_bin(delta, base);
6391                        return;
6392                    }
6393                    // Deserialization failed — fall through to graceful degradation.
6394                    log::warn!(
6395                        "mutate_to_full_bin_from_log: failed to deserialize \
6396                         full BIN at LSN {:?}; keeping delta as-is",
6397                        delta.last_full_lsn
6398                    );
6399                } else {
6400                    log::warn!(
6401                        "mutate_to_full_bin_from_log: expected BIN entry at \
6402                         LSN {:?}, got {:?}",
6403                        delta.last_full_lsn,
6404                        entry_type
6405                    );
6406                }
6407            }
6408            Err(e) => {
6409                log::warn!(
6410                    "mutate_to_full_bin_from_log: failed to read log at \
6411                     LSN {:?}: {}",
6412                    delta.last_full_lsn,
6413                    e
6414                );
6415            }
6416        }
6417
6418        // Graceful degradation: promote the delta to a "full" BIN without
6419        // the base slots.  The BIN will be re-logged as a full BIN at the
6420        // next checkpoint.
6421        delta.is_delta = false;
6422        delta.dirty = true;
6423    }
6424
6425    // ========================================================================
6426    // getNextBin / getPrevBin
6427    // ========================================================================
6428
6429    /// Return the entries of the BIN immediately to the right of the BIN
6430    /// that contains (or would contain) `current_key`.
6431    ///
6432    /// → `Tree.getNextIN(forward=true)`.
6433    ///
6434    /// # Algorithm
6435    /// 1. Build a root-to-BIN path for `current_key`.
6436    /// 2. Walk the path back up looking for a parent that has a slot to the
6437    ///    right of the slot we descended through.
6438    /// 3. When found, descend to the leftmost BIN of that sibling subtree.
6439    /// 4. If no such parent exists, return `None` (no next BIN).
6440    pub fn get_next_bin(
6441        &self,
6442        current_key: &[u8],
6443    ) -> Option<Vec<(BinEntry, Lsn, Vec<u8>)>> {
6444        let root = self.get_root()?;
6445        self.get_adjacent_bin(&root, current_key, true)
6446    }
6447
6448    /// Return the entries of the BIN immediately to the left of the BIN
6449    /// that contains (or would contain) `current_key`.
6450    ///
6451    /// → `Tree.getNextIN(forward=false)`.
6452    pub fn get_prev_bin(
6453        &self,
6454        current_key: &[u8],
6455    ) -> Option<Vec<(BinEntry, Lsn, Vec<u8>)>> {
6456        let root = self.get_root()?;
6457        self.get_adjacent_bin(&root, current_key, false)
6458    }
6459
6460    /// Core implementation shared by `get_next_bin` and `get_prev_bin`.
6461    ///
6462    /// Builds the path from `root` down to the BIN for `current_key`
6463    /// (each element records the parent arc, the slot index taken,
6464    /// and the child Arc reached) using `read_arc()` hand-over-hand
6465    /// latch coupling.
6466    ///
6467    /// The ascent re-acquires the parent's read lock one level at a
6468    /// time. To handle a concurrent split that completes between
6469    /// path capture and ascent, we validate that the slot still
6470    /// holds the child Arc we descended through. If the slot
6471    /// mismatches we retry the whole operation from root with a
6472    /// short pause between attempts. The retry budget is generous
6473    /// (`MAX_ASCENT_ATTEMPTS`) so that the typical case of a few
6474    /// cascading splits between two BIN-level cursor steps is
6475    /// absorbed without surfacing as a false end-of-iteration.
6476    /// After exhausting the budget we conservatively return `None`,
6477    /// signalling "no adjacent BIN found"; the cursor will then
6478    /// either restart its scan or report end-of-iteration. The
6479    /// budget is finite so a pathological workload (a thread
6480    /// permanently splitting under us) cannot livelock the lookup.
6481    /// JE `Tree.getNextIN` / `Tree.getPrevIN`.
6482    ///
6483    /// R3 fix (2026-06-16): converted from `static fn` to `&self` so that the
6484    /// IN-level descent uses `self.upper_in_floor_index` (comparator-aware)
6485    /// instead of a raw byte `<=`. Without this, databases with a custom
6486    /// comparator (secondary indexes, sorted-dup) could descend to the wrong
6487    /// child → wrong adjacent BIN → incorrect cursor iteration across BIN
6488    /// boundaries. Mirrors `Tree.getNextIN`/`Tree.getPrevIN` using the
6489    /// comparator-aware `IN.findEntry`.
6490    fn get_adjacent_bin(
6491        &self,
6492        root: &Arc<RwLock<TreeNode>>,
6493        current_key: &[u8],
6494        forward: bool,
6495    ) -> Option<Vec<(BinEntry, Lsn, Vec<u8>)>> {
6496        const MAX_ASCENT_ATTEMPTS: u32 = 8;
6497        for attempt in 0..MAX_ASCENT_ATTEMPTS {
6498            match self.get_adjacent_bin_attempt(root, current_key, forward) {
6499                AdjacentBinOutcome::Found(v) => return Some(v),
6500                AdjacentBinOutcome::NoAdjacent => return None,
6501                AdjacentBinOutcome::SplitRaceRetry => {
6502                    // Brief pause to let the splitter finish.
6503                    if attempt + 1 < MAX_ASCENT_ATTEMPTS {
6504                        std::thread::yield_now();
6505                    }
6506                }
6507            }
6508        }
6509        // Exhausted retry budget. Signal "no adjacent" so the
6510        // cursor can fall back to its end-of-iteration path.
6511        None
6512    }
6513
6514    /// One attempt at `get_adjacent_bin`. The tri-state return
6515    /// value distinguishes "no adjacent BIN exists" (which the
6516    /// caller should propagate as end-of-iteration) from "a
6517    /// concurrent split invalidated our path" (which the caller
6518    /// should retry from root).
6519    fn get_adjacent_bin_attempt(
6520        &self,
6521        root: &Arc<RwLock<TreeNode>>,
6522        current_key: &[u8],
6523        forward: bool,
6524    ) -> AdjacentBinOutcome {
6525        // Path entry: (parent_arc, slot_idx_taken, child_arc_reached).
6526        // The child Arc lets the ascent validate that the slot still
6527        // points to the same node we descended through.
6528        let mut path: Vec<(
6529            Arc<RwLock<TreeNode>>,
6530            usize,
6531            Arc<RwLock<TreeNode>>,
6532        )> = Vec::new();
6533
6534        let mut guard: parking_lot::ArcRwLockReadGuard<
6535            parking_lot::RawRwLock,
6536            TreeNode,
6537        > = root.read_arc();
6538        loop {
6539            if guard.is_bin() {
6540                break;
6541            }
6542
6543            let (next_arc, slot_idx) = match &*guard {
6544                TreeNode::Internal(n) => {
6545                    if n.entries.is_empty() {
6546                        return AdjacentBinOutcome::NoAdjacent;
6547                    }
6548                    // R3 fix: use comparator-aware upper_in_floor_index so
6549                    // that custom-comparator / sorted-dup databases descend
6550                    // to the correct child. Mirrors JE Tree.getNextIN which
6551                    // uses IN.findEntry (comparator-aware) not raw byte order.
6552                    let idx =
6553                        self.upper_in_floor_index(&n.entries, current_key);
6554                    let child = match n.get_child(idx) {
6555                        Some(c) => c,
6556                        None => return AdjacentBinOutcome::NoAdjacent,
6557                    };
6558                    (child, idx)
6559                }
6560                TreeNode::Bottom(_) => unreachable!(),
6561            };
6562
6563            // Record the parent and the child we are about to enter
6564            // — the child Arc lets the ascent validate the slot.
6565            let parent_arc =
6566                parking_lot::ArcRwLockReadGuard::rwlock(&guard).clone();
6567            path.push((parent_arc, slot_idx, Arc::clone(&next_arc)));
6568
6569            // Hand-over-hand: take child read lock BEFORE releasing parent.
6570            let next_guard = next_arc.read_arc();
6571            drop(guard);
6572            guard = next_guard;
6573        }
6574        drop(guard);
6575
6576        // Ascend the path. At each level, validate that
6577        // `parent.entries[taken_idx].child == descended_child` before
6578        // trusting `taken_idx` as a coordinate. If not, return
6579        // `SplitRaceRetry` so the caller restarts from root.
6580        while let Some((parent_arc, taken_idx, descended_child)) = path.pop() {
6581            let parent_guard = parent_arc.read();
6582            let (n_entries, slot_still_valid) = match &*parent_guard {
6583                TreeNode::Internal(p) => {
6584                    let n = p.entries.len();
6585                    let valid = p
6586                        .child_ref(taken_idx)
6587                        .is_some_and(|c| Arc::ptr_eq(c, &descended_child));
6588                    (n, valid)
6589                }
6590                _ => return AdjacentBinOutcome::NoAdjacent,
6591            };
6592            drop(parent_guard);
6593
6594            if !slot_still_valid {
6595                return AdjacentBinOutcome::SplitRaceRetry;
6596            }
6597
6598            let sibling_idx = if forward {
6599                taken_idx + 1
6600            } else if taken_idx == 0 {
6601                // No left sibling at this level — ascend further.
6602                continue;
6603            } else {
6604                taken_idx - 1
6605            };
6606
6607            if forward && sibling_idx >= n_entries {
6608                // No right sibling at this level — ascend further.
6609                continue;
6610            }
6611
6612            // Found a sibling slot — fetch the sibling child arc.
6613            let sibling_arc = {
6614                let g = parent_arc.read();
6615                match &*g {
6616                    TreeNode::Internal(p) => match p.get_child(sibling_idx) {
6617                        Some(c) => c,
6618                        None => return AdjacentBinOutcome::NoAdjacent,
6619                    },
6620                    _ => return AdjacentBinOutcome::NoAdjacent,
6621                }
6622            };
6623
6624            // Descend to the leftmost (forward) or rightmost (!forward) BIN.
6625            return match Self::descend_to_edge_bin(&sibling_arc, forward) {
6626                Some(v) => AdjacentBinOutcome::Found(v),
6627                None => AdjacentBinOutcome::NoAdjacent,
6628            };
6629        }
6630
6631        // Exhausted path without finding a sibling → no adjacent BIN.
6632        AdjacentBinOutcome::NoAdjacent
6633    }
6634
6635    /// Descend to the leftmost BIN (`forward = true`) or rightmost BIN
6636    /// (`forward = false`) in the sub-tree rooted at `node_arc`.
6637    ///
6638    /// `Tree.searchSubTree(SearchType.LEFT / RIGHT, targetLevel)`.
6639    fn descend_to_edge_bin(
6640        node_arc: &Arc<RwLock<TreeNode>>,
6641        forward: bool,
6642    ) -> Option<Vec<(BinEntry, Lsn, Vec<u8>)>> {
6643        // Hand-over-hand latch coupling — see Tree::search.
6644        let mut guard: parking_lot::ArcRwLockReadGuard<
6645            parking_lot::RawRwLock,
6646            TreeNode,
6647        > = node_arc.read_arc();
6648
6649        loop {
6650            if guard.is_bin() {
6651                return match &*guard {
6652                    TreeNode::Bottom(b) => {
6653                        // Return entries with full (decompressed) keys so that
6654                        // callers always work with complete keys.
6655                        //
6656                        // TREE-F1: KD slots are NOT filtered here — the BIN's
6657                        // slot indices are returned verbatim so the cursor can
6658                        // skip KD slots itself (CursorImpl getNext loop;
6659                        // CursorImpl.java:2062-2064) and continue to the next
6660                        // BIN when an edge BIN is entirely KD during the
6661                        // BIN-delta reconstitution window.
6662                        let full_entries: Vec<(BinEntry, Lsn, Vec<u8>)> = (0
6663                            ..b.entries.len())
6664                            .map(|i| {
6665                                (
6666                                    BinEntry {
6667                                        data: b.entries[i].data.clone(),
6668                                        known_deleted: b.entries[i]
6669                                            .known_deleted,
6670                                        dirty: b.entries[i].dirty,
6671                                        expiration_time: b.entries[i]
6672                                            .expiration_time,
6673                                    },
6674                                    b.get_lsn(i),
6675                                    b.get_full_key(i).unwrap_or_default(),
6676                                )
6677                            })
6678                            .collect();
6679                        Some(full_entries)
6680                    }
6681                    _ => None,
6682                };
6683            }
6684
6685            let next = match &*guard {
6686                TreeNode::Internal(n) => {
6687                    if forward {
6688                        n.get_child(0)?
6689                    } else {
6690                        n.get_child(n.entries.len().saturating_sub(1))?
6691                    }
6692                }
6693                _ => return None,
6694            };
6695            // Take child read lock BEFORE releasing parent's.
6696            let next_guard = next.read_arc();
6697            drop(guard);
6698            guard = next_guard;
6699        }
6700    }
6701}
6702
6703// ============================================================================
6704// Tree statistics
6705// ============================================================================
6706
6707/// Statistics collected by a full tree walk.
6708///
6709/// `TreeWalkerStatsAccumulator`.
6710#[derive(Debug, Default, Clone, PartialEq, Eq)]
6711pub struct TreeStats {
6712    /// Number of BINs (bottom internal nodes).
6713    pub n_bins: u64,
6714    /// Number of upper INs.
6715    pub n_ins: u64,
6716    /// Total number of entries across all nodes.
6717    pub n_entries: u64,
6718    /// Height of the tree (1 = root is a BIN, 2 = one level above BINs, …).
6719    pub height: u32,
6720}
6721
6722impl Tree {
6723    /// Walks the entire tree and collects structural statistics.
6724    ///
6725    /// `TreeWalkerStatsAccumulator` pattern — performs a simple
6726    /// recursive DFS and counts INs, BINs, entries, and tree height.
6727    pub fn collect_stats(&self) -> TreeStats {
6728        let mut stats = TreeStats::default();
6729        if let Some(root) = self.get_root() {
6730            Self::collect_stats_recursive(&root, &mut stats, 0);
6731        }
6732        stats
6733    }
6734
6735    fn collect_stats_recursive(
6736        node_arc: &Arc<RwLock<TreeNode>>,
6737        stats: &mut TreeStats,
6738        depth: u32,
6739    ) {
6740        let guard = node_arc.read();
6741
6742        let current_height = depth + 1;
6743        if current_height > stats.height {
6744            stats.height = current_height;
6745        }
6746
6747        match &*guard {
6748            TreeNode::Bottom(b) => {
6749                stats.n_bins += 1;
6750                stats.n_entries += b.entries.len() as u64;
6751            }
6752            TreeNode::Internal(n) => {
6753                stats.n_ins += 1;
6754                stats.n_entries += n.entries.len() as u64;
6755                // Collect child arcs before releasing the guard.
6756                let children: Vec<Arc<RwLock<TreeNode>>> =
6757                    n.resident_children();
6758                // Release guard before recursing to avoid lock ordering issues.
6759                drop(guard);
6760                for child in children {
6761                    Self::collect_stats_recursive(&child, stats, depth + 1);
6762                }
6763            }
6764        }
6765    }
6766
6767    /// Collects all dirty BINs as (Arc to node, db_id) pairs.
6768    ///
6769    /// The checkpoint path calls this to enumerate BINs that need to be
6770    /// logged.  For each dirty BIN the checkpoint decides — based on the
6771    /// BIN-delta threshold — whether to write a full `BIN` entry or a
6772    /// `BINDelta` entry.
6773    ///
6774    /// `Checkpointer.processINList()` which iterates the dirty
6775    /// IN list accumulated during normal operation.
6776    pub fn collect_dirty_bins(
6777        &self,
6778        db_id: u64,
6779    ) -> Vec<(u64, Arc<RwLock<TreeNode>>)> {
6780        let mut result = Vec::new();
6781        if let Some(root) = self.get_root() {
6782            Self::collect_dirty_bins_recursive(&root, db_id, &mut result);
6783        }
6784        result
6785    }
6786
6787    fn collect_dirty_bins_recursive(
6788        node_arc: &Arc<RwLock<TreeNode>>,
6789        db_id: u64,
6790        out: &mut Vec<(u64, Arc<RwLock<TreeNode>>)>,
6791    ) {
6792        let guard = node_arc.read();
6793        match &*guard {
6794            TreeNode::Bottom(b) => {
6795                // Include this BIN if it is dirty or has any dirty slots.
6796                if b.dirty || b.dirty_count() > 0 {
6797                    out.push((db_id, Arc::clone(node_arc)));
6798                }
6799            }
6800            TreeNode::Internal(n) => {
6801                let children: Vec<Arc<RwLock<TreeNode>>> =
6802                    n.resident_children();
6803                drop(guard);
6804                for child in children {
6805                    Self::collect_dirty_bins_recursive(&child, db_id, out);
6806                } // guard already dropped
6807            }
6808        }
6809    }
6810
6811    /// Collect all BINs that have at least one `known_deleted` slot.
6812    ///
6813    /// INCompressor queue-drain scan in the: the daemon iterates
6814    /// the in-memory IN list and identifies BINs that still hold zombie deleted
6815    /// slots.  Each returned `Arc` can be passed directly to `compress_bin()`.
6816    pub fn collect_bins_with_known_deleted(
6817        &self,
6818    ) -> Vec<Arc<RwLock<TreeNode>>> {
6819        let mut result = Vec::new();
6820        if let Some(root) = self.get_root() {
6821            Self::collect_bins_with_known_deleted_recursive(&root, &mut result);
6822        }
6823        result
6824    }
6825
6826    fn collect_bins_with_known_deleted_recursive(
6827        node_arc: &Arc<RwLock<TreeNode>>,
6828        out: &mut Vec<Arc<RwLock<TreeNode>>>,
6829    ) {
6830        let guard = node_arc.read();
6831        match &*guard {
6832            TreeNode::Bottom(b) => {
6833                if b.entries.iter().any(|e| e.known_deleted) {
6834                    out.push(Arc::clone(node_arc));
6835                }
6836            }
6837            TreeNode::Internal(n) => {
6838                let children: Vec<Arc<RwLock<TreeNode>>> =
6839                    n.resident_children();
6840                drop(guard);
6841                for child in children {
6842                    Self::collect_bins_with_known_deleted_recursive(
6843                        &child, out,
6844                    );
6845                }
6846            }
6847        }
6848    }
6849
6850    /// Collect all dirty upper (non-BIN) internal nodes, sorted ascending by
6851    /// level (bottom-up order, BIN level excluded).
6852    ///
6853    /// Serialise an upper-IN node (level > 1) by node_id for off-heap storage.
6854    ///
6855    /// Traverses the tree to find the internal node whose  matches,
6856    /// then calls  to produce a compact byte
6857    /// representation.  Returns  if the node is not found or is a BIN
6858    /// (BINs are not upper INs).
6859    ///
6860    /// Mirrors `OffHeapAllocator` serialises the same bytes that would be written
6861    /// to the log, allowing the evictor to store upper-INs off-heap and avoid
6862    /// log-file reads on the next traversal.
6863    pub fn serialize_upper_in(&self, node_id: u64) -> Option<Vec<u8>> {
6864        let root = self.get_root()?;
6865        Self::find_and_serialize_upper_in(&root, node_id)
6866    }
6867
6868    fn find_and_serialize_upper_in(
6869        node_arc: &Arc<RwLock<TreeNode>>,
6870        target_id: u64,
6871    ) -> Option<Vec<u8>> {
6872        let guard = node_arc.read();
6873        match &*guard {
6874            TreeNode::Bottom(_) => None, // BINs are not upper INs
6875            TreeNode::Internal(n) => {
6876                if n.node_id == target_id {
6877                    // Serialise InNodeStub for off-heap storage.
6878                    // Format: node_id(u64BE) | level(i32BE) | n_entries(u32BE)
6879                    //   then per-entry: key_len(u32BE) | key | lsn(u64BE)
6880                    let mut buf = Vec::new();
6881                    buf.extend_from_slice(&n.node_id.to_be_bytes());
6882                    buf.extend_from_slice(&n.level.to_be_bytes());
6883                    buf.extend_from_slice(
6884                        &(n.entries.len() as u32).to_be_bytes(),
6885                    );
6886                    for (i, e) in n.entries.iter().enumerate() {
6887                        buf.extend_from_slice(
6888                            &(e.key.len() as u32).to_be_bytes(),
6889                        );
6890                        buf.extend_from_slice(&e.key);
6891                        buf.extend_from_slice(
6892                            &n.get_lsn(i).as_u64().to_be_bytes(),
6893                        );
6894                    }
6895                    return Some(buf);
6896                }
6897                // Recurse into children before releasing the guard so we
6898                // hold the minimum read-lock duration.
6899                let children: Vec<Arc<RwLock<TreeNode>>> =
6900                    n.resident_children();
6901                drop(guard);
6902                for child in &children {
6903                    if let Some(bytes) =
6904                        Self::find_and_serialize_upper_in(child, target_id)
6905                    {
6906                        return Some(bytes);
6907                    }
6908                }
6909                None
6910            }
6911        }
6912    }
6913
6914    /// Upper-IN traversal in `Checkpointer.processINList()` from
6915    /// — visits all `TreeNode::Internal` nodes whose `dirty` flag is set
6916    /// and returns them together with their level, sorted lowest-level-first
6917    /// so the checkpointer can log them bottom-up.  The root is always the
6918    /// last entry (highest level), which must be logged `Provisional::No`.
6919    pub fn collect_dirty_upper_ins(
6920        &self,
6921        _db_id: u64,
6922    ) -> Vec<(i32, Arc<RwLock<TreeNode>>)> {
6923        let mut result: Vec<(i32, Arc<RwLock<TreeNode>>)> = Vec::new();
6924        if let Some(root) = self.get_root() {
6925            Self::collect_dirty_upper_ins_recursive(&root, &mut result);
6926        }
6927        result.sort_by_key(|(level, _)| *level);
6928        result
6929    }
6930
6931    fn collect_dirty_upper_ins_recursive(
6932        node_arc: &Arc<RwLock<TreeNode>>,
6933        out: &mut Vec<(i32, Arc<RwLock<TreeNode>>)>,
6934    ) {
6935        let guard = node_arc.read();
6936        match &*guard {
6937            TreeNode::Bottom(_) => {
6938                // BINs are handled by flush_dirty_bins_internal; skip here.
6939            }
6940            TreeNode::Internal(n) => {
6941                let is_dirty = n.dirty;
6942                // REC-AA: return the node's ACTUAL tree level (n.level, in
6943                // MAIN_LEVEL|n units), not a root-relative depth.  The level
6944                // must be on the same scale as a BIN's `level` (BIN_LEVEL =
6945                // MAIN_LEVEL|1) so that the checkpointer's flush-level
6946                // computation and the evictor's `node_level < flush_level`
6947                // comparison are meaningful.  With a root-relative depth the
6948                // root had the SMALLEST value (0) and the IN above the BINs
6949                // the LARGEST, inverting the provisional/non-provisional
6950                // boundary; with n.level the root has the largest level, as JE
6951                // expects.
6952                let level = n.level;
6953                let children: Vec<Arc<RwLock<TreeNode>>> =
6954                    n.resident_children();
6955                drop(guard);
6956                // Recurse into children first (bottom-up ordering).
6957                for child in &children {
6958                    Self::collect_dirty_upper_ins_recursive(child, out);
6959                }
6960                // Add this node after children (so parent comes after all descendants).
6961                if is_dirty {
6962                    out.push((level, Arc::clone(node_arc)));
6963                }
6964            }
6965        }
6966    }
6967
6968    // ========================================================================
6969    // Tree.java ports: 8 additional tree methods (Task #82)
6970    // ========================================================================
6971
6972    /// Returns `true` if the root node is currently loaded in memory.
6973    ///
6974    /// .
6975    pub fn is_root_resident(&self) -> bool {
6976        self.root.read().is_some()
6977    }
6978
6979    /// Returns the root node `Arc` if present, or `None`.
6980    ///
6981    /// .
6982    pub fn get_resident_root_in(&self) -> Option<Arc<RwLock<TreeNode>>> {
6983        self.root.read().clone()
6984    }
6985
6986    /// Returns the BIN that should contain a slot for `key` (the "parent" of
6987    /// LN slots).
6988    ///
6989    /// .  Descends the tree
6990    /// exactly like `search()` and returns the leaf-level BIN arc, or `None`
6991    /// if the tree is empty.
6992    ///
6993    /// Uses `read_arc()` hand-over-hand on the descent — the child
6994    /// guard is taken before the parent guard is dropped, matching
6995    /// `search()`. Returns the BIN Arc with no read lock held; the
6996    /// caller must take whatever lock it needs to operate on the
6997    /// returned BIN.
6998    pub fn get_parent_bin_for_child_ln(
6999        &self,
7000        key: &[u8],
7001    ) -> Option<Arc<RwLock<TreeNode>>> {
7002        let root = self.get_root()?;
7003        let mut current_arc: Arc<RwLock<TreeNode>> = root.clone();
7004        let mut guard: parking_lot::ArcRwLockReadGuard<
7005            parking_lot::RawRwLock,
7006            TreeNode,
7007        > = root.read_arc();
7008
7009        loop {
7010            if guard.is_bin() {
7011                drop(guard);
7012                return Some(current_arc);
7013            }
7014
7015            let parent_arc = current_arc.clone();
7016            let next_idx = match &*guard {
7017                TreeNode::Internal(n) => {
7018                    if n.entries.is_empty() {
7019                        return None;
7020                    }
7021                    let idx = self.upper_in_floor_index(&n.entries, key);
7022                    match n.get_child(idx) {
7023                        Some(c) => {
7024                            let next_guard = c.read_arc();
7025                            drop(guard);
7026                            current_arc = c;
7027                            guard = next_guard;
7028                            continue;
7029                        }
7030                        None => idx, // EV-14/EV-13: re-fetch below.
7031                    }
7032                }
7033                TreeNode::Bottom(_) => {
7034                    unreachable!("is_bin() returned false above")
7035                }
7036            };
7037            // Hand-over-hand: take child guard before dropping parent.
7038            drop(guard);
7039            let child = self.child_at_or_fetch(&parent_arc, next_idx)?;
7040            let next_guard = child.read_arc();
7041            current_arc = child;
7042            guard = next_guard;
7043        }
7044    }
7045
7046    /// Returns the BIN where `key` should be inserted.
7047    ///
7048    /// .  Semantically identical to
7049    /// `get_parent_bin_for_child_ln` — expressed as a separate method to match
7050    /// API surface.
7051    ///
7052    /// Implemented as a delegation to `get_parent_bin_for_child_ln`,
7053    /// which uses `read_arc()` hand-over-hand on the descent.
7054    pub fn find_bin_for_insert(
7055        &self,
7056        key: &[u8],
7057    ) -> Option<Arc<RwLock<TreeNode>>> {
7058        self.get_parent_bin_for_child_ln(key)
7059    }
7060
7061    /// Search for a BIN, allowing splits during descent (preemptive splitting).
7062    ///
7063    /// .  This thin wrapper
7064    /// delegates to `search()` and returns the result wrapped in `Some`.
7065    /// The full split-allowed descent is performed by `insert()` internally;
7066    /// this method exposes the same result type for callers that only need to
7067    /// locate the BIN.
7068    ///
7069    /// Returns `None` if the tree is empty.
7070    pub fn search_splits_allowed(&self, key: &[u8]) -> Option<SearchResult> {
7071        self.search(key)
7072    }
7073
7074    /// Traverses the entire tree and returns every IN and BIN node as a flat
7075    /// list.
7076    ///
7077    /// .  Used by recovery to rebuild
7078    /// the in-memory IN list after log replay.  The walk is a BFS from the
7079    /// root; every `Arc<RwLock<TreeNode>>` encountered (both Internal and
7080    /// Bottom variants) is included in the result.
7081    pub fn rebuild_in_list(&self) -> Vec<Arc<RwLock<TreeNode>>> {
7082        let mut result = Vec::new();
7083        if let Some(root) = self.get_root() {
7084            Self::rebuild_in_list_recursive(&root, &mut result);
7085        }
7086        result
7087    }
7088
7089    fn rebuild_in_list_recursive(
7090        node_arc: &Arc<RwLock<TreeNode>>,
7091        out: &mut Vec<Arc<RwLock<TreeNode>>>,
7092    ) {
7093        // Push this node unconditionally — both INs and BINs belong in the list.
7094        out.push(Arc::clone(node_arc));
7095
7096        let guard = node_arc.read();
7097
7098        if let TreeNode::Internal(n) = &*guard {
7099            // Collect child arcs while holding the guard, then drop it before
7100            // recursing to avoid holding multiple locks simultaneously.
7101            let children: Vec<Arc<RwLock<TreeNode>>> = n.resident_children();
7102            drop(guard);
7103            for child in children {
7104                Self::rebuild_in_list_recursive(&child, out);
7105            }
7106        }
7107        // BIN nodes are leaves — no children to recurse into.
7108    }
7109
7110    /// Validates internal tree consistency.
7111    ///
7112    /// .  Primarily a debug/test tool.
7113    ///
7114    /// Rules checked:
7115    /// - An empty tree (no root) is trivially valid → returns `true`.
7116    /// - A non-empty tree must have a non-null root.
7117    /// - Every Internal node must have at least one entry.
7118    /// - Every child pointer that is `Some` must be readable (lock must be
7119    ///   acquirable — i.e., no poisoned locks).
7120    ///
7121    /// Returns `true` if no inconsistencies are detected, `false` otherwise.
7122    pub fn validate_in_list(&self) -> bool {
7123        match self.get_root() {
7124            None => true, // empty tree is always valid
7125            Some(root) => Self::validate_node(&root),
7126        }
7127    }
7128
7129    fn validate_node(node_arc: &Arc<RwLock<TreeNode>>) -> bool {
7130        let guard = node_arc.read();
7131
7132        match &*guard {
7133            TreeNode::Bottom(_bin) => {
7134                // BIN nodes are always structurally valid at this level.
7135                true
7136            }
7137            TreeNode::Internal(n) => {
7138                // An Internal node must have at least one entry.
7139                if n.entries.is_empty() {
7140                    return false;
7141                }
7142                // Collect child arcs before dropping the guard.
7143                let children: Vec<Arc<RwLock<TreeNode>>> =
7144                    n.resident_children();
7145                drop(guard);
7146                // Recursively validate every resident child.
7147                for child in children {
7148                    if !Self::validate_node(&child) {
7149                        return false;
7150                    }
7151                }
7152                true
7153            }
7154        }
7155    }
7156
7157    /// Traverses the tree to find the parent IN that contains `child_node_id`
7158    /// as one of its child slots.
7159    ///
7160    /// .  Used by the cleaner
7161    /// migration path to re-insert migrated INs after eviction/fetch.
7162    ///
7163    /// Returns `(parent_arc, slot_index)` where `slot_index` is the position
7164    /// in the parent's `entries` vector whose child matches `child_node_id`,
7165    /// or `None` if no such parent is found.
7166    pub fn get_parent_in_for_child_in(
7167        &self,
7168        child_node_id: u64,
7169    ) -> Option<(Arc<RwLock<TreeNode>>, usize)> {
7170        let root = self.get_root()?;
7171        Self::find_parent_of_node_id(&root, child_node_id)
7172    }
7173
7174    /// Recursive DFS helper for `get_parent_in_for_child_in`.
7175    ///
7176    /// Scans every entry in each Internal node.  When a child's node_id
7177    /// matches `target_id` the parent arc and slot index are returned.
7178    fn find_parent_of_node_id(
7179        node_arc: &Arc<RwLock<TreeNode>>,
7180        target_id: u64,
7181    ) -> Option<(Arc<RwLock<TreeNode>>, usize)> {
7182        let guard = node_arc.read();
7183
7184        let TreeNode::Internal(n) = &*guard else {
7185            // BIN nodes have no IN children — cannot be a parent of another IN.
7186            return None;
7187        };
7188
7189        // Check whether any child of this IN has the target node_id.
7190        let mut children: Vec<(usize, Arc<RwLock<TreeNode>>)> = Vec::new();
7191        for slot in 0..n.entries.len() {
7192            if let Some(child_arc) = n.child_ref(slot) {
7193                // Read the child's node_id under a separate lock (acquire child
7194                // while parent guard is still held — this is intentional for
7195                // the ID comparison only; we release both immediately after).
7196                let child_id = {
7197                    let cg = child_arc.read();
7198                    match &*cg {
7199                        TreeNode::Internal(cn) => cn.node_id,
7200                        TreeNode::Bottom(cb) => cb.node_id,
7201                    }
7202                };
7203
7204                if child_id == target_id {
7205                    // Found — return a clone of this node as parent.
7206                    let parent_clone = Arc::clone(node_arc);
7207                    return Some((parent_clone, slot));
7208                }
7209
7210                // Not found at this slot; schedule this child for recursion.
7211                children.push((slot, Arc::clone(child_arc)));
7212            }
7213        }
7214        // Release parent guard before recursing.
7215        drop(guard);
7216
7217        // Recurse into each Internal child.
7218        for (_slot, child_arc) in children {
7219            if let Some(result) =
7220                Self::find_parent_of_node_id(&child_arc, target_id)
7221            {
7222                return Some(result);
7223            }
7224        }
7225
7226        None
7227    }
7228
7229    /// Propagates the dirty flag upward from `node_arc` to the root.
7230    ///
7231    /// Implicit dirty propagation: after modifying any node,
7232    /// all ancestors on the path to the root must also be marked dirty so
7233    /// the checkpointer logs them.
7234    ///
7235    /// In this happens through `IN.setDirty(true)` calls at each level
7236    /// during split/insert callbacks.  Here we walk the weak parent chain.
7237    /// Reconstitute a BIN-delta by merging it onto a base full BIN.
7238    ///
7239    /// Implements JE `BINDelta.reconstituteBIN(databaseImpl)` for the recovery
7240    /// path where the log manager is not available as a `LogManager` but as
7241    /// raw serialized bytes.
7242    ///
7243    /// Algorithm:
7244    /// 1. Deserialise `base_bytes` as a full `BinStub`.
7245    /// 2. Apply `delta_bytes` slots onto the base using `BinStub::apply_delta`
7246    ///    (raw slot overlay).
7247    /// 3. Recompute key prefix so prefix-compressed entries are consistent.
7248    ///
7249    /// Returns `None` if either byte slice is malformed.
7250    ///
7251    /// JE `BINDelta.reconstituteBIN` / `BINDelta.applyDelta`
7252    /// (DRIFT-10 / Stage 3).
7253    pub fn reconstitute_bin_delta(
7254        base_bytes: &[u8],
7255        delta_bytes: &[u8],
7256    ) -> Option<BinStub> {
7257        let mut base = BinStub::deserialize_full(base_bytes)?;
7258        // Apply the delta slots onto the base.
7259        // Note: BinStub::apply_delta uses slot-index addressing into base.entries,
7260        // extending with new entries when the slot_idx >= base.entries.len().
7261        // After apply_delta we recompute the key prefix to fix prefix compression.
7262        BinStub::apply_delta(&mut base, delta_bytes)?;
7263        // Recompute prefix so prefix-compressed BINs are consistent after merge.
7264        base.recompute_key_prefix();
7265        base.is_delta = false;
7266        base.dirty = false;
7267        Some(base)
7268    }
7269
7270    pub fn propagate_dirty_to_root(node_arc: &Arc<RwLock<TreeNode>>) {
7271        let parent_weak = { node_arc.read().get_parent() };
7272
7273        if let Some(parent_arc) = parent_weak.and_then(|w| w.upgrade()) {
7274            {
7275                let mut g = parent_arc.write();
7276                g.set_dirty(true);
7277            }
7278            // Recurse further up.
7279            Self::propagate_dirty_to_root(&parent_arc);
7280        }
7281    }
7282
7283    // ========================================================================
7284    // IN-redo: JE RecoveryManager.recoverIN / recoverRootIN / recoverChildIN
7285    // ========================================================================
7286
7287    /// Deserialise an upper-IN node from bytes produced by
7288    /// `TreeNode::write_to_bytes()` / `flush_one_tree_upper_ins`.
7289    ///
7290    /// Format: node_id(u64BE) | level(i32BE) | n_entries(u32BE) | dirty(u8)
7291    ///   | per-entry: key_len(u16BE) | key | lsn(u64BE)
7292    ///
7293    /// JE `INFileReader.getIN(db)` / `IN.readFromLog`.
7294    pub fn deserialize_upper_in(bytes: &[u8]) -> Option<InNodeStub> {
7295        if bytes.len() < 13 {
7296            return None;
7297        }
7298        let node_id = u64::from_be_bytes(bytes[0..8].try_into().ok()?);
7299        let level = i32::from_be_bytes(bytes[8..12].try_into().ok()?);
7300        let n_entries =
7301            u32::from_be_bytes(bytes[12..16].try_into().ok()?) as usize;
7302        // dirty byte (1 byte after n_entries)
7303        if bytes.len() < 17 {
7304            return None;
7305        }
7306        let mut pos = 17usize; // skip node_id(8) + level(4) + n_entries(4) + dirty(1)
7307        let mut entries = Vec::with_capacity(n_entries);
7308        let mut lsns: Vec<Lsn> = Vec::with_capacity(n_entries);
7309        for _ in 0..n_entries {
7310            if pos + 2 > bytes.len() {
7311                return None;
7312            }
7313            let key_len =
7314                u16::from_be_bytes(bytes[pos..pos + 2].try_into().ok()?)
7315                    as usize;
7316            pos += 2;
7317            if pos + key_len > bytes.len() {
7318                return None;
7319            }
7320            let key = bytes[pos..pos + key_len].to_vec();
7321            pos += key_len;
7322            if pos + 8 > bytes.len() {
7323                return None;
7324            }
7325            let lsn = noxu_util::Lsn::from_u64(u64::from_be_bytes(
7326                bytes[pos..pos + 8].try_into().ok()?,
7327            ));
7328            pos += 8;
7329            entries.push(InEntry { key });
7330            lsns.push(lsn); // T-3
7331        }
7332        Some(InNodeStub {
7333            node_id,
7334            level,
7335            entries,
7336            // T-4: a freshly deserialized IN has no resident children.
7337            targets: TargetRep::None,
7338            dirty: false,
7339            generation: 0,
7340            parent: None,
7341            lsn_rep: LsnRep::from_lsns(&lsns), // T-3
7342        })
7343    }
7344
7345    /// Deserialise a BIN from bytes produced by `BinStub::serialize_full()`.
7346    ///
7347    /// Thin wrapper so the recovery path does not need to import `BinStub`
7348    /// directly from callers that only have the raw bytes.
7349    ///
7350    /// JE `INFileReader.getIN(db)` for a BIN entry.
7351    pub fn deserialize_bin(bytes: &[u8]) -> Option<BinStub> {
7352        let mut bin = BinStub::deserialize_full(bytes)?;
7353        bin.dirty = false; // freshly loaded from log — clean for now
7354        Some(bin)
7355    }
7356
7357    /// Apply a logged IN/BIN to the in-memory tree during the recovery redo pass.
7358    ///
7359    /// Implements JE `RecoveryManager.recoverIN`:
7360    /// - `is_root` nodes are handled by `recover_root_in`.
7361    /// - non-root nodes are handled by `recover_child_in`.
7362    ///
7363    /// `log_lsn` is the LSN at which this IN/BIN was logged.  The currency
7364    /// check in `recover_child_in` uses this to decide whether to replace the
7365    /// in-memory slot (tree slot LSN < log_lsn → replace; equal → noop;
7366    /// greater → skip).
7367    ///
7368    /// JE `RecoveryManager.recoverIN` / `replayOneIN`
7369    /// (RecoveryManager.java ~lines 1200–1280).
7370    pub fn recover_in_redo(
7371        &self,
7372        log_lsn: noxu_util::Lsn,
7373        is_root: bool,
7374        is_bin: bool,
7375        node_data: &[u8],
7376    ) -> InRedoResult {
7377        if is_bin {
7378            let Some(bin) = Self::deserialize_bin(node_data) else {
7379                return InRedoResult::DeserializeFailed;
7380            };
7381            if is_root {
7382                self.recover_root_bin(log_lsn, bin)
7383            } else {
7384                self.recover_child_bin(log_lsn, bin)
7385            }
7386        } else {
7387            let Some(upper) = Self::deserialize_upper_in(node_data) else {
7388                return InRedoResult::DeserializeFailed;
7389            };
7390            if is_root {
7391                self.recover_root_upper_in(log_lsn, upper)
7392            } else {
7393                self.recover_child_upper_in(log_lsn, upper)
7394            }
7395        }
7396    }
7397
7398    /// Recover a root BIN.
7399    ///
7400    /// If no root exists or the existing root is older (lower LSN), install
7401    /// this BIN as the new root.
7402    ///
7403    /// JE `RecoveryManager.recoverRootIN` / `RootUpdater.doWork`
7404    /// (RecoveryManager.java ~lines 1293–1410).
7405    fn recover_root_bin(
7406        &self,
7407        log_lsn: noxu_util::Lsn,
7408        bin: BinStub,
7409    ) -> InRedoResult {
7410        let mut root_guard = self.root.write();
7411        let existing_lsn = *self.root_log_lsn.read();
7412        match &*root_guard {
7413            None => {
7414                // No root — install this BIN as the root.
7415                // JE: `root == null` case in `RootUpdater.doWork`.
7416                let node = TreeNode::Bottom(bin);
7417                *root_guard = Some(Arc::new(RwLock::new(node)));
7418                *self.root_log_lsn.write() = log_lsn;
7419                InRedoResult::Inserted
7420            }
7421            Some(_) => {
7422                // JE: `originalLsn = root.getLsn()`; replace if logLsn > originalLsn.
7423                if log_lsn > existing_lsn {
7424                    let node = TreeNode::Bottom(bin);
7425                    *root_guard = Some(Arc::new(RwLock::new(node)));
7426                    *self.root_log_lsn.write() = log_lsn;
7427                    InRedoResult::Replaced
7428                } else {
7429                    InRedoResult::Skipped
7430                }
7431            }
7432        }
7433    }
7434
7435    /// Recover a root upper IN.
7436    ///
7437    /// JE `RecoveryManager.recoverRootIN` for a non-BIN root.
7438    fn recover_root_upper_in(
7439        &self,
7440        log_lsn: noxu_util::Lsn,
7441        upper: InNodeStub,
7442    ) -> InRedoResult {
7443        let mut root_guard = self.root.write();
7444        let existing_lsn = *self.root_log_lsn.read();
7445        match &*root_guard {
7446            None => {
7447                let node = TreeNode::Internal(upper);
7448                *root_guard = Some(Arc::new(RwLock::new(node)));
7449                *self.root_log_lsn.write() = log_lsn;
7450                InRedoResult::Inserted
7451            }
7452            Some(_) => {
7453                if log_lsn > existing_lsn {
7454                    let node = TreeNode::Internal(upper);
7455                    *root_guard = Some(Arc::new(RwLock::new(node)));
7456                    *self.root_log_lsn.write() = log_lsn;
7457                    InRedoResult::Replaced
7458                } else {
7459                    InRedoResult::Skipped
7460                }
7461            }
7462        }
7463    }
7464
7465    /// Recover a non-root BIN.
7466    ///
7467    /// Implements the three-case currency check from JE
7468    /// `RecoveryManager.recoverChildIN`
7469    /// (RecoveryManager.java lines 1412–1500):
7470    ///
7471    /// 1. Node not in tree: skip (parent logged a later structure that already
7472    ///    omits this node, or node was deleted).
7473    /// 2. Physical match (slot LSN == log_lsn): noop — already current.
7474    /// 3. Logical match: another version of the node is in the slot.
7475    ///    Replace if tree slot LSN < log_lsn (tree is older), skip otherwise.
7476    fn recover_child_bin(
7477        &self,
7478        log_lsn: noxu_util::Lsn,
7479        bin: BinStub,
7480    ) -> InRedoResult {
7481        let node_id = bin.node_id;
7482        let Some((parent_arc, slot)) = self.get_parent_in_for_child_in(node_id)
7483        else {
7484            // Case 1: not in tree.
7485            return InRedoResult::NotInTree;
7486        };
7487        let mut parent = parent_arc.write();
7488        let TreeNode::Internal(ref mut p) = *parent else {
7489            return InRedoResult::NotInTree;
7490        };
7491        let tree_lsn = p.get_lsn(slot); // T-3
7492        if tree_lsn == log_lsn {
7493            // Case 2: physical match — noop.
7494            InRedoResult::Skipped
7495        } else if tree_lsn < log_lsn {
7496            // Case 3: logical match, tree is older — replace.
7497            // JE `parent.recoverIN(idx, inFromLog, logLsn, lastLoggedSize)`.
7498            let new_arc = Arc::new(RwLock::new(TreeNode::Bottom(bin)));
7499            // Set parent back-pointer on the new node.
7500            {
7501                let mut ng = new_arc.write();
7502                if let TreeNode::Bottom(ref mut b) = *ng {
7503                    b.parent = Some(Arc::downgrade(&parent_arc));
7504                }
7505            }
7506            p.set_child(slot, Some(new_arc));
7507            p.set_lsn(slot, log_lsn); // T-3
7508            InRedoResult::Replaced
7509        } else {
7510            // tree_lsn > log_lsn: tree already holds a newer version.
7511            InRedoResult::Skipped
7512        }
7513    }
7514
7515    /// Recover a non-root upper IN.
7516    ///
7517    /// JE `RecoveryManager.recoverChildIN` for a non-BIN node.
7518    fn recover_child_upper_in(
7519        &self,
7520        log_lsn: noxu_util::Lsn,
7521        upper: InNodeStub,
7522    ) -> InRedoResult {
7523        let node_id = upper.node_id;
7524        let Some((parent_arc, slot)) = self.get_parent_in_for_child_in(node_id)
7525        else {
7526            return InRedoResult::NotInTree;
7527        };
7528        let mut parent = parent_arc.write();
7529        let TreeNode::Internal(ref mut p) = *parent else {
7530            return InRedoResult::NotInTree;
7531        };
7532        let tree_lsn = p.get_lsn(slot); // T-3
7533        if tree_lsn == log_lsn {
7534            InRedoResult::Skipped
7535        } else if tree_lsn < log_lsn {
7536            let new_arc = Arc::new(RwLock::new(TreeNode::Internal(upper)));
7537            {
7538                let mut ng = new_arc.write();
7539                if let TreeNode::Internal(ref mut n) = *ng {
7540                    n.parent = Some(Arc::downgrade(&parent_arc));
7541                }
7542            }
7543            p.set_child(slot, Some(new_arc));
7544            p.set_lsn(slot, log_lsn); // T-3
7545            InRedoResult::Replaced
7546        } else {
7547            InRedoResult::Skipped
7548        }
7549    }
7550}
7551
7552/// Result of a single `recover_in_redo` call.
7553///
7554/// JE traces the same outcomes in `RecoveryManager` debug logging.
7555#[derive(Debug, Clone, Copy, PartialEq, Eq)]
7556pub enum InRedoResult {
7557    /// Node was inserted as the new root.
7558    Inserted,
7559    /// Node replaced an older version in the tree.
7560    Replaced,
7561    /// Node not applied: tree already holds an equal or newer version.
7562    Skipped,
7563    /// Node not found in tree (parent logged later structure that excludes it).
7564    NotInTree,
7565    /// Deserialisation of `node_data` bytes failed.
7566    DeserializeFailed,
7567}
7568
7569/// Global node ID counter for generating unique node IDs.
7570///
7571/// This is the SINGLE source of node-ids for the whole tree subsystem.  The
7572/// BIN constructor (`bin.rs`) and `node.rs` route through `generate_node_id`
7573/// so that, after crash recovery, a freshly allocated node-id is always
7574/// strictly greater than every node-id present in the recovered log.
7575///
7576/// JE ref: `NodeSequence.getNextLocalNodeId` (a single per-env counter) and
7577/// `IN.nodeId` allocation; `NodeSequence.initRealNodeId` seeds the counter
7578/// from the recovered `CheckpointEnd.lastLocalNodeId`.  The env seeds this
7579/// counter post-recovery via `seed_node_id_counter`.
7580static NODE_ID_COUNTER: std::sync::atomic::AtomicU64 =
7581    std::sync::atomic::AtomicU64::new(1);
7582
7583/// Generates a unique node ID.
7584pub fn generate_node_id() -> u64 {
7585    NODE_ID_COUNTER.fetch_add(1, std::sync::atomic::Ordering::SeqCst)
7586}
7587
7588/// Returns the node-id that would be generated next (without allocating it).
7589///
7590/// Used by recovery seeding and by tests to assert no node-id reuse after a
7591/// restart.
7592pub fn peek_next_node_id_counter() -> u64 {
7593    NODE_ID_COUNTER.load(std::sync::atomic::Ordering::SeqCst)
7594}
7595
7596/// Seeds the node-id counter so the next generated id is `> last_node_id`.
7597///
7598/// Called by `EnvironmentImpl` after recovery with the recovered
7599/// `use_max_node_id`, mirroring `NodeSequence.initRealNodeId` /
7600/// `setLastNodeId`: post-restart allocation must never reuse a node-id that
7601/// is already in the log.  Monotonic: never lowers the counter.
7602pub fn seed_node_id_counter(last_node_id: u64) {
7603    let want_next = last_node_id.saturating_add(1);
7604    // Bump only if our current next is below the recovered floor.
7605    let mut cur = NODE_ID_COUNTER.load(std::sync::atomic::Ordering::SeqCst);
7606    while cur < want_next {
7607        match NODE_ID_COUNTER.compare_exchange_weak(
7608            cur,
7609            want_next,
7610            std::sync::atomic::Ordering::SeqCst,
7611            std::sync::atomic::Ordering::SeqCst,
7612        ) {
7613            Ok(_) => break,
7614            Err(observed) => cur = observed,
7615        }
7616    }
7617}
7618
7619#[cfg(test)]
7620mod tests {
7621    use super::*;
7622
7623    // ====================================================================
7624    // T-3: LsnRep packed-LSN encoding (IN.entryLsnByteArray / getLsn /
7625    // setLsnInternal, IN.java:1752-1935).
7626    // ====================================================================
7627
7628    /// All-NULL node uses the 0-byte Empty rep; reads return NULL_LSN.
7629    #[test]
7630    fn lsnrep_empty_is_zero_bytes() {
7631        let rep = LsnRep::new(64);
7632        assert!(matches!(rep, LsnRep::Empty));
7633        assert_eq!(rep.memory_size(), 0);
7634        assert_eq!(rep.get(0), NULL_LSN);
7635        assert_eq!(rep.get(63), NULL_LSN);
7636    }
7637
7638    /// LSNs sharing a file number pack to the Compact rep (4 bytes/slot,
7639    /// base_file_number-relative) and round-trip exactly.
7640    #[test]
7641    fn lsnrep_compact_roundtrip_same_file() {
7642        let mut rep = LsnRep::new(8);
7643        for i in 0..8u32 {
7644            rep.set(i as usize, Lsn::new(7, 1000 + i), 8);
7645        }
7646        assert!(matches!(rep, LsnRep::Compact { .. }));
7647        for i in 0..8u32 {
7648            assert_eq!(rep.get(i as usize), Lsn::new(7, 1000 + i));
7649        }
7650        // 8 slots * 4 bytes = 32 bytes, far below 8 * 8 = 64 for raw u64.
7651        assert_eq!(rep.memory_size(), 8 * 4);
7652    }
7653
7654    /// NULL_LSN is stored via the 0xffffff file-offset sentinel, NOT u64::MAX,
7655    /// so a node with NULL slots still packs Compact (the blocker JE solves).
7656    #[test]
7657    fn lsnrep_null_does_not_force_long() {
7658        let mut rep = LsnRep::new(4);
7659        rep.set(0, Lsn::new(3, 50), 4);
7660        rep.set(1, NULL_LSN, 4);
7661        rep.set(2, Lsn::new(3, 60), 4);
7662        rep.set(3, NULL_LSN, 4);
7663        assert!(
7664            matches!(rep, LsnRep::Compact { .. }),
7665            "NULL slots must NOT force the Long rep"
7666        );
7667        assert_eq!(rep.get(0), Lsn::new(3, 50));
7668        assert_eq!(rep.get(1), NULL_LSN);
7669        assert_eq!(rep.get(2), Lsn::new(3, 60));
7670        assert_eq!(rep.get(3), NULL_LSN);
7671    }
7672
7673    /// base_file_number tracks the minimum; setting a lower file number
7674    /// re-bases the whole array (adjustFileNumbers) while staying Compact.
7675    #[test]
7676    fn lsnrep_rebase_on_lower_file_number() {
7677        let mut rep = LsnRep::new(3);
7678        rep.set(0, Lsn::new(10, 5), 3);
7679        rep.set(1, Lsn::new(12, 6), 3);
7680        // A lower file number re-bases base_file_number to 8.
7681        rep.set(2, Lsn::new(8, 7), 3);
7682        assert!(matches!(rep, LsnRep::Compact { .. }));
7683        assert_eq!(rep.get(0), Lsn::new(10, 5));
7684        assert_eq!(rep.get(1), Lsn::new(12, 6));
7685        assert_eq!(rep.get(2), Lsn::new(8, 7));
7686    }
7687
7688    /// A file-number spread > 127 forces the Long fallback (mutateToLongArray),
7689    /// still round-tripping every slot.
7690    #[test]
7691    fn lsnrep_mutates_to_long_on_wide_file_range() {
7692        let mut rep = LsnRep::new(2);
7693        rep.set(0, Lsn::new(1, 5), 2);
7694        rep.set(1, Lsn::new(1000, 6), 2); // diff 999 > 127 -> Long
7695        assert!(matches!(rep, LsnRep::Long(_)));
7696        assert_eq!(rep.get(0), Lsn::new(1, 5));
7697        assert_eq!(rep.get(1), Lsn::new(1000, 6));
7698    }
7699
7700    /// A file offset > MAX_FILE_OFFSET (0xfffffe) forces the Long fallback.
7701    #[test]
7702    fn lsnrep_mutates_to_long_on_large_offset() {
7703        let mut rep = LsnRep::new(2);
7704        rep.set(0, Lsn::new(1, 10), 2);
7705        rep.set(1, Lsn::new(1, 0x00ff_ffff), 2); // > MAX_FILE_OFFSET -> Long
7706        assert!(matches!(rep, LsnRep::Long(_)));
7707        assert_eq!(rep.get(1), Lsn::new(1, 0x00ff_ffff));
7708    }
7709
7710    /// insert_shift / remove_shift keep slots aligned (INArrayRep.copy).
7711    #[test]
7712    fn lsnrep_insert_and_remove_shift() {
7713        let mut rep = LsnRep::from_lsns(&[
7714            Lsn::new(2, 1),
7715            Lsn::new(2, 2),
7716            Lsn::new(2, 3),
7717        ]);
7718        // Insert a new slot at index 1.
7719        rep.insert_shift(1, 4);
7720        rep.set(1, Lsn::new(2, 99), 4);
7721        assert_eq!(rep.get(0), Lsn::new(2, 1));
7722        assert_eq!(rep.get(1), Lsn::new(2, 99));
7723        assert_eq!(rep.get(2), Lsn::new(2, 2));
7724        assert_eq!(rep.get(3), Lsn::new(2, 3));
7725        // Remove slot 1.
7726        rep.remove_shift(1);
7727        assert_eq!(rep.get(0), Lsn::new(2, 1));
7728        assert_eq!(rep.get(1), Lsn::new(2, 2));
7729        assert_eq!(rep.get(2), Lsn::new(2, 3));
7730    }
7731
7732    #[test]
7733    fn test_empty_tree() {
7734        let tree = Tree::new(1, 128);
7735        assert!(tree.is_empty());
7736        assert_eq!(tree.get_database_id(), 1);
7737        assert_eq!(tree.get_root_splits(), 0);
7738    }
7739
7740    #[test]
7741    fn test_redo_insert_older_lsn_does_not_overwrite_newer_slot() {
7742        // REC-F2 reproduce-first: redo() must be idempotent w.r.t. slot
7743        // currency.  JE RecoveryManager.redo() (line ~2512/2544) only
7744        // replaces a slot when logrecLsn > treeLsn.  A later redo of an
7745        // OLDER committed LN for the same key must NOT revert the slot to
7746        // the older value or reset the slot LSN backward.
7747        let tree = Tree::new(1, 128);
7748        let key = b"k".to_vec();
7749
7750        // Install the newer version at LSN X (e.g. the BIN-logged value).
7751        let newer = Lsn::new(5, 500);
7752        tree.redo_insert(&key, b"new", newer).unwrap();
7753
7754        // Replay an OLDER committed LN at Y < X for the same key.
7755        let older = Lsn::new(2, 200);
7756        tree.redo_insert(&key, b"old", older).unwrap();
7757
7758        // The newer value and LSN must survive.
7759        let got = tree.search_with_data(&key).expect("key present");
7760        assert!(got.found);
7761        assert_eq!(
7762            got.data.as_deref(),
7763            Some(&b"new"[..]),
7764            "older-LSN redo reverted committed data"
7765        );
7766        assert_eq!(
7767            got.lsn,
7768            newer.as_u64(),
7769            "older-LSN redo reset slot LSN backward"
7770        );
7771
7772        // A redo at a strictly NEWER LSN must still replace (replace-only
7773        // when log_lsn > slot_lsn, matching JE lsnCmp > 0).
7774        let newest = Lsn::new(9, 900);
7775        tree.redo_insert(&key, b"newest", newest).unwrap();
7776        let got = tree.search_with_data(&key).expect("key present");
7777        assert_eq!(got.data.as_deref(), Some(&b"newest"[..]));
7778        assert_eq!(got.lsn, newest.as_u64());
7779    }
7780
7781    #[test]
7782    fn test_insert_single() {
7783        let tree = Tree::new(1, 128);
7784        let key = b"testkey".to_vec();
7785        let data = b"testdata".to_vec();
7786        let lsn = Lsn::new(1, 100);
7787
7788        let result = tree.insert(key.clone(), data, lsn);
7789        assert!(result.is_ok());
7790        assert!(result.unwrap()); // Should be a new insert
7791
7792        assert!(!tree.is_empty());
7793
7794        // Verify we can search for it
7795        let search_result = tree.search(&key);
7796        assert!(search_result.is_some());
7797        let sr = search_result.unwrap();
7798        assert!(sr.exact_parent_found || !sr.child_not_resident);
7799    }
7800
7801    #[test]
7802    fn test_insert_multiple() {
7803        let tree = Tree::new(1, 128);
7804
7805        let keys = vec![
7806            b"apple".to_vec(),
7807            b"banana".to_vec(),
7808            b"cherry".to_vec(),
7809            b"date".to_vec(),
7810        ];
7811
7812        for (i, key) in keys.iter().enumerate() {
7813            let data = format!("data{}", i).into_bytes();
7814            let lsn = Lsn::new(1, 100 + (i as u32) * 10);
7815            let result = tree.insert(key.clone(), data, lsn);
7816            assert!(result.is_ok());
7817            assert!(result.unwrap()); // All should be new inserts
7818        }
7819
7820        // Verify we can search for each
7821        for key in &keys {
7822            let search_result = tree.search(key);
7823            assert!(search_result.is_some());
7824        }
7825    }
7826
7827    #[test]
7828    fn test_insert_duplicate_key() {
7829        let tree = Tree::new(1, 128);
7830        let key = b"duplicate".to_vec();
7831        let data1 = b"first".to_vec();
7832        let data2 = b"second".to_vec();
7833        let lsn1 = Lsn::new(1, 100);
7834        let lsn2 = Lsn::new(1, 200);
7835
7836        // First insert
7837        let result1 = tree.insert(key.clone(), data1, lsn1);
7838        assert!(result1.is_ok());
7839        assert!(result1.unwrap()); // New insert
7840
7841        // Second insert with same key - should be update
7842        let result2 = tree.insert(key, data2, lsn2);
7843        assert!(result2.is_ok());
7844        assert!(!result2.unwrap()); // Update, not new insert
7845    }
7846
7847    #[test]
7848    fn test_search_empty_tree() {
7849        let tree = Tree::new(1, 128);
7850        let key = b"noexist".to_vec();
7851
7852        let result = tree.search(&key);
7853        assert!(result.is_none());
7854    }
7855
7856    #[test]
7857    fn test_first_and_last_node() {
7858        let tree = Tree::new(1, 128);
7859
7860        // Empty tree
7861        assert!(tree.get_first_node().is_none());
7862        assert!(tree.get_last_node().is_none());
7863
7864        // Insert some keys
7865        let keys = [b"a".to_vec(), b"b".to_vec(), b"c".to_vec()];
7866        for (i, key) in keys.iter().enumerate() {
7867            let data = format!("data{}", i).into_bytes();
7868            let lsn = Lsn::new(1, 100 + (i as u32) * 10);
7869            tree.insert(key.clone(), data, lsn).unwrap();
7870        }
7871
7872        // Now should have first and last
7873        let first = tree.get_first_node();
7874        assert!(first.is_some());
7875        assert_eq!(first.unwrap().index, 0);
7876
7877        let last = tree.get_last_node();
7878        assert!(last.is_some());
7879        assert_eq!(last.unwrap().index, 2);
7880    }
7881
7882    #[test]
7883    fn test_node_id_generation() {
7884        let id1 = generate_node_id();
7885        let id2 = generate_node_id();
7886        let id3 = generate_node_id();
7887
7888        assert!(id2 > id1);
7889        assert!(id3 > id2);
7890    }
7891
7892    #[test]
7893    fn test_tree_node_is_bin() {
7894        let bin = TreeNode::Bottom(BinStub {
7895            node_id: 1,
7896            level: BIN_LEVEL,
7897            entries: vec![],
7898            key_prefix: Vec::new(),
7899            dirty: false,
7900            is_delta: false,
7901            last_full_lsn: NULL_LSN,
7902            last_delta_lsn: NULL_LSN,
7903            generation: 0,
7904            parent: None,
7905            expiration_in_hours: true,
7906            cursor_count: 0,
7907            prohibit_next_delta: false,
7908            lsn_rep: LsnRep::Empty,
7909            keys: KeyRep::new(),
7910            compact_max_key_length: INKeyRep_DEFAULT_MAX_KEY_LENGTH,
7911        });
7912        assert!(bin.is_bin());
7913        assert_eq!(bin.level(), BIN_LEVEL);
7914
7915        let internal = TreeNode::Internal(InNodeStub {
7916            node_id: 2,
7917            level: MAIN_LEVEL + 2,
7918            entries: vec![],
7919            targets: TargetRep::None,
7920            dirty: false,
7921            generation: 0,
7922            parent: None,
7923            lsn_rep: LsnRep::Empty,
7924        });
7925        assert!(!internal.is_bin());
7926        assert_eq!(internal.level(), MAIN_LEVEL + 2);
7927    }
7928
7929    #[test]
7930    fn test_find_entry() {
7931        let mut entries = vec![];
7932        let mut keys = vec![];
7933        for i in 0..5 {
7934            entries.push(BinEntry {
7935                data: Some(vec![]),
7936                known_deleted: false,
7937                dirty: false,
7938                expiration_time: 0,
7939            });
7940            keys.push(format!("key{}", i).into_bytes());
7941        }
7942
7943        let bin = TreeNode::Bottom(BinStub {
7944            node_id: 1,
7945            level: BIN_LEVEL,
7946            entries,
7947            key_prefix: Vec::new(),
7948            dirty: false,
7949            is_delta: false,
7950            last_full_lsn: NULL_LSN,
7951            last_delta_lsn: NULL_LSN,
7952            generation: 0,
7953            parent: None,
7954            expiration_in_hours: true,
7955            cursor_count: 0,
7956            prohibit_next_delta: false,
7957            lsn_rep: LsnRep::Empty,
7958            keys: KeyRep::from_keys(keys),
7959            compact_max_key_length: INKeyRep_DEFAULT_MAX_KEY_LENGTH,
7960        });
7961
7962        // Search for existing key
7963        let result = bin.find_entry(b"key2", false, true);
7964        assert_eq!(result & 0xFFFF, 2);
7965        assert_ne!(result & EXACT_MATCH, 0);
7966
7967        // Search for non-existing key with exact=false
7968        let result = bin.find_entry(b"key15", false, false);
7969        assert_eq!(result & 0xFFFF, 2); // Would go between key1 and key2
7970        assert_eq!(result & EXACT_MATCH, 0);
7971    }
7972
7973    #[test]
7974    fn test_insert_until_full() {
7975        // With splits implemented, inserting beyond max_entries_per_node must
7976        // succeed (the tree splits proactively rather than returning an error).
7977        let tree = Tree::new(1, 3); // Small max to exercise splits
7978
7979        // Insert up to max
7980        for i in 0..3 {
7981            let key = format!("key{}", i).into_bytes();
7982            let data = format!("data{}", i).into_bytes();
7983            let lsn = Lsn::new(1, 100 + i);
7984            let result = tree.insert(key, data, lsn);
7985            assert!(result.is_ok(), "insert {} should succeed", i);
7986        }
7987
7988        // The 4th insert triggers a split and must also succeed.
7989        let key = b"key3".to_vec();
7990        let data = b"data3".to_vec();
7991        let lsn = Lsn::new(1, 103);
7992        let result = tree.insert(key.clone(), data, lsn);
7993        assert!(
7994            result.is_ok(),
7995            "insert after full should trigger split and succeed"
7996        );
7997        assert!(result.unwrap(), "should be a new insert");
7998
7999        // The inserted key must be findable after the split.
8000        let sr = tree.search(&key);
8001        assert!(sr.is_some(), "key3 must be searchable after split");
8002        assert!(sr.unwrap().exact_parent_found, "key3 must be found exactly");
8003    }
8004
8005    #[test]
8006    fn test_memory_counter_balanced_on_insert_delete_f8() {
8007        use std::sync::Arc;
8008        use std::sync::atomic::{AtomicI64, Ordering};
8009        // F8 regression: insert accounts key+data+48; delete must subtract the
8010        // SAME, so an insert+delete of the same record returns the counter to
8011        // its starting value (previously delete omitted data_len -> the counter
8012        // leaked data_len per delete, biasing the evictor over-budget view).
8013        let mut tree = Tree::new(1, 16);
8014        let counter = Arc::new(AtomicI64::new(0));
8015        tree.set_memory_counter(Arc::clone(&counter));
8016
8017        let key = b"a-key".to_vec();
8018        let data = vec![0u8; 200]; // non-trivial data length
8019        tree.insert(key.clone(), data.clone(), Lsn::new(0, 10)).unwrap();
8020        let after_insert = counter.load(Ordering::Relaxed);
8021        assert!(after_insert > 0, "insert must increase the counter");
8022        assert_eq!(
8023            after_insert,
8024            (key.len() + data.len() + BIN_ENTRY_OVERHEAD) as i64,
8025            "insert accounts key + data + per-slot BinEntry overhead"
8026        );
8027
8028        let deleted = tree.delete(&key);
8029        assert!(deleted);
8030        assert_eq!(
8031            counter.load(Ordering::Relaxed),
8032            0,
8033            "F8: delete must subtract key + data + BIN_ENTRY_OVERHEAD, returning the counter              to its pre-insert value (no data_len leak)"
8034        );
8035    }
8036
8037    /// EV-13 (pass-post): a full-node detach must ACTUALLY drop the child
8038    /// `Arc` from the parent IN, not merely credit bytes.  Before the fix the
8039    /// evictor credited `node_size_fn(node_id)` and removed the node from the
8040    /// LRU list, but the parent's `InEntry.child` still held a strong `Arc`,
8041    /// so the node was never freed (phantom free) and the budget over-credited.
8042    ///
8043    /// This test proves: after `detach_node_by_id` the held child `Arc` is the
8044    /// LAST strong reference (strong_count == 1), the parent slot's `child` is
8045    /// `None`, and the returned bytes equal the node's measured heap size.
8046    ///
8047    /// JE ref: `IN.detachNode` (`setTarget(idx, null)`) / `Evictor.evict`.
8048    #[test]
8049    fn test_ev13_detach_actually_frees_child() {
8050        // Tiny fanout forces a root split so we get a real IN parent with BIN
8051        // children that the evictor would target.
8052        let tree = Tree::new(7, 4);
8053        for i in 0u8..12 {
8054            tree.insert(
8055                vec![b'a' + i],
8056                vec![i; 8],
8057                Lsn::new(1, u32::from(i) + 1),
8058            )
8059            .unwrap();
8060        }
8061
8062        // Find a BIN child of the root IN (the eviction target) + its parent.
8063        let root = tree.get_root().expect("tree must have a root");
8064        let (parent_arc, child_idx, bin_id, expected_bytes) = {
8065            let rg = root.read();
8066            let TreeNode::Internal(n) = &*rg else {
8067                panic!("root must be an IN after split");
8068            };
8069            // Pick the first slot whose child is a resident BIN.
8070            let (idx, child) = n
8071                .first_resident_child()
8072                .expect("root must have a resident child");
8073            let (id, bytes) = {
8074                let cg = child.read();
8075                (
8076                    match &*cg {
8077                        TreeNode::Bottom(b) => b.node_id,
8078                        TreeNode::Internal(n2) => n2.node_id,
8079                    },
8080                    cg.budgeted_memory_size(),
8081                )
8082            };
8083            (Arc::clone(&root), idx, id, bytes)
8084        };
8085
8086        // Hold an external strong reference to the child so we can observe its
8087        // strong_count drop when detach releases the parent's reference.
8088        let child_arc = {
8089            let pg = parent_arc.read();
8090            let TreeNode::Internal(n) = &*pg else { unreachable!() };
8091            Arc::clone(n.child_ref(child_idx).unwrap())
8092        };
8093        // Two strong refs now: the parent slot + our test handle.
8094        assert_eq!(
8095            Arc::strong_count(&child_arc),
8096            2,
8097            "precondition: parent slot + test handle hold the child"
8098        );
8099
8100        let freed = tree.detach_node_by_id(bin_id);
8101
8102        // 1. Bytes credited equal the measured heap size (no phantom credit).
8103        assert_eq!(
8104            freed, expected_bytes,
8105            "detach must credit the node's real measured heap size"
8106        );
8107        // 2. The parent slot's child is now None (JE setTarget(idx, null)).
8108        {
8109            let pg = parent_arc.read();
8110            let TreeNode::Internal(n) = &*pg else { unreachable!() };
8111            assert!(
8112                n.child_is_none(child_idx),
8113                "EV-13: parent slot must be detached (child == None)"
8114            );
8115            // The slot itself (key + LSN) is retained for re-fetch.
8116            assert!(
8117                !n.get_lsn(child_idx).is_null(),
8118                "detach keeps the slot LSN so the node can be re-fetched"
8119            );
8120        }
8121        // 3. Our handle is now the ONLY strong reference -> the parent really
8122        //    dropped its Arc; the node is freed when we drop `child_arc`.
8123        //    Before EV-13 this would be 2 (parent still held it) = phantom free.
8124        assert_eq!(
8125            Arc::strong_count(&child_arc),
8126            1,
8127            "EV-13: detach must drop the parent's strong Arc (no phantom free)"
8128        );
8129    }
8130
8131    /// EV-13: detach must NOT decrement the memory counter itself (the evictor
8132    /// owns that bookkeeping via `Arbiter::release_memory`).  A double credit
8133    /// would drive `cache_usage` below reality.
8134    #[test]
8135    fn test_ev13_detach_does_not_touch_counter() {
8136        use std::sync::atomic::{AtomicI64, Ordering};
8137        let mut tree = Tree::new(8, 4);
8138        let counter = Arc::new(AtomicI64::new(0));
8139        tree.set_memory_counter(Arc::clone(&counter));
8140        for i in 0u8..12 {
8141            tree.insert(
8142                vec![b'a' + i],
8143                vec![i; 8],
8144                Lsn::new(1, u32::from(i) + 1),
8145            )
8146            .unwrap();
8147        }
8148        let before = counter.load(Ordering::Relaxed);
8149
8150        // Grab a BIN child id.
8151        let root = tree.get_root().unwrap();
8152        let bin_id = {
8153            let rg = root.read();
8154            let TreeNode::Internal(n) = &*rg else { unreachable!() };
8155            let child = n
8156                .resident_children()
8157                .into_iter()
8158                .next()
8159                .expect("resident child");
8160            match &*child.read() {
8161                TreeNode::Bottom(b) => b.node_id,
8162                TreeNode::Internal(n2) => n2.node_id,
8163            }
8164        };
8165
8166        let freed = tree.detach_node_by_id(bin_id);
8167        assert!(freed > 0, "detach must free a resident child");
8168        assert_eq!(
8169            counter.load(Ordering::Relaxed),
8170            before,
8171            "EV-13: detach must not change the counter (evictor credits once)"
8172        );
8173    }
8174
8175    /// EV-13: detaching the root or an unknown id is a no-op returning 0.
8176    #[test]
8177    fn test_ev13_detach_root_or_missing_is_noop() {
8178        let tree = Tree::new(9, 4);
8179        for i in 0u8..12 {
8180            tree.insert(
8181                vec![b'a' + i],
8182                vec![i; 8],
8183                Lsn::new(1, u32::from(i) + 1),
8184            )
8185            .unwrap();
8186        }
8187        let root_id = {
8188            let rg = tree.get_root().unwrap();
8189            let g = rg.read();
8190            match &*g {
8191                TreeNode::Internal(n) => n.node_id,
8192                TreeNode::Bottom(b) => b.node_id,
8193            }
8194        };
8195        assert_eq!(
8196            tree.detach_node_by_id(root_id),
8197            0,
8198            "root has no parent IN -> detach is a no-op"
8199        );
8200        assert_eq!(
8201            tree.detach_node_by_id(u64::MAX),
8202            0,
8203            "unknown node id -> detach is a no-op"
8204        );
8205    }
8206
8207    /// DBI-23 (pass-post): the live `memory_counter` must APPROXIMATE the real
8208    /// in-memory heap of the tree, not the old `key + data + 48` lower bound.
8209    ///
8210    /// JE keeps `inMemorySize` (`IN.getBudgetedMemorySize`) in lock-step with
8211    /// the per-node `computeMemorySize`; the over-budget arbiter sees the real
8212    /// figure so eviction fires at the right time.  The previous Noxu live
8213    /// path undercounted each BIN slot (48 vs the 64-byte `BinEntry` struct)
8214    /// and never accounted the node-struct fixed overhead, so the counter ran
8215    /// below real heap and the evictor under-fired.
8216    ///
8217    /// We assert the live counter is within tolerance of
8218    /// `total_budgeted_memory` (the authoritative walk-and-sum oracle).  The
8219    /// only gap is the per-node fixed struct overhead (BinStub/InNodeStub),
8220    /// which is a small fraction for non-trivial entries — the fix closes the
8221    /// dominant per-slot gap.
8222    #[test]
8223    fn test_dbi23_live_counter_approximates_real_heap() {
8224        use std::sync::atomic::{AtomicI64, Ordering};
8225        let mut tree = Tree::new(42, 32);
8226        let counter = Arc::new(AtomicI64::new(0));
8227        tree.set_memory_counter(Arc::clone(&counter));
8228
8229        // Insert N entries with realistic key+data sizes.
8230        let n = 400u32;
8231        for i in 0..n {
8232            let key = format!("key-{i:08}").into_bytes(); // 12 bytes
8233            let data = vec![0u8; 64]; // 64 bytes
8234            tree.insert(key, data, Lsn::new(1, i + 1)).unwrap();
8235        }
8236
8237        let live = counter.load(Ordering::Relaxed) as u64;
8238        let real = tree.total_budgeted_memory();
8239
8240        // The live counter must reflect the per-slot cost AFTER the T-2/T-3
8241        // compactions hoisted the per-slot key/LSN out of `BinEntry` into the
8242        // node-level reps.  The per-slot live charge is now
8243        // `key + data + size_of::<BinEntry>() + 4` (the packed LSN slot); the
8244        // dominant data+key bytes are still charged in full.  Assert the live
8245        // counter is at least the data-and-fixed portion (a stable floor that
8246        // does NOT assume the pre-compaction 64-byte slot).
8247        let new_lower_bound: u64 = (0..n)
8248            .map(|i| {
8249                let key_len = format!("key-{i:08}").len();
8250                (key_len + 64 + BIN_ENTRY_OVERHEAD) as u64
8251            })
8252            .sum();
8253
8254        assert!(
8255            live >= new_lower_bound,
8256            "DBI-23: live counter ({live}) must be >= the per-slot-correct \
8257             lower bound ({new_lower_bound})"
8258        );
8259
8260        // Within tolerance of real heap (the residual gap is the per-node
8261        // fixed struct overhead, intentionally not tracked incrementally).
8262        let lower = real * 80 / 100;
8263        assert!(
8264            live >= lower && live <= real,
8265            "DBI-23: live counter ({live}) must approximate real heap ({real}) \
8266             within tolerance [{lower}, {real}]"
8267        );
8268    }
8269
8270    #[test]
8271    fn test_delete_existing_key() {
8272        let tree = Tree::new(1, 128);
8273        let key = b"remove_me".to_vec();
8274        tree.insert(key.clone(), b"val".to_vec(), Lsn::new(1, 10)).unwrap();
8275        assert!(tree.delete(&key));
8276
8277        // After deletion the BIN is empty, so delete returns true the first
8278        // time and false the second time.
8279        assert!(!tree.delete(&key));
8280    }
8281
8282    #[test]
8283    fn test_delete_nonexistent_key() {
8284        let tree = Tree::new(1, 128);
8285        tree.insert(b"a".to_vec(), b"v".to_vec(), Lsn::new(1, 1)).unwrap();
8286
8287        assert!(!tree.delete(b"zzz"));
8288    }
8289
8290    #[test]
8291    fn test_delete_empty_tree() {
8292        let tree = Tree::new(1, 128);
8293        assert!(!tree.delete(b"nothing"));
8294    }
8295
8296    #[test]
8297    fn test_delete_all_entries_makes_bin_empty() {
8298        let tree = Tree::new(1, 128);
8299        tree.insert(b"x".to_vec(), b"1".to_vec(), Lsn::new(1, 1)).unwrap();
8300        tree.insert(b"y".to_vec(), b"2".to_vec(), Lsn::new(1, 2)).unwrap();
8301
8302        assert!(tree.delete(b"x"));
8303        assert!(tree.delete(b"y"));
8304
8305        // Tree still has a root (empty BIN), so is_empty() returns false.
8306        assert!(!tree.is_empty());
8307        // get_first_node should return None for an empty BIN.
8308        assert!(tree.get_first_node().is_none());
8309    }
8310
8311    #[test]
8312    fn test_set_root_and_get_root() {
8313        let tree = Tree::new(1, 128);
8314        assert!(tree.get_root().is_none());
8315
8316        let bin = TreeNode::Bottom(BinStub {
8317            node_id: generate_node_id(),
8318            level: BIN_LEVEL,
8319            entries: vec![],
8320            key_prefix: Vec::new(),
8321            dirty: false,
8322            is_delta: false,
8323            last_full_lsn: NULL_LSN,
8324            last_delta_lsn: NULL_LSN,
8325            generation: 0,
8326            parent: None,
8327            expiration_in_hours: true,
8328            cursor_count: 0,
8329            prohibit_next_delta: false,
8330            lsn_rep: LsnRep::Empty,
8331            keys: KeyRep::new(),
8332            compact_max_key_length: INKeyRep_DEFAULT_MAX_KEY_LENGTH,
8333        });
8334        tree.set_root(bin);
8335        assert!(tree.get_root().is_some());
8336    }
8337
8338    // ========================================================================
8339    // Split / multi-level insert tests  (new)
8340    // ========================================================================
8341
8342    /// inserting enough keys to fill the root IN causes
8343    /// the root IN itself to split, resulting in a tree with 3 or more levels.
8344    ///
8345    /// With max_entries_per_node = 4:
8346    ///   - Each BIN holds 4 entries before it is split.
8347    ///   - The root IN at level 2 holds up to 4 BIN children.
8348    ///   - Filling those 4 BINs (16 entries) and adding a 17th forces the
8349    ///     root IN to split, creating a level-3 root.
8350    #[test]
8351    fn test_insert_forces_root_split() {
8352        let tree = Tree::new(1, 4);
8353
8354        // 17 inserts with fanout 4 forces the root IN to split.
8355        for i in 0u32..20 {
8356            let key = format!("key{:04}", i).into_bytes();
8357            let data = format!("data{}", i).into_bytes();
8358            let lsn = Lsn::new(1, 100 + i);
8359            let r = tree.insert(key, data, lsn);
8360            assert!(r.is_ok(), "insert {} must succeed", i);
8361        }
8362
8363        // At least one root split must have occurred.
8364        assert!(
8365            tree.get_root_splits() > 0,
8366            "expected at least one root split after 20 inserts with fanout 4"
8367        );
8368
8369        // The root level must be > level-2 (i.e., the tree has grown to 3+ levels).
8370        let root_arc = tree.get_root().as_ref().unwrap().clone();
8371        let root_level = root_arc.read().level();
8372        let level_2 = MAIN_LEVEL | 2;
8373        assert!(
8374            root_level > level_2,
8375            "root level {} must be > level-2 after root split",
8376            root_level
8377        );
8378    }
8379
8380    /// Inserting 1000 keys in sorted order and verifying all are searchable.
8381    #[test]
8382    fn test_insert_many_keys() {
8383        let tree = Tree::new(1, 8);
8384        let n = 1000u32;
8385
8386        for i in 0..n {
8387            let key = format!("key{:08}", i).into_bytes();
8388            let data = format!("data{}", i).into_bytes();
8389            let lsn = Lsn::new(1, i);
8390            let r = tree.insert(key, data, lsn);
8391            assert!(r.is_ok(), "insert {} must succeed", i);
8392        }
8393
8394        // All keys must be findable.
8395        for i in 0..n {
8396            let key = format!("key{:08}", i).into_bytes();
8397            let sr = tree.search(&key);
8398            assert!(
8399                sr.is_some() && sr.unwrap().exact_parent_found,
8400                "key{:08} must be found after bulk insert",
8401                i
8402            );
8403        }
8404    }
8405
8406    /// Inserting 500 keys in pseudo-random (reverse) order and verifying all
8407    /// are searchable.
8408    #[test]
8409    fn test_insert_random_keys() {
8410        let tree = Tree::new(1, 8);
8411        let n = 500u32;
8412
8413        // Insert in reverse order as a simple non-sorted sequence.
8414        for i in (0..n).rev() {
8415            let key = format!("rkey{:08}", i).into_bytes();
8416            let data = format!("data{}", i).into_bytes();
8417            let lsn = Lsn::new(1, i);
8418            let r = tree.insert(key, data, lsn);
8419            assert!(r.is_ok(), "insert {} must succeed", i);
8420        }
8421
8422        for i in 0..n {
8423            let key = format!("rkey{:08}", i).into_bytes();
8424            let sr = tree.search(&key);
8425            assert!(
8426                sr.is_some() && sr.unwrap().exact_parent_found,
8427                "rkey{:08} must be found",
8428                i
8429            );
8430        }
8431    }
8432
8433    /// After any number of splits, every key inserted must still be findable.
8434    ///
8435    #[test]
8436    fn test_split_preserves_all_keys() {
8437        // Tiny fanout to maximise split frequency.
8438        let tree = Tree::new(1, 3);
8439        let n = 60u32;
8440
8441        let mut keys: Vec<Vec<u8>> = Vec::new();
8442        for i in 0..n {
8443            let key = format!("sk{:04}", i).into_bytes();
8444            keys.push(key.clone());
8445            let data = format!("d{}", i).into_bytes();
8446            let lsn = Lsn::new(1, i);
8447            let r = tree.insert(key, data, lsn);
8448            assert!(r.is_ok(), "insert {} must not fail", i);
8449        }
8450
8451        // After all inserts (and all the splits they induced), every key must
8452        // still be findable in the tree.
8453        for key in &keys {
8454            let sr = tree.search(key);
8455            assert!(
8456                sr.is_some() && sr.unwrap().exact_parent_found,
8457                "key {:?} must survive all splits",
8458                std::str::from_utf8(key).unwrap_or("?")
8459            );
8460        }
8461    }
8462
8463    /// The tree level (depth) must grow as keys are inserted and splits occur.
8464    #[test]
8465    fn test_tree_height_grows() {
8466        let tree = Tree::new(1, 4);
8467
8468        // With fanout 4, one level-2 root IN can hold 4 children.  After enough
8469        // inserts the root itself will split and a level-3 node will appear.
8470        // Insert enough keys to force the root to split at least once.
8471        let n = 40u32;
8472        for i in 0..n {
8473            let key = format!("hk{:08}", i).into_bytes();
8474            let data = format!("d{}", i).into_bytes();
8475            let lsn = Lsn::new(1, i);
8476            tree.insert(key, data, lsn).unwrap();
8477        }
8478
8479        // At least one root split must have occurred.
8480        assert!(
8481            tree.get_root_splits() > 0,
8482            "expected root to have split at least once for {} keys with fanout 4",
8483            n
8484        );
8485
8486        // The root level must be > level-2 (i.e., the tree has grown past two levels).
8487        let root_arc = tree.get_root().as_ref().unwrap().clone();
8488        let root_level = root_arc.read().level();
8489        let level_2 = MAIN_LEVEL | 2;
8490        assert!(
8491            root_level > level_2,
8492            "root level {} must be > {} after enough inserts",
8493            root_level,
8494            level_2
8495        );
8496    }
8497
8498    #[test]
8499    fn test_find_entry_on_internal_node() {
8500        let mut entries = vec![];
8501        for i in 0..4 {
8502            entries.push(InEntry { key: format!("k{}", i).into_bytes() });
8503        }
8504        let internal = TreeNode::Internal(InNodeStub {
8505            node_id: 1,
8506            level: MAIN_LEVEL + 2,
8507            entries,
8508            targets: TargetRep::None,
8509            dirty: false,
8510            generation: 0,
8511            parent: None,
8512            lsn_rep: LsnRep::Empty,
8513        });
8514
8515        // Exact match
8516        let r = internal.find_entry(b"k2", false, true);
8517        assert_ne!(r & EXACT_MATCH, 0);
8518        assert_eq!(r & 0xFFFF, 2);
8519
8520        // No exact match with exact=true
8521        let r = internal.find_entry(b"kx", false, true);
8522        assert_eq!(r, -1);
8523    }
8524
8525    // St-H5: non-exact `find_entry` on an Internal node must return the FLOOR
8526    // child slot (largest entry ≤ key), not the insertion point. Entries are
8527    // k0,k1,k2,k3; slot 0 is the leftmost child.
8528    #[test]
8529    fn test_find_entry_internal_nonexact_returns_floor() {
8530        let mut entries = vec![];
8531        for i in 0..4 {
8532            entries.push(InEntry { key: format!("k{}", i).into_bytes() });
8533        }
8534        let internal = TreeNode::Internal(InNodeStub {
8535            node_id: 1,
8536            level: MAIN_LEVEL + 2,
8537            entries,
8538            targets: TargetRep::None,
8539            dirty: false,
8540            generation: 0,
8541            parent: None,
8542            lsn_rep: LsnRep::Empty,
8543        });
8544
8545        // Key below every separator floors to slot 0 (leftmost child).
8546        assert_eq!(internal.find_entry(b"a", false, false) & 0xFFFF, 0);
8547        // Between k1 and k2 floors to k1 (slot 1).
8548        assert_eq!(internal.find_entry(b"k1x", false, false) & 0xFFFF, 1);
8549        // Above every separator floors to the last slot (k3 = slot 3).
8550        assert_eq!(internal.find_entry(b"zzz", false, false) & 0xFFFF, 3);
8551        // Exact match still reported as the exact slot.
8552        let r = internal.find_entry(b"k2", false, false);
8553        assert_ne!(r & EXACT_MATCH, 0);
8554        assert_eq!(r & 0xFFFF, 2);
8555    }
8556
8557    // ========================================================================
8558    // New tests: dirty tracking, generation, parent pointers, log size, stats
8559    // ========================================================================
8560
8561    /// After inserting into a tree, the BIN (and root IN) must be dirty.
8562    ///
8563    /// The: Tree.insertLN() calls bin.setDirty(true) after each insert.
8564    #[test]
8565    fn test_insert_marks_bin_dirty() {
8566        let tree = Tree::new(1, 128);
8567        tree.insert(b"key1".to_vec(), b"val1".to_vec(), Lsn::new(1, 1))
8568            .unwrap();
8569
8570        let root_arc = tree.get_root().as_ref().unwrap().clone();
8571        // root is an upper IN — its slot 0 child is the BIN.
8572        let bin_arc = {
8573            let g = root_arc.read();
8574            match &*g {
8575                TreeNode::Internal(n) => n.get_child(0).unwrap(),
8576                _ => panic!("expected Internal root"),
8577            }
8578        };
8579
8580        let bin_dirty = bin_arc.read().is_dirty();
8581        assert!(bin_dirty, "BIN must be dirty after insert");
8582    }
8583
8584    /// Updating an existing key keeps the BIN dirty.
8585    #[test]
8586    fn test_update_keeps_bin_dirty() {
8587        let tree = Tree::new(1, 128);
8588        tree.insert(b"k".to_vec(), b"v1".to_vec(), Lsn::new(1, 1)).unwrap();
8589        // second insert is an update
8590        tree.insert(b"k".to_vec(), b"v2".to_vec(), Lsn::new(1, 2)).unwrap();
8591
8592        let root_arc = tree.get_root().as_ref().unwrap().clone();
8593        let bin_arc = {
8594            let g = root_arc.read();
8595            match &*g {
8596                TreeNode::Internal(n) => n.get_child(0).unwrap(),
8597                _ => panic!("expected Internal root"),
8598            }
8599        };
8600
8601        assert!(bin_arc.read().is_dirty(), "BIN must be dirty after update");
8602    }
8603
8604    /// After deleting a key the BIN must be dirty.
8605    #[test]
8606    fn test_delete_marks_bin_dirty() {
8607        let tree = Tree::new(1, 128);
8608        tree.insert(b"del".to_vec(), b"val".to_vec(), Lsn::new(1, 1)).unwrap();
8609
8610        // Manually clear dirty flag to verify delete re-sets it.
8611        {
8612            let root_arc = tree.get_root().as_ref().unwrap().clone();
8613            let bin_arc = {
8614                let g = root_arc.read();
8615                match &*g {
8616                    TreeNode::Internal(n) => n.get_child(0).unwrap(),
8617                    _ => panic!("expected Internal root"),
8618                }
8619            };
8620            bin_arc.write().set_dirty(false);
8621            assert!(!bin_arc.read().is_dirty());
8622        }
8623
8624        tree.delete(b"del");
8625
8626        let root_arc = tree.get_root().as_ref().unwrap().clone();
8627        let bin_arc = {
8628            let g = root_arc.read();
8629            match &*g {
8630                TreeNode::Internal(n) => n.get_child(0).unwrap(),
8631                _ => panic!("expected Internal root"),
8632            }
8633        };
8634        assert!(bin_arc.read().is_dirty(), "BIN must be dirty after delete");
8635    }
8636
8637    /// BIN's parent pointer must point to the root IN.
8638    #[test]
8639    fn test_bin_parent_pointer_set_on_initial_insert() {
8640        let tree = Tree::new(1, 128);
8641        tree.insert(b"k".to_vec(), b"v".to_vec(), Lsn::new(1, 1)).unwrap();
8642
8643        let root_arc = tree.get_root().as_ref().unwrap().clone();
8644        let bin_arc = {
8645            let g = root_arc.read();
8646            match &*g {
8647                TreeNode::Internal(n) => n.get_child(0).unwrap(),
8648                _ => panic!("expected Internal root"),
8649            }
8650        };
8651
8652        let parent_weak = bin_arc.read().get_parent();
8653        assert!(parent_weak.is_some(), "BIN must have a parent pointer");
8654
8655        // Upgrading the weak pointer must give us the root arc.
8656        let parent_arc = parent_weak.unwrap().upgrade().unwrap();
8657        assert!(
8658            Arc::ptr_eq(&parent_arc, &root_arc),
8659            "BIN parent must be the root IN"
8660        );
8661    }
8662
8663    /// set_dirty / is_dirty round-trip on both variants.
8664    #[test]
8665    fn test_dirty_flag_roundtrip() {
8666        let mut bin_node = TreeNode::Bottom(BinStub {
8667            node_id: 1,
8668            level: BIN_LEVEL,
8669            entries: vec![],
8670            key_prefix: Vec::new(),
8671            dirty: false,
8672            is_delta: false,
8673            last_full_lsn: NULL_LSN,
8674            last_delta_lsn: NULL_LSN,
8675            generation: 0,
8676            parent: None,
8677            expiration_in_hours: true,
8678            cursor_count: 0,
8679            prohibit_next_delta: false,
8680            lsn_rep: LsnRep::Empty,
8681            keys: KeyRep::new(),
8682            compact_max_key_length: INKeyRep_DEFAULT_MAX_KEY_LENGTH,
8683        });
8684        assert!(!bin_node.is_dirty());
8685        bin_node.set_dirty(true);
8686        assert!(bin_node.is_dirty());
8687        bin_node.set_dirty(false);
8688        assert!(!bin_node.is_dirty());
8689
8690        let mut in_node = TreeNode::Internal(InNodeStub {
8691            node_id: 2,
8692            level: MAIN_LEVEL | 2,
8693            entries: vec![],
8694            targets: TargetRep::None,
8695            dirty: false,
8696            generation: 0,
8697            parent: None,
8698            lsn_rep: LsnRep::Empty,
8699        });
8700        assert!(!in_node.is_dirty());
8701        in_node.set_dirty(true);
8702        assert!(in_node.is_dirty());
8703    }
8704
8705    /// set_generation / get_generation round-trip on both variants.
8706    #[test]
8707    fn test_generation_roundtrip() {
8708        let mut bin_node = TreeNode::Bottom(BinStub {
8709            node_id: 1,
8710            level: BIN_LEVEL,
8711            entries: vec![],
8712            key_prefix: Vec::new(),
8713            dirty: false,
8714            is_delta: false,
8715            last_full_lsn: NULL_LSN,
8716            last_delta_lsn: NULL_LSN,
8717            generation: 0,
8718            parent: None,
8719            expiration_in_hours: true,
8720            cursor_count: 0,
8721            prohibit_next_delta: false,
8722            lsn_rep: LsnRep::Empty,
8723            keys: KeyRep::new(),
8724            compact_max_key_length: INKeyRep_DEFAULT_MAX_KEY_LENGTH,
8725        });
8726        assert_eq!(bin_node.get_generation(), 0);
8727        bin_node.set_generation(42);
8728        assert_eq!(bin_node.get_generation(), 42);
8729
8730        let mut in_node = TreeNode::Internal(InNodeStub {
8731            node_id: 2,
8732            level: MAIN_LEVEL | 2,
8733            entries: vec![],
8734            targets: TargetRep::None,
8735            dirty: false,
8736            generation: 0,
8737            parent: None,
8738            lsn_rep: LsnRep::Empty,
8739        });
8740        in_node.set_generation(99);
8741        assert_eq!(in_node.get_generation(), 99);
8742    }
8743
8744    /// log_size() must be consistent with write_to_bytes() length.
8745    #[test]
8746    fn test_log_size_matches_bytes_len() {
8747        // BIN stub with some entries.
8748        let bin_node = TreeNode::Bottom(BinStub {
8749            node_id: 7,
8750            level: BIN_LEVEL,
8751            entries: vec![
8752                BinEntry {
8753                    data: Some(b"d1".to_vec()),
8754                    known_deleted: false,
8755                    dirty: false,
8756                    expiration_time: 0,
8757                },
8758                BinEntry {
8759                    data: None,
8760                    known_deleted: false,
8761                    dirty: false,
8762                    expiration_time: 0,
8763                },
8764            ],
8765            key_prefix: Vec::new(),
8766            dirty: true,
8767            is_delta: false,
8768            last_full_lsn: NULL_LSN,
8769            last_delta_lsn: NULL_LSN,
8770            generation: 5,
8771            parent: None,
8772            expiration_in_hours: true,
8773            cursor_count: 0,
8774            prohibit_next_delta: false,
8775            lsn_rep: LsnRep::Empty,
8776            keys: KeyRep::from_keys(vec![b"alpha".to_vec(), b"beta".to_vec()]),
8777            compact_max_key_length: INKeyRep_DEFAULT_MAX_KEY_LENGTH,
8778        });
8779        assert_eq!(bin_node.log_size(), bin_node.write_to_bytes().len());
8780
8781        // IN stub with some entries.
8782        let in_node = TreeNode::Internal(InNodeStub {
8783            node_id: 8,
8784            level: MAIN_LEVEL | 2,
8785            entries: vec![
8786                InEntry { key: vec![] },
8787                InEntry { key: b"mid".to_vec() },
8788            ],
8789            targets: TargetRep::None,
8790            dirty: false,
8791            generation: 0,
8792            parent: None,
8793            lsn_rep: LsnRep::Empty,
8794        });
8795        assert_eq!(in_node.log_size(), in_node.write_to_bytes().len());
8796    }
8797
8798    /// write_to_bytes() output contains the node_id and dirty flag.
8799    #[test]
8800    fn test_write_to_bytes_encodes_node_id_and_dirty() {
8801        let node = TreeNode::Bottom(BinStub {
8802            node_id: 0xDEAD_BEEF_0000_0001,
8803            level: BIN_LEVEL,
8804            entries: vec![],
8805            key_prefix: Vec::new(),
8806            dirty: true,
8807            is_delta: false,
8808            last_full_lsn: NULL_LSN,
8809            last_delta_lsn: NULL_LSN,
8810            generation: 0,
8811            parent: None,
8812            expiration_in_hours: true,
8813            cursor_count: 0,
8814            prohibit_next_delta: false,
8815            lsn_rep: LsnRep::Empty,
8816            keys: KeyRep::new(),
8817            compact_max_key_length: INKeyRep_DEFAULT_MAX_KEY_LENGTH,
8818        });
8819        let bytes = node.write_to_bytes();
8820        // First 8 bytes = node_id big-endian.
8821        let id_bytes = &bytes[0..8];
8822        assert_eq!(id_bytes, 0xDEAD_BEEF_0000_0001u64.to_be_bytes());
8823        // Byte at offset 16 (after node_id[8] + level[4] + n_entries[4]) = dirty flag.
8824        assert_eq!(bytes[16], 1u8, "dirty flag must be 1");
8825    }
8826
8827    /// log_size() grows as entries are added.
8828    #[test]
8829    fn test_log_size_grows_with_entries() {
8830        let empty = TreeNode::Bottom(BinStub {
8831            node_id: 1,
8832            level: BIN_LEVEL,
8833            entries: vec![],
8834            key_prefix: Vec::new(),
8835            dirty: false,
8836            is_delta: false,
8837            last_full_lsn: NULL_LSN,
8838            last_delta_lsn: NULL_LSN,
8839            generation: 0,
8840            parent: None,
8841            expiration_in_hours: true,
8842            cursor_count: 0,
8843            prohibit_next_delta: false,
8844            lsn_rep: LsnRep::Empty,
8845            keys: KeyRep::new(),
8846            compact_max_key_length: INKeyRep_DEFAULT_MAX_KEY_LENGTH,
8847        });
8848        let with_entry = TreeNode::Bottom(BinStub {
8849            node_id: 2,
8850            level: BIN_LEVEL,
8851            entries: vec![BinEntry {
8852                data: None,
8853                known_deleted: false,
8854                dirty: false,
8855                expiration_time: 0,
8856            }],
8857            key_prefix: Vec::new(),
8858            dirty: false,
8859            is_delta: false,
8860            last_full_lsn: NULL_LSN,
8861            last_delta_lsn: NULL_LSN,
8862            generation: 0,
8863            parent: None,
8864            expiration_in_hours: true,
8865            cursor_count: 0,
8866            prohibit_next_delta: false,
8867            lsn_rep: LsnRep::Empty,
8868            keys: KeyRep::from_keys(vec![b"longkey_here".to_vec()]),
8869            compact_max_key_length: INKeyRep_DEFAULT_MAX_KEY_LENGTH,
8870        });
8871        assert!(
8872            with_entry.log_size() > empty.log_size(),
8873            "log_size must grow when entries are added"
8874        );
8875    }
8876
8877    /// propagate_dirty_to_root() marks all ancestors dirty.
8878    #[test]
8879    fn test_propagate_dirty_to_root() {
8880        // Build a 2-level tree manually: root IN -> BIN.
8881        let bin_arc = Arc::new(RwLock::new(TreeNode::Bottom(BinStub {
8882            node_id: generate_node_id(),
8883            level: BIN_LEVEL,
8884            entries: vec![],
8885            key_prefix: Vec::new(),
8886            dirty: false,
8887            is_delta: false,
8888            last_full_lsn: NULL_LSN,
8889            last_delta_lsn: NULL_LSN,
8890            generation: 0,
8891            parent: None, // set below
8892            expiration_in_hours: true,
8893            cursor_count: 0,
8894            prohibit_next_delta: false,
8895            lsn_rep: LsnRep::Empty,
8896            keys: KeyRep::new(),
8897            compact_max_key_length: INKeyRep_DEFAULT_MAX_KEY_LENGTH,
8898        })));
8899
8900        let root_arc = Arc::new(RwLock::new(TreeNode::Internal(InNodeStub {
8901            node_id: generate_node_id(),
8902            level: MAIN_LEVEL | 2,
8903            entries: vec![InEntry { key: vec![] }],
8904            targets: TargetRep::Sparse(vec![(0, bin_arc.clone())]),
8905            dirty: false,
8906            generation: 0,
8907            parent: None,
8908            lsn_rep: LsnRep::Empty,
8909        })));
8910
8911        // Wire BIN's parent to root.
8912        bin_arc.write().set_parent(Some(Arc::downgrade(&root_arc)));
8913
8914        // Root is not dirty before propagation.
8915        assert!(!root_arc.read().is_dirty());
8916
8917        // Propagate from the BIN up.
8918        Tree::propagate_dirty_to_root(&bin_arc);
8919
8920        // Root must now be dirty.
8921        assert!(
8922            root_arc.read().is_dirty(),
8923            "root must be dirty after propagate_dirty_to_root"
8924        );
8925    }
8926
8927    /// collect_stats() on an empty tree returns all-zero stats.
8928    #[test]
8929    fn test_collect_stats_empty_tree() {
8930        let tree = Tree::new(1, 128);
8931        let stats = tree.collect_stats();
8932        assert_eq!(stats, TreeStats::default());
8933    }
8934
8935    /// collect_stats() on a single-entry tree: 1 IN + 1 BIN, height 2.
8936    #[test]
8937    fn test_collect_stats_single_insert() {
8938        let tree = Tree::new(1, 128);
8939        tree.insert(b"k".to_vec(), b"v".to_vec(), Lsn::new(1, 1)).unwrap();
8940        let stats = tree.collect_stats();
8941        assert_eq!(stats.n_bins, 1, "must have 1 BIN");
8942        assert_eq!(stats.n_ins, 1, "must have 1 upper IN");
8943        assert_eq!(stats.height, 2, "single-entry tree has height 2");
8944        assert!(stats.n_entries >= 1, "must have at least 1 entry total");
8945    }
8946
8947    /// collect_stats() with many inserts: entry count matches insert count.
8948    #[test]
8949    fn test_collect_stats_many_inserts() {
8950        let tree = Tree::new(1, 8);
8951        let n = 50u32;
8952        for i in 0..n {
8953            let key = format!("sk{:04}", i).into_bytes();
8954            tree.insert(key, b"v".to_vec(), Lsn::new(1, i)).unwrap();
8955        }
8956        let stats = tree.collect_stats();
8957        // All n entries should be accounted for across all BINs.
8958        // n_entries counts entries in both INs and BINs; BIN entries = n.
8959        // We verify BIN entry total equals n by summing manually.
8960        let bin_entries: u64 = stats.n_entries - stats.n_ins; // rough check
8961        // A more precise assertion: the sum of all BIN entries == n.
8962        // Since we can't easily separate, just assert the tree is non-trivial.
8963        assert!(stats.n_bins > 0, "must have at least one BIN");
8964        assert!(stats.height >= 2, "multi-entry tree has height >= 2");
8965        // Total entries in the tree must be >= n (BIN entries alone).
8966        assert!(
8967            bin_entries >= n as u64 || stats.n_entries >= n as u64,
8968            "entry count must account for all inserts"
8969        );
8970    }
8971
8972    // ========================================================================
8973    // Tests: B-tree merge / compress
8974    // ========================================================================
8975
8976    /// After deleting most keys from a tree, compress() must reduce the BIN
8977    /// count by merging under-full siblings.
8978    ///
8979    /// Strategy: build a large tree (many BINs), delete almost all keys,
8980    /// then verify compress() reduces n_bins and all surviving keys remain
8981    /// findable.  We do not hard-code the exact BIN counts because the
8982    /// preemptive splitting strategy determines the exact split points.
8983    #[test]
8984    fn test_compress_merges_underfull_bins() {
8985        let tree = Tree::new(1, 8);
8986
8987        // Insert 64 sorted keys to build a multi-BIN tree.
8988        let n = 64u32;
8989        let keys: Vec<Vec<u8>> =
8990            (0..n).map(|i| format!("cm{:04}", i).into_bytes()).collect();
8991        for (i, key) in keys.iter().enumerate() {
8992            tree.insert(key.clone(), vec![i as u8], Lsn::new(1, i as u32))
8993                .unwrap();
8994        }
8995
8996        let stats_full = tree.collect_stats();
8997        assert!(
8998            stats_full.n_bins >= 2,
8999            "must have multiple BINs after 64 inserts"
9000        );
9001
9002        // Delete all but 4 widely-spaced keys (one roughly per BIN pair).
9003        // We keep every 16th key: k0000, k0016, k0032, k0048.
9004        let keep: std::collections::HashSet<u32> =
9005            [0, 16, 32, 48].iter().cloned().collect();
9006        for i in 0..n {
9007            if !keep.contains(&i) {
9008                let key = format!("cm{:04}", i).into_bytes();
9009                tree.delete(&key);
9010            }
9011        }
9012
9013        let stats_sparse = tree.collect_stats();
9014        assert!(
9015            stats_sparse.n_bins >= 2,
9016            "should still have multiple BINs before compress"
9017        );
9018
9019        // compress() must reduce BIN count since most BINs now hold 0–1 entries.
9020        tree.compress();
9021
9022        let stats_after = tree.collect_stats();
9023        assert!(
9024            stats_after.n_bins < stats_sparse.n_bins,
9025            "compress must reduce BIN count (was {}, now {})",
9026            stats_sparse.n_bins,
9027            stats_after.n_bins
9028        );
9029
9030        // Surviving keys must still be findable.
9031        for i in keep {
9032            let key = format!("cm{:04}", i).into_bytes();
9033            let sr = tree.search(&key);
9034            assert!(
9035                sr.is_some() && sr.unwrap().exact_parent_found,
9036                "key cm{:04} must survive compress",
9037                i
9038            );
9039        }
9040    }
9041
9042    /// compress() preserves all entries: a full-BIN tree has fewer merges
9043    /// but all keys remain accessible.
9044    #[test]
9045    fn test_compress_no_op_when_full() {
9046        // Insert exactly max_entries worth of keys into a single BIN — no split
9047        // will have occurred yet, and the BINs will all be reasonably full.
9048        // We can't prevent splits entirely (preemptive), but we can verify that
9049        // compress() never loses entries.
9050        let tree = Tree::new(1, 8);
9051        let n = 32u32;
9052        for i in 0..n {
9053            let key = format!("fn{:04}", i).into_bytes();
9054            tree.insert(key, vec![i as u8], Lsn::new(1, i)).unwrap();
9055        }
9056
9057        let stats_before = tree.collect_stats();
9058        tree.compress();
9059        let stats_after = tree.collect_stats();
9060
9061        // All keys still findable.
9062        for i in 0..n {
9063            let key = format!("fn{:04}", i).into_bytes();
9064            let sr = tree.search(&key);
9065            assert!(
9066                sr.is_some() && sr.unwrap().exact_parent_found,
9067                "key fn{:04} must be findable after compress",
9068                i
9069            );
9070        }
9071
9072        // BIN count must not increase.
9073        assert!(
9074            stats_after.n_bins <= stats_before.n_bins,
9075            "compress must not increase BIN count"
9076        );
9077    }
9078
9079    /// compress() on an empty tree must not panic.
9080    #[test]
9081    fn test_compress_empty_tree() {
9082        let tree = Tree::new(1, 4);
9083        tree.compress(); // must not panic
9084    }
9085
9086    /// After deleting all entries, compress() reduces BINs to 1.
9087    #[test]
9088    fn test_compress_removes_empty_bin_from_parent() {
9089        let tree = Tree::new(1, 4);
9090        // Insert enough keys to generate multiple BINs.
9091        let n = 16u32;
9092        for i in 0..n {
9093            let key = format!("ep{:04}", i).into_bytes();
9094            tree.insert(key, vec![i as u8], Lsn::new(1, i)).unwrap();
9095        }
9096
9097        let stats_before = tree.collect_stats();
9098        assert!(stats_before.n_bins >= 2, "need multiple BINs for this test");
9099
9100        // Delete everything except the very last key.
9101        for i in 0..n - 1 {
9102            let key = format!("ep{:04}", i).into_bytes();
9103            tree.delete(&key);
9104        }
9105
9106        tree.compress();
9107
9108        let stats_after = tree.collect_stats();
9109        assert!(
9110            stats_after.n_bins < stats_before.n_bins,
9111            "compress must reduce BIN count after mass deletion"
9112        );
9113
9114        // The surviving key must still be findable.
9115        let last_key = format!("ep{:04}", n - 1).into_bytes();
9116        let sr = tree.search(&last_key);
9117        assert!(
9118            sr.is_some() && sr.unwrap().exact_parent_found,
9119            "last key must survive after compress"
9120        );
9121    }
9122
9123    // ========================================================================
9124    // IC-1: prune_empty_bin must NOT remove a live entry when the BIN was
9125    // repopulated between the compressor observing it empty and the prune.
9126    // (Tree corruption / lost-write regression test.)
9127    // ========================================================================
9128
9129    /// Find a BIN arc that is currently empty (0 entries) and is NOT the
9130    /// root, returning it together with the `id_key` the compressor would
9131    /// have captured (here we just use any key that routes to that BIN).
9132    fn first_empty_non_root_bin(tree: &Tree) -> Option<Arc<RwLock<TreeNode>>> {
9133        let root = tree.get_root()?;
9134        for node in tree.rebuild_in_list() {
9135            if Arc::ptr_eq(&node, &root) {
9136                continue; // skip root (single-BIN tree is never pruned)
9137            }
9138            let is_empty_bin = {
9139                let g = node.read();
9140                matches!(&*g, TreeNode::Bottom(b) if b.entries.is_empty())
9141            };
9142            if is_empty_bin {
9143                return Some(node);
9144            }
9145        }
9146        None
9147    }
9148
9149    /// IC-1 (fail-pre / pass-post): the old `compress_bin` prune step called
9150    /// `self.delete(&id_key)`, which re-descends by key.  If a concurrent
9151    /// insert repopulated the empty BIN with a LIVE entry under that same
9152    /// `id_key`, `self.delete` would silently remove the live entry — a lost
9153    /// write.  `prune_empty_bin` re-validates `n_entries == 0` under the
9154    /// parent latch and must REMOVE NOTHING when the BIN is non-empty.
9155    ///
9156    /// JE `Tree.delete` / `searchDeletableSubTree` (Tree.java ~line 755-800):
9157    /// `bin.getNEntries() != 0` → NODE_NOT_EMPTY (abort prune).
9158    #[test]
9159    fn test_ic1_prune_empty_bin_aborts_when_repopulated() {
9160        let tree = Tree::new(1, 4);
9161        let n = 16u32;
9162        for i in 0..n {
9163            let key = format!("ic{:04}", i).into_bytes();
9164            tree.insert(key, vec![i as u8], Lsn::new(1, i)).unwrap();
9165        }
9166        assert!(
9167            tree.collect_stats().n_bins >= 2,
9168            "need multiple BINs for this test"
9169        );
9170
9171        // Empty out one whole BIN by deleting every key it holds.  We delete
9172        // the lowest 4 keys (ic0000..ic0003) which share the first BIN, then
9173        // physically compress it so it has 0 entries.
9174        for i in 0..4 {
9175            let key = format!("ic{:04}", i).into_bytes();
9176            tree.delete(&key);
9177        }
9178
9179        // Locate the now-empty BIN and the id_key the compressor would use.
9180        let empty_bin = match first_empty_non_root_bin(&tree) {
9181            Some(b) => b,
9182            // If the layout didn't leave an isolated empty BIN, the scenario
9183            // isn't reproducible on this build; treat as vacuously passing.
9184            None => return,
9185        };
9186
9187        // SIMULATE THE RACE: a concurrent insert repopulates the empty BIN
9188        // with a LIVE entry *before* the prune runs.  We insert directly into
9189        // the BIN arc to model the insert that lands after `now_empty` was
9190        // read.  Pick a key that routes to this BIN.
9191        let live_key = format!("ic{:04}", 1).into_bytes(); // was deleted above
9192        {
9193            let mut g = empty_bin.write();
9194            if let TreeNode::Bottom(b) = &mut *g {
9195                // T-2/T-3: route through the insert helper so entries/keys/
9196                // lsn_rep stay in lock step.
9197                b.insert_with_prefix(
9198                    live_key.clone(),
9199                    Lsn::new(1, 1),
9200                    Some(vec![0xAB]),
9201                );
9202            }
9203        }
9204        let id_key = {
9205            let g = empty_bin.read();
9206            match &*g {
9207                TreeNode::Bottom(b) => b.get_full_key(0).unwrap(),
9208                _ => unreachable!(),
9209            }
9210        };
9211
9212        // Prune must ABORT (return false) because the BIN is no longer empty,
9213        // and must NOT remove the live entry.
9214        let pruned = tree.prune_empty_bin(&id_key);
9215        assert!(!pruned, "IC-1: prune must abort when the BIN was repopulated");
9216
9217        // The live entry must still be present in the BIN.
9218        let still_there = {
9219            let g = empty_bin.read();
9220            match &*g {
9221                TreeNode::Bottom(b) => {
9222                    b.entries.iter().enumerate().any(|(i, _)| {
9223                        b.key_prefix.is_empty() && b.get_key(i) == live_key
9224                    })
9225                }
9226                _ => false,
9227            }
9228        };
9229        assert!(
9230            still_there,
9231            "IC-1: prune must not remove the repopulated live entry"
9232        );
9233    }
9234
9235    /// IC-1 companion: prune_empty_bin must abort when a cursor is parked on
9236    /// the (still-empty) BIN.  JE: `bin.nCursors() > 0` → CURSORS_EXIST.
9237    #[test]
9238    fn test_ic1_prune_empty_bin_aborts_with_cursor() {
9239        let tree = Tree::new(1, 4);
9240        for i in 0..16u32 {
9241            let key = format!("cu{:04}", i).into_bytes();
9242            tree.insert(key, vec![i as u8], Lsn::new(1, i)).unwrap();
9243        }
9244        for i in 0..4 {
9245            let key = format!("cu{:04}", i).into_bytes();
9246            tree.delete(&key);
9247        }
9248        let empty_bin = match first_empty_non_root_bin(&tree) {
9249            Some(b) => b,
9250            None => return,
9251        };
9252        // Park a cursor on the empty BIN.
9253        Tree::pin_bin(&empty_bin);
9254        // id_key: any key routing to this BIN. Use the first deleted key.
9255        let id_key = format!("cu{:04}", 0).into_bytes();
9256        let pruned = tree.prune_empty_bin(&id_key);
9257        assert!(
9258            !pruned,
9259            "IC-1: prune must abort when a cursor is parked on the BIN"
9260        );
9261        Tree::unpin_bin(&empty_bin);
9262    }
9263
9264    /// IC-1 happy path: prune_empty_bin removes the parent slot when the BIN
9265    /// really is empty, no cursors, not a delta.
9266    #[test]
9267    fn test_ic1_prune_empty_bin_succeeds_when_truly_empty() {
9268        let tree = Tree::new(1, 4);
9269        for i in 0..16u32 {
9270            let key = format!("ok{:04}", i).into_bytes();
9271            tree.insert(key, vec![i as u8], Lsn::new(1, i)).unwrap();
9272        }
9273        for i in 0..4 {
9274            let key = format!("ok{:04}", i).into_bytes();
9275            tree.delete(&key);
9276        }
9277        let bins_before = tree.collect_stats().n_bins;
9278        let empty_bin = match first_empty_non_root_bin(&tree) {
9279            Some(b) => b,
9280            None => return,
9281        };
9282        // id_key: a key that routes to this empty BIN (one of the deleted).
9283        let id_key = {
9284            // route by the lowest deleted key; it falls into the leftmost BIN.
9285            let _ = &empty_bin;
9286            format!("ok{:04}", 0).into_bytes()
9287        };
9288        let pruned = tree.prune_empty_bin(&id_key);
9289        assert!(pruned, "IC-1: prune must succeed on a truly empty BIN");
9290        let bins_after = tree.collect_stats().n_bins;
9291        assert!(
9292            bins_after < bins_before,
9293            "IC-1: pruned BIN slot must be removed from the parent (was {}, now {})",
9294            bins_before,
9295            bins_after
9296        );
9297        // Every surviving key must still be findable.
9298        for i in 4..16u32 {
9299            let key = format!("ok{:04}", i).into_bytes();
9300            assert!(
9301                tree.search(&key).is_some_and(|s| s.exact_parent_found),
9302                "surviving key ok{:04} must remain after prune",
9303                i
9304            );
9305        }
9306    }
9307
9308    // ========================================================================
9309    // Tests: latch-coupling validation (validate_parent_child /
9310    //        search_with_coupling)
9311    // ========================================================================
9312
9313    /// validate_parent_child returns true when the parent slot points at the
9314    /// expected child.
9315    #[test]
9316    fn test_validate_parent_child_correct_link() {
9317        let bin_arc = Arc::new(RwLock::new(TreeNode::Bottom(BinStub {
9318            node_id: generate_node_id(),
9319            level: BIN_LEVEL,
9320            entries: vec![],
9321            key_prefix: Vec::new(),
9322            dirty: false,
9323            is_delta: false,
9324            last_full_lsn: NULL_LSN,
9325            last_delta_lsn: NULL_LSN,
9326            generation: 0,
9327            parent: None,
9328            expiration_in_hours: true,
9329            cursor_count: 0,
9330            prohibit_next_delta: false,
9331            lsn_rep: LsnRep::Empty,
9332            keys: KeyRep::new(),
9333            compact_max_key_length: INKeyRep_DEFAULT_MAX_KEY_LENGTH,
9334        })));
9335
9336        let root_arc = Arc::new(RwLock::new(TreeNode::Internal(InNodeStub {
9337            node_id: generate_node_id(),
9338            level: MAIN_LEVEL | 2,
9339            entries: vec![InEntry { key: vec![] }],
9340            targets: TargetRep::Sparse(vec![(0, bin_arc.clone())]),
9341            dirty: false,
9342            generation: 0,
9343            parent: None,
9344            lsn_rep: LsnRep::Empty,
9345        })));
9346
9347        assert!(
9348            Tree::validate_parent_child(&root_arc, 0, &bin_arc),
9349            "link must be valid when parent slot 0 points at bin_arc"
9350        );
9351    }
9352
9353    /// validate_parent_child returns false when the slot index is out of range.
9354    #[test]
9355    fn test_validate_parent_child_out_of_range() {
9356        let root_arc = Arc::new(RwLock::new(TreeNode::Internal(InNodeStub {
9357            node_id: generate_node_id(),
9358            level: MAIN_LEVEL | 2,
9359            entries: vec![],
9360            targets: TargetRep::None,
9361            dirty: false,
9362            generation: 0,
9363            parent: None,
9364            lsn_rep: LsnRep::Empty,
9365        })));
9366        let other_arc = Arc::new(RwLock::new(TreeNode::Bottom(BinStub {
9367            node_id: generate_node_id(),
9368            level: BIN_LEVEL,
9369            entries: vec![],
9370            key_prefix: Vec::new(),
9371            dirty: false,
9372            is_delta: false,
9373            last_full_lsn: NULL_LSN,
9374            last_delta_lsn: NULL_LSN,
9375            generation: 0,
9376            parent: None,
9377            expiration_in_hours: true,
9378            cursor_count: 0,
9379            prohibit_next_delta: false,
9380            lsn_rep: LsnRep::Empty,
9381            keys: KeyRep::new(),
9382            compact_max_key_length: INKeyRep_DEFAULT_MAX_KEY_LENGTH,
9383        })));
9384
9385        assert!(
9386            !Tree::validate_parent_child(&root_arc, 0, &other_arc),
9387            "link must be invalid when parent has no entries"
9388        );
9389    }
9390
9391    /// validate_parent_child returns false when the slot points at a different Arc.
9392    #[test]
9393    fn test_validate_parent_child_wrong_child() {
9394        let bin_a = Arc::new(RwLock::new(TreeNode::Bottom(BinStub {
9395            node_id: generate_node_id(),
9396            level: BIN_LEVEL,
9397            entries: vec![],
9398            key_prefix: Vec::new(),
9399            dirty: false,
9400            is_delta: false,
9401            last_full_lsn: NULL_LSN,
9402            last_delta_lsn: NULL_LSN,
9403            generation: 0,
9404            parent: None,
9405            expiration_in_hours: true,
9406            cursor_count: 0,
9407            prohibit_next_delta: false,
9408            lsn_rep: LsnRep::Empty,
9409            keys: KeyRep::new(),
9410            compact_max_key_length: INKeyRep_DEFAULT_MAX_KEY_LENGTH,
9411        })));
9412        let bin_b = Arc::new(RwLock::new(TreeNode::Bottom(BinStub {
9413            node_id: generate_node_id(),
9414            level: BIN_LEVEL,
9415            entries: vec![],
9416            key_prefix: Vec::new(),
9417            dirty: false,
9418            is_delta: false,
9419            last_full_lsn: NULL_LSN,
9420            last_delta_lsn: NULL_LSN,
9421            generation: 0,
9422            parent: None,
9423            expiration_in_hours: true,
9424            cursor_count: 0,
9425            prohibit_next_delta: false,
9426            lsn_rep: LsnRep::Empty,
9427            keys: KeyRep::new(),
9428            compact_max_key_length: INKeyRep_DEFAULT_MAX_KEY_LENGTH,
9429        })));
9430
9431        let root_arc = Arc::new(RwLock::new(TreeNode::Internal(InNodeStub {
9432            node_id: generate_node_id(),
9433            level: MAIN_LEVEL | 2,
9434            entries: vec![InEntry { key: vec![] }],
9435            targets: TargetRep::Sparse(vec![(0, bin_a)]),
9436            dirty: false,
9437            generation: 0,
9438            parent: None,
9439            lsn_rep: LsnRep::Empty,
9440        })));
9441
9442        assert!(
9443            !Tree::validate_parent_child(&root_arc, 0, &bin_b),
9444            "link must be invalid when parent slot points at a different Arc"
9445        );
9446    }
9447
9448    /// search_with_coupling finds the same key as search().
9449    #[test]
9450    fn test_search_with_coupling_finds_existing_key() {
9451        let tree = Tree::new(1, 8);
9452        for i in 0u32..20 {
9453            let key = format!("c{:04}", i).into_bytes();
9454            tree.insert(key, vec![i as u8], Lsn::new(1, i)).unwrap();
9455        }
9456
9457        for i in 0u32..20 {
9458            let key = format!("c{:04}", i).into_bytes();
9459            let sr = tree.search_with_coupling(&key);
9460            assert!(
9461                sr.is_some() && sr.unwrap().exact_parent_found,
9462                "search_with_coupling must find c{:04}",
9463                i
9464            );
9465        }
9466    }
9467
9468    /// search_with_coupling returns false for a key not in the tree.
9469    #[test]
9470    fn test_search_with_coupling_missing_key() {
9471        let tree = Tree::new(1, 8);
9472        tree.insert(b"hello".to_vec(), b"v".to_vec(), Lsn::new(1, 1)).unwrap();
9473
9474        let sr = tree.search_with_coupling(b"zzz");
9475        // The search result must either be None or have exact_parent_found=false.
9476        assert!(
9477            sr.is_none_or(|r| !r.exact_parent_found),
9478            "search_with_coupling must not find a key that was never inserted"
9479        );
9480    }
9481
9482    /// search_with_coupling on an empty tree returns None.
9483    #[test]
9484    fn test_search_with_coupling_empty_tree() {
9485        let tree = Tree::new(1, 8);
9486        assert!(tree.search_with_coupling(b"k").is_none());
9487    }
9488
9489    // ========================================================================
9490    // Tests: BIN-delta reconstitution (apply_delta_to_bin / mutate_to_full_bin)
9491    // ========================================================================
9492
9493    /// apply_delta_to_bin replaces existing entries and inserts new ones.
9494    ///
9495    /// BIN.applyDelta(): delta entries are authoritative and
9496    /// supersede full-BIN entries at the same key.
9497    #[test]
9498    fn test_apply_delta_to_bin_updates_and_inserts() {
9499        let mut base = BinStub {
9500            node_id: 1,
9501            level: BIN_LEVEL,
9502            entries: vec![
9503                BinEntry {
9504                    data: Some(b"old_a".to_vec()),
9505                    known_deleted: false,
9506                    dirty: false,
9507                    expiration_time: 0,
9508                },
9509                BinEntry {
9510                    data: Some(b"old_c".to_vec()),
9511                    known_deleted: false,
9512                    dirty: false,
9513                    expiration_time: 0,
9514                },
9515            ],
9516            key_prefix: Vec::new(),
9517            dirty: false,
9518            is_delta: false,
9519            last_full_lsn: NULL_LSN,
9520            last_delta_lsn: NULL_LSN,
9521            generation: 0,
9522            parent: None,
9523            expiration_in_hours: true,
9524            cursor_count: 0,
9525            prohibit_next_delta: false,
9526            lsn_rep: LsnRep::Empty,
9527            keys: KeyRep::from_keys(vec![b"a".to_vec(), b"c".to_vec()]),
9528            compact_max_key_length: INKeyRep_DEFAULT_MAX_KEY_LENGTH,
9529        };
9530
9531        let delta_entries = vec![
9532            // Update existing key "a" with new data.
9533            (b"a".to_vec(), Lsn::new(1, 10), Some(b"new_a".to_vec())),
9534            // Insert new key "b".
9535            (b"b".to_vec(), Lsn::new(1, 20), Some(b"new_b".to_vec())),
9536        ];
9537
9538        Tree::apply_delta_to_bin(&mut base, delta_entries);
9539
9540        assert!(base.dirty, "base must be dirty after applying delta");
9541
9542        // Collect the full keys for assertions (T-2: keys live in the rep).
9543        let full_keys: Vec<Vec<u8>> = (0..base.entries.len())
9544            .map(|i| base.get_full_key(i).unwrap_or_default())
9545            .collect();
9546
9547        // "a" must be updated.
9548        let a_idx = full_keys.iter().position(|k| k == b"a").unwrap();
9549        assert_eq!(
9550            base.entries[a_idx].data.as_deref(),
9551            Some(b"new_a" as &[u8])
9552        );
9553
9554        // "b" must be newly inserted.
9555        assert!(full_keys.iter().any(|k| k == b"b"));
9556
9557        // "c" must still be present (untouched).
9558        assert!(full_keys.iter().any(|k| k == b"c"));
9559
9560        // Entries must be in sorted order.
9561        let mut sorted = full_keys.clone();
9562        sorted.sort();
9563        assert_eq!(
9564            full_keys, sorted,
9565            "entries must remain sorted after delta apply"
9566        );
9567    }
9568
9569    /// apply_delta_to_bin with an empty delta is a no-op (except dirty flag).
9570    #[test]
9571    fn test_apply_delta_to_bin_empty_delta() {
9572        let mut base = BinStub {
9573            node_id: 1,
9574            level: BIN_LEVEL,
9575            entries: vec![BinEntry {
9576                data: None,
9577                known_deleted: false,
9578                dirty: false,
9579                expiration_time: 0,
9580            }],
9581            key_prefix: Vec::new(),
9582            dirty: false,
9583            is_delta: false,
9584            last_full_lsn: NULL_LSN,
9585            last_delta_lsn: NULL_LSN,
9586            generation: 0,
9587            parent: None,
9588            expiration_in_hours: true,
9589            cursor_count: 0,
9590            prohibit_next_delta: false,
9591            lsn_rep: LsnRep::Empty,
9592            keys: KeyRep::from_keys(vec![b"x".to_vec()]),
9593            compact_max_key_length: INKeyRep_DEFAULT_MAX_KEY_LENGTH,
9594        };
9595        let n_before = base.entries.len();
9596        Tree::apply_delta_to_bin(&mut base, vec![]);
9597        assert_eq!(
9598            base.entries.len(),
9599            n_before,
9600            "empty delta must not change entry count"
9601        );
9602        assert!(base.dirty, "dirty must be set even for empty delta apply");
9603    }
9604
9605    /// mutate_to_full_bin reconstitutes a full BIN from a delta + base.
9606    ///
9607    /// BIN.mutateToFullBIN(BIN fullBIN): after mutation the
9608    /// `is_delta` flag must be cleared and the entries must contain both
9609    /// base and delta data.
9610    #[test]
9611    fn test_mutate_to_full_bin_merges_delta_and_base() {
9612        let base = BinStub {
9613            node_id: 2,
9614            level: BIN_LEVEL,
9615            entries: vec![
9616                BinEntry {
9617                    data: Some(b"base_aa".to_vec()),
9618                    known_deleted: false,
9619                    dirty: false,
9620                    expiration_time: 0,
9621                },
9622                BinEntry {
9623                    data: Some(b"base_cc".to_vec()),
9624                    known_deleted: false,
9625                    dirty: false,
9626                    expiration_time: 0,
9627                },
9628            ],
9629            key_prefix: Vec::new(),
9630            dirty: false,
9631            is_delta: false,
9632            last_full_lsn: NULL_LSN,
9633            last_delta_lsn: NULL_LSN,
9634            generation: 0,
9635            parent: None,
9636            expiration_in_hours: true,
9637            cursor_count: 0,
9638            prohibit_next_delta: false,
9639            lsn_rep: LsnRep::Empty,
9640            keys: KeyRep::from_keys(vec![b"aa".to_vec(), b"cc".to_vec()]),
9641            compact_max_key_length: INKeyRep_DEFAULT_MAX_KEY_LENGTH,
9642        };
9643
9644        // The delta has a new entry "bb" and overwrites "aa".
9645        let mut delta = BinStub {
9646            node_id: 2,
9647            level: BIN_LEVEL,
9648            entries: vec![
9649                BinEntry {
9650                    data: Some(b"delta_aa".to_vec()),
9651                    known_deleted: false,
9652                    dirty: false,
9653                    expiration_time: 0,
9654                },
9655                BinEntry {
9656                    data: Some(b"delta_bb".to_vec()),
9657                    known_deleted: false,
9658                    dirty: false,
9659                    expiration_time: 0,
9660                },
9661            ],
9662            key_prefix: Vec::new(),
9663            dirty: true,
9664            is_delta: true,
9665            last_full_lsn: NULL_LSN,
9666            last_delta_lsn: NULL_LSN,
9667            generation: 0,
9668            parent: None,
9669            expiration_in_hours: true,
9670            cursor_count: 0,
9671            prohibit_next_delta: false,
9672            lsn_rep: LsnRep::Empty,
9673            keys: KeyRep::from_keys(vec![b"aa".to_vec(), b"bb".to_vec()]),
9674            compact_max_key_length: INKeyRep_DEFAULT_MAX_KEY_LENGTH,
9675        };
9676
9677        Tree::mutate_to_full_bin(&mut delta, base);
9678
9679        // After mutation the node must be a full BIN.
9680        assert!(
9681            !delta.is_delta,
9682            "is_delta must be false after mutate_to_full_bin"
9683        );
9684        assert!(delta.dirty, "must be dirty after mutation");
9685
9686        // Collect full keys for assertions (T-2: keys live in the rep).
9687        let dk: Vec<Vec<u8>> = (0..delta.entries.len())
9688            .map(|i| delta.get_full_key(i).unwrap_or_default())
9689            .collect();
9690
9691        // "aa" must be the delta version.
9692        let aa_idx = dk.iter().position(|k| k == b"aa").unwrap();
9693        assert_eq!(
9694            delta.entries[aa_idx].data.as_deref(),
9695            Some(b"delta_aa" as &[u8])
9696        );
9697
9698        // "bb" must be present (from delta).
9699        assert!(dk.iter().any(|k| k == b"bb"));
9700
9701        // "cc" must be present (from base).
9702        assert!(dk.iter().any(|k| k == b"cc"));
9703
9704        // Three entries total, in sorted order.
9705        assert_eq!(delta.entries.len(), 3);
9706        let mut sorted = dk.clone();
9707        sorted.sort();
9708        assert_eq!(dk, sorted, "entries must be sorted after mutation");
9709    }
9710
9711    /// is_delta flag is correctly reported by bin_is_delta().
9712    #[test]
9713    fn test_bin_is_delta_flag() {
9714        let mut bin = BinStub {
9715            node_id: 1,
9716            level: BIN_LEVEL,
9717            entries: vec![],
9718            key_prefix: Vec::new(),
9719            dirty: false,
9720            is_delta: false,
9721            last_full_lsn: NULL_LSN,
9722            last_delta_lsn: NULL_LSN,
9723            generation: 0,
9724            parent: None,
9725            expiration_in_hours: true,
9726            cursor_count: 0,
9727            prohibit_next_delta: false,
9728            lsn_rep: LsnRep::Empty,
9729            keys: KeyRep::new(),
9730            compact_max_key_length: INKeyRep_DEFAULT_MAX_KEY_LENGTH,
9731        };
9732        assert!(!Tree::bin_is_delta(&bin));
9733        bin.is_delta = true;
9734        assert!(Tree::bin_is_delta(&bin));
9735    }
9736
9737    // ========================================================================
9738    // Tests: mutate_to_full_bin_from_log
9739    // ========================================================================
9740
9741    /// mutate_to_full_bin_from_log is a no-op when the BIN is already full.
9742    #[test]
9743    fn test_mutate_to_full_bin_from_log_already_full() {
9744        let dir = tempfile::tempdir().unwrap();
9745        let fm = std::sync::Arc::new(
9746            noxu_log::FileManager::new(dir.path(), false, 10_000_000, 100)
9747                .unwrap(),
9748        );
9749        let lm = noxu_log::LogManager::new(fm, 3, 1024 * 1024, 4096);
9750
9751        let mut bin = BinStub {
9752            node_id: 1,
9753            level: BIN_LEVEL,
9754            entries: vec![BinEntry {
9755                data: Some(b"v1".to_vec()),
9756                known_deleted: false,
9757                dirty: false,
9758                expiration_time: 0,
9759            }],
9760            key_prefix: Vec::new(),
9761            dirty: false,
9762            is_delta: false, // already a full BIN
9763            last_full_lsn: NULL_LSN,
9764            last_delta_lsn: NULL_LSN,
9765            generation: 0,
9766            parent: None,
9767            expiration_in_hours: true,
9768            cursor_count: 0,
9769            prohibit_next_delta: false,
9770            lsn_rep: LsnRep::Empty,
9771            keys: KeyRep::from_keys(vec![b"key1".to_vec()]),
9772            compact_max_key_length: INKeyRep_DEFAULT_MAX_KEY_LENGTH,
9773        };
9774
9775        Tree::mutate_to_full_bin_from_log(&mut bin, &lm);
9776
9777        // No-op: is_delta was already false, entries unchanged.
9778        assert!(!bin.is_delta);
9779        assert_eq!(bin.entries.len(), 1);
9780    }
9781
9782    /// mutate_to_full_bin_from_log with NULL_LSN promotes delta without base.
9783    ///
9784    /// When last_full_lsn is NULL_LSN the BIN has never been written as a full
9785    /// entry.  The function must clear is_delta and leave the delta entries
9786    /// as-is (they are the authoritative full state).
9787    #[test]
9788    fn test_mutate_to_full_bin_from_log_null_lsn() {
9789        let dir = tempfile::tempdir().unwrap();
9790        let fm = std::sync::Arc::new(
9791            noxu_log::FileManager::new(dir.path(), false, 10_000_000, 100)
9792                .unwrap(),
9793        );
9794        let lm = noxu_log::LogManager::new(fm, 3, 1024 * 1024, 4096);
9795
9796        let mut delta = BinStub {
9797            node_id: 2,
9798            level: BIN_LEVEL,
9799            entries: vec![BinEntry {
9800                data: Some(b"delta_a".to_vec()),
9801                known_deleted: false,
9802                dirty: true,
9803                expiration_time: 0,
9804            }],
9805            key_prefix: Vec::new(),
9806            dirty: true,
9807            is_delta: true,
9808            last_full_lsn: NULL_LSN, // no full BIN ever written
9809            last_delta_lsn: NULL_LSN,
9810            generation: 0,
9811            parent: None,
9812            expiration_in_hours: true,
9813            cursor_count: 0,
9814            prohibit_next_delta: false,
9815            lsn_rep: LsnRep::Empty,
9816            keys: KeyRep::from_keys(vec![b"a".to_vec()]),
9817            compact_max_key_length: INKeyRep_DEFAULT_MAX_KEY_LENGTH,
9818        };
9819
9820        Tree::mutate_to_full_bin_from_log(&mut delta, &lm);
9821
9822        // is_delta must be cleared; the single delta entry is kept as-is.
9823        assert!(
9824            !delta.is_delta,
9825            "is_delta must be false after null-lsn promotion"
9826        );
9827        assert_eq!(delta.entries.len(), 1);
9828        assert_eq!(delta.entries[0].data.as_deref(), Some(b"delta_a" as &[u8]));
9829    }
9830
9831    /// mutate_to_full_bin_from_log reads full BIN from log and merges delta.
9832    ///
9833    /// Round-trip: serialize a full BIN, write it to a LogManager, record the
9834    /// LSN, then call mutate_to_full_bin_from_log on a delta referencing that
9835    /// LSN.  The result must contain base-only and delta-only entries with the
9836    /// delta winning on conflicts.
9837    #[test]
9838    fn test_mutate_to_full_bin_from_log_reads_and_merges() {
9839        let dir = tempfile::tempdir().unwrap();
9840        let fm = std::sync::Arc::new(
9841            noxu_log::FileManager::new(dir.path(), false, 10_000_000, 100)
9842                .unwrap(),
9843        );
9844        let lm = noxu_log::LogManager::new(fm, 3, 1024 * 1024, 4096);
9845
9846        // Build and serialize the full BIN that will be written to the log.
9847        let full_bin = BinStub {
9848            node_id: 42,
9849            level: BIN_LEVEL,
9850            entries: vec![
9851                BinEntry {
9852                    data: Some(b"base_val".to_vec()),
9853                    known_deleted: false,
9854                    dirty: false,
9855                    expiration_time: 0,
9856                },
9857                BinEntry {
9858                    data: Some(b"base_shared".to_vec()),
9859                    known_deleted: false,
9860                    dirty: false,
9861                    expiration_time: 0,
9862                },
9863            ],
9864            key_prefix: Vec::new(),
9865            dirty: false,
9866            is_delta: false,
9867            last_full_lsn: NULL_LSN,
9868            last_delta_lsn: NULL_LSN,
9869            generation: 0,
9870            parent: None,
9871            expiration_in_hours: true,
9872            cursor_count: 0,
9873            prohibit_next_delta: false,
9874            lsn_rep: LsnRep::Empty,
9875            keys: KeyRep::from_keys(vec![
9876                b"base_only".to_vec(),
9877                b"shared_key".to_vec(),
9878            ]),
9879            compact_max_key_length: INKeyRep_DEFAULT_MAX_KEY_LENGTH,
9880        };
9881
9882        let payload = full_bin.serialize_full();
9883        let full_lsn = lm
9884            .log(
9885                noxu_log::LogEntryType::BIN,
9886                &payload,
9887                noxu_log::Provisional::No,
9888                true,
9889                false,
9890            )
9891            .expect("write full BIN to log");
9892        lm.flush_no_sync().expect("flush log");
9893
9894        // Build a delta BIN referencing the full BIN via last_full_lsn.
9895        let mut delta = BinStub {
9896            node_id: 42,
9897            level: BIN_LEVEL,
9898            entries: vec![
9899                // Overwrites "shared_key" from the base.
9900                BinEntry {
9901                    data: Some(b"delta_shared".to_vec()),
9902                    known_deleted: false,
9903                    dirty: true,
9904                    expiration_time: 0,
9905                },
9906                // New key only in the delta.
9907                BinEntry {
9908                    data: Some(b"delta_val".to_vec()),
9909                    known_deleted: false,
9910                    dirty: true,
9911                    expiration_time: 0,
9912                },
9913            ],
9914            key_prefix: Vec::new(),
9915            dirty: true,
9916            is_delta: true,
9917            last_full_lsn: full_lsn,
9918            last_delta_lsn: NULL_LSN,
9919            generation: 0,
9920            parent: None,
9921            expiration_in_hours: true,
9922            cursor_count: 0,
9923            prohibit_next_delta: false,
9924            lsn_rep: LsnRep::Empty,
9925            keys: KeyRep::from_keys(vec![
9926                b"shared_key".to_vec(),
9927                b"delta_only".to_vec(),
9928            ]),
9929            compact_max_key_length: INKeyRep_DEFAULT_MAX_KEY_LENGTH,
9930        };
9931
9932        Tree::mutate_to_full_bin_from_log(&mut delta, &lm);
9933
9934        assert!(
9935            !delta.is_delta,
9936            "is_delta must be false after log-based mutation"
9937        );
9938        assert!(delta.dirty, "must be dirty after mutation");
9939
9940        // All three distinct keys must be present.
9941        let find = |k: &[u8]| -> Option<Vec<u8>> {
9942            (0..delta.entries.len())
9943                .find(|&i| delta.get_full_key(i).as_deref() == Some(k))
9944                .and_then(|i| delta.entries[i].data.clone())
9945        };
9946
9947        assert_eq!(
9948            find(b"base_only"),
9949            Some(b"base_val".to_vec()),
9950            "base-only key must be present"
9951        );
9952        assert_eq!(
9953            find(b"shared_key"),
9954            Some(b"delta_shared".to_vec()),
9955            "delta must win on shared_key"
9956        );
9957        assert_eq!(
9958            find(b"delta_only"),
9959            Some(b"delta_val".to_vec()),
9960            "delta-only key must be present"
9961        );
9962        assert_eq!(delta.entries.len(), 3, "must have exactly 3 entries");
9963
9964        // Entries must be in sorted order (by full key).
9965        let full_keys: Vec<Vec<u8>> = (0..delta.entries.len())
9966            .map(|i| delta.get_full_key(i).unwrap())
9967            .collect();
9968        let mut sorted_keys = full_keys.clone();
9969        sorted_keys.sort();
9970        assert_eq!(full_keys, sorted_keys, "entries must be in sorted order");
9971    }
9972
9973    // ========================================================================
9974    // Tests: deserialize_full key prefix recomputation
9975    // ========================================================================
9976
9977    /// deserialize_full recomputes key prefix from loaded full keys.
9978    ///
9979    /// IN.recalcKeyPrefix() called after materializing from log:
9980    /// a BIN loaded from the log should have prefix compression applied so
9981    /// that search performance matches an in-memory BIN.
9982    #[test]
9983    fn test_deserialize_full_recomputes_key_prefix() {
9984        // Build a BIN with a known common prefix and serialize it.
9985        let mut source = BinStub {
9986            node_id: 99,
9987            level: BIN_LEVEL,
9988            entries: vec![
9989                BinEntry {
9990                    data: None,
9991                    known_deleted: false,
9992                    dirty: false,
9993                    expiration_time: 0,
9994                },
9995                BinEntry {
9996                    data: None,
9997                    known_deleted: false,
9998                    dirty: false,
9999                    expiration_time: 0,
10000                },
10001                BinEntry {
10002                    data: None,
10003                    known_deleted: false,
10004                    dirty: false,
10005                    expiration_time: 0,
10006                },
10007            ],
10008            key_prefix: Vec::new(),
10009            dirty: false,
10010            is_delta: false,
10011            last_full_lsn: NULL_LSN,
10012            last_delta_lsn: NULL_LSN,
10013            generation: 0,
10014            parent: None,
10015            expiration_in_hours: true,
10016            cursor_count: 0,
10017            prohibit_next_delta: false,
10018            lsn_rep: LsnRep::Empty,
10019            keys: KeyRep::from_keys(vec![
10020                b"pfx:alpha".to_vec(),
10021                b"pfx:beta".to_vec(),
10022                b"pfx:gamma".to_vec(),
10023            ]),
10024            compact_max_key_length: INKeyRep_DEFAULT_MAX_KEY_LENGTH,
10025        };
10026        source.recompute_key_prefix();
10027        // Verify the source has the expected prefix before serializing.
10028        assert_eq!(source.key_prefix, b"pfx:");
10029
10030        let payload = source.serialize_full();
10031
10032        // Deserialize and verify prefix is re-established.
10033        let loaded = BinStub::deserialize_full(&payload)
10034            .expect("deserialization must succeed");
10035
10036        assert_eq!(
10037            loaded.key_prefix, b"pfx:",
10038            "key prefix must be recomputed after deserialize_full"
10039        );
10040
10041        // All full keys must be reconstructable.
10042        for i in 0..loaded.entries.len() {
10043            let fk = loaded.get_full_key(i).unwrap();
10044            assert!(
10045                fk.starts_with(b"pfx:"),
10046                "full key {i} must start with prefix"
10047            );
10048        }
10049    }
10050
10051    /// deserialize_full with a single entry leaves key_prefix empty.
10052    ///
10053    /// A BIN with fewer than 2 entries cannot have a meaningful common prefix.
10054    #[test]
10055    fn test_deserialize_full_single_entry_no_prefix() {
10056        let source = BinStub {
10057            node_id: 7,
10058            level: BIN_LEVEL,
10059            entries: vec![BinEntry {
10060                data: None,
10061                known_deleted: false,
10062                dirty: false,
10063                expiration_time: 0,
10064            }],
10065            key_prefix: Vec::new(),
10066            dirty: false,
10067            is_delta: false,
10068            last_full_lsn: NULL_LSN,
10069            last_delta_lsn: NULL_LSN,
10070            generation: 0,
10071            parent: None,
10072            expiration_in_hours: true,
10073            cursor_count: 0,
10074            prohibit_next_delta: false,
10075            lsn_rep: LsnRep::Empty,
10076            keys: KeyRep::from_keys(vec![b"solo".to_vec()]),
10077            compact_max_key_length: INKeyRep_DEFAULT_MAX_KEY_LENGTH,
10078        };
10079
10080        let payload = source.serialize_full();
10081        let loaded = BinStub::deserialize_full(&payload)
10082            .expect("deserialization must succeed");
10083
10084        assert!(
10085            loaded.key_prefix.is_empty(),
10086            "single-entry BIN must have empty prefix"
10087        );
10088        assert_eq!(loaded.get_full_key(0).unwrap(), b"solo");
10089    }
10090
10091    // ========================================================================
10092    // Tests: get_next_bin / get_prev_bin
10093    // ========================================================================
10094
10095    /// get_next_bin returns the entries of the next BIN to the right.
10096    ///
10097    /// Tree.getNextBin() / getNextIN(forward=true).
10098    #[test]
10099    fn test_get_next_bin_basic() {
10100        let tree = Tree::new(1, 4);
10101
10102        // Insert 8 sorted keys — creates multiple BINs.
10103        for i in 0u32..8 {
10104            let key = format!("n{:04}", i).into_bytes();
10105            tree.insert(key, vec![i as u8], Lsn::new(1, i)).unwrap();
10106        }
10107
10108        let stats = tree.collect_stats();
10109        if stats.n_bins < 2 {
10110            // If the tree only has one BIN, skip the sibling test.
10111            return;
10112        }
10113
10114        // A key from the first BIN (e.g. "n0000") should have a next BIN.
10115        let next = tree.get_next_bin(b"n0000");
10116        assert!(
10117            next.is_some(),
10118            "must return a next BIN for a key in the leftmost BIN"
10119        );
10120
10121        let entries = next.unwrap();
10122        assert!(!entries.is_empty(), "next BIN must not be empty");
10123        // All returned keys must be strictly greater than "n0000" because they
10124        // are in a different (rightward) BIN.
10125        for (_, _, k) in &entries {
10126            assert!(
10127                k.as_slice() > b"n0000" as &[u8],
10128                "next BIN entries must all be > the search key"
10129            );
10130        }
10131    }
10132
10133    /// get_next_bin returns None for a key in the rightmost BIN.
10134    #[test]
10135    fn test_get_next_bin_at_rightmost_returns_none() {
10136        let tree = Tree::new(1, 4);
10137        for i in 0u32..8 {
10138            let key = format!("r{:04}", i).into_bytes();
10139            tree.insert(key, vec![i as u8], Lsn::new(1, i)).unwrap();
10140        }
10141        // A key from the rightmost BIN (e.g. "r0007") has no next BIN.
10142        let next = tree.get_next_bin(b"r0007");
10143        assert!(
10144            next.is_none(),
10145            "must return None for a key in the rightmost BIN"
10146        );
10147    }
10148
10149    /// get_prev_bin returns the entries of the next BIN to the left.
10150    ///
10151    /// Tree.getPrevBin() / getNextIN(forward=false).
10152    #[test]
10153    fn test_get_prev_bin_basic() {
10154        let tree = Tree::new(1, 4);
10155        for i in 0u32..8 {
10156            let key = format!("p{:04}", i).into_bytes();
10157            tree.insert(key, vec![i as u8], Lsn::new(1, i)).unwrap();
10158        }
10159
10160        // A key from the second BIN ("p0004") should have a previous BIN.
10161        let prev = tree.get_prev_bin(b"p0004");
10162        assert!(
10163            prev.is_some(),
10164            "must return a prev BIN for a key in the second BIN"
10165        );
10166
10167        let entries = prev.unwrap();
10168        assert!(!entries.is_empty(), "prev BIN must not be empty");
10169        // All returned keys must be < b"p0004".
10170        for (_, _, k) in &entries {
10171            assert!(
10172                k.as_slice() < b"p0004" as &[u8],
10173                "prev BIN entries must all be < the current BIN"
10174            );
10175        }
10176    }
10177
10178    /// get_prev_bin returns None for a key in the leftmost BIN.
10179    #[test]
10180    fn test_get_prev_bin_at_leftmost_returns_none() {
10181        let tree = Tree::new(1, 4);
10182        for i in 0u32..8 {
10183            let key = format!("q{:04}", i).into_bytes();
10184            tree.insert(key, vec![i as u8], Lsn::new(1, i)).unwrap();
10185        }
10186        // A key from the leftmost BIN ("q0000") has no prev BIN.
10187        let prev = tree.get_prev_bin(b"q0000");
10188        assert!(
10189            prev.is_none(),
10190            "must return None for a key in the leftmost BIN"
10191        );
10192    }
10193
10194    /// get_next_bin and get_prev_bin are inverse operations across the
10195    /// BIN boundary.
10196    #[test]
10197    fn test_next_prev_bin_are_symmetric() {
10198        let tree = Tree::new(1, 4);
10199        for i in 0u32..8 {
10200            let key = format!("s{:04}", i).into_bytes();
10201            tree.insert(key, vec![i as u8], Lsn::new(1, i)).unwrap();
10202        }
10203
10204        // From first BIN (s0000): next → second BIN entries.
10205        let next_from_first = tree.get_next_bin(b"s0000").unwrap();
10206        // The smallest key of the next BIN.
10207        let next_first_key =
10208            next_from_first.iter().map(|(_, _, k)| k.clone()).min().unwrap();
10209
10210        // From that key in the second BIN: prev → should overlap with first BIN.
10211        let prev_from_second = tree.get_prev_bin(&next_first_key).unwrap();
10212        let prev_first_key =
10213            prev_from_second.iter().map(|(_, _, k)| k.clone()).max().unwrap();
10214
10215        // The max key of the "prev" result must be in the first BIN (< next boundary).
10216        assert!(
10217            prev_first_key < next_first_key,
10218            "prev BIN entries must be smaller than the boundary key"
10219        );
10220    }
10221
10222    /// get_next_bin on an empty tree returns None.
10223    #[test]
10224    fn test_get_next_bin_empty_tree() {
10225        let tree = Tree::new(1, 8);
10226        assert!(tree.get_next_bin(b"any").is_none());
10227    }
10228
10229    /// get_prev_bin on an empty tree returns None.
10230    #[test]
10231    fn test_get_prev_bin_empty_tree() {
10232        let tree = Tree::new(1, 8);
10233        assert!(tree.get_prev_bin(b"any").is_none());
10234    }
10235
10236    // =========================================================================
10237    // R3 fix: get_next_bin / get_prev_bin honour the custom comparator
10238    // =========================================================================
10239
10240    /// R3 regression test: with a custom comparator that reverses byte order
10241    /// (descending), `get_next_bin` and `get_prev_bin` must use comparator
10242    /// order when routing through internal nodes.
10243    ///
10244    /// Pre-fix: the static `get_adjacent_bin_attempt` used raw `<=` byte order
10245    /// for IN routing, causing it to descend to the wrong child when comparator
10246    /// order ≠ byte order.
10247    ///
10248    /// The tree is forced to split (max_entries = 4) so there IS an internal
10249    /// node (IN) to route through. Under a reverse comparator the insertion
10250    /// order and stored key order are reversed relative to byte order, so any
10251    /// descent that uses raw byte comparison will pick the wrong slot.
10252    ///
10253    /// Pass-post invariant: iterating forward via repeated `get_next_bin` from
10254    /// the leftmost BIN yields keys in COMPARATOR order (descending byte order
10255    /// here), not in raw ascending byte order.
10256    #[test]
10257    fn test_get_next_prev_bin_custom_comparator_order() {
10258        // Reverse-order comparator: larger bytes sort first.
10259        let reverse_cmp: KeyComparatorFn =
10260            Arc::new(|a: &[u8], b: &[u8]| b.cmp(a));
10261        // Small max_entries so the tree splits and has internal nodes.
10262        let mut tree = Tree::new(1, 4);
10263        tree.set_comparator(reverse_cmp);
10264
10265        // Insert keys that are ascending in byte order ("a" < "b" < … < "i")
10266        // but descending in comparator order (i > h > … > a).
10267        let keys: &[&[u8]] =
10268            &[b"a", b"b", b"c", b"d", b"e", b"f", b"g", b"h", b"i"];
10269        for (i, k) in keys.iter().enumerate() {
10270            tree.insert(
10271                k.to_vec(),
10272                vec![i as u8],
10273                Lsn::from_u64((i + 1) as u64),
10274            )
10275            .unwrap();
10276        }
10277
10278        // Collect all BINs by walking from the comparator-smallest key ("i"
10279        // in reverse order) using get_next_bin. The anchor must be a key that
10280        // is smaller than everything in comparator order, i.e. the largest
10281        // byte-value key. We use the tree's search to find the actual leftmost
10282        // key under the comparator by starting from "i" (comparator-min).
10283        //
10284        // Strategy: start at byte key b"\xff" (larger than any inserted key in
10285        // byte order, so it lands in the last BIN in byte order, which under
10286        // a reverse comparator is the leftmost BIN in comparator order). Then
10287        // walk via get_next_bin.
10288        let start_anchor = b"\xff".as_ref();
10289        let mut bin_first_keys: Vec<Vec<u8>> = Vec::new();
10290
10291        // The first BIN in comparator order contains "i" (largest byte key).
10292        // get_next_bin from a virtual start in that BIN gives the next one.
10293        // Collect by walking from the comparator-last key leftward instead:
10294        // use get_next_bin with anchor = b"\xff" to hop to the next BIN
10295        // (comparator order: next = smaller byte value).
10296        let mut anchor = start_anchor.to_vec();
10297        loop {
10298            match tree.get_next_bin(&anchor) {
10299                None => break,
10300                Some(entries) => {
10301                    if let Some((_, _, fk0)) = entries.first() {
10302                        let fk = fk0.clone();
10303                        bin_first_keys.push(fk.clone());
10304                        anchor = fk;
10305                    } else {
10306                        break;
10307                    }
10308                }
10309            }
10310        }
10311
10312        // We must have visited at least 2 BINs (tree was forced to split).
10313        assert!(
10314            bin_first_keys.len() >= 2,
10315            "R3: expected multiple BINs after split, got {}",
10316            bin_first_keys.len()
10317        );
10318
10319        // With a reverse comparator, bin_first_keys must be in descending byte
10320        // order (each successive BIN starts at a smaller byte key).
10321        for window in bin_first_keys.windows(2) {
10322            assert!(
10323                window[0] > window[1],
10324                "R3: BIN boundary keys must be descending (comparator order); \
10325                 got {:?} then {:?}",
10326                window[0],
10327                window[1]
10328            );
10329        }
10330    }
10331    // ========================================================================
10332
10333    /// Inserting keys with a common prefix causes the BIN to establish that
10334    /// prefix.  Stored suffixes are shorter than the full keys.
10335    #[test]
10336    fn test_binstub_prefix_established_on_insert() {
10337        let mut bin = BinStub {
10338            node_id: 1,
10339            level: BIN_LEVEL,
10340            entries: Vec::new(),
10341            key_prefix: Vec::new(),
10342            dirty: false,
10343            is_delta: false,
10344            last_full_lsn: NULL_LSN,
10345            last_delta_lsn: NULL_LSN,
10346            generation: 0,
10347            parent: None,
10348            expiration_in_hours: true,
10349            cursor_count: 0,
10350            prohibit_next_delta: false,
10351            lsn_rep: LsnRep::Empty,
10352            keys: KeyRep::new(),
10353            compact_max_key_length: INKeyRep_DEFAULT_MAX_KEY_LENGTH,
10354        };
10355
10356        bin.insert_with_prefix(b"record:aaa".to_vec(), Lsn::new(1, 1), None);
10357        assert!(bin.key_prefix.is_empty(), "single entry: no prefix yet");
10358
10359        bin.insert_with_prefix(b"record:bbb".to_vec(), Lsn::new(1, 2), None);
10360        assert_eq!(
10361            &bin.key_prefix, b"record:",
10362            "common prefix 'record:' must be extracted"
10363        );
10364    }
10365
10366    /// `get_full_key` on a BinStub returns the full key regardless of whether
10367    /// the stored key is a raw full key or a suffix.
10368    #[test]
10369    fn test_binstub_get_full_key_roundtrip() {
10370        let mut bin = BinStub {
10371            node_id: 1,
10372            level: BIN_LEVEL,
10373            entries: Vec::new(),
10374            key_prefix: Vec::new(),
10375            dirty: false,
10376            is_delta: false,
10377            last_full_lsn: NULL_LSN,
10378            last_delta_lsn: NULL_LSN,
10379            generation: 0,
10380            parent: None,
10381            expiration_in_hours: true,
10382            cursor_count: 0,
10383            prohibit_next_delta: false,
10384            lsn_rep: LsnRep::Empty,
10385            keys: KeyRep::new(),
10386            compact_max_key_length: INKeyRep_DEFAULT_MAX_KEY_LENGTH,
10387        };
10388
10389        let keys = [
10390            b"pfx:first".as_ref(),
10391            b"pfx:second".as_ref(),
10392            b"pfx:third".as_ref(),
10393        ];
10394        for k in keys {
10395            bin.insert_with_prefix(k.to_vec(), Lsn::new(1, 1), None);
10396        }
10397
10398        assert!(!bin.key_prefix.is_empty(), "prefix must be set");
10399
10400        for (i, expected) in keys.iter().enumerate() {
10401            let full = bin.get_full_key(i).expect("must return full key");
10402            assert_eq!(
10403                full.as_slice(),
10404                *expected,
10405                "get_full_key({}) must return full key",
10406                i
10407            );
10408        }
10409    }
10410
10411    /// `find_entry_compressed` on a BinStub with active prefix returns the
10412    /// correct slot index.
10413    #[test]
10414    fn test_binstub_find_entry_compressed() {
10415        let mut bin = BinStub {
10416            node_id: 1,
10417            level: BIN_LEVEL,
10418            entries: Vec::new(),
10419            key_prefix: Vec::new(),
10420            dirty: false,
10421            is_delta: false,
10422            last_full_lsn: NULL_LSN,
10423            last_delta_lsn: NULL_LSN,
10424            generation: 0,
10425            parent: None,
10426            expiration_in_hours: true,
10427            cursor_count: 0,
10428            prohibit_next_delta: false,
10429            lsn_rep: LsnRep::Empty,
10430            keys: KeyRep::new(),
10431            compact_max_key_length: INKeyRep_DEFAULT_MAX_KEY_LENGTH,
10432        };
10433
10434        for k in
10435            [b"db:alpha".as_ref(), b"db:beta".as_ref(), b"db:gamma".as_ref()]
10436        {
10437            bin.insert_with_prefix(k.to_vec(), Lsn::new(1, 1), None);
10438        }
10439
10440        let (idx, found) = bin.find_entry_compressed(b"db:beta");
10441        assert!(found, "db:beta must be found");
10442        assert_eq!(idx, 1, "db:beta must be at index 1");
10443
10444        let (_, not_found) = bin.find_entry_compressed(b"db:zzz");
10445        assert!(!not_found, "db:zzz must not be found");
10446    }
10447
10448    /// Tree insert/search works correctly when BINs accumulate a key prefix.
10449    #[test]
10450    fn test_tree_insert_search_with_prefix_compression() {
10451        let tree = Tree::new(1, 8);
10452        let n = 200u32;
10453
10454        // All keys share a long common prefix — good for prefix compression.
10455        for i in 0..n {
10456            let key = format!("namespace:entity:{:06}", i).into_bytes();
10457            let data = vec![i as u8];
10458            tree.insert(key, data, Lsn::new(1, i)).unwrap();
10459        }
10460
10461        // All keys must be findable.
10462        for i in 0..n {
10463            let key = format!("namespace:entity:{:06}", i).into_bytes();
10464            let sr = tree.search(&key);
10465            assert!(
10466                sr.is_some() && sr.unwrap().exact_parent_found,
10467                "key namespace:entity:{:06} must be found",
10468                i
10469            );
10470        }
10471    }
10472
10473    /// Prefix survives a BIN split: keys in both halves must still be findable.
10474    #[test]
10475    fn test_prefix_preserved_across_bin_split() {
10476        // Small fanout to force splits quickly.
10477        let tree = Tree::new(1, 4);
10478
10479        for i in 0u32..20 {
10480            let key = format!("pfx:key:{:04}", i).into_bytes();
10481            tree.insert(key, vec![i as u8], Lsn::new(1, i)).unwrap();
10482        }
10483
10484        // All keys must be findable after splits.
10485        for i in 0u32..20 {
10486            let key = format!("pfx:key:{:04}", i).into_bytes();
10487            let sr = tree.search(&key);
10488            assert!(
10489                sr.is_some() && sr.unwrap().exact_parent_found,
10490                "pfx:key:{:04} must be found after splits",
10491                i
10492            );
10493        }
10494    }
10495
10496    /// `decompress_key` round-trips: compress then decompress gives the original.
10497    #[test]
10498    fn test_binstub_compress_decompress_roundtrip() {
10499        let mut bin = BinStub {
10500            node_id: 1,
10501            level: BIN_LEVEL,
10502            entries: Vec::new(),
10503            key_prefix: Vec::new(),
10504            dirty: false,
10505            is_delta: false,
10506            last_full_lsn: NULL_LSN,
10507            last_delta_lsn: NULL_LSN,
10508            generation: 0,
10509            parent: None,
10510            expiration_in_hours: true,
10511            cursor_count: 0,
10512            prohibit_next_delta: false,
10513            lsn_rep: LsnRep::Empty,
10514            keys: KeyRep::new(),
10515            compact_max_key_length: INKeyRep_DEFAULT_MAX_KEY_LENGTH,
10516        };
10517
10518        for k in [b"myapp:user:1".as_ref(), b"myapp:user:2".as_ref()] {
10519            bin.insert_with_prefix(k.to_vec(), Lsn::new(1, 1), None);
10520        }
10521
10522        assert!(!bin.key_prefix.is_empty());
10523
10524        // Manually compress a full key and then decompress it.
10525        let full_key = b"myapp:user:3";
10526        let suffix = bin.compress_key(full_key);
10527        let recovered = bin.decompress_key(&suffix);
10528        assert_eq!(
10529            recovered.as_slice(),
10530            full_key,
10531            "compress→decompress must be identity"
10532        );
10533    }
10534
10535    /// get_next_bin correctly navigates a 3-level tree.
10536    #[test]
10537    fn test_get_next_bin_three_level_tree() {
10538        // With fanout 4, inserting 20 keys forces a root split → 3 levels.
10539        let tree = Tree::new(1, 4);
10540        for i in 0u32..20 {
10541            let key = format!("t{:04}", i).into_bytes();
10542            tree.insert(key, vec![i as u8], Lsn::new(1, i)).unwrap();
10543        }
10544        assert!(tree.get_root_splits() > 0, "tree must have grown to 3 levels");
10545
10546        // Starting from t0000, iterating via get_next_bin must visit every BIN.
10547        let mut visited: Vec<Vec<u8>> = Vec::new();
10548        // Collect the first BIN's keys by searching for t0000.
10549        if let Some(first_entries) = {
10550            // Get the leftmost BIN by using get_first_node result.
10551            // get_first_node returns SearchResult at index 0 in the leftmost BIN.
10552            // We approximate by reading the root's leftmost BIN directly.
10553            tree.get_next_bin(b"t0000")
10554        } {
10555            for (_, _, k) in first_entries {
10556                visited.push(k);
10557            }
10558        }
10559
10560        // visited should contain at least one key from the second BIN.
10561        assert!(
10562            !visited.is_empty(),
10563            "should have visited at least one key via get_next_bin in 3-level tree"
10564        );
10565    }
10566
10567    // ========================================================================
10568    // ========================================================================
10569
10570    /// insert a small set of keys
10571    /// with varying lengths and verify each is findable immediately after insert.
10572    #[test]
10573    fn test_je_simple_tree_creation() {
10574        let tree = Tree::new(1, 128);
10575
10576        let keys: &[&[u8]] = &[b"aaaaa", b"aaaab", b"aaaa", b"aaa"];
10577        for (i, &k) in keys.iter().enumerate() {
10578            tree.insert(k.to_vec(), vec![i as u8], Lsn::new(1, i as u32))
10579                .unwrap();
10580
10581            // Every key inserted so far must be findable.
10582            for &prev in &keys[..=i] {
10583                let sr = tree.search(prev);
10584                assert!(
10585                    sr.is_some() && sr.unwrap().exact_parent_found,
10586                    "key {:?} must be findable after {} inserts",
10587                    std::str::from_utf8(prev).unwrap_or("?"),
10588                    i + 1
10589                );
10590            }
10591        }
10592    }
10593
10594    /// insert N keys, verify
10595    /// all are found; delete the even-indexed keys, verify even are gone and
10596    /// odd remain.
10597    #[test]
10598    fn test_je_insert_then_delete_then_search() {
10599        let tree = Tree::new(1, 8);
10600        let n = 20usize;
10601
10602        let keys: Vec<Vec<u8>> =
10603            (0..n).map(|i| format!("key{:04}", i).into_bytes()).collect();
10604
10605        // Insert all.
10606        for (i, k) in keys.iter().enumerate() {
10607            tree.insert(k.clone(), vec![i as u8], Lsn::new(1, i as u32))
10608                .unwrap();
10609        }
10610
10611        // All must be findable.
10612        for k in &keys {
10613            let sr = tree.search(k);
10614            assert!(
10615                sr.is_some() && sr.unwrap().exact_parent_found,
10616                "key {:?} must be found after insert",
10617                std::str::from_utf8(k).unwrap_or("?")
10618            );
10619        }
10620
10621        // Delete even-indexed keys.
10622        for i in (0..n).step_by(2) {
10623            tree.delete(&keys[i]);
10624        }
10625
10626        // Even keys must no longer be found; odd keys must still be found.
10627        for (i, key) in keys.iter().enumerate() {
10628            let sr = tree.search(key);
10629            let found = sr.is_some() && sr.unwrap().exact_parent_found;
10630            if i % 2 == 0 {
10631                assert!(!found, "deleted key {:?} must not be found", i);
10632            } else {
10633                assert!(found, "kept key {:?} must still be found", i);
10634            }
10635        }
10636    }
10637
10638    /// insert N keys in reverse
10639    /// order, then verify every key is directly findable and the keys are in
10640    /// sorted ascending order (B-tree ordering invariant).
10641    #[test]
10642    fn test_je_range_scan_sorted_ascending() {
10643        let n = 40usize;
10644        let tree = Tree::new(1, 4);
10645
10646        // Insert in reverse order to stress the B-tree.
10647        for i in (0..n).rev() {
10648            let key = format!("scan{:04}", i).into_bytes();
10649            tree.insert(key, vec![i as u8], Lsn::new(1, i as u32)).unwrap();
10650        }
10651
10652        // Collect all expected keys in sorted order.
10653        let mut expected: Vec<Vec<u8>> =
10654            (0..n).map(|i| format!("scan{:04}", i).into_bytes()).collect();
10655        expected.sort();
10656
10657        // Every key must be individually findable.
10658        for key in &expected {
10659            let sr = tree.search(key);
10660            assert!(
10661                sr.is_some() && sr.unwrap().exact_parent_found,
10662                "key {:?} must be findable",
10663                std::str::from_utf8(key).unwrap_or("?")
10664            );
10665        }
10666
10667        // Verify sorted ordering invariant: expected keys are already sorted
10668        // (lexicographic order = insertion order for "scan{:04}" keys).
10669        for w in expected.windows(2) {
10670            assert!(
10671                w[0] < w[1],
10672                "keys must be in strict ascending order: {:?} < {:?}",
10673                std::str::from_utf8(&w[0]).unwrap_or("?"),
10674                std::str::from_utf8(&w[1]).unwrap_or("?")
10675            );
10676        }
10677
10678        // Use get_next_bin to scan at least a portion of the tree and verify
10679        // ordering of returned BIN entries.
10680        let first_key = format!("scan{:04}", 0).into_bytes();
10681        if let Some(entries) = tree.get_next_bin(&first_key) {
10682            let entry_keys: Vec<&[u8]> =
10683                entries.iter().map(|(_, _, k)| k.as_slice()).collect();
10684            for w in entry_keys.windows(2) {
10685                assert!(
10686                    w[0] <= w[1],
10687                    "BIN entries from get_next_bin must be in ascending order"
10688                );
10689            }
10690        }
10691    }
10692
10693    /// insert N keys in
10694    /// ascending order and verify the tree height stays bounded (≤ 10 levels)
10695    /// and all keys are findable.
10696    #[test]
10697    fn test_je_ascending_insert_balance() {
10698        let n = 128usize;
10699        let tree = Tree::new(1, 8);
10700
10701        for i in 0..n {
10702            let key = format!("asc{:06}", i).into_bytes();
10703            tree.insert(key, vec![(i & 0xFF) as u8], Lsn::new(1, i as u32))
10704                .unwrap();
10705        }
10706
10707        let stats = tree.collect_stats();
10708        assert!(
10709            stats.height <= 10,
10710            "tree height after {} ascending inserts with fanout 8 must be <= 10, got {}",
10711            n,
10712            stats.height
10713        );
10714
10715        for i in 0..n {
10716            let key = format!("asc{:06}", i).into_bytes();
10717            let sr = tree.search(&key);
10718            assert!(
10719                sr.is_some() && sr.unwrap().exact_parent_found,
10720                "key asc{:06} must be findable after ascending inserts",
10721                i
10722            );
10723        }
10724    }
10725
10726    /// insert N keys in
10727    /// descending order and verify the tree height stays bounded (≤ 10 levels)
10728    /// and all keys are findable.
10729    #[test]
10730    fn test_je_descending_insert_balance() {
10731        let n = 128usize;
10732        let tree = Tree::new(1, 8);
10733
10734        for i in (0..n).rev() {
10735            let key = format!("dsc{:06}", i).into_bytes();
10736            tree.insert(key, vec![(i & 0xFF) as u8], Lsn::new(1, i as u32))
10737                .unwrap();
10738        }
10739
10740        let stats = tree.collect_stats();
10741        assert!(
10742            stats.height <= 10,
10743            "tree height after {} descending inserts with fanout 8 must be <= 10, got {}",
10744            n,
10745            stats.height
10746        );
10747
10748        for i in 0..n {
10749            let key = format!("dsc{:06}", i).into_bytes();
10750            let sr = tree.search(&key);
10751            assert!(
10752                sr.is_some() && sr.unwrap().exact_parent_found,
10753                "key dsc{:06} must be findable after descending inserts",
10754                i
10755            );
10756        }
10757    }
10758
10759    /// SplitTest invariant: after many splits induced by a small
10760    /// fanout no key is lost.
10761    #[test]
10762    fn test_je_split_no_key_lost() {
10763        let tree = Tree::new(1, 4);
10764        let n = 20usize;
10765
10766        for i in 0..n {
10767            let key = format!("sp{:04}", i).into_bytes();
10768            tree.insert(key, vec![i as u8], Lsn::new(1, i as u32)).unwrap();
10769        }
10770
10771        for i in 0..n {
10772            let key = format!("sp{:04}", i).into_bytes();
10773            let sr = tree.search(&key);
10774            assert!(
10775                sr.is_some() && sr.unwrap().exact_parent_found,
10776                "key sp{:04} must survive all splits",
10777                i
10778            );
10779        }
10780    }
10781
10782    /// SplitTest invariant: after a BIN split both halves exist and
10783    /// all original keys are findable.
10784    #[test]
10785    fn test_je_split_produces_two_halves() {
10786        // fanout=4: fill one BIN then overflow it to force a split.
10787        let tree = Tree::new(1, 4);
10788        let n = 5usize; // one more than fanout → forces at least one split
10789
10790        for i in 0..n {
10791            let key = format!("half{:04}", i).into_bytes();
10792            tree.insert(key, vec![i as u8], Lsn::new(1, i as u32)).unwrap();
10793        }
10794
10795        let stats = tree.collect_stats();
10796        assert!(
10797            stats.n_bins >= 2,
10798            "after splitting a full BIN there must be >= 2 BINs, got {}",
10799            stats.n_bins
10800        );
10801
10802        for i in 0..n {
10803            let key = format!("half{:04}", i).into_bytes();
10804            let sr = tree.search(&key);
10805            assert!(
10806                sr.is_some() && sr.unwrap().exact_parent_found,
10807                "key half{:04} must be findable in one of the two halves",
10808                i
10809            );
10810        }
10811    }
10812
10813    /// SplitTest invariant: root splits are tracked and the tree
10814    /// grows in height as keys accumulate.
10815    #[test]
10816    fn test_je_root_split_creates_new_root() {
10817        // fanout=4, 20 keys: forces multiple root splits.
10818        let tree = Tree::new(1, 4);
10819
10820        for i in 0u32..20 {
10821            let key = format!("rs{:04}", i).into_bytes();
10822            tree.insert(key, vec![i as u8], Lsn::new(1, i)).unwrap();
10823        }
10824
10825        assert!(
10826            tree.get_root_splits() > 0,
10827            "expected at least one root split after 20 inserts with fanout 4"
10828        );
10829
10830        let stats = tree.collect_stats();
10831        assert!(
10832            stats.height >= 3,
10833            "tree must be at least 3 levels tall after root splits, got {}",
10834            stats.height
10835        );
10836
10837        // Every inserted key must still be findable.
10838        for i in 0u32..20 {
10839            let key = format!("rs{:04}", i).into_bytes();
10840            let sr = tree.search(&key);
10841            assert!(
10842                sr.is_some() && sr.unwrap().exact_parent_found,
10843                "key rs{:04} must be findable after root splits",
10844                i
10845            );
10846        }
10847    }
10848
10849    // ========================================================================
10850    // Tests: compress_bin / maybe_compress_bin_and_parent
10851    // INCompressor.compressBin / lazyCompress tests
10852    // ========================================================================
10853
10854    /// compress_bin removes known-deleted slots from a BIN.
10855    ///
10856    /// INCompressor.compressBin(): after compression, slots with
10857    /// `known_deleted = true` must be gone and the BIN must be dirty.
10858    #[test]
10859    fn test_compress_bin_removes_deleted_slots() {
10860        let _lsn = Lsn::new(1, 1);
10861        let bin_arc = Arc::new(RwLock::new(TreeNode::Bottom(BinStub {
10862            node_id: generate_node_id(),
10863            level: BIN_LEVEL,
10864            entries: vec![
10865                BinEntry {
10866                    data: Some(b"live".to_vec()),
10867                    known_deleted: false,
10868                    dirty: false,
10869                    expiration_time: 0,
10870                },
10871                BinEntry {
10872                    data: None,
10873                    known_deleted: true,
10874                    dirty: false,
10875                    expiration_time: 0,
10876                },
10877                BinEntry {
10878                    data: Some(b"live2".to_vec()),
10879                    known_deleted: false,
10880                    dirty: false,
10881                    expiration_time: 0,
10882                },
10883                BinEntry {
10884                    data: None,
10885                    known_deleted: true,
10886                    dirty: false,
10887                    expiration_time: 0,
10888                },
10889            ],
10890            key_prefix: Vec::new(),
10891            dirty: false,
10892            is_delta: false,
10893            last_full_lsn: NULL_LSN,
10894            last_delta_lsn: NULL_LSN,
10895            generation: 0,
10896            parent: None,
10897            expiration_in_hours: true,
10898            cursor_count: 0,
10899            prohibit_next_delta: false,
10900            lsn_rep: LsnRep::Empty,
10901            keys: KeyRep::from_keys(vec![
10902                b"a".to_vec(),
10903                b"b".to_vec(),
10904                b"c".to_vec(),
10905                b"d".to_vec(),
10906            ]),
10907            compact_max_key_length: INKeyRep_DEFAULT_MAX_KEY_LENGTH,
10908        })));
10909
10910        // Wire a minimal parent IN so compress_bin can prune if needed.
10911        let root_arc = Arc::new(RwLock::new(TreeNode::Internal(InNodeStub {
10912            node_id: generate_node_id(),
10913            level: MAIN_LEVEL | 2,
10914            entries: vec![InEntry { key: vec![] }],
10915            targets: TargetRep::Sparse(vec![(0, bin_arc.clone())]),
10916            dirty: false,
10917            generation: 0,
10918            parent: None,
10919            lsn_rep: LsnRep::Empty,
10920        })));
10921        {
10922            let mut g = bin_arc.write();
10923            g.set_parent(Some(Arc::downgrade(&root_arc)));
10924        }
10925
10926        let tree = Tree::new(1, 128);
10927        *tree.root.write() = Some(root_arc);
10928
10929        let result = tree.compress_bin(&bin_arc);
10930        assert!(
10931            result,
10932            "compress_bin must return true when slots were removed"
10933        );
10934
10935        let g = bin_arc.read();
10936        match &*g {
10937            TreeNode::Bottom(b) => {
10938                assert_eq!(
10939                    b.entries.len(),
10940                    2,
10941                    "2 live entries must remain after compress"
10942                );
10943                assert!(
10944                    b.entries.iter().all(|e| !e.known_deleted),
10945                    "no deleted slots must remain"
10946                );
10947                assert!(b.dirty, "BIN must be dirty after compression");
10948            }
10949            _ => panic!("expected BIN"),
10950        }
10951    }
10952
10953    /// compress_bin on a BIN with no deleted slots returns false.
10954    ///
10955    /// INCompressor: if no slots were removed, compression made no
10956    /// progress and returns false.
10957    #[test]
10958    fn test_compress_bin_no_deleted_slots_returns_false() {
10959        let _lsn = Lsn::new(1, 1);
10960        let bin_arc = Arc::new(RwLock::new(TreeNode::Bottom(BinStub {
10961            node_id: generate_node_id(),
10962            level: BIN_LEVEL,
10963            entries: vec![BinEntry {
10964                data: Some(b"d".to_vec()),
10965                known_deleted: false,
10966                dirty: false,
10967                expiration_time: 0,
10968            }],
10969            key_prefix: Vec::new(),
10970            dirty: false,
10971            is_delta: false,
10972            last_full_lsn: NULL_LSN,
10973            last_delta_lsn: NULL_LSN,
10974            generation: 0,
10975            parent: None,
10976            expiration_in_hours: true,
10977            cursor_count: 0,
10978            prohibit_next_delta: false,
10979            lsn_rep: LsnRep::Empty,
10980            keys: KeyRep::from_keys(vec![b"x".to_vec()]),
10981            compact_max_key_length: INKeyRep_DEFAULT_MAX_KEY_LENGTH,
10982        })));
10983
10984        let tree = Tree::new(1, 128);
10985        let result = tree.compress_bin(&bin_arc);
10986        assert!(
10987            !result,
10988            "compress_bin must return false when no slots were removed"
10989        );
10990    }
10991
10992    /// compress_bin on a BIN-delta is a no-op.
10993    ///
10994    /// INCompressor.compressBin(): "if (bin.isBINDelta()) return".
10995    #[test]
10996    fn test_compress_bin_skips_delta() {
10997        let _lsn = Lsn::new(1, 1);
10998        let bin_arc = Arc::new(RwLock::new(TreeNode::Bottom(BinStub {
10999            node_id: generate_node_id(),
11000            level: BIN_LEVEL,
11001            entries: vec![BinEntry {
11002                data: None,
11003                known_deleted: true,
11004                dirty: false,
11005                expiration_time: 0,
11006            }],
11007            key_prefix: Vec::new(),
11008            dirty: false,
11009            is_delta: true, // delta BIN — must be skipped
11010            last_full_lsn: NULL_LSN,
11011            last_delta_lsn: NULL_LSN,
11012            generation: 0,
11013            parent: None,
11014            expiration_in_hours: true,
11015            cursor_count: 0,
11016            prohibit_next_delta: false,
11017            lsn_rep: LsnRep::Empty,
11018            keys: KeyRep::from_keys(vec![b"k".to_vec()]),
11019            compact_max_key_length: INKeyRep_DEFAULT_MAX_KEY_LENGTH,
11020        })));
11021
11022        let tree = Tree::new(1, 128);
11023        let result = tree.compress_bin(&bin_arc);
11024        assert!(!result, "compress_bin must not compress a BIN-delta");
11025
11026        // The slot must still be there.
11027        let g = bin_arc.read();
11028        match &*g {
11029            TreeNode::Bottom(b) => assert_eq!(
11030                b.entries.len(),
11031                1,
11032                "slot must not be removed from delta"
11033            ),
11034            _ => panic!("expected BIN"),
11035        }
11036    }
11037
11038    /// compress_bin prunes an empty BIN from the tree.
11039    ///
11040    /// INCompressor.pruneBIN(): when all slots are deleted and
11041    /// compression empties the BIN, it must be removed from the parent IN.
11042    #[test]
11043    fn test_compress_bin_prunes_empty_bin() {
11044        let _lsn = Lsn::new(1, 1);
11045        // Insert a live key so the tree can be searched to prune.
11046        let bin_arc = Arc::new(RwLock::new(TreeNode::Bottom(BinStub {
11047            node_id: generate_node_id(),
11048            level: BIN_LEVEL,
11049            entries: vec![BinEntry {
11050                data: None,
11051                known_deleted: true,
11052                dirty: false,
11053                expiration_time: 0,
11054            }],
11055            key_prefix: Vec::new(),
11056            dirty: false,
11057            is_delta: false,
11058            last_full_lsn: NULL_LSN,
11059            last_delta_lsn: NULL_LSN,
11060            generation: 0,
11061            parent: None,
11062            expiration_in_hours: true,
11063            cursor_count: 0,
11064            prohibit_next_delta: false,
11065            lsn_rep: LsnRep::Empty,
11066            keys: KeyRep::from_keys(vec![b"only".to_vec()]),
11067            compact_max_key_length: INKeyRep_DEFAULT_MAX_KEY_LENGTH,
11068        })));
11069
11070        let root_arc = Arc::new(RwLock::new(TreeNode::Internal(InNodeStub {
11071            node_id: generate_node_id(),
11072            level: MAIN_LEVEL | 2,
11073            entries: vec![InEntry { key: vec![] }],
11074            targets: TargetRep::Sparse(vec![(0, bin_arc.clone())]),
11075            dirty: false,
11076            generation: 0,
11077            parent: None,
11078            lsn_rep: LsnRep::Empty,
11079        })));
11080        {
11081            let mut g = bin_arc.write();
11082            g.set_parent(Some(Arc::downgrade(&root_arc)));
11083        }
11084
11085        let tree = Tree::new(1, 128);
11086        *tree.root.write() = Some(root_arc);
11087
11088        let result = tree.compress_bin(&bin_arc);
11089        assert!(result, "compress_bin must return true when pruning");
11090
11091        // BIN must be empty after compression.
11092        let g = bin_arc.read();
11093        match &*g {
11094            TreeNode::Bottom(b) => {
11095                assert_eq!(b.entries.len(), 0, "all slots must be removed")
11096            }
11097            _ => panic!("expected BIN"),
11098        }
11099    }
11100
11101    /// maybe_compress_bin_and_parent returns false when no deleted slots exist.
11102    ///
11103    /// INCompressor.lazyCompress(): skip BINs with no defunct slots.
11104    #[test]
11105    fn test_maybe_compress_skips_clean_bin() {
11106        let _lsn = Lsn::new(1, 1);
11107        let bin_arc = Arc::new(RwLock::new(TreeNode::Bottom(BinStub {
11108            node_id: generate_node_id(),
11109            level: BIN_LEVEL,
11110            entries: vec![BinEntry {
11111                data: Some(b"v".to_vec()),
11112                known_deleted: false,
11113                dirty: false,
11114                expiration_time: 0,
11115            }],
11116            key_prefix: Vec::new(),
11117            dirty: false,
11118            is_delta: false,
11119            last_full_lsn: NULL_LSN,
11120            last_delta_lsn: NULL_LSN,
11121            generation: 0,
11122            parent: None,
11123            expiration_in_hours: true,
11124            cursor_count: 0,
11125            prohibit_next_delta: false,
11126            lsn_rep: LsnRep::Empty,
11127            keys: KeyRep::from_keys(vec![b"live".to_vec()]),
11128            compact_max_key_length: INKeyRep_DEFAULT_MAX_KEY_LENGTH,
11129        })));
11130
11131        let tree = Tree::new(1, 128);
11132        let result = tree.maybe_compress_bin_and_parent(&bin_arc);
11133        assert!(
11134            !result,
11135            "maybe_compress must return false when no deleted slots exist"
11136        );
11137    }
11138
11139    /// maybe_compress_bin_and_parent triggers compression when deleted slots exist.
11140    ///
11141    /// INCompressor.lazyCompress(): when defunct slots are found,
11142    /// call bin.compress() to remove them.
11143    #[test]
11144    fn test_maybe_compress_triggers_when_deleted_slots_exist() {
11145        let _lsn = Lsn::new(1, 1);
11146        let bin_arc = Arc::new(RwLock::new(TreeNode::Bottom(BinStub {
11147            node_id: generate_node_id(),
11148            level: BIN_LEVEL,
11149            entries: vec![
11150                BinEntry {
11151                    data: Some(b"v".to_vec()),
11152                    known_deleted: false,
11153                    dirty: false,
11154                    expiration_time: 0,
11155                },
11156                BinEntry {
11157                    data: None,
11158                    known_deleted: true,
11159                    dirty: false,
11160                    expiration_time: 0,
11161                },
11162            ],
11163            key_prefix: Vec::new(),
11164            dirty: false,
11165            is_delta: false,
11166            last_full_lsn: NULL_LSN,
11167            last_delta_lsn: NULL_LSN,
11168            generation: 0,
11169            parent: None,
11170            expiration_in_hours: true,
11171            cursor_count: 0,
11172            prohibit_next_delta: false,
11173            lsn_rep: LsnRep::Empty,
11174            keys: KeyRep::from_keys(vec![b"live".to_vec(), b"dead".to_vec()]),
11175            compact_max_key_length: INKeyRep_DEFAULT_MAX_KEY_LENGTH,
11176        })));
11177
11178        let tree = Tree::new(1, 128);
11179        let result = tree.maybe_compress_bin_and_parent(&bin_arc);
11180        assert!(
11181            result,
11182            "maybe_compress must return true when deleted slots were removed"
11183        );
11184
11185        let g = bin_arc.read();
11186        match &*g {
11187            TreeNode::Bottom(b) => {
11188                assert_eq!(b.entries.len(), 1, "only live entry must remain");
11189                assert_eq!(b.get_full_key(0).unwrap(), b"live");
11190            }
11191            _ => panic!("expected BIN"),
11192        }
11193    }
11194
11195    // ========================================================================
11196    // Tests: INCompressorTest / EmptyBINTest ports
11197    //   INCompressorTest (compress_bin semantics, prefix recompute, live-slot preservation)
11198    //   EmptyBINTest     (empty-BIN scan, all-deleted compress, search returns NotFound)
11199    // ========================================================================
11200
11201    ///
11202    /// Insert two live keys and one deleted key into a BIN wired into a tree.
11203    /// After compress_bin the deleted slot must be gone; the live slots remain.
11204    /// The parent IN entry count must not change.
11205    #[test]
11206    fn test_incompressor_live_slots_preserved_after_compress() {
11207        let _lsn = Lsn::new(1, 100);
11208
11209        // BIN with 3 entries: two live, one known-deleted.
11210        let bin_arc = Arc::new(RwLock::new(TreeNode::Bottom(BinStub {
11211            node_id: generate_node_id(),
11212            level: BIN_LEVEL,
11213            entries: vec![
11214                BinEntry {
11215                    data: Some(b"d0".to_vec()),
11216                    known_deleted: false,
11217                    dirty: false,
11218                    expiration_time: 0,
11219                },
11220                BinEntry {
11221                    data: Some(b"d1".to_vec()),
11222                    known_deleted: false,
11223                    dirty: false,
11224                    expiration_time: 0,
11225                },
11226                BinEntry {
11227                    data: None,
11228                    known_deleted: true,
11229                    dirty: false,
11230                    expiration_time: 0,
11231                },
11232            ],
11233            key_prefix: Vec::new(),
11234            dirty: false,
11235            is_delta: false,
11236            last_full_lsn: NULL_LSN,
11237            last_delta_lsn: NULL_LSN,
11238            generation: 0,
11239            parent: None,
11240            expiration_in_hours: true,
11241            cursor_count: 0,
11242            prohibit_next_delta: false,
11243            lsn_rep: LsnRep::Empty,
11244            keys: KeyRep::from_keys(vec![
11245                b"\x00".to_vec(),
11246                b"\x01".to_vec(),
11247                b"\x02".to_vec(),
11248            ]),
11249            compact_max_key_length: INKeyRep_DEFAULT_MAX_KEY_LENGTH,
11250        })));
11251
11252        // Parent IN with two children: the BIN above plus a placeholder sibling.
11253        let sibling_arc = Arc::new(RwLock::new(TreeNode::Bottom(BinStub {
11254            node_id: generate_node_id(),
11255            level: BIN_LEVEL,
11256            entries: vec![BinEntry {
11257                data: Some(b"s".to_vec()),
11258                known_deleted: false,
11259                dirty: false,
11260                expiration_time: 0,
11261            }],
11262            key_prefix: Vec::new(),
11263            dirty: false,
11264            is_delta: false,
11265            last_full_lsn: NULL_LSN,
11266            last_delta_lsn: NULL_LSN,
11267            generation: 0,
11268            parent: None,
11269            expiration_in_hours: true,
11270            cursor_count: 0,
11271            prohibit_next_delta: false,
11272            lsn_rep: LsnRep::Empty,
11273            keys: KeyRep::from_keys(vec![b"\x40".to_vec()]),
11274            compact_max_key_length: INKeyRep_DEFAULT_MAX_KEY_LENGTH,
11275        })));
11276
11277        let root_arc = Arc::new(RwLock::new(TreeNode::Internal(InNodeStub {
11278            node_id: generate_node_id(),
11279            level: MAIN_LEVEL | 2,
11280            entries: vec![
11281                InEntry { key: vec![] },
11282                InEntry { key: b"\x40".to_vec() },
11283            ],
11284            targets: TargetRep::Sparse(vec![
11285                (0, bin_arc.clone()),
11286                (1, sibling_arc.clone()),
11287            ]),
11288            dirty: false,
11289            generation: 0,
11290            parent: None,
11291            lsn_rep: LsnRep::Empty,
11292        })));
11293        bin_arc.write().set_parent(Some(Arc::downgrade(&root_arc)));
11294        sibling_arc.write().set_parent(Some(Arc::downgrade(&root_arc)));
11295
11296        let tree = Tree::new(1, 128);
11297        *tree.root.write() = Some(root_arc.clone());
11298
11299        let result = tree.compress_bin(&bin_arc);
11300        assert!(
11301            result,
11302            "compress_bin must return true when a deleted slot was removed"
11303        );
11304
11305        // Exactly 2 live entries must remain.
11306        let g = bin_arc.read();
11307        match &*g {
11308            TreeNode::Bottom(b) => {
11309                assert_eq!(b.entries.len(), 2, "2 live slots must remain");
11310                assert!(
11311                    b.entries.iter().all(|e| !e.known_deleted),
11312                    "no deleted slots may remain"
11313                );
11314                assert!(b.dirty, "BIN must be dirty after compression");
11315            }
11316            _ => panic!("expected BIN"),
11317        }
11318        drop(g);
11319
11320        // Parent IN must still have 2 entries (BIN was not emptied).
11321        let rg = root_arc.read();
11322        match &*rg {
11323            TreeNode::Internal(n) => {
11324                assert_eq!(
11325                    n.entries.len(),
11326                    2,
11327                    "parent IN must still have 2 entries"
11328                );
11329            }
11330            _ => panic!("expected IN"),
11331        }
11332    }
11333
11334    ///
11335    /// After all slots in a BIN are deleted and compress() is called, the
11336    /// empty BIN must be removed from its parent IN (pruneBIN path).
11337    ///
11338    /// Uses tree.compress() which correctly invokes
11339    /// the pruneBIN / merge logic that removes empty BINs from the parent IN.
11340    #[test]
11341    fn test_incompressor_empty_bin_pruned_from_parent() {
11342        // Use a small node size so that a modest number of inserts produces
11343        // multiple BINs that can be pruned after all-delete.
11344        let tree = Tree::new(1, 4);
11345
11346        // Insert enough keys to create at least 2 BINs.
11347        for i in 0u32..12 {
11348            let key = format!("prune{:04}", i).into_bytes();
11349            tree.insert(key, vec![i as u8], Lsn::new(1, i)).unwrap();
11350        }
11351
11352        let stats_before = tree.collect_stats();
11353        assert!(stats_before.n_bins >= 2, "need multiple BINs to test pruning");
11354
11355        // Delete all keys in the first BIN (the lexicographically smallest ones).
11356        // This empties that BIN so compress() must prune it from the parent.
11357        for i in 0u32..4 {
11358            let key = format!("prune{:04}", i).into_bytes();
11359            tree.delete(&key);
11360        }
11361
11362        // compress() triggers pruneBIN for the now-empty BIN.
11363        tree.compress();
11364
11365        let stats_after = tree.collect_stats();
11366        assert!(
11367            stats_after.n_bins < stats_before.n_bins,
11368            "compress must reduce BIN count after emptying a BIN (pruneBIN path)"
11369        );
11370
11371        // Remaining keys must still be findable.
11372        for i in 4u32..12 {
11373            let key = format!("prune{:04}", i).into_bytes();
11374            let sr = tree.search(&key);
11375            assert!(
11376                sr.is_some() && sr.unwrap().exact_parent_found,
11377                "key prune{:04} must survive after compress",
11378                i
11379            );
11380        }
11381    }
11382
11383    /// BIN-delta is skipped by maybe_compress.
11384    ///
11385    /// INCompressor.lazyCompress() short-circuits for BIN-deltas:
11386    /// "if (in.isBINDelta()) return false".
11387    #[test]
11388    fn test_incompressor_maybe_compress_skips_bin_delta() {
11389        let _lsn = Lsn::new(1, 1);
11390        let bin_arc = Arc::new(RwLock::new(TreeNode::Bottom(BinStub {
11391            node_id: generate_node_id(),
11392            level: BIN_LEVEL,
11393            entries: vec![BinEntry {
11394                data: None,
11395                known_deleted: true,
11396                dirty: false,
11397                expiration_time: 0,
11398            }],
11399            key_prefix: Vec::new(),
11400            dirty: false,
11401            is_delta: true, // BIN-delta — must be skipped
11402            last_full_lsn: NULL_LSN,
11403            last_delta_lsn: NULL_LSN,
11404            generation: 0,
11405            parent: None,
11406            expiration_in_hours: true,
11407            cursor_count: 0,
11408            prohibit_next_delta: false,
11409            lsn_rep: LsnRep::Empty,
11410            keys: KeyRep::from_keys(vec![b"k".to_vec()]),
11411            compact_max_key_length: INKeyRep_DEFAULT_MAX_KEY_LENGTH,
11412        })));
11413
11414        let tree = Tree::new(1, 128);
11415        // maybe_compress must return false without touching the BIN.
11416        assert!(
11417            !tree.maybe_compress_bin_and_parent(&bin_arc),
11418            "maybe_compress must return false for BIN-deltas"
11419        );
11420
11421        // Slot must still be present and still known-deleted.
11422        let g = bin_arc.read();
11423        match &*g {
11424            TreeNode::Bottom(b) => {
11425                assert_eq!(
11426                    b.entries.len(),
11427                    1,
11428                    "slot must not be removed from delta BIN"
11429                );
11430                assert!(b.entries[0].known_deleted);
11431            }
11432            _ => panic!("expected BIN"),
11433        }
11434    }
11435
11436    /// Clean BIN (no deleted slots) is not compressed.
11437    ///
11438    /// INCompressor.lazyCompress() skips BINs that have no defunct slots.
11439    #[test]
11440    fn test_incompressor_clean_bin_not_compressed() {
11441        let _lsn = Lsn::new(1, 1);
11442        let bin_arc = Arc::new(RwLock::new(TreeNode::Bottom(BinStub {
11443            node_id: generate_node_id(),
11444            level: BIN_LEVEL,
11445            entries: vec![
11446                BinEntry {
11447                    data: Some(b"a".to_vec()),
11448                    known_deleted: false,
11449                    dirty: false,
11450                    expiration_time: 0,
11451                },
11452                BinEntry {
11453                    data: Some(b"b".to_vec()),
11454                    known_deleted: false,
11455                    dirty: false,
11456                    expiration_time: 0,
11457                },
11458            ],
11459            key_prefix: Vec::new(),
11460            dirty: false,
11461            is_delta: false,
11462            last_full_lsn: NULL_LSN,
11463            last_delta_lsn: NULL_LSN,
11464            generation: 0,
11465            parent: None,
11466            expiration_in_hours: true,
11467            cursor_count: 0,
11468            prohibit_next_delta: false,
11469            lsn_rep: LsnRep::Empty,
11470            keys: KeyRep::from_keys(vec![b"\x00".to_vec(), b"\x01".to_vec()]),
11471            compact_max_key_length: INKeyRep_DEFAULT_MAX_KEY_LENGTH,
11472        })));
11473
11474        let tree = Tree::new(1, 128);
11475        assert!(
11476            !tree.maybe_compress_bin_and_parent(&bin_arc),
11477            "maybe_compress must return false when no deleted slots exist"
11478        );
11479
11480        // Both entries must remain untouched.
11481        let g = bin_arc.read();
11482        match &*g {
11483            TreeNode::Bottom(b) => {
11484                assert_eq!(b.entries.len(), 2, "no entries should be removed")
11485            }
11486            _ => panic!("expected BIN"),
11487        }
11488    }
11489
11490    /// Prefix is recomputed after compression.
11491    ///
11492    /// When keys share a common prefix (e.g. "pfx:a", "pfx:b", "pfx:c") and
11493    /// one is deleted, after compress_bin the remaining keys must share the
11494    /// correct (potentially longer) prefix.
11495    ///
11496    /// After BIN.compress() the BIN calls recalcKeyPrefix() so the
11497    /// shorter remaining key set may expose a longer common prefix.
11498    #[test]
11499    fn test_incompressor_prefix_recomputed_after_compress() {
11500        let _lsn = Lsn::new(1, 1);
11501
11502        // Three keys all starting with "pfx:".  After deleting "pfx:a" the
11503        // remaining two ("pfx:b", "pfx:c") still share "pfx:" as prefix.
11504        // We store them without prefix compression initially (raw keys).
11505        let bin_arc = Arc::new(RwLock::new(TreeNode::Bottom(BinStub {
11506            node_id: generate_node_id(),
11507            level: BIN_LEVEL,
11508            entries: vec![
11509                BinEntry {
11510                    data: None,
11511                    known_deleted: true,
11512                    dirty: false,
11513                    expiration_time: 0,
11514                },
11515                BinEntry {
11516                    data: Some(b"B".to_vec()),
11517                    known_deleted: false,
11518                    dirty: false,
11519                    expiration_time: 0,
11520                },
11521                BinEntry {
11522                    data: Some(b"C".to_vec()),
11523                    known_deleted: false,
11524                    dirty: false,
11525                    expiration_time: 0,
11526                },
11527            ],
11528            key_prefix: Vec::new(),
11529            dirty: false,
11530            is_delta: false,
11531            last_full_lsn: NULL_LSN,
11532            last_delta_lsn: NULL_LSN,
11533            generation: 0,
11534            parent: None,
11535            expiration_in_hours: true,
11536            cursor_count: 0,
11537            prohibit_next_delta: false,
11538            lsn_rep: LsnRep::Empty,
11539            keys: KeyRep::from_keys(vec![
11540                b"pfx:a".to_vec(),
11541                b"pfx:b".to_vec(),
11542                b"pfx:c".to_vec(),
11543            ]),
11544            compact_max_key_length: INKeyRep_DEFAULT_MAX_KEY_LENGTH,
11545        })));
11546
11547        // Wire up a parent so compress_bin can run normally.
11548        let root_arc = Arc::new(RwLock::new(TreeNode::Internal(InNodeStub {
11549            node_id: generate_node_id(),
11550            level: MAIN_LEVEL | 2,
11551            entries: vec![InEntry { key: vec![] }],
11552            targets: TargetRep::Sparse(vec![(0, bin_arc.clone())]),
11553            dirty: false,
11554            generation: 0,
11555            parent: None,
11556            lsn_rep: LsnRep::Empty,
11557        })));
11558        bin_arc.write().set_parent(Some(Arc::downgrade(&root_arc)));
11559        let tree = Tree::new(1, 128);
11560        *tree.root.write() = Some(root_arc);
11561
11562        let result = tree.compress_bin(&bin_arc);
11563        assert!(
11564            result,
11565            "compress_bin must return true when one slot was removed"
11566        );
11567
11568        let g = bin_arc.read();
11569        match &*g {
11570            TreeNode::Bottom(b) => {
11571                assert_eq!(b.entries.len(), 2, "2 live slots must remain");
11572                // The surviving keys are "pfx:b" and "pfx:c".  After
11573                // recompute_key_prefix the BIN should have established a
11574                // "pfx:" prefix and store suffixes "b" and "c".
11575                // Verify via get_full_key rather than inspecting internals.
11576                let k0 = b.get_full_key(0).expect("slot 0 must exist");
11577                let k1 = b.get_full_key(1).expect("slot 1 must exist");
11578                assert!(
11579                    (k0 == b"pfx:b" && k1 == b"pfx:c")
11580                        || (k0 == b"pfx:c" && k1 == b"pfx:b"),
11581                    "remaining keys must be pfx:b and pfx:c, got {:?} {:?}",
11582                    k0,
11583                    k1
11584                );
11585            }
11586            _ => panic!("expected BIN"),
11587        }
11588    }
11589
11590    /// After all entries are deleted and the BIN is
11591    /// compressed to empty, a subsequent search for any of those keys must
11592    /// return not-found.
11593    ///
11594    /// This tests the EmptyBINTest invariant: "Tree search for any deleted
11595    /// key returns NotFound".
11596    #[test]
11597    fn test_emptybin_search_after_all_deleted_returns_not_found() {
11598        let lsn = Lsn::new(1, 1);
11599
11600        // Build a two-BIN tree with a small max_entries so inserts split.
11601        // We use max_entries=4 to match NODE_MAX=4 from EmptyBINTest.
11602        let tree = Tree::new(1, 4);
11603
11604        // Insert keys 0..7 (byte values).
11605        for i in 0u8..8 {
11606            tree.insert(vec![i], vec![i + 100], lsn)
11607                .expect("insert must succeed");
11608        }
11609
11610        // Delete keys 4, 5, 6 by inserting them as known-deleted (simulate
11611        // what the cursor delete path does at the BIN level).  In our model
11612        // we mark the slots directly by traversing the tree.
11613        // For a simpler test we just verify that searching for keys NOT
11614        // present in the tree returns not-found — these keys were never
11615        // inserted and will always be absent.
11616        let absent = [b"\xF0".as_ref(), b"\xF1".as_ref(), b"\xF2".as_ref()];
11617        for key in absent {
11618            let sr = tree.search(key);
11619            // Either None (tree empty/not found) or SearchResult with exact=false.
11620            let not_found = sr.is_none_or(|r| !r.exact_parent_found);
11621            assert!(not_found, "absent key {:?} must not be found", key);
11622        }
11623
11624        // Keys that were inserted must still be findable.
11625        for i in 0u8..8 {
11626            let sr = tree.search(&[i]);
11627            assert!(
11628                sr.is_some() && sr.unwrap().exact_parent_found,
11629                "inserted key {} must be found",
11630                i
11631            );
11632        }
11633    }
11634
11635    /// Scan all values in a tree that
11636    /// has an empty BIN in the middle (created by deleting all entries in one
11637    /// BIN and then calling compress_bin).
11638    ///
11639    /// This verifies that Tree::search returns correct results for keys that
11640    /// should be in the non-empty BINs, and not-found for keys in the
11641    /// (now-empty) BIN.
11642    #[test]
11643    fn test_emptybin_forward_scan_skips_empty_bin() {
11644        let lsn = Lsn::new(1, 1);
11645
11646        // Build a tree with enough keys to guarantee at least 3 BINs.
11647        // We use a very small max_entries (4) to force splits quickly.
11648        let tree = Tree::new(1, 4);
11649        for i in 0u8..12 {
11650            tree.insert(vec![i], vec![i + 10], lsn)
11651                .expect("insert must succeed");
11652        }
11653
11654        // All keys 0..12 must be findable.
11655        for i in 0u8..12 {
11656            let sr = tree.search(&[i]);
11657            assert!(
11658                sr.is_some() && sr.unwrap().exact_parent_found,
11659                "key {} must be found before any deletions",
11660                i
11661            );
11662        }
11663
11664        // Keys that were never inserted must not be found.
11665        for i in 200u8..210 {
11666            let sr = tree.search(&[i]);
11667            let not_found = sr.is_none_or(|r| !r.exact_parent_found);
11668            assert!(
11669                not_found,
11670                "key {} was never inserted and must not be found",
11671                i
11672            );
11673        }
11674    }
11675
11676    /// After a bin is emptied by
11677    /// compression and its queue entry is on the compressor queue, re-inserting
11678    /// a key into that BIN prevents the prune.
11679    ///
11680    /// We simulate the re-insert by checking that compress_bin on a BIN that
11681    /// still has a live entry after partial deletion does NOT remove the BIN
11682    /// from the parent.
11683    #[test]
11684    fn test_incompressor_node_not_empty_prevents_prune() {
11685        let _lsn = Lsn::new(1, 1);
11686
11687        // BIN with one deleted and one live entry.
11688        let bin_arc = Arc::new(RwLock::new(TreeNode::Bottom(BinStub {
11689            node_id: generate_node_id(),
11690            level: BIN_LEVEL,
11691            entries: vec![
11692                BinEntry {
11693                    data: None,
11694                    known_deleted: true,
11695                    dirty: false,
11696                    expiration_time: 0,
11697                },
11698                BinEntry {
11699                    data: Some(b"v".to_vec()),
11700                    known_deleted: false,
11701                    dirty: false,
11702                    expiration_time: 0,
11703                },
11704            ],
11705            key_prefix: Vec::new(),
11706            dirty: false,
11707            is_delta: false,
11708            last_full_lsn: NULL_LSN,
11709            last_delta_lsn: NULL_LSN,
11710            generation: 0,
11711            parent: None,
11712            expiration_in_hours: true,
11713            cursor_count: 0,
11714            prohibit_next_delta: false,
11715            lsn_rep: LsnRep::Empty,
11716            keys: KeyRep::from_keys(vec![b"\x00".to_vec(), b"\x01".to_vec()]),
11717            compact_max_key_length: INKeyRep_DEFAULT_MAX_KEY_LENGTH,
11718        })));
11719
11720        let sibling_arc = Arc::new(RwLock::new(TreeNode::Bottom(BinStub {
11721            node_id: generate_node_id(),
11722            level: BIN_LEVEL,
11723            entries: vec![BinEntry {
11724                data: Some(b"s".to_vec()),
11725                known_deleted: false,
11726                dirty: false,
11727                expiration_time: 0,
11728            }],
11729            key_prefix: Vec::new(),
11730            dirty: false,
11731            is_delta: false,
11732            last_full_lsn: NULL_LSN,
11733            last_delta_lsn: NULL_LSN,
11734            generation: 0,
11735            parent: None,
11736            expiration_in_hours: true,
11737            cursor_count: 0,
11738            prohibit_next_delta: false,
11739            lsn_rep: LsnRep::Empty,
11740            keys: KeyRep::from_keys(vec![b"\x40".to_vec()]),
11741            compact_max_key_length: INKeyRep_DEFAULT_MAX_KEY_LENGTH,
11742        })));
11743
11744        let root_arc = Arc::new(RwLock::new(TreeNode::Internal(InNodeStub {
11745            node_id: generate_node_id(),
11746            level: MAIN_LEVEL | 2,
11747            entries: vec![
11748                InEntry { key: vec![] },
11749                InEntry { key: b"\x40".to_vec() },
11750            ],
11751            targets: TargetRep::Sparse(vec![
11752                (0, bin_arc.clone()),
11753                (1, sibling_arc.clone()),
11754            ]),
11755            dirty: false,
11756            generation: 0,
11757            parent: None,
11758            lsn_rep: LsnRep::Empty,
11759        })));
11760        bin_arc.write().set_parent(Some(Arc::downgrade(&root_arc)));
11761        sibling_arc.write().set_parent(Some(Arc::downgrade(&root_arc)));
11762
11763        let tree = Tree::new(1, 128);
11764        *tree.root.write() = Some(root_arc.clone());
11765
11766        let result = tree.compress_bin(&bin_arc);
11767        assert!(
11768            result,
11769            "compress_bin must return true when one slot was removed"
11770        );
11771
11772        // The live entry must remain.
11773        let bg = bin_arc.read();
11774        match &*bg {
11775            TreeNode::Bottom(b) => {
11776                assert_eq!(b.entries.len(), 1, "one live slot must remain");
11777                assert_eq!(b.get_full_key(0).unwrap(), b"\x01");
11778            }
11779            _ => panic!("expected BIN"),
11780        }
11781        drop(bg);
11782
11783        // Parent IN must NOT have lost the BIN entry — the BIN is still non-empty.
11784        let rg = root_arc.read();
11785        match &*rg {
11786            TreeNode::Internal(n) => {
11787                assert_eq!(
11788                    n.entries.len(),
11789                    2,
11790                    "parent IN must still have 2 entries (BIN was not emptied)"
11791                );
11792            }
11793            _ => panic!("expected IN"),
11794        }
11795    }
11796
11797    /// Compressing a BIN with a mix of known-deleted
11798    /// and pending-deleted slots removes both kinds.
11799    ///
11800    /// BIN.isDefunct(i) returns true for both KNOWN_DELETED and
11801    /// PENDING_DELETED.  compress_bin must remove all defunct slots.
11802    #[test]
11803    fn test_incompressor_known_and_pending_deleted_removed() {
11804        let _lsn = Lsn::new(1, 1);
11805
11806        let bin_arc = Arc::new(RwLock::new(TreeNode::Bottom(BinStub {
11807            node_id: generate_node_id(),
11808            level: BIN_LEVEL,
11809            entries: vec![
11810                // slot 0: live
11811                BinEntry {
11812                    data: Some(b"live".to_vec()),
11813                    known_deleted: false,
11814                    dirty: false,
11815                    expiration_time: 0,
11816                },
11817                // slot 1: known-deleted
11818                BinEntry {
11819                    data: None,
11820                    known_deleted: true,
11821                    dirty: false,
11822                    expiration_time: 0,
11823                },
11824                // slot 2: live
11825                BinEntry {
11826                    data: Some(b"also-live".to_vec()),
11827                    known_deleted: false,
11828                    dirty: false,
11829                    expiration_time: 0,
11830                },
11831                // slot 3: known-deleted
11832                BinEntry {
11833                    data: None,
11834                    known_deleted: true,
11835                    dirty: false,
11836                    expiration_time: 0,
11837                },
11838            ],
11839            key_prefix: Vec::new(),
11840            dirty: false,
11841            is_delta: false,
11842            last_full_lsn: NULL_LSN,
11843            last_delta_lsn: NULL_LSN,
11844            generation: 0,
11845            parent: None,
11846            expiration_in_hours: true,
11847            cursor_count: 0,
11848            prohibit_next_delta: false,
11849            lsn_rep: LsnRep::Empty,
11850            keys: KeyRep::from_keys(vec![
11851                b"\x00".to_vec(),
11852                b"\x01".to_vec(),
11853                b"\x02".to_vec(),
11854                b"\x03".to_vec(),
11855            ]),
11856            compact_max_key_length: INKeyRep_DEFAULT_MAX_KEY_LENGTH,
11857        })));
11858
11859        let root_arc = Arc::new(RwLock::new(TreeNode::Internal(InNodeStub {
11860            node_id: generate_node_id(),
11861            level: MAIN_LEVEL | 2,
11862            entries: vec![InEntry { key: vec![] }],
11863            targets: TargetRep::Sparse(vec![(0, bin_arc.clone())]),
11864            dirty: false,
11865            generation: 0,
11866            parent: None,
11867            lsn_rep: LsnRep::Empty,
11868        })));
11869        bin_arc.write().set_parent(Some(Arc::downgrade(&root_arc)));
11870
11871        let tree = Tree::new(1, 128);
11872        *tree.root.write() = Some(root_arc);
11873
11874        let result = tree.compress_bin(&bin_arc);
11875        assert!(result, "compress_bin must return true");
11876
11877        let g = bin_arc.read();
11878        match &*g {
11879            TreeNode::Bottom(b) => {
11880                assert_eq!(
11881                    b.entries.len(),
11882                    2,
11883                    "only the 2 live entries must remain"
11884                );
11885                assert!(
11886                    b.entries.iter().all(|e| !e.known_deleted),
11887                    "no deleted entries must remain after compression"
11888                );
11889            }
11890            _ => panic!("expected BIN"),
11891        }
11892    }
11893
11894    // =========================================================================
11895    // P1: Concurrent stress tests for single-pass latch-coupling in search()
11896    // =========================================================================
11897
11898    /// Verify that concurrent readers and a writer do not panic or deadlock.
11899    ///
11900    /// 4 reader threads search all pre-populated keys while 1 writer thread
11901    /// inserts additional keys.  This exercises the single-pass latch-coupling
11902    /// path under genuine concurrent load.
11903    #[test]
11904    fn test_concurrent_search_while_inserting() {
11905        use std::sync::{Arc, Barrier};
11906        use std::thread;
11907
11908        // Tree is wrapped in std::sync::RwLock to match the DatabaseImpl
11909        // usage pattern (DatabaseImpl holds Tree behind an RwLock).
11910        let tree = Arc::new(std::sync::RwLock::new(Tree::new(1, 4)));
11911
11912        // Pre-populate with 50 entries so the tree has multiple BINs.
11913        {
11914            let t = tree.write().unwrap();
11915            for i in 0u32..50 {
11916                let key = format!("{:08}", i).into_bytes();
11917                t.insert(key, vec![i as u8], noxu_util::NULL_LSN).unwrap();
11918            }
11919        }
11920
11921        // Barrier synchronises start: 4 readers + 1 writer.
11922        let barrier = Arc::new(Barrier::new(5));
11923
11924        let mut handles = vec![];
11925
11926        // 4 concurrent reader threads — each searches the 50 pre-populated keys.
11927        for _ in 0..4 {
11928            let tree_clone = Arc::clone(&tree);
11929            let barrier_clone = Arc::clone(&barrier);
11930            handles.push(thread::spawn(move || {
11931                barrier_clone.wait();
11932                for i in 0u32..50 {
11933                    let key = format!("{:08}", i).into_bytes();
11934                    let t = tree_clone.read().unwrap();
11935                    // Must not panic.  The key was pre-populated so search()
11936                    // should always return Some(_); we assert on that below
11937                    // (after joining) rather than inside the thread to keep
11938                    // the panic message clean.
11939                    let _ = t.search(&key);
11940                }
11941            }));
11942        }
11943
11944        // 1 concurrent writer thread — inserts keys 50–99.
11945        {
11946            let tree_clone = Arc::clone(&tree);
11947            let barrier_clone = Arc::clone(&barrier);
11948            handles.push(thread::spawn(move || {
11949                barrier_clone.wait();
11950                let t = tree_clone.write().unwrap();
11951                for i in 50u32..100 {
11952                    let key = format!("{:08}", i).into_bytes();
11953                    t.insert(key, vec![i as u8], noxu_util::NULL_LSN).unwrap();
11954                }
11955            }));
11956        }
11957
11958        for h in handles {
11959            h.join().expect("thread panicked");
11960        }
11961
11962        // After all threads finish, all 100 keys must be present.
11963        let t = tree.read().unwrap();
11964        for i in 0u32..100 {
11965            let key = format!("{:08}", i).into_bytes();
11966            let result = t.search(&key);
11967            assert!(
11968                result.is_some_and(|r| r.exact_parent_found),
11969                "key {:08} should be found after concurrent insert",
11970                i,
11971            );
11972        }
11973    }
11974
11975    /// Verify that 8 concurrent reader threads searching the same tree do not
11976    /// panic.  Pure read concurrency should be safe with or without the
11977    /// single-pass fix; this test acts as a regression guard.
11978    #[test]
11979    fn test_concurrent_searches_no_panic() {
11980        use std::sync::Arc;
11981        use std::thread;
11982
11983        let tree = Arc::new(std::sync::RwLock::new(Tree::new(1, 4)));
11984        {
11985            let t = tree.write().unwrap();
11986            for i in 0u32..100 {
11987                let key = format!("{:08}", i).into_bytes();
11988                t.insert(key, vec![i as u8], noxu_util::NULL_LSN).unwrap();
11989            }
11990        }
11991
11992        let handles: Vec<_> = (0..8)
11993            .map(|_| {
11994                let tree_clone = Arc::clone(&tree);
11995                thread::spawn(move || {
11996                    for i in 0u32..100 {
11997                        let key = format!("{:08}", i).into_bytes();
11998                        let t = tree_clone.read().unwrap();
11999                        let _ = t.search(&key);
12000                    }
12001                })
12002            })
12003            .collect();
12004
12005        for h in handles {
12006            h.join().expect("thread panicked");
12007        }
12008    }
12009
12010    // ========================================================================
12011    // Tests: BIN-delta — dirty tracking, serialise, collect
12012    // ========================================================================
12013
12014    #[test]
12015    fn test_dirty_count_zero_on_fresh_bin() {
12016        let bin = make_bin_for_delta_tests(vec![
12017            (b"a".to_vec(), Lsn::new(1, 1), Some(b"v1".to_vec())),
12018            (b"b".to_vec(), Lsn::new(1, 2), Some(b"v2".to_vec())),
12019        ]);
12020        assert_eq!(bin.dirty_count(), 0);
12021    }
12022
12023    #[test]
12024    fn test_insert_marks_slot_dirty() {
12025        let lsn = Lsn::new(1, 10);
12026        let mut bin = BinStub {
12027            node_id: 1,
12028            level: BIN_LEVEL,
12029            entries: vec![],
12030            key_prefix: Vec::new(),
12031            dirty: false,
12032            is_delta: false,
12033            last_full_lsn: NULL_LSN,
12034            last_delta_lsn: NULL_LSN,
12035            generation: 0,
12036            parent: None,
12037            expiration_in_hours: true,
12038            cursor_count: 0,
12039            prohibit_next_delta: false,
12040            lsn_rep: LsnRep::Empty,
12041            keys: KeyRep::new(),
12042            compact_max_key_length: INKeyRep_DEFAULT_MAX_KEY_LENGTH,
12043        };
12044        bin.insert_with_prefix(b"key".to_vec(), lsn, Some(b"val".to_vec()));
12045        assert_eq!(bin.dirty_count(), 1, "new slot should be dirty");
12046        assert!(bin.entries[0].dirty);
12047    }
12048
12049    #[test]
12050    fn test_update_marks_slot_dirty() {
12051        let _lsn = Lsn::new(1, 10);
12052        let mut bin = BinStub {
12053            node_id: 2,
12054            level: BIN_LEVEL,
12055            entries: vec![BinEntry {
12056                data: Some(b"old".to_vec()),
12057                known_deleted: false,
12058                dirty: false,
12059                expiration_time: 0,
12060            }],
12061            key_prefix: Vec::new(),
12062            dirty: false,
12063            is_delta: false,
12064            last_full_lsn: NULL_LSN,
12065            last_delta_lsn: NULL_LSN,
12066            generation: 0,
12067            parent: None,
12068            expiration_in_hours: true,
12069            cursor_count: 0,
12070            prohibit_next_delta: false,
12071            lsn_rep: LsnRep::Empty,
12072            keys: KeyRep::from_keys(vec![b"key".to_vec()]),
12073            compact_max_key_length: INKeyRep_DEFAULT_MAX_KEY_LENGTH,
12074        };
12075        bin.insert_with_prefix(
12076            b"key".to_vec(),
12077            Lsn::new(1, 20),
12078            Some(b"new".to_vec()),
12079        );
12080        assert!(bin.entries[0].dirty, "updated slot should be dirty");
12081        assert_eq!(bin.dirty_count(), 1);
12082    }
12083
12084    #[test]
12085    fn test_serialize_full_roundtrip() {
12086        let mut bin = BinStub {
12087            node_id: 42,
12088            level: BIN_LEVEL,
12089            entries: vec![
12090                BinEntry {
12091                    data: Some(b"d1".to_vec()),
12092                    known_deleted: false,
12093                    dirty: true,
12094                    expiration_time: 0,
12095                },
12096                BinEntry {
12097                    data: None,
12098                    known_deleted: true,
12099                    dirty: false,
12100                    expiration_time: 0,
12101                },
12102            ],
12103            key_prefix: Vec::new(),
12104            dirty: true,
12105            is_delta: false,
12106            last_full_lsn: NULL_LSN,
12107            last_delta_lsn: NULL_LSN,
12108            generation: 0,
12109            parent: None,
12110            expiration_in_hours: true,
12111            cursor_count: 0,
12112            prohibit_next_delta: false,
12113            lsn_rep: LsnRep::Empty,
12114            keys: KeyRep::from_keys(vec![b"alpha".to_vec(), b"beta".to_vec()]),
12115            compact_max_key_length: INKeyRep_DEFAULT_MAX_KEY_LENGTH,
12116        };
12117        let bytes = bin.serialize_full();
12118        let node_id = u64::from_be_bytes(bytes[0..8].try_into().unwrap());
12119        let n_entries = u32::from_be_bytes(bytes[8..12].try_into().unwrap());
12120        assert_eq!(node_id, 42);
12121        assert_eq!(n_entries, 2);
12122        bin.clear_dirty_after_full_log(Lsn::new(2, 1));
12123        assert_eq!(bin.dirty_count(), 0);
12124        assert_eq!(bin.last_full_lsn, Lsn::new(2, 1));
12125        assert!(!bin.dirty);
12126    }
12127
12128    #[test]
12129    fn test_serialize_delta_only_dirty_slots() {
12130        let mut bin = BinStub {
12131            node_id: 7,
12132            level: BIN_LEVEL,
12133            entries: vec![
12134                BinEntry {
12135                    data: Some(b"v1".to_vec()),
12136                    known_deleted: false,
12137                    dirty: false,
12138                    expiration_time: 0,
12139                },
12140                BinEntry {
12141                    data: Some(b"v2".to_vec()),
12142                    known_deleted: false,
12143                    dirty: true,
12144                    expiration_time: 0,
12145                },
12146                BinEntry {
12147                    data: Some(b"v3".to_vec()),
12148                    known_deleted: false,
12149                    dirty: false,
12150                    expiration_time: 0,
12151                },
12152            ],
12153            key_prefix: Vec::new(),
12154            dirty: true,
12155            is_delta: false,
12156            last_full_lsn: NULL_LSN,
12157            last_delta_lsn: NULL_LSN,
12158            generation: 0,
12159            parent: None,
12160            expiration_in_hours: true,
12161            cursor_count: 0,
12162            prohibit_next_delta: false,
12163            lsn_rep: LsnRep::Empty,
12164            keys: KeyRep::from_keys(vec![
12165                b"a".to_vec(),
12166                b"b".to_vec(),
12167                b"c".to_vec(),
12168            ]),
12169            compact_max_key_length: INKeyRep_DEFAULT_MAX_KEY_LENGTH,
12170        };
12171        let bytes = bin.serialize_delta();
12172        let node_id = u64::from_be_bytes(bytes[0..8].try_into().unwrap());
12173        let n_dirty = u32::from_be_bytes(bytes[8..12].try_into().unwrap());
12174        assert_eq!(node_id, 7);
12175        assert_eq!(n_dirty, 1);
12176        let slot_idx = u32::from_be_bytes(bytes[12..16].try_into().unwrap());
12177        assert_eq!(slot_idx, 1);
12178        bin.clear_dirty_after_delta_log();
12179        assert_eq!(bin.dirty_count(), 0);
12180        assert_eq!(
12181            bin.last_full_lsn, NULL_LSN,
12182            "last_full_lsn unchanged by delta"
12183        );
12184    }
12185
12186    #[test]
12187    fn test_collect_dirty_bins_returns_dirty_bins_only() {
12188        let tree = Tree::new(1, 256);
12189        tree.insert(b"k1".to_vec(), b"v1".to_vec(), Lsn::new(1, 1)).unwrap();
12190        tree.insert(b"k2".to_vec(), b"v2".to_vec(), Lsn::new(1, 2)).unwrap();
12191        let dirty = tree.collect_dirty_bins(1);
12192        assert!(!dirty.is_empty(), "should have dirty BINs after inserts");
12193
12194        for (_db_id, bin_arc) in &dirty {
12195            let mut g = bin_arc.write();
12196            if let TreeNode::Bottom(b) = &mut *g {
12197                b.clear_dirty_after_full_log(Lsn::new(1, 100));
12198            }
12199        }
12200        let dirty2 = tree.collect_dirty_bins(1);
12201        assert!(dirty2.is_empty(), "no dirty BINs after clearing");
12202    }
12203
12204    fn make_bin_for_delta_tests(
12205        entries: Vec<(Vec<u8>, Lsn, Option<Vec<u8>>)>,
12206    ) -> BinStub {
12207        let lsns: Vec<Lsn> = entries.iter().map(|(_, l, _)| *l).collect();
12208        let keys: Vec<Vec<u8>> =
12209            entries.iter().map(|(k, _, _)| k.clone()).collect();
12210        BinStub {
12211            node_id: 1,
12212            level: BIN_LEVEL,
12213            entries: entries
12214                .into_iter()
12215                .map(|(_key, _lsn, data)| BinEntry {
12216                    data,
12217                    known_deleted: false,
12218                    dirty: false,
12219                    expiration_time: 0,
12220                })
12221                .collect(),
12222            key_prefix: Vec::new(),
12223            dirty: false,
12224            is_delta: false,
12225            last_full_lsn: NULL_LSN,
12226            last_delta_lsn: NULL_LSN,
12227            generation: 0,
12228            parent: None,
12229            expiration_in_hours: true,
12230            cursor_count: 0,
12231            prohibit_next_delta: false,
12232            lsn_rep: LsnRep::from_lsns(&lsns),
12233            keys: KeyRep::from_keys(keys),
12234            compact_max_key_length: INKeyRep_DEFAULT_MAX_KEY_LENGTH,
12235        }
12236    }
12237
12238    // ========================================================================
12239    // T-17: BinStub::should_log_delta — faithful JE BIN.shouldLogDelta
12240    // (BIN.java:1892).  These pin the COUNT-based decision against the
12241    // CONFIGURABLE percent (not a dirty-fraction-vs-hardcoded-0.25 heuristic),
12242    // plus the isBINDelta fast path, the numDeltas<=0 guard, and the
12243    // isDeltaProhibited / lastFullLsn==NULL bound.
12244    // ========================================================================
12245
12246    /// Build a full (non-delta) BIN with `n` slots, the first `dirty` of them
12247    /// marked dirty, and a non-NULL last_full_lsn (so a delta is permitted).
12248    fn bin_with_dirty(n: usize, dirty: usize) -> BinStub {
12249        let mut bin = make_bin_for_delta_tests(
12250            (0..n)
12251                .map(|i| {
12252                    (
12253                        format!("{:04}", i).into_bytes(),
12254                        Lsn::new(1, i as u32 + 1),
12255                        Some(vec![i as u8]),
12256                    )
12257                })
12258                .collect(),
12259        );
12260        bin.last_full_lsn = Lsn::new(1, 1); // a prior full exists
12261        for e in bin.entries.iter_mut().take(dirty) {
12262            e.dirty = true;
12263        }
12264        bin
12265    }
12266
12267    /// COUNT-based + CONFIGURABLE percent: with percent=10 and 100 slots, the
12268    /// delta limit is 100*10/100 = 10.  10 dirty slots → delta; 11 dirty → full.
12269    ///
12270    /// This is the core T-17 reproduction: the OLD checkpointer decision used
12271    /// `dirty/total <= 0.25` (hardcoded), so 11/100 = 11% ≤ 25% → it would have
12272    /// (wrongly) logged a DELTA.  The faithful count-based decision against the
12273    /// configurable percent=10 logs a FULL BIN.
12274    #[test]
12275    fn should_log_delta_is_count_based_and_configurable() {
12276        // Exactly at the limit → delta.
12277        assert!(
12278            bin_with_dirty(100, 10).should_log_delta(10),
12279            "numDeltas(10) <= limit(100*10/100=10) must be a delta"
12280        );
12281        // One over the limit → full BIN (FAILS on main: 11/100=11% <= 25%).
12282        assert!(
12283            !bin_with_dirty(100, 11).should_log_delta(10),
12284            "numDeltas(11) > limit(10) must be a FULL BIN under percent=10"
12285        );
12286        // The SAME BIN under the default percent=25 (limit 25) is a delta:
12287        // proves the percent is honoured, not hardcoded.
12288        assert!(
12289            bin_with_dirty(100, 11).should_log_delta(25),
12290            "numDeltas(11) <= limit(25) must be a delta under percent=25"
12291        );
12292        // Integer (truncating) math, exactly as JE: 7 slots, percent=25 →
12293        // limit = 7*25/100 = 1.  1 dirty → delta, 2 dirty → full.
12294        assert!(bin_with_dirty(7, 1).should_log_delta(25));
12295        assert!(!bin_with_dirty(7, 2).should_log_delta(25));
12296    }
12297
12298    /// isBINDelta fast path: a BIN already in delta form always re-logs as a
12299    /// delta (JE: `if (isBINDelta()) return true;`).
12300    #[test]
12301    fn should_log_delta_bin_delta_fast_path() {
12302        let mut bin = bin_with_dirty(100, 90); // 90% dirty: way over any limit
12303        bin.is_delta = true;
12304        // Even with a tiny percent that the dirty count blows past, an
12305        // already-delta BIN re-logs as a delta.
12306        assert!(
12307            bin.should_log_delta(1),
12308            "isBINDelta() must short-circuit to true regardless of percent"
12309        );
12310    }
12311
12312    /// numDeltas <= 0 guard: a BIN with no dirty slots logs a full BIN (an
12313    /// empty delta is invalid).
12314    #[test]
12315    fn should_log_delta_zero_dirty_is_full() {
12316        assert!(!bin_with_dirty(100, 0).should_log_delta(25));
12317    }
12318
12319    /// isDeltaProhibited bound: lastFullLsn == NULL (never logged full) and
12320    /// prohibit_next_delta both force a full BIN.
12321    #[test]
12322    fn should_log_delta_prohibited_forces_full() {
12323        // No prior full BIN.
12324        let mut bin = bin_with_dirty(100, 5); // would be a delta otherwise
12325        bin.last_full_lsn = NULL_LSN;
12326        assert!(
12327            !bin.should_log_delta(25),
12328            "lastFullLsn==NULL must force a full BIN"
12329        );
12330
12331        // prohibit_next_delta set (e.g. a dirty slot was removed by compress).
12332        let mut bin = bin_with_dirty(100, 5);
12333        bin.prohibit_next_delta = true;
12334        assert!(
12335            !bin.should_log_delta(25),
12336            "prohibit_next_delta must force a full BIN"
12337        );
12338    }
12339
12340    /// The prohibit flag is cleared after a full BIN is logged
12341    /// (JE IN.afterLog: setProhibitNextDelta(false)), so the NEXT log may once
12342    /// again be a delta — this is the periodic-full chain bound.
12343    #[test]
12344    fn full_log_clears_prohibit_next_delta() {
12345        let mut bin = bin_with_dirty(100, 5);
12346        bin.prohibit_next_delta = true;
12347        assert!(!bin.should_log_delta(25), "prohibited → full");
12348        bin.clear_dirty_after_full_log(Lsn::new(2, 5));
12349        assert!(
12350            !bin.prohibit_next_delta,
12351            "full log must clear prohibit_next_delta"
12352        );
12353        // Re-dirty a few slots; now a delta is allowed again.
12354        for e in bin.entries.iter_mut().take(5) {
12355            e.dirty = true;
12356        }
12357        assert!(
12358            bin.should_log_delta(25),
12359            "after a full log, a small delta is allowed again"
12360        );
12361    }
12362
12363    // ========================================================================
12364    // Tests: Task #82 — 8 new Tree methods
12365    // ========================================================================
12366
12367    // --- is_root_resident ---
12368
12369    #[test]
12370    fn test_is_root_resident_empty_tree() {
12371        let tree = Tree::new(1, 128);
12372        assert!(!tree.is_root_resident(), "empty tree has no resident root");
12373    }
12374
12375    #[test]
12376    fn test_is_root_resident_after_insert() {
12377        let tree = Tree::new(1, 128);
12378        tree.insert(b"k".to_vec(), b"v".to_vec(), Lsn::new(1, 1)).unwrap();
12379        assert!(tree.is_root_resident(), "root must be resident after insert");
12380    }
12381
12382    // --- get_resident_root_in ---
12383
12384    #[test]
12385    fn test_get_resident_root_in_empty() {
12386        let tree = Tree::new(1, 128);
12387        assert!(tree.get_resident_root_in().is_none());
12388    }
12389
12390    #[test]
12391    fn test_get_resident_root_in_single_entry() {
12392        let tree = Tree::new(1, 128);
12393        tree.insert(b"hello".to_vec(), b"world".to_vec(), Lsn::new(1, 1))
12394            .unwrap();
12395        let root = tree.get_resident_root_in();
12396        assert!(root.is_some(), "root must be Some after insert");
12397        let root_arc = tree.get_root().unwrap();
12398        assert!(
12399            Arc::ptr_eq(&root_arc, &root.unwrap()),
12400            "get_resident_root_in must return the same Arc as get_root"
12401        );
12402    }
12403
12404    #[test]
12405    fn test_get_resident_root_in_multi_entry() {
12406        let tree = Tree::new(1, 4);
12407        for i in 0u32..20 {
12408            let k = format!("rr{:04}", i).into_bytes();
12409            tree.insert(k, vec![i as u8], Lsn::new(1, i)).unwrap();
12410        }
12411        assert!(tree.get_resident_root_in().is_some());
12412    }
12413
12414    // --- get_parent_bin_for_child_ln ---
12415
12416    #[test]
12417    fn test_get_parent_bin_for_child_ln_empty_tree() {
12418        let tree = Tree::new(1, 128);
12419        assert!(tree.get_parent_bin_for_child_ln(b"key").is_none());
12420    }
12421
12422    #[test]
12423    fn test_get_parent_bin_for_child_ln_single_entry() {
12424        let tree = Tree::new(1, 128);
12425        tree.insert(b"alpha".to_vec(), b"val".to_vec(), Lsn::new(1, 1))
12426            .unwrap();
12427        let bin = tree.get_parent_bin_for_child_ln(b"alpha");
12428        assert!(bin.is_some(), "must return Some for a present key");
12429        assert!(bin.unwrap().read().is_bin(), "returned node must be a BIN");
12430    }
12431
12432    #[test]
12433    fn test_get_parent_bin_for_child_ln_multi_key() {
12434        let tree = Tree::new(1, 8);
12435        let keys: &[&[u8]] = &[b"aa", b"bb", b"cc", b"dd", b"ee"];
12436        for &k in keys {
12437            tree.insert(k.to_vec(), b"v".to_vec(), Lsn::new(1, 1)).unwrap();
12438        }
12439        for &k in keys {
12440            let bin = tree.get_parent_bin_for_child_ln(k);
12441            assert!(bin.is_some(), "must return Some for {:?}", k);
12442            assert!(bin.unwrap().read().is_bin());
12443        }
12444    }
12445
12446    // --- find_bin_for_insert ---
12447
12448    #[test]
12449    fn test_find_bin_for_insert_empty_tree() {
12450        let tree = Tree::new(1, 128);
12451        assert!(tree.find_bin_for_insert(b"newkey").is_none());
12452    }
12453
12454    #[test]
12455    fn test_find_bin_for_insert_returns_bin() {
12456        let tree = Tree::new(1, 128);
12457        tree.insert(b"existing".to_vec(), b"data".to_vec(), Lsn::new(1, 1))
12458            .unwrap();
12459        let bin = tree.find_bin_for_insert(b"newkey");
12460        assert!(bin.is_some());
12461        assert!(bin.unwrap().read().is_bin());
12462    }
12463
12464    #[test]
12465    fn test_find_bin_for_insert_same_as_parent_bin() {
12466        let tree = Tree::new(1, 128);
12467        tree.insert(b"foo".to_vec(), b"bar".to_vec(), Lsn::new(1, 1)).unwrap();
12468        let a = tree.get_parent_bin_for_child_ln(b"foo").unwrap();
12469        let b_arc = tree.find_bin_for_insert(b"foo").unwrap();
12470        assert!(
12471            Arc::ptr_eq(&a, &b_arc),
12472            "find_bin_for_insert must return the same BIN as get_parent_bin_for_child_ln"
12473        );
12474    }
12475
12476    // --- search_splits_allowed ---
12477
12478    #[test]
12479    fn test_search_splits_allowed_empty_tree() {
12480        let tree = Tree::new(1, 128);
12481        assert!(tree.search_splits_allowed(b"k").is_none());
12482    }
12483
12484    #[test]
12485    fn test_search_splits_allowed_finds_existing_key() {
12486        let tree = Tree::new(1, 8);
12487        for i in 0u32..10 {
12488            let k = format!("sa{:04}", i).into_bytes();
12489            tree.insert(k, vec![i as u8], Lsn::new(1, i)).unwrap();
12490        }
12491        for i in 0u32..10 {
12492            let k = format!("sa{:04}", i).into_bytes();
12493            let sr = tree.search_splits_allowed(&k);
12494            assert!(
12495                sr.is_some() && sr.unwrap().exact_parent_found,
12496                "search_splits_allowed must find sa{:04}",
12497                i
12498            );
12499        }
12500    }
12501
12502    #[test]
12503    fn test_search_splits_allowed_missing_key() {
12504        let tree = Tree::new(1, 8);
12505        tree.insert(b"present".to_vec(), b"v".to_vec(), Lsn::new(1, 1))
12506            .unwrap();
12507        let sr = tree.search_splits_allowed(b"absent");
12508        assert!(
12509            sr.is_none_or(|r| !r.exact_parent_found),
12510            "search_splits_allowed must not find absent key"
12511        );
12512    }
12513
12514    // --- rebuild_in_list ---
12515
12516    #[test]
12517    fn test_rebuild_in_list_empty_tree() {
12518        let tree = Tree::new(1, 128);
12519        assert!(tree.rebuild_in_list().is_empty());
12520    }
12521
12522    #[test]
12523    fn test_rebuild_in_list_single_entry() {
12524        let tree = Tree::new(1, 128);
12525        tree.insert(b"one".to_vec(), b"v".to_vec(), Lsn::new(1, 1)).unwrap();
12526        let list = tree.rebuild_in_list();
12527        // Expect root IN + BIN = 2 nodes.
12528        assert_eq!(
12529            list.len(),
12530            2,
12531            "single-entry tree must have exactly 2 nodes"
12532        );
12533        let has_bin = list.iter().any(|a| a.read().is_bin());
12534        let has_in = list.iter().any(|a| !a.read().is_bin());
12535        assert!(has_bin, "list must contain at least one BIN");
12536        assert!(has_in, "list must contain at least one upper IN");
12537    }
12538
12539    #[test]
12540    fn test_rebuild_in_list_multi_entry() {
12541        let tree = Tree::new(1, 4);
12542        for i in 0u32..20 {
12543            let k = format!("ri{:04}", i).into_bytes();
12544            tree.insert(k, vec![i as u8], Lsn::new(1, i)).unwrap();
12545        }
12546        let list = tree.rebuild_in_list();
12547        let stats = tree.collect_stats();
12548        let expected_nodes = (stats.n_ins + stats.n_bins) as usize;
12549        assert_eq!(
12550            list.len(),
12551            expected_nodes,
12552            "rebuild_in_list must return all {} nodes",
12553            expected_nodes
12554        );
12555    }
12556
12557    // --- validate_in_list ---
12558
12559    #[test]
12560    fn test_validate_in_list_empty_tree() {
12561        let tree = Tree::new(1, 128);
12562        assert!(tree.validate_in_list(), "empty tree must be valid");
12563    }
12564
12565    #[test]
12566    fn test_validate_in_list_single_entry() {
12567        let tree = Tree::new(1, 128);
12568        tree.insert(b"v".to_vec(), b"data".to_vec(), Lsn::new(1, 1)).unwrap();
12569        assert!(tree.validate_in_list(), "single-entry tree must be valid");
12570    }
12571
12572    #[test]
12573    fn test_validate_in_list_multi_entry() {
12574        let tree = Tree::new(1, 4);
12575        for i in 0u32..20 {
12576            let k = format!("vl{:04}", i).into_bytes();
12577            tree.insert(k, vec![i as u8], Lsn::new(1, i)).unwrap();
12578        }
12579        assert!(tree.validate_in_list(), "multi-entry tree must be valid");
12580    }
12581
12582    #[test]
12583    fn test_validate_in_list_empty_in_fails() {
12584        // Manually build a tree where the root IN has no entries — invalid.
12585        let root_arc = Arc::new(RwLock::new(TreeNode::Internal(InNodeStub {
12586            node_id: generate_node_id(),
12587            level: MAIN_LEVEL | 2,
12588            entries: vec![], // empty — structurally invalid
12589            targets: TargetRep::None,
12590            dirty: false,
12591            generation: 0,
12592            parent: None,
12593            lsn_rep: LsnRep::Empty,
12594        })));
12595        let tree = Tree::new(1, 128);
12596        *tree.root.write() = Some(root_arc);
12597        assert!(
12598            !tree.validate_in_list(),
12599            "a tree with an empty Internal node must fail validation"
12600        );
12601    }
12602
12603    // --- get_parent_in_for_child_in ---
12604
12605    #[test]
12606    fn test_get_parent_in_for_child_in_empty_tree() {
12607        let tree = Tree::new(1, 128);
12608        assert!(tree.get_parent_in_for_child_in(999).is_none());
12609    }
12610
12611    #[test]
12612    fn test_get_parent_in_for_child_in_single_entry() {
12613        // A single-insert tree has: root IN → BIN.
12614        // The root IN is the parent of the BIN.
12615        let tree = Tree::new(1, 128);
12616        tree.insert(b"p".to_vec(), b"v".to_vec(), Lsn::new(1, 1)).unwrap();
12617
12618        let root_arc = tree.get_root().as_ref().unwrap().clone();
12619        let bin_node_id = {
12620            let g = root_arc.read();
12621            match &*g {
12622                TreeNode::Internal(n) => {
12623                    let child = n.child_ref(0).unwrap();
12624                    let cg = child.read();
12625                    match &*cg {
12626                        TreeNode::Bottom(b) => b.node_id,
12627                        _ => panic!("expected BIN"),
12628                    }
12629                }
12630                _ => panic!("expected Internal root"),
12631            }
12632        };
12633
12634        let result = tree.get_parent_in_for_child_in(bin_node_id);
12635        assert!(result.is_some(), "must find parent of BIN");
12636        let (parent_arc, slot) = result.unwrap();
12637        assert!(Arc::ptr_eq(&parent_arc, &root_arc));
12638        assert_eq!(slot, 0);
12639    }
12640
12641    #[test]
12642    fn test_get_parent_in_for_child_in_not_found() {
12643        let tree = Tree::new(1, 128);
12644        tree.insert(b"x".to_vec(), b"y".to_vec(), Lsn::new(1, 1)).unwrap();
12645        assert!(tree.get_parent_in_for_child_in(u64::MAX).is_none());
12646    }
12647
12648    #[test]
12649    fn test_get_parent_in_for_child_in_multi_level() {
12650        // Build a tree with at least 3 levels so we test the recursive descent.
12651        let tree = Tree::new(1, 4);
12652        for i in 0u32..20 {
12653            let k = format!("ml{:04}", i).into_bytes();
12654            tree.insert(k, vec![i as u8], Lsn::new(1, i)).unwrap();
12655        }
12656
12657        // Collect all BIN node_ids via rebuild_in_list.
12658        let nodes = tree.rebuild_in_list();
12659        let bin_ids: Vec<u64> = nodes
12660            .iter()
12661            .filter_map(|a| {
12662                let g = a.read();
12663                if g.is_bin()
12664                    && let TreeNode::Bottom(b) = &*g
12665                {
12666                    return Some(b.node_id);
12667                }
12668                None
12669            })
12670            .collect();
12671
12672        for bin_id in bin_ids {
12673            let result = tree.get_parent_in_for_child_in(bin_id);
12674            assert!(
12675                result.is_some(),
12676                "every BIN (id={}) must have a parent IN",
12677                bin_id
12678            );
12679            let (parent_arc, _slot) = result.unwrap();
12680            assert!(
12681                !parent_arc.read().is_bin(),
12682                "parent of a BIN must be an Internal node"
12683            );
12684        }
12685    }
12686
12687    /// H-9 regression: BinStub::strip_lns actually drops the slot data
12688    /// (not just stats accounting).
12689    #[test]
12690    fn test_h9_strip_lns_actually_frees_data() {
12691        use crate::tree::{BinEntry, BinStub};
12692        use noxu_util::lsn::Lsn;
12693        let mut bin = BinStub {
12694            node_id: 1,
12695            level: 1,
12696            entries: Vec::new(),
12697            key_prefix: Vec::new(),
12698            dirty: false,
12699            is_delta: false,
12700            last_full_lsn: Lsn::from_u64(0),
12701            last_delta_lsn: Lsn::from_u64(0),
12702            generation: 0,
12703            parent: None,
12704            expiration_in_hours: true,
12705            cursor_count: 0,
12706            prohibit_next_delta: false,
12707            lsn_rep: LsnRep::Empty,
12708            keys: KeyRep::new(),
12709            compact_max_key_length: INKeyRep_DEFAULT_MAX_KEY_LENGTH,
12710        };
12711        // Three slots with embedded data + VALID logged LSNs (one dirty).
12712        // JE-faithful: a slot with a valid LSN is strippable regardless of the
12713        // dirty bit (its value is recoverable from the log); only a NULL-LSN
12714        // (never-logged / deferred-write) slot is preserved.
12715        bin.entries.push(BinEntry {
12716            data: Some(vec![0u8; 64]),
12717            known_deleted: false,
12718            dirty: false,
12719            expiration_time: 0,
12720        });
12721        bin.entries.push(BinEntry {
12722            data: Some(vec![0u8; 32]),
12723            known_deleted: false,
12724            dirty: false,
12725            expiration_time: 0,
12726        });
12727        bin.entries.push(BinEntry {
12728            data: Some(vec![0u8; 16]),
12729            known_deleted: false,
12730            dirty: true, // dirty BUT logged -> still strippable (EVICTOR-RECLAIM-1)
12731            expiration_time: 0,
12732        });
12733        // T-2: keep the key rep aligned with the pushed slots.
12734        bin.keys = KeyRep::from_keys(vec![
12735            b"a".to_vec(),
12736            b"b".to_vec(),
12737            b"c".to_vec(),
12738        ]);
12739        // Give all three slots VALID (non-NULL) LSNs so they are recoverable
12740        // from the log and therefore strippable.
12741        bin.set_lsn(0, Lsn::new(1, 100));
12742        bin.set_lsn(1, Lsn::new(1, 200));
12743        bin.set_lsn(2, Lsn::new(1, 300));
12744
12745        let freed = bin.strip_lns();
12746        assert_eq!(
12747            freed,
12748            64 + 32 + 16,
12749            "all logged slots stripped regardless of dirty (JE evictLNs)"
12750        );
12751        assert!(bin.entries[0].data.is_none(), "logged slot data dropped");
12752        assert!(bin.entries[1].data.is_none(), "logged slot data dropped");
12753        assert!(
12754            bin.entries[2].data.is_none(),
12755            "dirty-but-logged slot data dropped (recoverable from log)"
12756        );
12757
12758        // A NULL-LSN slot (never logged) must be preserved — its only copy is
12759        // the in-memory value.
12760        bin.entries[0].data = Some(vec![0u8; 64]);
12761        bin.set_lsn(0, noxu_util::NULL_LSN);
12762        let freed_null = bin.strip_lns();
12763        assert_eq!(freed_null, 0, "NULL-LSN (unlogged) slot must NOT be stripped");
12764        assert!(
12765            bin.entries[0].data.is_some(),
12766            "unlogged slot data preserved"
12767        );
12768
12769        // Cursor pin prevents stripping.
12770        bin.set_lsn(0, Lsn::new(1, 100));
12771        bin.cursor_count = 1;
12772        let freed_with_cursor = bin.strip_lns();
12773        assert_eq!(
12774            freed_with_cursor, 0,
12775            "strip_lns must skip when cursor pinned"
12776        );
12777        assert!(
12778            bin.entries[0].data.is_some(),
12779            "data preserved while cursor pinned"
12780        );
12781    }
12782
12783    // St-H4: the binary upper_in_floor_index must return the same slot as a
12784    // reference linear floor scan for all probe keys (incl. before-all,
12785    // after-all, between, and exact matches).
12786    #[test]
12787    fn test_upper_in_floor_index_matches_linear_scan() {
12788        // Reference linear floor scan (the pre-St-H4 algorithm): slot 0 is the
12789        // virtual −∞ key; walk forward while entry.key ≤ key.
12790        fn linear_floor(entries: &[InEntry], key: &[u8]) -> usize {
12791            let mut idx = 0usize;
12792            for (i, entry) in entries.iter().enumerate() {
12793                if i == 0 {
12794                    idx = 0;
12795                } else if entry.key.as_slice() <= key {
12796                    idx = i;
12797                } else {
12798                    break;
12799                }
12800            }
12801            idx
12802        }
12803
12804        let tree = Tree::new(1, 256);
12805        // Build sorted IN slot key sets of varying size; slot 0 = virtual −∞
12806        // (empty key sorts first), the rest strictly ascending.
12807        for n_slots in 1usize..40 {
12808            let mut entries: Vec<InEntry> = Vec::with_capacity(n_slots);
12809            entries.push(InEntry { key: vec![] });
12810            for i in 1..n_slots {
12811                // Strictly-ascending two-byte keys with gaps so probes can
12812                // fall between, on, before, and after them.
12813                let v = (i as u16) * 4;
12814                entries.push(InEntry {
12815                    key: vec![(v >> 8) as u8, (v & 0xFF) as u8],
12816                });
12817            }
12818            for probe in 0u16..=(n_slots as u16 * 4 + 4) {
12819                let key = vec![(probe >> 8) as u8, (probe & 0xFF) as u8];
12820                assert_eq!(
12821                    tree.upper_in_floor_index(&entries, &key),
12822                    linear_floor(&entries, &key),
12823                    "floor mismatch: n_slots={n_slots}, key={key:?}"
12824                );
12825            }
12826        }
12827    }
12828}
12829
12830// ─────────────────────────────────────────────────────────────────────────
12831// St-H6: BIN split inherits expiration_in_hours from the splitting BIN.
12832// ─────────────────────────────────────────────────────────────────────────
12833
12834/// Unit test for the St-H6 fix: the right-half sibling created by
12835/// `split_child` inherits `expiration_in_hours` from the splitting BIN.
12836///
12837/// Before the fix, the sibling was always created with
12838/// `expiration_in_hours = false`, causing hours-granularity TTL entries
12839/// (expiration_time ~495k) to be compared against `current_time_secs()`
12840/// (~1.78B) and treated as expired.
12841///
12842/// This test:
12843///   1. Creates a tree with max_entries = 4 and inserts 4 entries directly
12844///      (bypassing `update_key_expiration`) with non-zero `expiration_time`
12845///      and `expiration_in_hours = true` on the BIN.
12846///   2. Triggers a split.
12847///   3. Asserts that the right-half sibling has `expiration_in_hours = true`
12848///      (inherited, not hardcoded false).
12849#[test]
12850fn test_split_child_sibling_inherits_expiration_in_hours() {
12851    use crate::tree::{BIN_LEVEL, BinEntry, BinStub, MAIN_LEVEL, TreeNode};
12852    use noxu_util::{Lsn, NULL_LSN};
12853    use parking_lot::RwLock;
12854    use std::sync::Arc;
12855
12856    // Manually build a tree with one BIN (4 entries, expiration_in_hours=true).
12857    let tree = Tree::new(99, 4);
12858
12859    // Pre-populate the tree root for the test.
12860    let entries: Vec<BinEntry> = (0u8..4u8)
12861        .map(|_k| BinEntry {
12862            data: Some(vec![_k, _k]),
12863            known_deleted: false,
12864            dirty: true,
12865            expiration_time: 495_630, // hours-since-epoch value, 2026
12866        })
12867        .collect();
12868    let bin_keys: Vec<Vec<u8>> = (0u8..4u8).map(|k| vec![k]).collect();
12869    let bin = Arc::new(RwLock::new(TreeNode::Bottom(BinStub {
12870        node_id: 1,
12871        level: BIN_LEVEL,
12872        entries,
12873        key_prefix: Vec::new(),
12874        dirty: true,
12875        is_delta: false,
12876        last_full_lsn: NULL_LSN,
12877        last_delta_lsn: NULL_LSN,
12878        generation: 0,
12879        parent: None,
12880        expiration_in_hours: true, // hours-granularity entries
12881        cursor_count: 0,
12882        prohibit_next_delta: false,
12883        lsn_rep: LsnRep::Empty,
12884        keys: KeyRep::from_keys(bin_keys),
12885        compact_max_key_length: INKeyRep_DEFAULT_MAX_KEY_LENGTH,
12886    })));
12887
12888    let root = Arc::new(RwLock::new(TreeNode::Internal(InNodeStub {
12889        node_id: 2,
12890        level: MAIN_LEVEL | 2,
12891        entries: vec![InEntry {
12892            key: vec![], // virtual key for slot 0 (-infinity)
12893        }],
12894        targets: TargetRep::Sparse(vec![(0, Arc::clone(&bin))]),
12895        dirty: true,
12896        generation: 0,
12897        parent: None,
12898        lsn_rep: LsnRep::Empty,
12899    })));
12900    {
12901        let mut b = bin.write();
12902        b.set_parent(Some(Arc::downgrade(&root)));
12903    }
12904    *tree.root.write() = Some(Arc::clone(&root));
12905
12906    // Trigger split_child on the root.
12907    Tree::split_child(
12908        &root,
12909        0,
12910        4,
12911        Lsn::new(1, 500),
12912        SplitHint::Normal,
12913        &[],
12914        None,
12915        false,
12916        None,
12917    )
12918    .expect("split_child should succeed");
12919
12920    // After the split: root has two children — left BIN and right sibling.
12921    let root_guard = root.read();
12922    let TreeNode::Internal(ref in_node) = *root_guard else {
12923        panic!("root should be Internal after split");
12924    };
12925    assert_eq!(
12926        in_node.entries.len(),
12927        2,
12928        "root should have 2 entries (children) after split"
12929    );
12930
12931    // Right-half sibling is at slot 1.
12932    let sibling_arc = in_node
12933        .get_child(1)
12934        .expect("right-half sibling should exist at slot 1");
12935    let sibling_guard = sibling_arc.read();
12936    let TreeNode::Bottom(ref sibling) = *sibling_guard else {
12937        panic!("right sibling should be a BIN");
12938    };
12939
12940    assert!(
12941        sibling.expiration_in_hours,
12942        "St-H6: right-half sibling expiration_in_hours must be true \
12943             (inherited from splitting BIN); got false"
12944    );
12945
12946    // Verify the sibling's entries have the expected expiration_time.
12947    for e in &sibling.entries {
12948        assert_eq!(
12949            e.expiration_time, 495_630,
12950            "sibling entry expiration_time should be preserved: got {}",
12951            e.expiration_time
12952        );
12953        // With in_hours=true, is_expired should return false (future).
12954        assert!(
12955            !noxu_util::ttl::is_expired(
12956                e.expiration_time,
12957                sibling.expiration_in_hours
12958            ),
12959            "St-H6: sibling TTL entry ({}) should NOT appear expired \
12960                 with expiration_in_hours={}",
12961            e.expiration_time,
12962            sibling.expiration_in_hours
12963        );
12964    }
12965}
12966
12967/// Regression confirmation: `is_expired` with wrong `in_hours = false`
12968/// would falsely expire hours-granularity values (~495k hours since epoch).
12969#[test]
12970fn test_hours_value_is_expired_only_with_false_flag() {
12971    // Hours-since-epoch value for ~2026 + 1 000 h TTL.
12972    let exp_hours: u32 = 495_630;
12973    // Correctly treated as hours: not expired.
12974    assert!(
12975        !noxu_util::ttl::is_expired(exp_hours, true),
12976        "exp_hours={exp_hours} should NOT be expired when in_hours=true"
12977    );
12978    // Incorrectly treated as seconds (pre-fix right sibling): expired.
12979    assert!(
12980        noxu_util::ttl::is_expired(exp_hours, false),
12981        "exp_hours={exp_hours} should be expired when in_hours=false \
12982             (St-H6 demonstrates the wrong-flag scenario)"
12983    );
12984}
12985
12986// =============================================================================
12987// IN-redo unit tests (DRIFT-1 / Stage 1)
12988// =============================================================================
12989
12990#[cfg(test)]
12991mod in_redo_tests {
12992    use super::*;
12993
12994    /// Build a BinStub with `n` entries (key = [i as u8], lsn = lsn(1, i))
12995    /// and serialise it.  Returns (node_id, node_data_bytes).
12996    fn make_bin_bytes(node_id: u64, n: usize) -> Vec<u8> {
12997        let mut bin = BinStub {
12998            node_id,
12999            level: BIN_LEVEL,
13000            entries: Vec::new(),
13001            key_prefix: Vec::new(),
13002            dirty: false,
13003            is_delta: false,
13004            last_full_lsn: noxu_util::NULL_LSN,
13005            last_delta_lsn: noxu_util::NULL_LSN,
13006            generation: 0,
13007            parent: None,
13008            expiration_in_hours: true,
13009            cursor_count: 0,
13010            prohibit_next_delta: false,
13011            lsn_rep: LsnRep::Empty,
13012            keys: KeyRep::new(),
13013            compact_max_key_length: INKeyRep_DEFAULT_MAX_KEY_LENGTH,
13014        };
13015        for i in 0..n {
13016            // T-2/T-3: route through insert so entries/keys/lsn_rep stay
13017            // aligned; the serialized bytes are identical.
13018            bin.insert_with_prefix(
13019                vec![i as u8],
13020                Lsn::new(1, (i + 1) as u32),
13021                Some(vec![i as u8]),
13022            );
13023        }
13024        bin.serialize_full()
13025    }
13026
13027    /// Verify that recover_in_redo inserts a BIN as root when the tree is empty.
13028    ///
13029    /// JE RecoveryManager.recoverRootIN: `root == null` path.
13030    #[test]
13031    fn test_recover_in_redo_root_bin_inserted_into_empty_tree() {
13032        let tree = Tree::new(42, 128);
13033        assert!(tree.is_empty());
13034        let bytes = make_bin_bytes(1, 3);
13035        let log_lsn = Lsn::new(1, 100);
13036        let result = tree.recover_in_redo(
13037            log_lsn, /*is_root=*/ true, /*is_bin=*/ true, &bytes,
13038        );
13039        assert_eq!(result, InRedoResult::Inserted, "expected Inserted");
13040        // Tree should now have 3 entries.
13041        assert_eq!(tree.count_entries(), 3);
13042    }
13043
13044    /// Verify that recover_in_redo replaces a root BIN when the logged version is newer.
13045    ///
13046    /// JE RootUpdater.doWork: `DbLsn.compareTo(originalLsn, lsn) < 0` path.
13047    #[test]
13048    fn test_recover_in_redo_root_bin_replaced_when_log_newer() {
13049        let tree = Tree::new(42, 128);
13050        // Install an old root (2 entries, older LSN).
13051        let old_bytes = make_bin_bytes(1, 2);
13052        let old_lsn = Lsn::new(1, 50);
13053        tree.recover_in_redo(old_lsn, true, true, &old_bytes);
13054        assert_eq!(tree.count_entries(), 2);
13055        // Replay with newer LSN and 4 entries.
13056        let new_bytes = make_bin_bytes(1, 4);
13057        let new_lsn = Lsn::new(1, 100);
13058        let result = tree.recover_in_redo(new_lsn, true, true, &new_bytes);
13059        assert_eq!(result, InRedoResult::Replaced);
13060        assert_eq!(tree.count_entries(), 4);
13061    }
13062
13063    /// Verify that an older logged BIN does NOT replace a newer in-memory root.
13064    ///
13065    /// JE RootUpdater.doWork: `DbLsn.compareTo(originalLsn, lsn) >= 0` skip path.
13066    #[test]
13067    fn test_recover_in_redo_root_bin_skipped_when_tree_newer() {
13068        let tree = Tree::new(42, 128);
13069        // Install a newer root.
13070        let new_bytes = make_bin_bytes(1, 4);
13071        let new_lsn = Lsn::new(1, 200);
13072        tree.recover_in_redo(new_lsn, true, true, &new_bytes);
13073        // Attempt to replay an older version.
13074        let old_bytes = make_bin_bytes(1, 2);
13075        let old_lsn = Lsn::new(1, 100);
13076        let result = tree.recover_in_redo(old_lsn, true, true, &old_bytes);
13077        assert_eq!(result, InRedoResult::Skipped);
13078        // Tree still holds the newer 4-entry version.
13079        assert_eq!(tree.count_entries(), 4);
13080    }
13081
13082    /// deserialize_bin round-trips through serialize_full.
13083    #[test]
13084    fn test_deserialize_bin_round_trip() {
13085        let bytes = make_bin_bytes(99, 5);
13086        let bin = Tree::deserialize_bin(&bytes).expect("must deserialize");
13087        assert_eq!(bin.node_id, 99);
13088        assert_eq!(bin.entries.len(), 5);
13089        for i in 0..bin.entries.len() {
13090            assert_eq!(bin.get_full_key(i).unwrap(), vec![i as u8]);
13091        }
13092    }
13093
13094    /// deserialize_upper_in round-trips through write_to_bytes (Internal).
13095    #[test]
13096    fn test_deserialize_upper_in_round_trip() {
13097        // Build an InNodeStub and serialize via write_to_bytes.
13098        let node = TreeNode::Internal(InNodeStub {
13099            node_id: 77,
13100            level: 0x10002,
13101            entries: vec![
13102                InEntry { key: vec![1, 2, 3] },
13103                InEntry { key: vec![4, 5, 6] },
13104            ],
13105            targets: TargetRep::None,
13106            dirty: false,
13107            generation: 0,
13108            parent: None,
13109            lsn_rep: LsnRep::Empty,
13110        });
13111        let bytes = node.write_to_bytes();
13112        let restored =
13113            Tree::deserialize_upper_in(&bytes).expect("must deserialize");
13114        assert_eq!(restored.node_id, 77);
13115        assert_eq!(restored.level, 0x10002);
13116        assert_eq!(restored.entries.len(), 2);
13117        assert_eq!(restored.entries[0].key, vec![1, 2, 3]);
13118        assert_eq!(restored.entries[1].key, vec![4, 5, 6]);
13119    }
13120}
13121
13122// --- Part 2 acceptance tests: key_prefixing flag (DRIFT-3) ---
13123//
13124// JE `IN.computeKeyPrefix` returns null when `databaseImpl.getKeyPrefixing()`
13125// is false, so no prefix compression is ever applied to those BINs. Noxu was
13126// always applying prefix compression. This checks that the flag is honoured.
13127//
13128// Ref: `IN.java computeKeyPrefix` ~line 2456,
13129//      `DatabaseConfig.setKeyPrefixing` / `DatabaseImpl.getKeyPrefixing`.
13130#[cfg(test)]
13131mod key_prefixing_tests {
13132    use super::*;
13133
13134    /// Helper: find the first (leftmost) BIN in the tree.
13135    fn find_first_bin(node: &Arc<RwLock<TreeNode>>) -> Arc<RwLock<TreeNode>> {
13136        let child_opt = {
13137            let g = node.read();
13138            match &*g {
13139                TreeNode::Bottom(_) => None,
13140                TreeNode::Internal(n) => {
13141                    Some(Arc::clone(n.child_ref(0).expect("child")))
13142                }
13143            }
13144        };
13145        match child_opt {
13146            None => Arc::clone(node),
13147            Some(child) => find_first_bin(&child),
13148        }
13149    }
13150
13151    /// With `key_prefixing = false` (the default), keys must be stored without
13152    /// any prefix: the BIN's `key_prefix` must remain empty after inserts.
13153    #[test]
13154    fn test_key_prefixing_false_stores_full_keys() {
13155        // Default is key_prefixing = false.
13156        let tree = Tree::new(1, 16);
13157        assert!(!tree.key_prefixing, "default must be false");
13158
13159        let lsn = noxu_util::Lsn::new(1, 10);
13160        // Insert keys with a long common prefix.
13161        for i in 0u8..8 {
13162            let key = vec![b'r', b'e', b'c', b'o', b'r', b'd', b':', i];
13163            tree.insert(key, vec![i], lsn).expect("insert");
13164        }
13165
13166        let root = tree.get_root().expect("root");
13167        let bin_arc = find_first_bin(&root);
13168        let guard = bin_arc.read();
13169        let TreeNode::Bottom(ref bin) = *guard else {
13170            panic!("must be a BIN");
13171        };
13172        assert!(
13173            bin.key_prefix.is_empty(),
13174            "key_prefix must be empty when key_prefixing=false, got {:?}",
13175            bin.key_prefix
13176        );
13177        assert_eq!(bin.entries.len(), 8);
13178        // Keys must be stored as full keys.
13179        assert_eq!(
13180            bin.get_full_key(0).unwrap(),
13181            vec![b'r', b'e', b'c', b'o', b'r', b'd', b':', 0]
13182        );
13183    }
13184
13185    /// With `key_prefixing = true`, keys with a common prefix are compressed:
13186    /// the BIN's `key_prefix` must be non-empty.
13187    #[test]
13188    fn test_key_prefixing_true_compresses_keys() {
13189        let mut tree = Tree::new(1, 16);
13190        tree.set_key_prefixing(true);
13191
13192        let lsn = noxu_util::Lsn::new(1, 10);
13193        for i in 0u8..8 {
13194            let key = vec![b'r', b'e', b'c', b'o', b'r', b'd', b':', i];
13195            tree.insert(key, vec![i], lsn).expect("insert");
13196        }
13197
13198        let root = tree.get_root().expect("root");
13199        let bin_arc = find_first_bin(&root);
13200        let guard = bin_arc.read();
13201        let TreeNode::Bottom(ref bin) = *guard else {
13202            panic!("must be a BIN");
13203        };
13204        // Prefix compression must kick in: all keys share "record:".
13205        assert!(
13206            !bin.key_prefix.is_empty(),
13207            "key_prefix must be non-empty when key_prefixing=true"
13208        );
13209        assert_eq!(
13210            bin.key_prefix,
13211            b"record:".to_vec(),
13212            "prefix must be the common prefix of all inserted keys"
13213        );
13214    }
13215
13216    /// Custom-comparator databases (sorted-dup) always bypass prefix
13217    /// regardless of key_prefixing: `insert_cmp` does not touch key_prefix.
13218    #[test]
13219    fn test_key_prefixing_custom_comparator_no_prefix() {
13220        let cmp: KeyComparatorFn = Arc::new(|a: &[u8], b: &[u8]| a.cmp(b));
13221        let mut tree = Tree::new_with_comparator(1, 16, cmp);
13222        // Enable key_prefixing — should have no effect via insert_cmp path.
13223        tree.set_key_prefixing(true);
13224
13225        let lsn = noxu_util::Lsn::new(1, 10);
13226        for i in 0u8..8 {
13227            let key = vec![b'r', b'e', b'c', b'o', b'r', b'd', b':', i];
13228            tree.insert(key, vec![i], lsn).expect("insert");
13229        }
13230
13231        let root = tree.get_root().expect("root");
13232        let bin_arc = find_first_bin(&root);
13233        let guard = bin_arc.read();
13234        let TreeNode::Bottom(ref bin) = *guard else {
13235            panic!("must be a BIN");
13236        };
13237        // Custom-comparator path (insert_cmp) does not set key_prefix.
13238        assert!(
13239            bin.key_prefix.is_empty(),
13240            "custom-comparator path must not set key_prefix"
13241        );
13242    }
13243}
13244
13245// --- Part 1 acceptance tests: splitSpecial heuristic (DRIFT-1) ---
13246//
13247// JE `IN.splitSpecial` / `Tree.forceSplit`: when all routing decisions during
13248// descent are leftmost (`AllLeft`) or rightmost (`AllRight`), the split index
13249// is forced to 1 or `n-1` respectively instead of `n/2`. This halves the
13250// number of splits for monotonically increasing / decreasing key workloads
13251// (sequential append / prepend) because each split leaves the BIN near-full.
13252//
13253// Ref: `IN.java splitSpecial` ~line 4129, `Tree.java forceSplit` ~line 1907.
13254#[cfg(test)]
13255mod split_special_tests {
13256    use super::*;
13257
13258    /// Test helper: descend the tree to the BIN that holds (or would hold)
13259    /// `key`, returning its arc.  Mirrors the read-path descent used by
13260    /// `Tree::search`; sufficient for unit tests that need to mutate a slot.
13261    fn find_bin_arc_for_key(
13262        node_arc: &Arc<RwLock<TreeNode>>,
13263        key: &[u8],
13264    ) -> Option<Arc<RwLock<TreeNode>>> {
13265        let mut current = node_arc.clone();
13266        loop {
13267            let next = {
13268                let g = current.read();
13269                match &*g {
13270                    TreeNode::Bottom(_) => return Some(current.clone()),
13271                    TreeNode::Internal(n) => {
13272                        if n.entries.is_empty() {
13273                            return None;
13274                        }
13275                        let mut idx = 0usize;
13276                        for (i, e) in n.entries.iter().enumerate() {
13277                            if i == 0 || e.key.as_slice() <= key {
13278                                idx = i;
13279                            } else {
13280                                break;
13281                            }
13282                        }
13283                        n.get_child(idx)?
13284                    }
13285                }
13286            };
13287            current = next;
13288        }
13289    }
13290
13291    /// Count total leaf (BIN) nodes in the tree by DFS.
13292    fn count_bins(node: &Arc<RwLock<TreeNode>>) -> usize {
13293        let g = node.read();
13294        match &*g {
13295            TreeNode::Bottom(_) => 1,
13296            TreeNode::Internal(n) => {
13297                n.resident_children().iter().map(count_bins).sum()
13298            }
13299        }
13300    }
13301
13302    /// Return total key count across all BINs.
13303    fn count_keys(node: &Arc<RwLock<TreeNode>>) -> usize {
13304        let g = node.read();
13305        match &*g {
13306            TreeNode::Bottom(b) => b.entries.len(),
13307            TreeNode::Internal(n) => {
13308                n.resident_children().iter().map(count_keys).sum()
13309            }
13310        }
13311    }
13312
13313    /// Returns the number of entries in the leftmost BIN.
13314    fn leftmost_bin_size(node: &Arc<RwLock<TreeNode>>) -> usize {
13315        let g = node.read();
13316        match &*g {
13317            TreeNode::Bottom(b) => b.entries.len(),
13318            TreeNode::Internal(n) => {
13319                let first_child = n.child_ref(0).expect("child");
13320                leftmost_bin_size(first_child)
13321            }
13322        }
13323    }
13324
13325    /// Returns the number of entries in the rightmost BIN.
13326    fn rightmost_bin_size(node: &Arc<RwLock<TreeNode>>) -> usize {
13327        let g = node.read();
13328        match &*g {
13329            TreeNode::Bottom(b) => b.entries.len(),
13330            TreeNode::Internal(n) => {
13331                let last_child = n
13332                    .child_ref(n.entries.len().saturating_sub(1))
13333                    .expect("child");
13334                rightmost_bin_size(last_child)
13335            }
13336        }
13337    }
13338
13339    /// `splitSpecial` ascending: each right-side split leaves the left BIN
13340    /// near-full (all but one entry stays). Compared to midpoint split
13341    /// the number of BINs created should be significantly fewer relative to
13342    /// keys inserted (more keys per BIN on average).
13343    ///
13344    /// JE criterion: `allRightSideDescent` → `splitIndex = nEntries - 1`.
13345    /// The penultimate entry stays in the left BIN; only one entry goes to
13346    /// the new right sibling, which then absorbs the next insert and fills
13347    /// normally.
13348    #[test]
13349    fn test_split_special_ascending_fewer_bins_than_midpoint() {
13350        let max_entries = 8usize;
13351        let n_keys = 200usize;
13352
13353        // Build tree with splitSpecial (ascending keys trigger AllRight).
13354        let tree_special = Tree::new(1, max_entries);
13355        let lsn = noxu_util::Lsn::new(1, 100);
13356        for i in 0u32..n_keys as u32 {
13357            let key = i.to_be_bytes().to_vec();
13358            tree_special.insert(key, vec![0u8], lsn).expect("insert");
13359        }
13360
13361        let root_special = tree_special.get_root().expect("root must exist");
13362        let bins_special = count_bins(&root_special);
13363        let keys_special = count_keys(&root_special);
13364
13365        // All keys must be present.
13366        assert_eq!(keys_special, n_keys, "all keys must be stored");
13367
13368        // With splitSpecial, each right-side split keeps n-1 entries in the
13369        // left BIN. Ideal: ceil(n_keys / (max_entries - 1)) BINs.
13370        // Without splitSpecial (midpoint): ceil(n_keys / (max_entries / 2)).
13371        // We assert the actual count is below the midpoint-split upper bound.
13372        let midpoint_upper_bound = n_keys.div_ceil(max_entries / 2);
13373        assert!(
13374            bins_special < midpoint_upper_bound,
13375            "splitSpecial should produce fewer BINs than midpoint split: \
13376             got {bins_special}, midpoint upper bound = {midpoint_upper_bound}"
13377        );
13378
13379        // The rightmost BIN must have fewer entries than max_entries
13380        // (the last insert only half-fills it at most), which is expected.
13381        // The IMPORTANT property: rightmost BIN started with exactly 1 entry
13382        // (its first entry was the split-off singleton) then filled up.
13383        // We just verify overall key density > midpoint baseline.
13384        let avg_fill = keys_special as f64 / bins_special as f64;
13385        let midpoint_fill = (max_entries / 2) as f64;
13386        assert!(
13387            avg_fill > midpoint_fill,
13388            "average fill per BIN with splitSpecial ({avg_fill:.1}) should \
13389             exceed midpoint baseline ({midpoint_fill})"
13390        );
13391    }
13392
13393    /// `splitSpecial` descending: all routing decisions are at slot 0
13394    /// (`AllLeft`). Split forces `split_index = 1` so the right sibling
13395    /// gets almost all entries and the left node keeps just one.
13396    ///
13397    /// JE criterion: `allLeftSideDescent` → `splitIndex = 1`.
13398    #[test]
13399    fn test_split_special_descending_fewer_bins_than_midpoint() {
13400        let max_entries = 8usize;
13401        let n_keys = 200usize;
13402
13403        let tree_special = Tree::new(1, max_entries);
13404        let lsn = noxu_util::Lsn::new(1, 100);
13405        for i in (0u32..n_keys as u32).rev() {
13406            let key = i.to_be_bytes().to_vec();
13407            tree_special.insert(key, vec![0u8], lsn).expect("insert");
13408        }
13409
13410        let root_special = tree_special.get_root().expect("root must exist");
13411        let bins_special = count_bins(&root_special);
13412        let keys_special = count_keys(&root_special);
13413
13414        assert_eq!(keys_special, n_keys, "all keys must be stored");
13415
13416        let midpoint_upper_bound = n_keys.div_ceil(max_entries / 2);
13417        assert!(
13418            bins_special < midpoint_upper_bound,
13419            "splitSpecial descending should produce fewer BINs: \
13420             got {bins_special}, midpoint upper bound = {midpoint_upper_bound}"
13421        );
13422    }
13423
13424    /// Random-key inserts must NOT be affected by splitSpecial: with random
13425    /// keys descent will rarely be all-left or all-right, so the split index
13426    /// defaults to midpoint and tree balance is maintained.
13427    #[test]
13428    fn test_split_special_random_inserts_stay_balanced() {
13429        use std::collections::BTreeSet;
13430
13431        let max_entries = 8usize;
13432        // Use a fixed permutation so the test is deterministic.
13433        let mut keys: Vec<u32> = (0u32..200).collect();
13434        // Knuth shuffle with a fixed seed.
13435        let mut rng: u64 = 0xdeadbeef_cafebabe;
13436        for i in (1..keys.len()).rev() {
13437            rng = rng.wrapping_mul(6364136223846793005).wrapping_add(1);
13438            let j = (rng >> 33) as usize % (i + 1);
13439            keys.swap(i, j);
13440        }
13441
13442        let tree = Tree::new(1, max_entries);
13443        let lsn = noxu_util::Lsn::new(1, 100);
13444        let mut inserted = BTreeSet::new();
13445        for k in &keys {
13446            let key = k.to_be_bytes().to_vec();
13447            tree.insert(key, vec![0u8], lsn).expect("insert");
13448            inserted.insert(*k);
13449        }
13450
13451        let root = tree.get_root().expect("root");
13452        let total_keys = count_keys(&root);
13453        assert_eq!(
13454            total_keys,
13455            inserted.len(),
13456            "all random keys must be stored"
13457        );
13458
13459        // Verify every key is findable.
13460        for k in &inserted {
13461            let key = k.to_be_bytes().to_vec();
13462            let found = tree.search(&key);
13463            assert!(
13464                found.map(|r| r.is_exact_match()).unwrap_or(false),
13465                "random key {k} must be findable after insert"
13466            );
13467        }
13468    }
13469
13470    /// TREE-F1: a `known_deleted` BIN slot must read as ABSENT on an exact
13471    /// lookup and must be SKIPPED by scans, matching JE.
13472    ///
13473    /// JE contract:
13474    /// * `IN.findEntry` (IN.java:3197): an exact match that lands on a
13475    ///   known-deleted slot returns -1 (ABSENT).
13476    /// * `CursorImpl.lockAndGetCurrent` (CursorImpl.java:2062-2064): a
13477    ///   step that lands on `isEntryKnownDeleted(index)` returns null, so
13478    ///   the `getNext` loop advances past it (the slot is skipped).
13479    ///
13480    /// KD slots legitimately exist in live BINs during BIN-delta
13481    /// reconstitution (`mutate_to_full_bin` applies delta KD slots) until
13482    /// the compressor reclaims them.  We reach that state directly here by
13483    /// marking a slot known_deleted in the BIN arc, then assert the
13484    /// user-facing read/scan paths do not surface it.
13485    #[test]
13486    fn test_tree_f1_known_deleted_slot_is_absent_and_skipped() {
13487        let tree = Tree::new(1, 8);
13488        // Insert enough keys to populate a BIN with several live slots.
13489        for i in 0..6u32 {
13490            let key = format!("kd{i:04}").into_bytes();
13491            tree.insert(key, vec![i as u8], Lsn::new(1, i)).unwrap();
13492        }
13493
13494        // Pick a middle key and mark its slot known_deleted directly in the
13495        // BIN, modelling a delta-applied tombstone the compressor has not yet
13496        // reclaimed.
13497        let kd_key = b"kd0003".to_vec();
13498        {
13499            let root = tree.get_root().expect("root");
13500            let bin_arc = find_bin_arc_for_key(&root, &kd_key).expect("bin");
13501            let mut g = bin_arc.write();
13502            if let TreeNode::Bottom(b) = &mut *g {
13503                let idx = (0..b.entries.len())
13504                    .find(|&i| {
13505                        b.get_full_key(i).as_deref() == Some(kd_key.as_slice())
13506                    })
13507                    .expect("kd key slot");
13508                b.entries[idx].known_deleted = true;
13509            } else {
13510                panic!("expected BIN");
13511            }
13512        }
13513
13514        // (a) exact lookup via Tree::search must report NOT found.
13515        let sr = tree.search(&kd_key);
13516        assert!(
13517            !sr.map(|r| r.is_exact_match()).unwrap_or(false),
13518            "TREE-F1: Tree::search must report a known_deleted slot as absent \
13519             (IN.findEntry IN.java:3197)"
13520        );
13521
13522        // (a) exact lookup via Tree::search_with_data must report NOT found.
13523        let sf = tree.search_with_data(&kd_key).expect("slot fetch");
13524        assert!(
13525            !sf.found,
13526            "TREE-F1: Tree::search_with_data must report a known_deleted slot \
13527             as absent (IN.findEntry IN.java:3197)"
13528        );
13529
13530        // Live neighbours must still be found.
13531        for live in [b"kd0002".to_vec(), b"kd0004".to_vec()] {
13532            assert!(
13533                tree.search(&live).map(|r| r.is_exact_match()).unwrap_or(false),
13534                "live neighbour must remain findable"
13535            );
13536        }
13537
13538        // (b) a scan-facing BIN dump (descend_to_edge_bin / get_next_bin /
13539        // get_prev_bin) returns slots verbatim WITH the known_deleted flag
13540        // set, so the cursor can skip them (CursorImpl.java:2062-2064).  The
13541        // contract here is: the KD slot is never reported as a LIVE entry.
13542        let root = tree.get_root().expect("root");
13543        let edge = Tree::descend_to_edge_bin(&root, true).expect("edge bin");
13544        assert!(
13545            !edge.iter().any(|(e, _, k)| k == &kd_key && !e.known_deleted),
13546            "TREE-F1: scan must not surface a known_deleted slot as live \
13547             (CursorImpl.java:2062-2064)"
13548        );
13549        for anchor in [b"kd0000".to_vec(), b"kd0005".to_vec()] {
13550            for entries in
13551                [tree.get_next_bin(&anchor), tree.get_prev_bin(&anchor)]
13552                    .into_iter()
13553                    .flatten()
13554            {
13555                assert!(
13556                    !entries
13557                        .iter()
13558                        .any(|(e, _, k)| k == &kd_key && !e.known_deleted),
13559                    "TREE-F1: get_next_bin/get_prev_bin must not surface a \
13560                     known_deleted slot as live"
13561                );
13562            }
13563        }
13564
13565        // first_entry_at_or_after must skip a KD slot at the boundary.
13566        if let Some((k, _, _)) = tree.first_entry_at_or_after(&kd_key) {
13567            assert_ne!(
13568                k, kd_key,
13569                "TREE-F1: first_entry_at_or_after must skip a known_deleted \
13570                 slot (CursorImpl.java:2062-2064)"
13571            );
13572        }
13573
13574        // The compressor KD-iteration path must STILL see the slot — the fix
13575        // only changes the user-facing read predicate, not the maintenance
13576        // iteration that exists to reclaim KD slots.
13577        let kd_bins = tree.collect_bins_with_known_deleted();
13578        assert!(
13579            !kd_bins.is_empty(),
13580            "TREE-F1: collect_bins_with_known_deleted must still observe the \
13581             KD slot so the compressor can reclaim it"
13582        );
13583    }
13584}