noxu_tree/
tree.rs

1//! B+tree implementation.
2//!
3//!
4//! Tree implements the B+tree. It provides search, insert, and delete
5//! operations on the tree structure. The tree uses latch-coupling for
6//! concurrent access: when traversing down the tree, the parent latch
7//! is released after the child latch is acquired.
8//!
9//! # Architecture
10//!
11//! The tree has a hierarchical structure:
12//! - Internal Nodes (IN) at levels 2 and above
13//! - Bottom Internal Nodes (BIN) at level 1
14//! - Leaf Nodes (LN) containing actual data
15//!
16//! # Locking Strategy
17//!
18//! - Root latch protects the root pointer itself
19//! - Each node has its own latch for concurrent access
20//! - Search uses latch-coupling: acquire child, release parent
21//! - Modifications may require exclusive latches
22
23use crate::error::TreeError;
24use crate::key::{create_key_prefix, get_key_prefix_length};
25use crate::search_result::SearchResult;
26use noxu_latch::{LatchContext, SharedLatch};
27use noxu_util::{Lsn, NULL_LSN};
28use parking_lot::RwLock;
29use std::sync::atomic::{AtomicI64, AtomicU64, Ordering};
30use std::sync::{Arc, Weak};
31
32/// Observer that mirrors JE's `INList` feeding the evictor's `LRUList`s.
33///
34/// The tree owns no eviction policy of its own; instead it notifies a
35/// registered listener whenever an IN/BIN node enters the resident cache, is
36/// accessed, or is removed.  The `Evictor` (in `noxu-evictor`) implements this
37/// trait, but the dependency is one-way (`noxu-evictor` → `noxu-tree`), so the
38/// tree refers to the listener only through this trait object — avoiding a
39/// circular crate dependency.
40///
41/// JE reference: `IN.fetchTarget` / split / `rebuildINList` call
42/// `Evictor.addBack`; node access calls `Evictor.moveBack`; node removal
43/// calls `Evictor.remove`.
44pub trait InListListener: Send + Sync {
45    /// A node has just become resident in the cache (JE `Evictor.addBack`).
46    fn note_ins_added(&self, node_id: u64);
47    /// A resident node was accessed (JE `Evictor.moveBack` — LRU touch).
48    fn note_ins_accessed(&self, node_id: u64);
49    /// A node was removed from the cache (JE `Evictor.remove`).
50    fn note_ins_removed(&self, node_id: u64);
51}
52
53// Level and flag constants re-exported here for tree-internal use.
54pub const DBMAP_LEVEL: i32 = 0x20000;
55pub const MAIN_LEVEL: i32 = 0x10000;
56pub const LEVEL_MASK: i32 = 0x0ffff;
57pub const MIN_LEVEL: i32 = -1;
58pub const BIN_LEVEL: i32 = MAIN_LEVEL | 1;
59pub const EXACT_MATCH: i32 = 1 << 16;
60pub const INSERT_SUCCESS: i32 = 1 << 17;
61
62/// Per-slot fixed memory overhead for a BIN entry, in bytes (DBI-23).
63///
64/// This is the heap footprint of one `BinEntry` *struct* as it lives inside
65/// the BIN's `Vec<BinEntry>` buffer — NOT counting the variable-length key and
66/// data bytes, which are separate heap allocations counted on top of this.
67///
68/// Faithful to JE `IN.getEntryInMemorySize` + the per-slot `entryStates` /
69/// LSN-array overhead folded into `IN.computeMemorySize` (IN.java ~4632):
70/// JE measures the slot's fixed cost with `Sizeof` on the JVM; Rust has a
71/// fixed struct layout so `size_of::<BinEntry>()` is exact.
72///
73/// T-2/T-3: the per-slot `key` (`Vec<u8>` header) and `lsn` (`u64`) were
74/// hoisted out of `BinEntry` into the node-level `KeyRep`/`LsnRep`.  The
75/// `size_of::<BinEntry>()` therefore shrank; we add back the packed per-slot
76/// LSN-rep cost (`LsnRep::BYTES_PER_LSN_ENTRY`, 4 bytes) so the incremental
77/// live counter still approximates the walked heap (the key bytes are charged
78/// separately as `key.len()` at the call site, matching the compact key rep).
79///
80/// Derived (not hard-coded) so a layout change to `BinEntry` is tracked
81/// automatically — see `bin_stub_conformance` for the drift guard.
82pub const BIN_ENTRY_OVERHEAD: usize =
83    std::mem::size_of::<BinEntry>() + LsnRep::BYTES_PER_LSN_ENTRY;
84
85/// Per-slot fixed memory overhead for an IN entry, in bytes (DBI-23).
86///
87/// Heap footprint of one `InEntry` struct inside the IN's `Vec<InEntry>`
88/// buffer (key bytes counted separately).  JE `IN.getEntryInMemorySize` for
89/// an upper IN plus the per-slot state/LSN/target overhead from
90/// `IN.computeMemorySize`.
91pub const IN_ENTRY_OVERHEAD: usize = std::mem::size_of::<InEntry>();
92
93/// Type alias for the key comparator used by sorted-duplicate databases.
94///
95/// The comparator takes two full (uncompressed) keys and returns their
96/// relative ordering.  For sorted-dup databases this is `DupKeyData::compare`,
97/// which splits each key into primary + data parts and applies separate
98/// comparators to each.  For normal databases this field is `None` and
99/// lexicographic byte comparison is used.
100///
101/// `DatabaseImpl.btreeComparator` / `DatabaseImpl.dupComparator`.
102pub type KeyComparatorFn =
103    Arc<dyn Fn(&[u8], &[u8]) -> std::cmp::Ordering + Send + Sync>;
104
105/// Combined search result carrying slot data and the BIN arc, returned by
106/// [`Tree::search_with_data`].
107///
108/// Avoids the double-descent pattern where `Tree::search` checked key
109/// existence and a second call re-descended to fetch the actual slot bytes.
110/// One descent now serves both purposes (Wave-11-I optimisation).
111pub struct SlotFetch {
112    /// `true` if an exact key match was found and is not expired.
113    pub found: bool,
114    /// Data bytes for the slot (`None` when `found` is `false`).
115    pub data: Option<Vec<u8>>,
116    /// Raw slot LSN as `u64`; zero when `found` is `false`.
117    pub lsn: u64,
118    /// Slot index within the BIN.  Set to the actual BIN slot index when
119    /// `found` is `true`; `0` otherwise.
120    ///
121    /// Used by `CursorImpl` to set `current_index` correctly so that
122    /// `retrieve_next` advances to the right slot after a search.
123    pub slot_index: usize,
124    /// Arc to the BIN that the descent reached.  Always `Some` when the
125    /// tree has at least one node, regardless of whether `found` is `true`.
126    pub bin_arc: Arc<RwLock<TreeNode>>,
127}
128
129/// The B+tree.
130///
131///
132///
133/// This is the main tree structure that manages the B+tree nodes and
134/// provides operations for search, insert, delete, and tree maintenance.
135pub struct Tree {
136    /// Database ID this tree belongs to.
137    database_id: u64,
138
139    /// Maximum entries per node (from config).
140    max_entries_per_node: usize,
141
142    /// Root of the tree. None if tree is empty.
143    ///
144    /// Wrapped in `RwLock` so that `insert`, `delete`, and other mutating
145    /// operations can take `&self` (interior mutability), enabling concurrent
146    /// access to different BIN nodes without requiring a global `&mut Tree`
147    /// borrow.  The root pointer itself is only written during root splits
148    /// and initial creation; all other access is read-only.
149    ///
150    /// `Tree.root` protected by the root latch.
151    root: RwLock<Option<Arc<RwLock<TreeNode>>>>,
152
153    /// Latch protecting the root reference itself.
154    /// Must be held when changing the root pointer.
155    root_latch: SharedLatch,
156
157    /// LSN at which the current root IN/BIN was last logged.
158    ///
159    /// Used by the IN-redo currency check (`recover_root_bin` /
160    /// `recover_root_upper_in`) to decide whether a logged root replaces the
161    /// in-memory one.  Updated whenever a new root is installed via
162    /// `set_root_with_lsn` or the IN-redo recover-root path.
163    ///
164    /// JE `RootUpdater.originalLsn` / `ChildReference.getLsn()` for the root.
165    root_log_lsn: RwLock<noxu_util::Lsn>,
166
167    /// Statistics: number of times the root has been split.
168    root_splits: AtomicU64,
169
170    /// Statistics: number of latch upgrades from shared to exclusive.
171    relatches_required: AtomicU64,
172
173    /// Optional custom key comparator for sorted-duplicate databases.
174    ///
175    /// When `Some`, all key comparisons in tree traversal (upper IN routing
176    /// and BIN entry search/insert/delete) use this comparator instead of
177    /// lexicographic byte comparison.
178    ///
179    /// / `dupComparator` stored on the
180    /// database and consulted at every `IN.findEntry()` call.
181    pub key_comparator: Option<KeyComparatorFn>,
182
183    /// Shared memory counter for the evictor / MemoryBudget.
184    ///
185    /// Updated on every BIN entry insert (+key+data+overhead) and delete
186    /// (-key+overhead) so the evictor sees real cache pressure.
187    ///
188    /// `env.getMemoryBudget().updateTreeMemoryUsage(delta)` call
189    /// in the equivalent `IN.updateMemorySize()`.  In Noxu the counter is an
190    /// `Arc<AtomicI64>` shared with the `Arbiter` (and later `MemoryBudget`)
191    /// to avoid a circular crate dependency (`noxu-tree` → `noxu-dbi`).
192    pub memory_counter: Option<Arc<AtomicI64>>,
193
194    /// Optional listener fed on node add/access/remove, mirroring JE's
195    /// `INList` feeding the evictor's `LRUList`s.
196    ///
197    /// When `None` (the default — used by unit tests with no environment),
198    /// the notifications are no-ops.  `EnvironmentImpl` installs the
199    /// `Evictor` here so production inserts/accesses populate the LRU lists
200    /// the evictor drains.
201    ///
202    /// JE reference: `IN.fetchTarget`/split/`rebuildINList` → `addBack`,
203    /// access → `moveBack`, removal → `remove`.
204    pub in_list_listener: Option<Arc<dyn InListListener>>,
205
206    /// Optional log manager so an evicted root IN can be re-materialized from
207    /// its persisted `root_log_lsn` on the next access (EV-14, piece B).
208    ///
209    /// JE's `Tree` reaches the log via `database.getEnv().getLogManager()`;
210    /// `Tree.getRootINRootAlreadyLatched` calls `root.fetchTarget(...)` which
211    /// reads the root IN back from its `ChildReference` LSN when the in-memory
212    /// target is null (Tree.java:477-516, ChildReference.fetchTarget).  Noxu
213    /// has no env back-reference here, so the log manager is installed
214    /// directly (the same one-way wiring as `in_list_listener`).  When `None`
215    /// (unit tests with no environment), an evicted root cannot be re-fetched
216    /// — but `evict_root` refuses to evict without a log manager, so the root
217    /// is never made non-resident in that configuration.
218    pub log_manager: Option<Arc<noxu_log::LogManager>>,
219
220    /// Capacity hint for the recovery redo path.
221    ///
222    /// When non-zero, the first BIN created by `redo_insert` (the first-key
223    /// path) pre-allocates its `entries` Vec with this capacity so that
224    /// redo insertions proceed without Vec-resize doublings.  The value is
225    /// clamped to `max_entries_per_node` at use.
226    ///
227    /// Set by `hint_redo_capacity` before the redo loop.
228    /// Wave 11-K optimisation (Fix 3).
229    redo_capacity_hint: usize,
230
231    /// Whether key-prefix compression is enabled for this tree's BINs.
232    ///
233    /// JE `DatabaseImpl.getKeyPrefixing()` / `DatabaseConfig.setKeyPrefixing()`.
234    /// When `false`, `IN.computeKeyPrefix` returns `null` in JE — no prefix
235    /// is ever set. Noxu mirrors this: `insert_with_prefix` is skipped in
236    /// favour of `insert_raw`, and `recompute_key_prefix` is not called on
237    /// BIN halves after a split.
238    ///
239    /// Default: `false` (matches JE's `DatabaseConfig.KEY_PREFIXING_DEFAULT`).
240    ///
241    /// Ref: `IN.java computeKeyPrefix` ~line 2456.
242    pub key_prefixing: bool,
243    /// T-5: maximum post-prefix key length (bytes) for the compact key rep
244    /// (`INKeyRep.MaxKeySize`).  A node packs all its keys into one fixed-width
245    /// byte array when every post-prefix key is `<=` this length; a longer key
246    /// inflates the node to the `Default` rep.  `<= 0` disables the compact
247    /// rep entirely.
248    ///
249    /// Default 16 (`TREE_COMPACT_MAX_KEY_LENGTH` /
250    /// `INKeyRep.MaxKeySize.DEFAULT_MAX_KEY_LENGTH`).  Wired from
251    /// `EnvironmentConfig` via `Tree::set_compact_max_key_length`
252    /// (`IN.getCompactMaxKeyLength`, IN.java:4929).
253    pub compact_max_key_length: i32,
254}
255
256/// A node in the tree.
257///
258/// TreeNode wraps an upper IN or a BIN. Each variant carries a lightweight
259/// stub whose fields mirror the persistent IN/BIN structure. The stubs will
260/// be replaced with full InNode/Bin types as the implementation matures; the
261/// API surface here is intentionally minimal.
262#[derive(Debug)]
263pub enum TreeNode {
264    /// Internal Node (IN) - non-leaf node in the tree.
265    Internal(InNodeStub),
266
267    /// Bottom Internal Node (BIN) - leaf-level internal node.
268    Bottom(BinStub),
269}
270
271/// Type alias for a resident child pointer.
272pub type ChildArc = Arc<RwLock<TreeNode>>;
273
274/// T-4: per-node representation of the resident-child-pointer array.
275///
276/// Faithful to JE `INTargetRep` (`INTargetRep.java`), the abstract array of
277/// target pointers to an IN's cached children.  These arrays are usually
278/// sparse — most upper INs have NO resident children — so JE never stores a
279/// full per-slot `Node[]` until many children are actually cached:
280///
281///   * `None`   — `INTargetRep.None`: a shared singleton, 0 child-pointer
282///     bytes, used when no children are cached (the common case for upper
283///     INs).  `get` returns null for every slot.
284///   * `Sparse` — `INTargetRep.Sparse`: a small parallel `(index, target)[]`
285///     for 1..=`MAX_ENTRIES` cached children (JE caps at 4).  `get(j)` is a
286///     linear scan of the index array.
287///   * `Default`— `INTargetRep.Default`: the full `Vec<Option<Arc>>`, one
288///     slot per entry, used once more than `MAX_ENTRIES` children are
289///     resident.
290///
291/// A node starts `None` and grows `None → Sparse → Default`.  JE does not
292/// shrink back when entries are nulled (it only compacts on IN-stripping) to
293/// avoid transitionary rep churn; we follow the same policy — `set_child` only
294/// inflates, and `compact()` (called on eviction/stripping) collapses an
295/// empty/small `Default`/`Sparse` back toward `None`.
296#[derive(Debug)]
297pub enum TargetRep {
298    /// `INTargetRep.None` — no children cached (shared-singleton semantics).
299    None,
300    /// `INTargetRep.Sparse` — a few cached children, `(slot_index, child)`.
301    /// Invariant: `len() <= SPARSE_MAX_ENTRIES`.
302    Sparse(Vec<(u16, ChildArc)>),
303    /// `INTargetRep.Default` — full parallel array, one slot per entry.
304    Default(Vec<Option<ChildArc>>),
305}
306
307impl TargetRep {
308    /// `INTargetRep.Sparse.MAX_ENTRIES` (INTargetRep.java) — the maximum
309    /// number of cached children the `Sparse` rep holds before inflating to
310    /// `Default`.
311    pub const SPARSE_MAX_ENTRIES: usize = 4;
312
313    /// `INTargetRep.get(idx)` — the cached child for slot `idx`, or `None`.
314    #[inline]
315    pub fn get(&self, idx: usize) -> Option<&ChildArc> {
316        match self {
317            TargetRep::None => None,
318            TargetRep::Sparse(v) => {
319                v.iter().find(|(i, _)| *i as usize == idx).map(|(_, c)| c)
320            }
321            TargetRep::Default(v) => v.get(idx).and_then(|o| o.as_ref()),
322        }
323    }
324
325    /// `INTargetRep.set(idx, node, parent)` — set (or clear, when `node` is
326    /// `None`) the cached child for slot `idx`, mutating the representation
327    /// upward (`None → Sparse → Default`) as needed.
328    pub fn set(&mut self, idx: usize, node: Option<ChildArc>) {
329        match self {
330            TargetRep::None => {
331                // INTargetRep.None.set: clearing stays None; setting mutates
332                // to a Sparse rep and sets there.
333                if let Some(child) = node {
334                    *self = TargetRep::Sparse(vec![(idx as u16, child)]);
335                }
336            }
337            TargetRep::Sparse(v) => {
338                // Update existing slot in place.
339                if let Some(pos) =
340                    v.iter().position(|(i, _)| *i as usize == idx)
341                {
342                    match node {
343                        Some(child) => v[pos].1 = child,
344                        None => {
345                            v.swap_remove(pos);
346                        }
347                    }
348                    return;
349                }
350                // New child: clearing a non-present slot is a no-op.
351                let Some(child) = node else { return };
352                if v.len() < Self::SPARSE_MAX_ENTRIES {
353                    v.push((idx as u16, child));
354                    return;
355                }
356                // Full — INTargetRep.Sparse.set mutates to Default.
357                let cap = v.iter().map(|(i, _)| *i as usize).max().unwrap_or(0);
358                let cap = cap.max(idx) + 1;
359                let mut def: Vec<Option<ChildArc>> = vec![None; cap];
360                for (i, c) in v.drain(..) {
361                    def[i as usize] = Some(c);
362                }
363                def[idx] = Some(child);
364                *self = TargetRep::Default(def);
365            }
366            TargetRep::Default(v) => {
367                if idx >= v.len() {
368                    if node.is_none() {
369                        return;
370                    }
371                    v.resize_with(idx + 1, || None);
372                }
373                v[idx] = node;
374            }
375        }
376    }
377
378    /// `INTargetRep.None`-aware take: remove and return the cached child for
379    /// slot `idx`, leaving the slot empty (JE `IN.setTarget(idx, null)` plus
380    /// returning the old target).
381    pub fn take(&mut self, idx: usize) -> Option<ChildArc> {
382        match self {
383            TargetRep::None => None,
384            TargetRep::Sparse(v) => v
385                .iter()
386                .position(|(i, _)| *i as usize == idx)
387                .map(|pos| v.swap_remove(pos).1),
388            TargetRep::Default(v) => v.get_mut(idx).and_then(|o| o.take()),
389        }
390    }
391
392    /// JE `INArrayRep.copy(from, to, n, parent)` adapted to slice ops: shift
393    /// the child mapping when an entry is INSERTED at `idx` (all children at
394    /// slots `>= idx` move up by one).  Mirrors how `Vec::insert` shifts the
395    /// parallel `entries` array.
396    pub fn insert_shift(&mut self, idx: usize) {
397        match self {
398            TargetRep::None => {}
399            TargetRep::Sparse(v) => {
400                for (i, _) in v.iter_mut() {
401                    if (*i as usize) >= idx {
402                        *i += 1;
403                    }
404                }
405            }
406            TargetRep::Default(v) => {
407                if idx <= v.len() {
408                    v.insert(idx, None);
409                }
410            }
411        }
412    }
413
414    /// JE `INArrayRep.copy` adapted: shift the child mapping when the entry at
415    /// `idx` is REMOVED (all children at slots `> idx` move down by one; the
416    /// child at `idx` itself is dropped).  Mirrors `Vec::remove`.
417    pub fn remove_shift(&mut self, idx: usize) {
418        match self {
419            TargetRep::None => {}
420            TargetRep::Sparse(v) => {
421                v.retain(|(i, _)| *i as usize != idx);
422                for (i, _) in v.iter_mut() {
423                    if (*i as usize) > idx {
424                        *i -= 1;
425                    }
426                }
427            }
428            TargetRep::Default(v) => {
429                if idx < v.len() {
430                    v.remove(idx);
431                }
432            }
433        }
434    }
435
436    /// `INTargetRep.compact(parent)` — collapse toward the most compact rep:
437    /// an empty rep becomes `None`; a `Default` with `<= MAX_ENTRIES` children
438    /// becomes `Sparse` (or `None`).  Called when an IN is stripped/evicted.
439    pub fn compact(&mut self) {
440        let count = self.resident_count();
441        if count == 0 {
442            *self = TargetRep::None;
443            return;
444        }
445        if count <= Self::SPARSE_MAX_ENTRIES
446            && let TargetRep::Default(v) = self
447        {
448            let sparse: Vec<(u16, ChildArc)> = v
449                .iter()
450                .enumerate()
451                .filter_map(|(i, o)| o.as_ref().map(|c| (i as u16, c.clone())))
452                .collect();
453            *self = TargetRep::Sparse(sparse);
454        }
455    }
456
457    /// Number of resident (non-null) children.
458    pub fn resident_count(&self) -> usize {
459        match self {
460            TargetRep::None => 0,
461            TargetRep::Sparse(v) => v.len(),
462            TargetRep::Default(v) => v.iter().filter(|o| o.is_some()).count(),
463        }
464    }
465
466    /// True if no children are cached (`INTargetRep.None` or empty).
467    pub fn is_empty(&self) -> bool {
468        self.resident_count() == 0
469    }
470
471    /// Iterate every resident child (in unspecified order).
472    pub fn iter_children(&self) -> Box<dyn Iterator<Item = ChildArc> + '_> {
473        match self {
474            TargetRep::None => Box::new(std::iter::empty()),
475            TargetRep::Sparse(v) => Box::new(v.iter().map(|(_, c)| c.clone())),
476            TargetRep::Default(v) => {
477                Box::new(v.iter().filter_map(|o| o.clone()))
478            }
479        }
480    }
481
482    /// `INTargetRep.calculateMemorySize()` — heap bytes of the rep itself
483    /// (excluding the children it points at).  `None` is 0 (shared singleton),
484    /// matching `INTargetRep.None.calculateMemorySize() == 0`.
485    pub fn memory_size(&self) -> usize {
486        use std::mem::size_of;
487        match self {
488            TargetRep::None => 0,
489            TargetRep::Sparse(v) => v.capacity() * size_of::<(u16, ChildArc)>(),
490            TargetRep::Default(v) => {
491                v.capacity() * size_of::<Option<ChildArc>>()
492            }
493        }
494    }
495}
496
497/// T-3: node-level packed LSN array — `IN.entryLsnByteArray` /
498/// `IN.entryLsnLongArray` (IN.java:251-289, getLsn/setLsnInternal
499/// IN.java:1752-1935).
500///
501/// JE stores one LSN per slot.  A naive `Lsn` (u64) costs 8 bytes/slot even
502/// though most LSNs in a node share a file number and have a file offset that
503/// fits in 3 bytes.  JE's compact rep is a single `byte[]` with
504/// `BYTES_PER_LSN_ENTRY == 4` bytes per slot:
505///
506///   * `base_file_number` is the lowest file number of any non-NULL LSN in the
507///     node;
508///   * byte 0 of each slot = `file_number - base_file_number` (0..=127,
509///     `Byte.MAX_VALUE`);
510///   * bytes 1..4 = the 3-byte little-endian file offset (max
511///     `MAX_FILE_OFFSET == 0xff_fffe`).
512///
513/// The NULL_LSN blocker (Noxu `NULL_LSN == u64::MAX`) is solved EXACTLY as JE
514/// does it: NULL is NOT stored as the raw u64; the slot's 3 file-offset bytes
515/// are set to `0xff_ffff` (`THREE_BYTE_NEGATIVE_ONE`), a value `MAX_FILE_OFFSET`
516/// can never reach, and `get_lsn` maps it back to `NULL_LSN`.
517///
518/// If a file-number difference exceeds 127 or a file offset exceeds
519/// `MAX_FILE_OFFSET`, the rep mutates to `Long` (one `u64` per slot), matching
520/// JE's `mutateToLongArray` (IN.java:1924).  An all-NULL node uses `Empty`
521/// (0 bytes), matching the EMPTY_REP/initial-capacity-free state.
522#[derive(Debug)]
523pub enum LsnRep {
524    /// All slots NULL — 0 heap bytes (the `byteArray == null` initial state).
525    Empty,
526    /// `IN.entryLsnByteArray` — 4 bytes/slot, `base_file_number`-relative.
527    Compact { base_file_number: u32, bytes: Vec<u8> },
528    /// `IN.entryLsnLongArray` — 8 bytes/slot fallback after `mutateToLongArray`.
529    Long(Vec<Lsn>),
530}
531
532impl LsnRep {
533    /// `IN.BYTES_PER_LSN_ENTRY` (IN.java:151).
534    pub const BYTES_PER_LSN_ENTRY: usize = 4;
535    /// `IN.MAX_FILE_OFFSET` (IN.java:152) — max file offset the 3-byte form holds.
536    const MAX_FILE_OFFSET: u32 = 0x00ff_fffe;
537    /// `IN.THREE_BYTE_NEGATIVE_ONE` (IN.java:153) — the NULL sentinel in the
538    /// 3 file-offset bytes.
539    const THREE_BYTE_NEGATIVE_ONE: u32 = 0x00ff_ffff;
540    /// `Byte.MAX_VALUE` — max file-number difference the 1-byte offset holds.
541    const MAX_FILE_NUMBER_OFFSET: u32 = 127;
542
543    /// A rep sized for `n` slots, all NULL.  Returns `Empty` (0 bytes); the
544    /// Compact byte array is lazily allocated by the first non-NULL `set_lsn`
545    /// — `base_file_number` is unknown until then (IN.java:1820, the
546    /// `baseFileNumber == -1` first-entry case).
547    #[inline]
548    pub fn new(_n: usize) -> Self {
549        LsnRep::Empty
550    }
551
552    /// Build a rep from a per-slot `Lsn` slice (used by node construction and
553    /// split, where slots arrive together).  Equivalent to `new(lsns.len())`
554    /// followed by `set(i, lsns[i])` for each slot.
555    pub fn from_lsns(lsns: &[Lsn]) -> Self {
556        let mut rep = LsnRep::Empty;
557        let n = lsns.len();
558        for (i, &lsn) in lsns.iter().enumerate() {
559            rep.set(i, lsn, n);
560        }
561        rep
562    }
563
564    /// `IN.getLsn(idx)` (IN.java:1752).
565    pub fn get(&self, idx: usize) -> Lsn {
566        match self {
567            LsnRep::Empty => NULL_LSN,
568            LsnRep::Long(v) => v.get(idx).copied().unwrap_or(NULL_LSN),
569            LsnRep::Compact { base_file_number, bytes } => {
570                let off = idx * Self::BYTES_PER_LSN_ENTRY;
571                if off + Self::BYTES_PER_LSN_ENTRY > bytes.len() {
572                    return NULL_LSN;
573                }
574                let file_offset = Self::get_3byte(bytes, off + 1);
575                if file_offset == Self::THREE_BYTE_NEGATIVE_ONE {
576                    NULL_LSN
577                } else {
578                    let file_number = base_file_number + bytes[off] as u32;
579                    Lsn::new(file_number, file_offset)
580                }
581            }
582        }
583    }
584
585    /// `IN.setLsnInternal(idx, value)` (IN.java:1801) — set the LSN of slot
586    /// `idx`, mutating Empty→Compact→Long as necessary.  `n` is the node's
587    /// slot count (sizes a freshly-allocated Compact array).
588    pub fn set(&mut self, idx: usize, lsn: Lsn, n: usize) {
589        // Empty: first non-NULL value allocates the Compact array; a NULL set
590        // on an Empty rep is a no-op (all slots already read NULL).
591        if let LsnRep::Empty = self {
592            if lsn.is_null() {
593                return;
594            }
595            let cap = n.max(idx + 1);
596            *self = LsnRep::Compact {
597                base_file_number: lsn.file_number(),
598                bytes: vec![0u8; cap * Self::BYTES_PER_LSN_ENTRY],
599            };
600            // Mark every other slot NULL (3-byte offset = 0xffffff).
601            if let LsnRep::Compact { bytes, .. } = self {
602                for s in 0..cap {
603                    if s != idx {
604                        Self::put_3byte(
605                            bytes,
606                            s * Self::BYTES_PER_LSN_ENTRY + 1,
607                            Self::THREE_BYTE_NEGATIVE_ONE,
608                        );
609                    }
610                }
611            }
612            self.set(idx, lsn, n);
613            return;
614        }
615
616        if let LsnRep::Long(v) = self {
617            if idx >= v.len() {
618                v.resize(idx + 1, NULL_LSN);
619            }
620            v[idx] = lsn;
621            return;
622        }
623
624        // Compact path.
625        let LsnRep::Compact { base_file_number, bytes } = self else {
626            unreachable!()
627        };
628        let need = (idx + 1) * Self::BYTES_PER_LSN_ENTRY;
629        if need > bytes.len() {
630            let old = bytes.len() / Self::BYTES_PER_LSN_ENTRY;
631            bytes.resize(need, 0);
632            for s in old..(idx + 1) {
633                Self::put_3byte(
634                    bytes,
635                    s * Self::BYTES_PER_LSN_ENTRY + 1,
636                    Self::THREE_BYTE_NEGATIVE_ONE,
637                );
638            }
639        }
640        let off = idx * Self::BYTES_PER_LSN_ENTRY;
641
642        if lsn.is_null() {
643            // IN.java:1812 — file-number offset 0, file offset -1 (0xffffff).
644            bytes[off] = 0;
645            Self::put_3byte(bytes, off + 1, Self::THREE_BYTE_NEGATIVE_ONE);
646            return;
647        }
648
649        let this_file_number = lsn.file_number();
650        let this_file_offset = lsn.file_offset();
651
652        // Whether to fall back to the Long rep.
653        let mutate = this_file_offset > Self::MAX_FILE_OFFSET || {
654            if this_file_number < *base_file_number {
655                // IN.java:1827 — try to re-base downward; bail if any existing
656                // slot would then exceed the 1-byte file-number offset.
657                !Self::adjust_file_numbers(
658                    bytes,
659                    *base_file_number,
660                    this_file_number,
661                )
662            } else {
663                this_file_number - *base_file_number
664                    > Self::MAX_FILE_NUMBER_OFFSET
665            }
666        };
667
668        if mutate {
669            // IN.java:1924 mutateToLongArray.
670            let nelts = bytes.len() / Self::BYTES_PER_LSN_ENTRY;
671            let mut longs = vec![NULL_LSN; nelts.max(idx + 1)];
672            for (s, slot) in longs.iter_mut().enumerate().take(nelts) {
673                *slot = self_get_compact(*base_file_number, bytes, s);
674            }
675            longs[idx] = lsn;
676            *self = LsnRep::Long(longs);
677            return;
678        }
679
680        if this_file_number < *base_file_number {
681            *base_file_number = this_file_number;
682        }
683        bytes[off] = (this_file_number - *base_file_number) as u8;
684        Self::put_3byte(bytes, off + 1, this_file_offset);
685    }
686
687    /// `IN.adjustFileNumbers` (IN.java:1855) — re-base to a lower file number,
688    /// rewriting every existing slot's 1-byte offset.  Returns false (and
689    /// leaves `bytes` unchanged) if any slot would overflow the 1-byte offset.
690    fn adjust_file_numbers(
691        bytes: &mut [u8],
692        old_base: u32,
693        new_base: u32,
694    ) -> bool {
695        let stride = Self::BYTES_PER_LSN_ENTRY;
696        // First pass: verify none overflow.
697        let mut i = 0;
698        while i < bytes.len() {
699            if Self::get_3byte(bytes, i + 1) != Self::THREE_BYTE_NEGATIVE_ONE {
700                let cur_fn = old_base + bytes[i] as u32;
701                if cur_fn - new_base > Self::MAX_FILE_NUMBER_OFFSET {
702                    return false;
703                }
704            }
705            i += stride;
706        }
707        // Second pass: apply.
708        let mut i = 0;
709        while i < bytes.len() {
710            if Self::get_3byte(bytes, i + 1) != Self::THREE_BYTE_NEGATIVE_ONE {
711                let cur_fn = old_base + bytes[i] as u32;
712                bytes[i] = (cur_fn - new_base) as u8;
713            }
714            i += stride;
715        }
716        true
717    }
718
719    /// `INArrayRep.copy` analogue: shift LSNs when an entry is inserted at
720    /// `idx` (slots `>= idx` move up one).  Mirrors `targets.insert_shift`.
721    pub fn insert_shift(&mut self, idx: usize, n: usize) {
722        match self {
723            LsnRep::Empty => {}
724            LsnRep::Long(v) => {
725                if idx <= v.len() {
726                    v.insert(idx, NULL_LSN);
727                }
728            }
729            LsnRep::Compact { bytes, .. } => {
730                let stride = Self::BYTES_PER_LSN_ENTRY;
731                let cap = (n.max((bytes.len() / stride) + 1)) * stride;
732                bytes.resize(cap, 0);
733                let at = idx * stride;
734                // Shift the tail up by one slot.
735                bytes.copy_within(at..cap - stride, at + stride);
736                // The new slot reads NULL.
737                Self::put_3byte(bytes, at + 1, Self::THREE_BYTE_NEGATIVE_ONE);
738            }
739        }
740    }
741
742    /// `INArrayRep.copy` analogue: shift LSNs when entry `idx` is removed
743    /// (slots `> idx` move down one).  Mirrors `targets.remove_shift`.
744    pub fn remove_shift(&mut self, idx: usize) {
745        match self {
746            LsnRep::Empty => {}
747            LsnRep::Long(v) => {
748                if idx < v.len() {
749                    v.remove(idx);
750                }
751            }
752            LsnRep::Compact { bytes, .. } => {
753                let stride = Self::BYTES_PER_LSN_ENTRY;
754                let at = idx * stride;
755                if at + stride <= bytes.len() {
756                    bytes.copy_within(at + stride.., at);
757                    let newlen = bytes.len() - stride;
758                    bytes.truncate(newlen);
759                }
760            }
761        }
762    }
763
764    /// `IN.computeLsnOverhead` analogue: heap bytes of the rep itself.
765    pub fn memory_size(&self) -> usize {
766        use std::mem::size_of;
767        match self {
768            LsnRep::Empty => 0,
769            LsnRep::Compact { bytes, .. } => bytes.capacity(),
770            LsnRep::Long(v) => v.capacity() * size_of::<Lsn>(),
771        }
772    }
773
774    fn put_3byte(bytes: &mut [u8], offset: usize, value: u32) {
775        bytes[offset] = (value & 0xFF) as u8;
776        bytes[offset + 1] = ((value >> 8) & 0xFF) as u8;
777        bytes[offset + 2] = ((value >> 16) & 0xFF) as u8;
778    }
779
780    fn get_3byte(bytes: &[u8], offset: usize) -> u32 {
781        (bytes[offset] as u32)
782            | ((bytes[offset + 1] as u32) << 8)
783            | ((bytes[offset + 2] as u32) << 16)
784    }
785}
786
787/// Helper used by `LsnRep::set` during `mutateToLongArray` to read an existing
788/// Compact slot without borrowing `self` (which is mid-mutation).
789fn self_get_compact(base_file_number: u32, bytes: &[u8], idx: usize) -> Lsn {
790    let off = idx * LsnRep::BYTES_PER_LSN_ENTRY;
791    let file_offset = LsnRep::get_3byte(bytes, off + 1);
792    if file_offset == LsnRep::THREE_BYTE_NEGATIVE_ONE {
793        NULL_LSN
794    } else {
795        Lsn::new(base_file_number + bytes[off] as u32, file_offset)
796    }
797}
798
799/// `INKeyRep.MaxKeySize.DEFAULT_MAX_KEY_LENGTH` (INKeyRep.java) and the
800/// `TREE_COMPACT_MAX_KEY_LENGTH` config default.
801#[allow(non_upper_case_globals)]
802pub const INKeyRep_DEFAULT_MAX_KEY_LENGTH: i32 = 16;
803
804/// T-2: node-level key array — `INKeyRep.{Default,MaxKeySize}` (INKeyRep.java).
805///
806/// The per-slot key that used to live in `BinEntry`/`InEntry` as a `Vec<u8>`
807/// (24-byte header + a separate heap allocation per key) is hoisted here as a
808/// node-level rep.  When every (post-prefix) key in the node is `<=`
809/// `TREE_COMPACT_MAX_KEY_LENGTH` (default 16) the keys pack into ONE
810/// fixed-width byte buffer (`MaxKeySize`): `slot_width` bytes per slot, with a
811/// parallel `lengths` vector tracking the actual length of each key.  A key
812/// longer than the threshold inflates the whole node to the `Default` rep
813/// (one `Vec<u8>` per slot), matching JE's `Default.compact` /
814/// `MaxKeySize.expandToDefaultRep`.
815///
816/// As in JE, this stores the UNPREFIXED suffix (key prefixing strips the
817/// common prefix first), so the compact rep is the smaller post-prefix bytes.
818#[derive(Debug, Clone)]
819pub enum KeyRep {
820    /// `INKeyRep.Default` — one owned key per slot (any length).
821    Default(Vec<Vec<u8>>),
822    /// `INKeyRep.MaxKeySize` — all keys packed into one fixed-width buffer.
823    /// `buf.len() == slot_width * lengths.len()`; slot `i` occupies
824    /// `buf[i*slot_width .. i*slot_width + lengths[i]]`.
825    Compact { buf: Vec<u8>, slot_width: usize, lengths: Vec<u16> },
826}
827
828impl KeyRep {
829    /// An empty `Default` rep.
830    #[inline]
831    pub fn new() -> Self {
832        KeyRep::Default(Vec::new())
833    }
834
835    /// Build a `Default` rep from owned keys (callers may later `compact`).
836    #[inline]
837    pub fn from_keys(keys: Vec<Vec<u8>>) -> Self {
838        KeyRep::Default(keys)
839    }
840
841    /// Number of slots.
842    #[inline]
843    pub fn len(&self) -> usize {
844        match self {
845            KeyRep::Default(v) => v.len(),
846            KeyRep::Compact { lengths, .. } => lengths.len(),
847        }
848    }
849
850    #[inline]
851    pub fn is_empty(&self) -> bool {
852        self.len() == 0
853    }
854
855    /// `INKeyRep.get(idx)` / `getKey` — borrow the (post-prefix) key at slot
856    /// `idx` without allocating.
857    #[inline]
858    pub fn get(&self, idx: usize) -> &[u8] {
859        match self {
860            KeyRep::Default(v) => v[idx].as_slice(),
861            KeyRep::Compact { buf, slot_width, lengths } => {
862                let off = idx * slot_width;
863                &buf[off..off + lengths[idx] as usize]
864            }
865        }
866    }
867
868    /// Set the key at slot `idx`.  A key longer than a Compact rep's
869    /// `slot_width` inflates the rep to `Default` first
870    /// (`MaxKeySize.expandToDefaultRep`).
871    pub fn set(&mut self, idx: usize, key: Vec<u8>) {
872        match self {
873            KeyRep::Default(v) => v[idx] = key,
874            KeyRep::Compact { slot_width, .. } if key.len() > *slot_width => {
875                self.inflate_to_default();
876                self.set(idx, key);
877            }
878            KeyRep::Compact { buf, slot_width, lengths } => {
879                let off = idx * *slot_width;
880                buf[off..off + key.len()].copy_from_slice(&key);
881                lengths[idx] = key.len() as u16;
882            }
883        }
884    }
885
886    /// Insert a key at slot `idx`, shifting later slots up (mirrors
887    /// `Vec::insert` + `INArrayRep.copy`).
888    pub fn insert(&mut self, idx: usize, key: Vec<u8>) {
889        match self {
890            KeyRep::Default(v) => v.insert(idx, key),
891            KeyRep::Compact { slot_width, .. } if key.len() > *slot_width => {
892                self.inflate_to_default();
893                self.insert(idx, key);
894            }
895            KeyRep::Compact { buf, slot_width, lengths } => {
896                let sw = *slot_width;
897                let at = idx * sw;
898                buf.splice(at..at, std::iter::repeat_n(0u8, sw));
899                buf[at..at + key.len()].copy_from_slice(&key);
900                lengths.insert(idx, key.len() as u16);
901            }
902        }
903    }
904
905    /// Remove the key at slot `idx`, shifting later slots down.
906    pub fn remove(&mut self, idx: usize) -> Vec<u8> {
907        match self {
908            KeyRep::Default(v) => v.remove(idx),
909            KeyRep::Compact { buf, slot_width, lengths } => {
910                let sw = *slot_width;
911                let len = lengths[idx] as usize;
912                let at = idx * sw;
913                let out = buf[at..at + len].to_vec();
914                buf.drain(at..at + sw);
915                lengths.remove(idx);
916                out
917            }
918        }
919    }
920
921    /// `INKeyRep.MaxKeySize.expandToDefaultRep` — mutate a Compact rep to a
922    /// Default rep (one owned `Vec<u8>` per slot).
923    fn inflate_to_default(&mut self) {
924        if let KeyRep::Compact { .. } = self {
925            let keys: Vec<Vec<u8>> =
926                (0..self.len()).map(|i| self.get(i).to_vec()).collect();
927            *self = KeyRep::Default(keys);
928        }
929    }
930
931    /// `INKeyRep.Default.compact(parent)` (INKeyRep.java) — if every key in a
932    /// `Default` rep fits `compact_max_key_length`, pack them into a
933    /// `MaxKeySize` (`Compact`) rep.  `compact_max_key_length <= 0` disables
934    /// compaction.  No-op when already Compact.
935    pub fn compact(&mut self, compact_max_key_length: i32) {
936        if compact_max_key_length <= 0 {
937            return;
938        }
939        let KeyRep::Default(keys) = self else {
940            return; // already Compact
941        };
942        if keys.is_empty() {
943            return;
944        }
945        let max_len = keys.iter().map(|k| k.len()).max().unwrap_or(0);
946        if max_len > compact_max_key_length as usize {
947            return; // a key exceeds the threshold — stay Default
948        }
949        let slot_width = max_len.max(1);
950        let mut buf = vec![0u8; slot_width * keys.len()];
951        let mut lengths = Vec::with_capacity(keys.len());
952        for (i, k) in keys.iter().enumerate() {
953            let off = i * slot_width;
954            buf[off..off + k.len()].copy_from_slice(k);
955            lengths.push(k.len() as u16);
956        }
957        *self = KeyRep::Compact { buf, slot_width, lengths };
958    }
959
960    /// True when key-byte memory is accounted for inside this rep (Compact),
961    /// vs per-slot `Vec` allocations (Default).
962    /// `INKeyRep.accountsForKeyByteMemUsage`.
963    #[inline]
964    pub fn is_compact(&self) -> bool {
965        matches!(self, KeyRep::Compact { .. })
966    }
967
968    /// Heap bytes of the rep itself (`INKeyRep.calculateMemorySize` +
969    /// key-byte accounting).  For Default this is the `Vec<Vec<u8>>` header
970    /// plus each key's heap allocation; for Compact it is the single buffer
971    /// plus the lengths vector.
972    pub fn memory_size(&self) -> usize {
973        use std::mem::size_of;
974        match self {
975            KeyRep::Default(v) => {
976                v.capacity() * size_of::<Vec<u8>>()
977                    + v.iter().map(|k| k.capacity()).sum::<usize>()
978            }
979            KeyRep::Compact { buf, lengths, .. } => {
980                buf.capacity() + lengths.capacity() * size_of::<u16>()
981            }
982        }
983    }
984}
985
986impl Default for KeyRep {
987    fn default() -> Self {
988        KeyRep::new()
989    }
990}
991
992/// Lightweight upper-IN representation used by the tree traversal layer.
993///
994/// `IN`: carries the dirty flag (IN_DIRTY_BIT), the LRU
995/// generation counter, and a weak back-pointer to the parent so that
996/// dirty state can be propagated upward.
997#[derive(Debug)]
998pub struct InNodeStub {
999    /// Node ID.
1000    pub node_id: u64,
1001    /// Level in tree.
1002    pub level: i32,
1003    /// Child entries (key, lsn).
1004    pub entries: Vec<InEntry>,
1005    /// T-4: per-node resident-child-pointer representation.
1006    ///
1007    /// `IN.entryTargets` (`INTargetRep`).  The cached child pointer is no
1008    /// longer a per-`InEntry` `Option<Arc>` (which cost a pointer-sized slot
1009    /// even when no child was resident); it lives here as a compact
1010    /// node-level rep that starts `None` (0 child-pointer bytes — most upper
1011    /// INs have no resident children), grows to `Sparse` for a few cached
1012    /// children, and inflates to `Default` (the full parallel array) once
1013    /// many children are resident.  See `INTargetRep.{None,Sparse,Default}`.
1014    pub targets: TargetRep,
1015    /// Dirty flag — set whenever this node is modified.
1016    /// `IN.dirty` (IN_DIRTY_BIT).
1017    pub dirty: bool,
1018    /// LRU generation counter for the evictor.
1019    /// `IN.generation`.
1020    pub generation: u64,
1021    /// Weak back-pointer to parent IN.
1022    /// Enables dirty-propagation and latch-coupling validation.
1023    /// `IN.parent` reference used during splits and logging.
1024    pub parent: Option<Weak<RwLock<TreeNode>>>,
1025    /// T-3: per-node packed LSN array (`IN.entryLsnByteArray`).  The per-slot
1026    /// `lsn` (8 bytes) that used to live in `InEntry` is hoisted here as a
1027    /// `base_file_number`-relative 4-byte-per-slot rep, falling back to a
1028    /// `u64`-per-slot `Long` rep only when a node's LSN range exceeds the
1029    /// compact form.  Access via `get_lsn(slot)` / `set_lsn(slot, lsn)`.
1030    pub lsn_rep: LsnRep,
1031}
1032
1033/// Entry in an IN node.
1034///
1035/// T-4: the resident-child pointer that used to live here (`Option<Arc>`) was
1036/// hoisted to the node-level `InNodeStub.targets` (`INTargetRep`); access the
1037/// child for slot `i` via `InNodeStub::get_child(i)` / `set_child` / etc.
1038///
1039/// T-3: the per-slot `lsn` (8 bytes) that used to live here was hoisted to the
1040/// node-level `InNodeStub.lsn_rep` (`IN.entryLsnByteArray`); access the LSN for
1041/// slot `i` via `InNodeStub::get_lsn(i)` / `set_lsn(i, lsn)`.
1042#[derive(Debug, Clone)]
1043pub struct InEntry {
1044    /// Key for this entry.
1045    pub key: Vec<u8>,
1046}
1047
1048/// Lightweight BIN representation used by the tree traversal layer.
1049///
1050/// `BIN` (which extends `IN`): carries the dirty flag, LRU
1051/// generation counter, and a weak back-pointer to the parent IN.
1052///
1053/// # Key Prefix Compression
1054///
1055/// BINs support key prefix compression.  When
1056/// `key_prefix` is non-empty the `key` field of every `BinEntry` stores only
1057/// the *suffix* — the bytes after stripping the common leading bytes.  The
1058/// full key is reconstructed by prepending `key_prefix` to the stored suffix.
1059///
1060/// This is transparent to callers through the `get_full_key` / `find_entry`
1061/// helpers on `BinStub`.  The prefix is recomputed after every insert and
1062/// after a split via `recompute_key_prefix`.
1063#[derive(Debug)]
1064pub struct BinStub {
1065    /// Node ID.
1066    pub node_id: u64,
1067    /// Level (always BIN_LEVEL).
1068    pub level: i32,
1069    /// Entries.  When `key_prefix` is non-empty the `key` field in each entry
1070    /// is the *suffix* of the full key (leading `key_prefix` bytes stripped).
1071    /// `IN.entryKeys` (suffix-only storage when prefixing is on).
1072    pub entries: Vec<BinEntry>,
1073    /// Common prefix shared by every key in this BIN.
1074    /// Empty slice means no prefix compression is active.
1075    /// `IN.keyPrefix`.
1076    pub key_prefix: Vec<u8>,
1077    /// Dirty flag — set whenever this BIN is modified.
1078    /// `IN.dirty` (IN_DIRTY_BIT).
1079    pub dirty: bool,
1080    /// BIN-delta flag — true when this BIN contains only dirty (delta) slots
1081    /// rather than a complete set of entries.
1082    /// `IN.IN_DELTA_BIT` (the IN_DELTA_BIT flag inside `flags`).
1083    pub is_delta: bool,
1084    /// LSN at which this BIN was last logged as a full (non-delta) BIN.
1085    ///
1086    /// Used by the checkpoint path to construct `BINDeltaLogEntry.prev_full_lsn`
1087    /// and to compare against `prev_delta_lsn` when deciding whether to write
1088    /// a delta or a full BIN.
1089    ///
1090    /// `BIN.lastFullLsn`.
1091    pub last_full_lsn: Lsn,
1092    /// LSN at which this BIN was last logged as a BIN-delta.
1093    ///
1094    /// Written as `prev_delta_lsn` into the next `BINDeltaLogEntry` so the
1095    /// cleaner's utilization tracker can mark the superseded delta obsolete.
1096    /// Reset to `NULL_LSN` whenever a full BIN is written.
1097    ///
1098    /// `BIN.lastDeltaVersion` / `BIN.getLastDeltaLsn()`.
1099    pub last_delta_lsn: Lsn,
1100    /// LRU generation counter for the evictor.
1101    /// `IN.generation`.
1102    pub generation: u64,
1103    /// Weak back-pointer to parent IN.
1104    /// Enables dirty-propagation and latch-coupling validation.
1105    pub parent: Option<Weak<RwLock<TreeNode>>>,
1106    /// If true, `BinEntry.expiration_time` values in this BIN are packed hours
1107    /// since epoch; if false, they are packed seconds since epoch.
1108    ///
1109    /// Default: `true` (hours, matching TTL resolution).
1110    ///
1111    /// `BIN.expirationInHours`.
1112    pub expiration_in_hours: bool,
1113    /// Number of cursors currently positioned on this BIN.
1114    ///
1115    /// The evictor skips BINs with a non-zero cursor count to avoid evicting
1116    /// a node that a cursor is actively traversing.  CursorImpl increments
1117    /// this when positioning on a BIN and decrements it on reposition/close.
1118    ///
1119    /// `IN.cursorSet.size()` used by `Evictor.selectIN()`.
1120    pub cursor_count: i32,
1121    /// When true, the NEXT log of this BIN must be a full BIN, not a delta.
1122    ///
1123    /// Set after a dirty slot is removed (a delta would silently lose that
1124    /// removal) and cleared after a full BIN is written.  This is the
1125    /// delta-chain bound: it forces a periodic full BIN so a delta never
1126    /// references stale state.
1127    ///
1128    /// `IN.prohibitNextDelta` / `IN.setProhibitNextDelta` (IN.java:5013) /
1129    /// `IN.getProhibitNextDelta`.
1130    pub prohibit_next_delta: bool,
1131    /// T-3: per-node packed LSN array (`IN.entryLsnByteArray`).  The per-slot
1132    /// `lsn` (8 bytes) that used to live in `BinEntry` is hoisted here as a
1133    /// `base_file_number`-relative 4-byte-per-slot rep.  Access via
1134    /// `get_lsn(slot)` / `set_lsn(slot, lsn)`.
1135    pub lsn_rep: LsnRep,
1136    /// T-2: per-node key array (`INKeyRep.{Default,MaxKeySize}`).  The per-slot
1137    /// `key` (`Vec<u8>`, 24-byte header + heap alloc) that used to live in
1138    /// `BinEntry` is hoisted here.  Stores the post-prefix SUFFIX (key
1139    /// prefixing strips the common prefix first).  Packs into one fixed-width
1140    /// buffer (`Compact`) when every suffix is `<= compact_max_key_length`,
1141    /// else one `Vec<u8>` per slot (`Default`).  `keys.len()` is kept in lock
1142    /// step with `entries.len()`.  Access via `get_key(slot)` /
1143    /// `get_full_key(slot)`.
1144    pub keys: KeyRep,
1145    /// T-5: the node's compact-key threshold (`IN.getCompactMaxKeyLength`),
1146    /// copied from the owning `Tree` at construction so `apply_new_prefix` can
1147    /// decide whether the suffixes now fit `MaxKeySize`.  Default 16.
1148    pub compact_max_key_length: i32,
1149}
1150
1151/// Entry in a BIN node.
1152///
1153/// T-3: the per-slot `lsn` (8 bytes) that used to live here was hoisted to the
1154/// node-level `BinStub.lsn_rep` (`IN.entryLsnByteArray`); access the LSN for
1155/// slot `i` via `BinStub::get_lsn(i)` / `set_lsn(i, lsn)`.
1156#[derive(Debug, Clone)]
1157pub struct BinEntry {
1158    /// Optional embedded data (for small records) or cached LN.
1159    pub data: Option<Vec<u8>>,
1160    /// True when this slot has been marked known-deleted (analogous to the
1161    /// KNOWN_DELETED_BIT in `IN.entryStates`).  The slot is eligible for
1162    /// removal by `compress_bin()`.
1163    pub known_deleted: bool,
1164    /// True when this slot has been modified since the last full BIN log write.
1165    ///
1166    /// `IN.entryStates[i] & IN_DIRTY_BIT`.  Used by the checkpoint
1167    /// path to decide whether to write a BIN-delta (few dirty slots) or a
1168    /// full BIN (many dirty slots).
1169    pub dirty: bool,
1170    /// Packed expiration time (0 = no expiration).
1171    ///
1172    /// When the owning `BinStub.expiration_in_hours` is true, this value is
1173    /// hours since Unix epoch; otherwise it is seconds since Unix epoch.
1174    ///
1175    /// `IN.entryExpiration`.
1176    pub expiration_time: u32,
1177}
1178
1179impl InNodeStub {
1180    /// `IN.getTarget(idx)` — the resident child cached for slot `idx`, cloned
1181    /// (a strong `Arc`), or `None` if the child is not cached.  Routes through
1182    /// the node-level `INTargetRep` (T-4).
1183    #[inline]
1184    pub fn get_child(&self, idx: usize) -> Option<ChildArc> {
1185        self.targets.get(idx).cloned()
1186    }
1187
1188    /// Borrow the resident child for slot `idx` without cloning.
1189    #[inline]
1190    pub fn child_ref(&self, idx: usize) -> Option<&ChildArc> {
1191        self.targets.get(idx)
1192    }
1193
1194    /// True if slot `idx` has no resident (cached) child.
1195    /// `IN.getTarget(idx) == null`.
1196    #[inline]
1197    pub fn child_is_none(&self, idx: usize) -> bool {
1198        self.targets.get(idx).is_none()
1199    }
1200
1201    /// `IN.setTarget(idx, node)` — set (or clear) the cached child for slot
1202    /// `idx`, mutating the `INTargetRep` upward as needed.
1203    #[inline]
1204    pub fn set_child(&mut self, idx: usize, node: Option<ChildArc>) {
1205        self.targets.set(idx, node);
1206    }
1207
1208    /// `IN.detachNode` helper — remove and return the cached child for slot
1209    /// `idx`, leaving the slot's key/LSN intact for re-fetch.
1210    #[inline]
1211    pub fn take_child(&mut self, idx: usize) -> Option<ChildArc> {
1212        self.targets.take(idx)
1213    }
1214
1215    /// `IN.getLsn(idx)` (IN.java:1752) — the LSN of slot `idx` via the
1216    /// node-level packed `LsnRep` (T-3).
1217    #[inline]
1218    pub fn get_lsn(&self, idx: usize) -> Lsn {
1219        self.lsn_rep.get(idx)
1220    }
1221
1222    /// `IN.setLsn(idx, lsn)` (IN.java:1773) — set the LSN of slot `idx` via
1223    /// the node-level packed `LsnRep` (T-3).
1224    #[inline]
1225    pub fn set_lsn(&mut self, idx: usize, lsn: Lsn) {
1226        let n = self.entries.len();
1227        self.lsn_rep.set(idx, lsn, n);
1228    }
1229
1230    /// Insert an entry at `idx`, shifting the child mapping to stay aligned
1231    /// (`INArrayRep.copy`), then set the new slot's cached child.  Mirrors the
1232    /// old `entries.insert(idx, InEntry{ child: ..})` in one call.
1233    pub fn insert_entry(
1234        &mut self,
1235        idx: usize,
1236        key: Vec<u8>,
1237        lsn: Lsn,
1238        child: Option<ChildArc>,
1239    ) {
1240        self.entries.insert(idx, InEntry { key });
1241        let n = self.entries.len();
1242        self.lsn_rep.insert_shift(idx, n);
1243        self.lsn_rep.set(idx, lsn, n);
1244        self.targets.insert_shift(idx);
1245        if child.is_some() {
1246            self.targets.set(idx, child);
1247        }
1248    }
1249
1250    /// Remove the entry at `idx`, shifting the child mapping to stay aligned
1251    /// (`INArrayRep.copy`).  Returns the removed `InEntry` (key).
1252    pub fn remove_entry(&mut self, idx: usize) -> InEntry {
1253        let e = self.entries.remove(idx);
1254        self.lsn_rep.remove_shift(idx);
1255        self.targets.remove_shift(idx);
1256        e
1257    }
1258
1259    /// All resident children (cloned `Arc`s), in unspecified order.
1260    /// Replaces `entries.iter().filter_map(|e| e.child.clone())`.
1261    pub fn resident_children(&self) -> Vec<ChildArc> {
1262        self.targets.iter_children().collect()
1263    }
1264
1265    /// `(slot_index, child)` of the first resident child, if any.
1266    pub fn first_resident_child(&self) -> Option<(usize, ChildArc)> {
1267        (0..self.entries.len())
1268            .find_map(|i| self.targets.get(i).map(|c| (i, c.clone())))
1269    }
1270}
1271
1272impl BinStub {
1273    /// `IN.getLsn(idx)` (IN.java:1752) — the LSN of slot `idx` via the
1274    /// node-level packed `LsnRep` (T-3).
1275    #[inline]
1276    pub fn get_lsn(&self, idx: usize) -> Lsn {
1277        self.lsn_rep.get(idx)
1278    }
1279
1280    /// `IN.setLsn(idx, lsn)` (IN.java:1773) — set the LSN of slot `idx` via
1281    /// the node-level packed `LsnRep` (T-3).
1282    #[inline]
1283    pub fn set_lsn(&mut self, idx: usize, lsn: Lsn) {
1284        let n = self.entries.len();
1285        self.lsn_rep.set(idx, lsn, n);
1286    }
1287
1288    /// TREE-F1: the single user-facing liveness predicate for a BIN slot.
1289    ///
1290    /// A slot is LIVE for reads/scans iff it is neither `known_deleted` nor
1291    /// TTL-expired.  This mirrors the two ways JE makes a slot read as ABSENT:
1292    ///   * `IN.findEntry` (IN.java:3197) returns -1 for a `known_deleted`
1293    ///     exact match;
1294    ///   * `CursorImpl.isProbablyExpired` / `lockAndGetCurrent`
1295    ///     (CursorImpl.java:2062-2064) skip `isEntryKnownDeleted` (and
1296    ///     expired) slots while stepping.
1297    ///
1298    /// KD slots legitimately exist in live BINs during BIN-delta
1299    /// reconstitution until the compressor reclaims them; the maintenance
1300    /// paths (compressor / recovery undo) iterate them on purpose and do NOT
1301    /// use this predicate.
1302    #[inline]
1303    pub fn slot_is_live(&self, idx: usize) -> bool {
1304        match self.entries.get(idx) {
1305            Some(e) => {
1306                !(e.known_deleted
1307                    || (e.expiration_time != 0
1308                        && noxu_util::ttl::is_expired(
1309                            e.expiration_time,
1310                            self.expiration_in_hours,
1311                        )))
1312            }
1313            None => false,
1314        }
1315    }
1316
1317    // ========================================================================
1318    // Key prefix compression helpers
1319    // IN.computeKeyPrefix / IN.recalcSuffixes / IN.getKey
1320    // ========================================================================
1321
1322    /// Strips embedded LN data from non-dirty slots, freeing the heap
1323    /// allocations of the per-slot value bytes while keeping the slot keys
1324    /// and LSNs addressable.  Used by the evictor's PartialEvict path: a
1325    /// hot BIN is kept in cache so its descent path stays warm, but the LN
1326    /// data is dropped to make room for hotter content.  Subsequent reads
1327    /// re-fetch the data from the log via the slot LSN.
1328    ///
1329    /// Skips slots that are still dirty (their data has not been written
1330    /// to the log yet, so dropping the in-memory copy would lose the
1331    /// update).  Returns the number of bytes freed (sum of the lengths
1332    /// of the dropped `Vec<u8>` data fields).
1333    ///
1334    /// Returns 0 if the BIN has any open cursors (the cursor may be
1335    /// reading the data right now).
1336    pub fn strip_lns(&mut self) -> usize {
1337        if self.cursor_count > 0 {
1338            return 0;
1339        }
1340        let mut freed = 0usize;
1341        for idx in 0..self.entries.len() {
1342            // JE BIN.evictLNs / LN.isEvictable (LN.java:263 returns true): an
1343            // LN's in-memory value can be stripped whenever it is recoverable
1344            // from the log — i.e. the slot has a valid (logged) LSN — REGARDLESS
1345            // of the dirty bit.  The dirty bit governs whether the BIN's
1346            // *structure* needs re-logging at the next checkpoint (BIN-delta vs
1347            // full BIN), NOT whether the LN *value* is durable: a transactional
1348            // commit logs the LN, so the slot's LSN points at the durable copy
1349            // even while the slot is still dirty.  Gating the strip on `!dirty`
1350            // (the previous behaviour) meant a freshly-written, not-yet-
1351            // checkpointed record — the common case under a write/recently-read
1352            // workload — could never be stripped, so eviction reclaimed almost
1353            // nothing under pressure (EVICTOR-RECLAIM-1).  A slot with a NULL/
1354            // transient LSN (a deferred-write LN never logged) is NOT
1355            // strippable — its only copy is the in-memory value.
1356            if self.get_lsn(idx) == NULL_LSN {
1357                continue;
1358            }
1359            if let Some(data) = self.entries[idx].data.take() {
1360                freed = freed.saturating_add(data.len());
1361            }
1362        }
1363        freed
1364    }
1365
1366    /// Reconstruct the full key for slot `idx` by prepending the BIN's
1367    /// current prefix to the stored suffix.
1368    ///
1369    /// `IN.getKey(int idx)`.
1370    pub fn get_full_key(&self, idx: usize) -> Option<Vec<u8>> {
1371        if idx >= self.keys.len() {
1372            return None;
1373        }
1374        let suffix = self.keys.get(idx); // T-2
1375        if self.key_prefix.is_empty() {
1376            Some(suffix.to_vec())
1377        } else {
1378            let mut full =
1379                Vec::with_capacity(self.key_prefix.len() + suffix.len());
1380            full.extend_from_slice(&self.key_prefix);
1381            full.extend_from_slice(suffix);
1382            Some(full)
1383        }
1384    }
1385
1386    /// Borrow the stored (post-prefix) suffix at slot `idx` (`INKeyRep.get`).
1387    #[inline]
1388    pub fn get_key(&self, idx: usize) -> &[u8] {
1389        self.keys.get(idx)
1390    }
1391
1392    /// T-2: insert a new slot at `idx` keeping the parallel `entries`, `keys`,
1393    /// and `lsn_rep` arrays in lock step.  `suffix` is the post-prefix key.
1394    fn insert_slot(
1395        &mut self,
1396        idx: usize,
1397        suffix: Vec<u8>,
1398        lsn: Lsn,
1399        data: Option<Vec<u8>>,
1400    ) {
1401        self.entries.insert(
1402            idx,
1403            BinEntry {
1404                data,
1405                known_deleted: false,
1406                dirty: true,
1407                expiration_time: 0,
1408            },
1409        );
1410        self.keys.insert(idx, suffix); // T-2
1411        let n = self.entries.len();
1412        self.lsn_rep.insert_shift(idx, n); // T-3
1413        self.lsn_rep.set(idx, lsn, n);
1414    }
1415
1416    /// Decompress a stored suffix back to a full key.
1417    ///
1418    /// `IN.getKey` used from outside: prepend `key_prefix` to
1419    /// `suffix`.  If `key_prefix` is empty the suffix *is* the full key.
1420    pub fn decompress_key(&self, suffix: &[u8]) -> Vec<u8> {
1421        if self.key_prefix.is_empty() {
1422            suffix.to_vec()
1423        } else {
1424            let mut full =
1425                Vec::with_capacity(self.key_prefix.len() + suffix.len());
1426            full.extend_from_slice(&self.key_prefix);
1427            full.extend_from_slice(suffix);
1428            full
1429        }
1430    }
1431
1432    /// Strip the current prefix from a full key to obtain the stored suffix.
1433    ///
1434    /// `IN.computeKeySuffix(byte[] prefix, byte[] key)`.
1435    ///
1436    /// # Panics
1437    /// Panics (debug only) if `full_key` does not start with `key_prefix`.
1438    pub fn compress_key(&self, full_key: &[u8]) -> Vec<u8> {
1439        let plen = self.key_prefix.len();
1440        if plen == 0 {
1441            full_key.to_vec()
1442        } else {
1443            debug_assert!(
1444                full_key.starts_with(&self.key_prefix),
1445                "compress_key: key does not start with current prefix"
1446            );
1447            full_key[plen..].to_vec()
1448        }
1449    }
1450
1451    /// Compute the longest common prefix of all full keys currently in this
1452    /// BIN, optionally excluding the entry at `exclude_idx` (used during
1453    /// insertions to ignore the slot that is about to be replaced).
1454    ///
1455    /// Returns an empty `Vec` if the BIN has fewer than 2 entries or if the
1456    /// keys share no common leading bytes.
1457    ///
1458    /// `IN.computeKeyPrefix(int excludeIdx)`.
1459    pub fn compute_key_prefix(&self, exclude_idx: Option<usize>) -> Vec<u8> {
1460        // Need at least 2 entries to find a common prefix.
1461        let n = self.keys.len();
1462        if n < 2 {
1463            return Vec::new();
1464        }
1465
1466        // Pick the first non-excluded index as the seed.
1467        let first_idx = match exclude_idx {
1468            Some(0) => 1,
1469            _ => 0,
1470        };
1471
1472        // The current prefix_len is taken from the seed full key.
1473        let seed_full = match self.get_full_key(first_idx) {
1474            Some(k) => k,
1475            None => return Vec::new(),
1476        };
1477        let mut prefix_len = seed_full.len();
1478
1479        // Compare every other non-excluded entry against the running prefix.
1480        // Iterate all entries (byteOrdered disabled in too).
1481        for i in (first_idx + 1)..n {
1482            if let Some(ex) = exclude_idx
1483                && i == ex
1484            {
1485                continue;
1486            }
1487            let full_key = match self.get_full_key(i) {
1488                Some(k) => k,
1489                None => continue,
1490            };
1491            let new_len =
1492                get_key_prefix_length(&seed_full[..prefix_len], &full_key);
1493            if new_len < prefix_len {
1494                prefix_len = new_len;
1495            }
1496            if prefix_len == 0 {
1497                return Vec::new();
1498            }
1499        }
1500
1501        seed_full[..prefix_len].to_vec()
1502    }
1503
1504    /// Recompute the key prefix from scratch and re-encode every stored suffix.
1505    ///
1506    /// Call this after bulk inserts, splits, or merges.
1507    ///
1508    /// `IN.recalcKeyPrefix()` → `IN.recalcSuffixes(newPrefix, …)`.
1509    pub fn recompute_key_prefix(&mut self) {
1510        let new_prefix = self.compute_key_prefix(None);
1511        self.apply_new_prefix(new_prefix);
1512    }
1513
1514    /// Apply `new_prefix` as the BIN's key prefix, re-encoding all stored
1515    /// suffixes from the old prefix into the new one.
1516    ///
1517    /// This is the Rust.
1518    fn apply_new_prefix(&mut self, new_prefix: Vec<u8>) {
1519        // Reconstruct all full keys (using old prefix), then re-encode with
1520        // the new prefix.
1521        let full_keys: Vec<Vec<u8>> = (0..self.keys.len())
1522            .map(|i| self.get_full_key(i).unwrap_or_default())
1523            .collect();
1524
1525        self.key_prefix = new_prefix;
1526
1527        // T-2: re-encode every suffix into the key rep, then re-attempt
1528        // compaction (a smaller prefix may make all suffixes fit MaxKeySize).
1529        for (i, full_key) in full_keys.into_iter().enumerate() {
1530            let suffix = self.compress_key(&full_key);
1531            self.keys.set(i, suffix);
1532        }
1533        self.keys.compact(self.compact_max_key_length);
1534    }
1535
1536    /// Binary-search this BIN for `full_key` (a full, uncompressed key).
1537    ///
1538    /// The stored suffixes are compared after stripping the current prefix
1539    /// from `full_key`, so the search is done entirely in suffix-space — no
1540    /// heap allocation needed in the happy path.
1541    ///
1542    /// Returns `(idx, exact)` where:
1543    /// - `idx` is the slot index (or insertion point when `exact == false`).
1544    /// - `exact` is `true` when an exact match was found.
1545    ///
1546    /// `IN.findEntry(key, indicateIfDuplicate, exact)`.
1547    pub fn find_entry_compressed(&self, full_key: &[u8]) -> (usize, bool) {
1548        let plen = self.key_prefix.len();
1549        // Check that the key shares the current prefix; if not it cannot be
1550        // present and we return the appropriate insertion point.
1551        if plen > 0
1552            && (full_key.len() < plen
1553                || &full_key[..plen] != self.key_prefix.as_slice())
1554        {
1555            // The key does not share the current prefix.
1556            // Determine insertion point using full-key comparison.
1557            let pos = self.key_partition_point(|s| {
1558                self.decompress_key(s).as_slice() < full_key
1559            });
1560            return (pos, false);
1561        }
1562        let suffix = &full_key[plen..];
1563        // T-2: binary search over the node-level key rep (suffix space).
1564        match self.key_binary_search(suffix) {
1565            Ok(idx) => (idx, true),
1566            Err(idx) => (idx, false),
1567        }
1568    }
1569
1570    /// Binary search the key rep for `suffix` (suffix space, unsigned bytes).
1571    /// Mirrors `Vec::binary_search_by(|e| e.key.cmp(suffix))` over the
1572    /// node-level `KeyRep` (T-2).
1573    #[inline]
1574    fn key_binary_search(&self, suffix: &[u8]) -> Result<usize, usize> {
1575        let mut lo = 0usize;
1576        let mut hi = self.keys.len();
1577        while lo < hi {
1578            let mid = lo + (hi - lo) / 2;
1579            match self.keys.get(mid).cmp(suffix) {
1580                std::cmp::Ordering::Less => lo = mid + 1,
1581                std::cmp::Ordering::Greater => hi = mid,
1582                std::cmp::Ordering::Equal => return Ok(mid),
1583            }
1584        }
1585        Err(lo)
1586    }
1587
1588    /// `slice::partition_point` over the node-level key rep suffixes (T-2):
1589    /// the index of the first slot for which `pred(suffix)` is false.
1590    #[inline]
1591    fn key_partition_point(
1592        &self,
1593        mut pred: impl FnMut(&[u8]) -> bool,
1594    ) -> usize {
1595        let mut lo = 0usize;
1596        let mut hi = self.keys.len();
1597        while lo < hi {
1598            let mid = lo + (hi - lo) / 2;
1599            if pred(self.keys.get(mid)) {
1600                lo = mid + 1;
1601            } else {
1602                hi = mid;
1603            }
1604        }
1605        lo
1606    }
1607
1608    /// Insert or update a full (uncompressed) key in this BIN.
1609    ///
1610    /// After insertion the key prefix is recomputed; if the prefix changes all
1611    /// stored suffixes are re-encoded.
1612    ///
1613    /// Returns `(slot_index, is_new_insert)`.
1614    ///
1615    /// `IN.setKey` / BIN insert path.
1616    pub fn insert_with_prefix(
1617        &mut self,
1618        full_key: Vec<u8>,
1619        lsn: Lsn,
1620        data: Option<Vec<u8>>,
1621    ) -> (usize, bool) {
1622        // Is the current prefix still compatible with this key?
1623        let plen = self.key_prefix.len();
1624        let new_len = if plen > 0 {
1625            get_key_prefix_length(&self.key_prefix, &full_key)
1626        } else {
1627            0
1628        };
1629
1630        // If the new key shrinks the prefix we must re-encode everything first.
1631        if plen > 0 && new_len < plen {
1632            // Compute new prefix considering the incoming key and
1633            // all existing full keys.  We pass `None` for exclude_idx because
1634            // the slot for this key does not yet exist.
1635            let mut candidate = self.compute_key_prefix(None);
1636            // Also constrain by the new key itself.
1637            if !candidate.is_empty() {
1638                let cl = get_key_prefix_length(&candidate, &full_key);
1639                candidate.truncate(cl);
1640            } else {
1641                // No existing prefix; try to build one from the new key
1642                // against the existing full keys.
1643                if !self.entries.is_empty()
1644                    && let Some(first_full) = self.get_full_key(0)
1645                {
1646                    candidate = create_key_prefix(&first_full, &full_key)
1647                        .unwrap_or_default();
1648                    for i in 1..self.entries.len() {
1649                        if candidate.is_empty() {
1650                            break;
1651                        }
1652                        if let Some(fk) = self.get_full_key(i) {
1653                            let l = get_key_prefix_length(&candidate, &fk);
1654                            candidate.truncate(l);
1655                        }
1656                    }
1657                }
1658            }
1659            self.apply_new_prefix(candidate);
1660        }
1661
1662        // Compress the new key under the (possibly updated) prefix.
1663        let suffix = self.compress_key(&full_key);
1664
1665        match self.key_binary_search(&suffix) {
1666            Ok(idx) => {
1667                // Key exists — update in place.
1668                self.set_lsn(idx, lsn); // T-3
1669                self.entries[idx].data = data;
1670                // Mark slot dirty: this slot changed since the last full BIN log.
1671                // `IN.setDirtyEntry(idx)`.
1672                self.entries[idx].dirty = true;
1673                (idx, false)
1674            }
1675            Err(idx) => {
1676                // New key — insert in sorted position.
1677                // New slots start dirty: they have never been logged in any BIN.
1678                // `IN.setDirtyEntry(idx)` called after `insertEntry`.
1679                self.insert_slot(idx, suffix, lsn, data);
1680                // After insertion, if there is no prefix yet, try to establish one.
1681                if self.key_prefix.is_empty() && self.entries.len() >= 2 {
1682                    self.recompute_key_prefix();
1683                }
1684                (idx, true)
1685            }
1686        }
1687    }
1688
1689    /// Slice-based variant of [`BinStub::insert_with_prefix`] for the recovery redo path.
1690    ///
1691    /// Accepts `key` and `data` as `&[u8]` slices instead of owned `Vec<u8>`,
1692    /// eliminating the intermediate `Vec<u8>` that `redo_ln` would otherwise
1693    /// allocate before crossing the BIN boundary.  The compressed suffix and
1694    /// the data bytes are each copied into the `BinEntry` exactly once.
1695    ///
1696    /// Semantics are identical to `insert_with_prefix`:
1697    /// - Updates the slot in place when the key already exists.
1698    /// - Inserts a new sorted entry when absent, recomputing the key prefix.
1699    ///
1700    /// Wave 11-K optimisation (Fix 1).
1701    pub fn insert_with_prefix_slice(
1702        &mut self,
1703        full_key: &[u8],
1704        lsn: Lsn,
1705        data: Option<&[u8]>,
1706    ) -> (usize, bool) {
1707        let plen = self.key_prefix.len();
1708        let new_len = if plen > 0 {
1709            get_key_prefix_length(&self.key_prefix, full_key)
1710        } else {
1711            0
1712        };
1713
1714        if plen > 0 && new_len < plen {
1715            let mut candidate = self.compute_key_prefix(None);
1716            if !candidate.is_empty() {
1717                let cl = get_key_prefix_length(&candidate, full_key);
1718                candidate.truncate(cl);
1719            } else {
1720                if !self.entries.is_empty()
1721                    && let Some(first_full) = self.get_full_key(0)
1722                {
1723                    candidate = create_key_prefix(&first_full, full_key)
1724                        .unwrap_or_default();
1725                    for i in 1..self.entries.len() {
1726                        if candidate.is_empty() {
1727                            break;
1728                        }
1729                        if let Some(fk) = self.get_full_key(i) {
1730                            let l = get_key_prefix_length(&candidate, &fk);
1731                            candidate.truncate(l);
1732                        }
1733                    }
1734                }
1735            }
1736            self.apply_new_prefix(candidate);
1737        }
1738
1739        let suffix = self.compress_key(full_key);
1740
1741        match self.key_binary_search(&suffix) {
1742            Ok(idx) => {
1743                self.set_lsn(idx, lsn); // T-3
1744                self.entries[idx].data = data.map(|d| d.to_vec());
1745                self.entries[idx].dirty = true;
1746                (idx, false)
1747            }
1748            Err(idx) => {
1749                self.insert_slot(idx, suffix, lsn, data.map(|d| d.to_vec()));
1750                if self.key_prefix.is_empty() && self.entries.len() >= 2 {
1751                    self.recompute_key_prefix();
1752                }
1753                (idx, true)
1754            }
1755        }
1756    }
1757
1758    /// Returns the number of slots that are marked dirty.
1759    ///
1760    /// `BIN.getNumDirtyEntries()`.
1761    pub fn dirty_count(&self) -> usize {
1762        self.entries.iter().filter(|e| e.dirty).count()
1763    }
1764
1765    /// Decide whether to log this BIN as a delta (true) or a full BIN (false).
1766    ///
1767    /// Faithful port of JE `BIN.shouldLogDelta()` (BIN.java:1892).  The
1768    /// decision is COUNT-based (number of would-be delta slots vs a percent of
1769    /// `nEntries`), NOT a dirty-fraction-vs-hardcoded-0.25 heuristic:
1770    ///
1771    /// ```text
1772    /// if (isBINDelta()) { return true; }          // already a delta
1773    /// if (isDeltaProhibited()) return false;       // prohibit / no prior full
1774    /// numDeltas = getNDeltas();
1775    /// if (numDeltas <= 0) return false;            // empty delta is invalid
1776    /// deltaLimit = (getNEntries() * binDeltaPercent) / 100;  // INTEGER math
1777    /// return numDeltas <= deltaLimit;
1778    /// ```
1779    ///
1780    /// `numDeltas` (JE `getNDeltas`) is the count of slots that would appear in
1781    /// the delta — i.e. the dirty slots since the last full BIN — which here is
1782    /// `dirty_count()`.  `binDeltaPercent` is the CONFIGURABLE `TREE_BIN_DELTA`
1783    /// param (JE `DatabaseImpl.getBinDeltaPercent()`, default 25), threaded in
1784    /// by the checkpointer — NOT a hardcoded constant.
1785    ///
1786    /// `isDeltaProhibited()` (BIN.java:1867) is
1787    /// `getProhibitNextDelta() || isDeferredWriteMode() || lastFullLsn == NULL`.
1788    /// Deferred-write mode is not modelled in the runtime stub; the other two
1789    /// terms are.
1790    ///
1791    /// JE ref: `BIN.shouldLogDelta` (BIN.java:1892), `BIN.isDeltaProhibited`
1792    /// (BIN.java:1867).
1793    pub fn should_log_delta(&self, bin_delta_percent: i32) -> bool {
1794        // Already a delta: re-log as a delta.  JE asserts !prohibitNextDelta
1795        // and lastFullLsn != NULL here.
1796        if self.is_delta {
1797            return self.last_full_lsn != NULL_LSN && !self.prohibit_next_delta;
1798        }
1799
1800        // isDeltaProhibited(): cheapest checks first.
1801        if self.prohibit_next_delta || self.last_full_lsn == NULL_LSN {
1802            return false;
1803        }
1804
1805        // numDeltas = getNDeltas(): the dirty slots that would be in the delta.
1806        let num_deltas = self.dirty_count() as i32;
1807
1808        // A delta with zero items is not valid.
1809        if num_deltas <= 0 {
1810            return false;
1811        }
1812
1813        // Configured BinDeltaPercent limit — INTEGER math, exactly as JE.
1814        let delta_limit = (self.entries.len() as i32 * bin_delta_percent) / 100;
1815        num_deltas <= delta_limit
1816    }
1817
1818    /// Comparator-aware binary search: finds `full_key` using `cmp`.
1819    ///
1820    /// Unlike `find_entry_compressed` (which uses suffix-based lexicographic
1821    /// comparison), this decompresses each entry's key to its full form and
1822    /// applies the provided comparator — required for sorted-dup databases
1823    /// where lexicographic suffix comparison would give wrong results when
1824    /// different-length primary keys are in the same BIN.
1825    ///
1826    /// Returns `(idx, exact)`.  Does NOT do prefix compression.
1827    ///
1828    /// `IN.findEntry` with btreeComparator active.
1829    pub fn find_entry_cmp(
1830        &self,
1831        full_key: &[u8],
1832        cmp: &dyn Fn(&[u8], &[u8]) -> std::cmp::Ordering,
1833    ) -> (usize, bool) {
1834        // Hot path: avoid per-comparison Vec<u8> allocation.
1835        // When key_prefix is empty the stored suffix IS the full key, so we
1836        // pass the suffix slice directly.  When prefix is non-empty we build a
1837        // temporary concatenation only once per comparison using a small
1838        // stack-local Vec that is dropped immediately after the call — this
1839        // still allocates but is limited to O(key_len) bytes per call and
1840        // avoids retaining any heap state between comparisons.
1841        if self.key_prefix.is_empty() {
1842            match self.key_binary_search_by(|s| cmp(s, full_key)) {
1843                Ok(idx) => (idx, true),
1844                Err(idx) => (idx, false),
1845            }
1846        } else {
1847            let prefix = self.key_prefix.as_slice();
1848            match self.key_binary_search_by(|s| {
1849                let mut fk = Vec::with_capacity(prefix.len() + s.len());
1850                fk.extend_from_slice(prefix);
1851                fk.extend_from_slice(s);
1852                cmp(&fk, full_key)
1853            }) {
1854                Ok(idx) => (idx, true),
1855                Err(idx) => (idx, false),
1856            }
1857        }
1858    }
1859
1860    /// Comparator-driven binary search over the node-level key rep (T-2).
1861    /// `cmp(stored_suffix)` returns how the stored slot compares to the
1862    /// search key.
1863    #[inline]
1864    fn key_binary_search_by(
1865        &self,
1866        mut cmp: impl FnMut(&[u8]) -> std::cmp::Ordering,
1867    ) -> Result<usize, usize> {
1868        let mut lo = 0usize;
1869        let mut hi = self.keys.len();
1870        while lo < hi {
1871            let mid = lo + (hi - lo) / 2;
1872            match cmp(self.keys.get(mid)) {
1873                std::cmp::Ordering::Less => lo = mid + 1,
1874                std::cmp::Ordering::Greater => hi = mid,
1875                std::cmp::Ordering::Equal => return Ok(mid),
1876            }
1877        }
1878        Err(lo)
1879    }
1880
1881    /// Returns the LSN of the slot matching `full_key`, if one exists.
1882    ///
1883    /// Used by the recovery LN-redo apply to enforce JE's currency check
1884    /// (`RecoveryManager.redo()` line ~2512): a logged LN is applied only
1885    /// when `logrecLsn > treeLsn`.  Returns `None` when the key is absent
1886    /// (always apply).  Uses the same lookup variant the matching insert
1887    /// path uses so the comparison is over the right slot.
1888    pub fn redo_slot_lsn(
1889        &self,
1890        full_key: &[u8],
1891        cmp: Option<&dyn Fn(&[u8], &[u8]) -> std::cmp::Ordering>,
1892        key_prefixing: bool,
1893    ) -> Option<Lsn> {
1894        let (idx, found) = match cmp {
1895            Some(c) => self.find_entry_cmp(full_key, c),
1896            None if key_prefixing => self.find_entry_compressed(full_key),
1897            None => {
1898                // insert_raw path: full keys stored verbatim.
1899                match self.key_binary_search(full_key) {
1900                    Ok(idx) => (idx, true),
1901                    Err(idx) => (idx, false),
1902                }
1903            }
1904        };
1905        if found { Some(self.get_lsn(idx)) } else { None }
1906    }
1907
1908    /// Raw insert (no prefix compression) for databases with
1909    /// `key_prefixing = false`.
1910    ///
1911    /// JE `IN.computeKeyPrefix` returns `null` when
1912    /// `databaseImpl.getKeyPrefixing()` is `false`, so no prefix is ever
1913    /// set on those BINs.  Noxu was previously ignoring the flag and always
1914    /// calling `insert_with_prefix`; this method provides the faithful path.
1915    ///
1916    /// The key is stored verbatim (no suffix stripping). An existing
1917    /// `key_prefix` on the BIN is left untouched; callers must ensure it is
1918    /// empty (split_child already guarantees this for new BINs when
1919    /// `key_prefixing = false`).
1920    ///
1921    /// Returns `(slot_index, is_new_insert)`.
1922    ///
1923    /// Ref: `IN.java computeKeyPrefix` ~line 2456,
1924    ///      `DatabaseConfig.setKeyPrefixing` / `DatabaseImpl.getKeyPrefixing`.
1925    pub fn insert_raw(
1926        &mut self,
1927        full_key: Vec<u8>,
1928        lsn: Lsn,
1929        data: Option<Vec<u8>>,
1930    ) -> (usize, bool) {
1931        // Binary search on the stored (full) keys.
1932        // When key_prefix is empty entries store full keys directly; for
1933        // key_prefixing=false DBs the prefix is always empty.
1934        match self.key_binary_search(full_key.as_slice()) {
1935            Ok(idx) => {
1936                self.set_lsn(idx, lsn); // T-3
1937                self.entries[idx].data = data;
1938                self.entries[idx].dirty = true;
1939                (idx, false)
1940            }
1941            Err(idx) => {
1942                self.insert_slot(idx, full_key, lsn, data);
1943                (idx, true)
1944            }
1945        }
1946    }
1947
1948    /// Comparator-aware insert: inserts `full_key` into the BIN using `cmp`.
1949    ///
1950    /// Prefix compression is DISABLED: the key is stored as-is.  This is
1951    /// intentional for sorted-dup databases where the custom comparator
1952    /// requires full-key access at every comparison.
1953    ///
1954    /// Returns `(slot_index, is_new_insert)`.
1955    ///
1956    pub fn insert_cmp(
1957        &mut self,
1958        full_key: Vec<u8>,
1959        lsn: Lsn,
1960        data: Option<Vec<u8>>,
1961        cmp: &dyn Fn(&[u8], &[u8]) -> std::cmp::Ordering,
1962    ) -> (usize, bool) {
1963        if self.key_prefix.is_empty() {
1964            match self.key_binary_search_by(|s| cmp(s, &full_key)) {
1965                Ok(idx) => {
1966                    self.set_lsn(idx, lsn); // T-3
1967                    self.entries[idx].data = data;
1968                    self.entries[idx].dirty = true;
1969                    (idx, false)
1970                }
1971                Err(idx) => {
1972                    self.insert_slot(idx, full_key, lsn, data);
1973                    (idx, true)
1974                }
1975            }
1976        } else {
1977            let prefix = self.key_prefix.clone();
1978            match self.key_binary_search_by(|s| {
1979                let mut fk = Vec::with_capacity(prefix.len() + s.len());
1980                fk.extend_from_slice(&prefix);
1981                fk.extend_from_slice(s);
1982                cmp(&fk, &full_key)
1983            }) {
1984                Ok(idx) => {
1985                    // Key exists — update in place.
1986                    self.set_lsn(idx, lsn); // T-3
1987                    self.entries[idx].data = data;
1988                    self.entries[idx].dirty = true;
1989                    (idx, false)
1990                }
1991                Err(idx) => {
1992                    // New key — insert at sorted position (no prefix compression).
1993                    self.insert_slot(idx, full_key, lsn, data);
1994                    (idx, true)
1995                }
1996            }
1997        }
1998    }
1999
2000    /// Comparator-aware delete: removes `full_key` from the BIN using `cmp`.
2001    ///
2002    /// Returns `true` if the entry was found and removed.
2003    pub fn delete_cmp(
2004        &mut self,
2005        full_key: &[u8],
2006        cmp: &dyn Fn(&[u8], &[u8]) -> std::cmp::Ordering,
2007    ) -> bool {
2008        let result = if self.key_prefix.is_empty() {
2009            self.key_binary_search_by(|s| cmp(s, full_key))
2010        } else {
2011            let prefix = self.key_prefix.clone();
2012            self.key_binary_search_by(|s| {
2013                let mut fk = Vec::with_capacity(prefix.len() + s.len());
2014                fk.extend_from_slice(&prefix);
2015                fk.extend_from_slice(s);
2016                cmp(&fk, full_key)
2017            })
2018        };
2019        match result {
2020            Ok(idx) => {
2021                self.entries.remove(idx);
2022                self.keys.remove(idx); // T-2
2023                self.lsn_rep.remove_shift(idx); // T-3
2024                self.dirty = true;
2025                true
2026            }
2027            Err(_) => false,
2028        }
2029    }
2030
2031    /// Serialise ALL entries (full BIN write).
2032    ///
2033    /// Format (per slot): key_len(u32BE) | key | lsn(u64BE) |
2034    ///   has_data(u8) | data_len(u32BE) | data | known_deleted(u8)
2035    ///
2036    /// Prepended by: node_id(u64BE) | num_entries(u32BE).
2037    ///
2038    /// `BIN.writeToLog()` (non-delta path).
2039    pub fn serialize_full(&self) -> Vec<u8> {
2040        let mut buf = Vec::new();
2041        buf.extend_from_slice(&self.node_id.to_be_bytes());
2042        buf.extend_from_slice(&(self.entries.len() as u32).to_be_bytes());
2043        for i in 0..self.entries.len() {
2044            let full_key = self.get_full_key(i).unwrap_or_default();
2045            buf.extend_from_slice(&(full_key.len() as u32).to_be_bytes());
2046            buf.extend_from_slice(&full_key);
2047            let lsn = self.get_lsn(i); // T-3
2048            let e = &self.entries[i];
2049            buf.extend_from_slice(&lsn.as_u64().to_be_bytes());
2050            if let Some(d) = &e.data {
2051                buf.push(1u8);
2052                buf.extend_from_slice(&(d.len() as u32).to_be_bytes());
2053                buf.extend_from_slice(d);
2054            } else {
2055                buf.push(0u8);
2056            }
2057            buf.push(e.known_deleted as u8);
2058        }
2059        buf
2060    }
2061
2062    /// Serialise only dirty slots (BIN-delta write).
2063    ///
2064    /// Format (per dirty slot): slot_idx(u32BE) | key_len(u32BE) | key |
2065    ///   lsn(u64BE) | has_data(u8) | data_len(u32BE) | data | known_deleted(u8)
2066    ///
2067    /// Prepended by: node_id(u64BE) | num_dirty(u32BE).
2068    ///
2069    /// `BIN.writeToLog()` (delta path).
2070    pub fn serialize_delta(&self) -> Vec<u8> {
2071        let dirty: Vec<usize> = (0..self.entries.len())
2072            .filter(|&i| self.entries[i].dirty)
2073            .collect();
2074        let mut buf = Vec::new();
2075        buf.extend_from_slice(&self.node_id.to_be_bytes());
2076        buf.extend_from_slice(&(dirty.len() as u32).to_be_bytes());
2077        for idx in dirty {
2078            buf.extend_from_slice(&(idx as u32).to_be_bytes());
2079            let full_key = self.get_full_key(idx).unwrap_or_default();
2080            buf.extend_from_slice(&(full_key.len() as u32).to_be_bytes());
2081            buf.extend_from_slice(&full_key);
2082            let lsn = self.get_lsn(idx); // T-3
2083            let e = &self.entries[idx];
2084            buf.extend_from_slice(&lsn.as_u64().to_be_bytes());
2085            if let Some(d) = &e.data {
2086                buf.push(1u8);
2087                buf.extend_from_slice(&(d.len() as u32).to_be_bytes());
2088                buf.extend_from_slice(d);
2089            } else {
2090                buf.push(0u8);
2091            }
2092            buf.push(e.known_deleted as u8);
2093        }
2094        buf
2095    }
2096
2097    /// Deserialise a full BIN from the bytes produced by `serialize_full()`.
2098    ///
2099    /// Returns a `BinStub` with all entries populated and all slots marked
2100    /// clean (they are already on disk at `last_full_lsn`).  Returns `None`
2101    /// if the byte slice is malformed.
2102    ///
2103    /// `INLogEntry.readEntry()` / `IN.readFromLog()` (non-delta).
2104    pub fn deserialize_full(bytes: &[u8]) -> Option<BinStub> {
2105        if bytes.len() < 12 {
2106            return None;
2107        }
2108        let node_id = u64::from_be_bytes(bytes[0..8].try_into().ok()?);
2109        let num_entries =
2110            u32::from_be_bytes(bytes[8..12].try_into().ok()?) as usize;
2111        let mut pos = 12usize;
2112        let mut entries = Vec::with_capacity(num_entries);
2113        let mut lsns: Vec<Lsn> = Vec::with_capacity(num_entries);
2114        let mut keys: Vec<Vec<u8>> = Vec::with_capacity(num_entries); // T-2
2115        for _ in 0..num_entries {
2116            // key_len(u32BE) | key | lsn(u64BE) | has_data(u8) [| data_len(u32BE) | data] | known_deleted(u8)
2117            if pos + 4 > bytes.len() {
2118                return None;
2119            }
2120            let key_len =
2121                u32::from_be_bytes(bytes[pos..pos + 4].try_into().ok()?)
2122                    as usize;
2123            pos += 4;
2124            if pos + key_len > bytes.len() {
2125                return None;
2126            }
2127            let key = bytes[pos..pos + key_len].to_vec();
2128            pos += key_len;
2129            if pos + 8 > bytes.len() {
2130                return None;
2131            }
2132            let lsn = Lsn::from_u64(u64::from_be_bytes(
2133                bytes[pos..pos + 8].try_into().ok()?,
2134            ));
2135            pos += 8;
2136            if pos + 1 > bytes.len() {
2137                return None;
2138            }
2139            let has_data = bytes[pos] != 0;
2140            pos += 1;
2141            let data = if has_data {
2142                if pos + 4 > bytes.len() {
2143                    return None;
2144                }
2145                let data_len =
2146                    u32::from_be_bytes(bytes[pos..pos + 4].try_into().ok()?)
2147                        as usize;
2148                pos += 4;
2149                if pos + data_len > bytes.len() {
2150                    return None;
2151                }
2152                let d = bytes[pos..pos + data_len].to_vec();
2153                pos += data_len;
2154                Some(d)
2155            } else {
2156                None
2157            };
2158            if pos + 1 > bytes.len() {
2159                return None;
2160            }
2161            let known_deleted = bytes[pos] != 0;
2162            pos += 1;
2163            entries.push(BinEntry {
2164                data,
2165                known_deleted,
2166                dirty: false, // freshly loaded from log — clean
2167                expiration_time: 0,
2168            });
2169            keys.push(key); // T-2 (full keys; recompute_key_prefix compresses)
2170            lsns.push(lsn); // T-3
2171        }
2172        // Keys stored in the serialized format are full (uncompressed) keys.
2173        // Re-establish the key prefix after loading so that memory use and
2174        // search performance match an in-memory BIN.
2175        // `IN.readFromLog()` → key prefix is part of the wire
2176        // format in the; in Noxu we store full keys and recompute on load.
2177        let mut bin = BinStub {
2178            node_id,
2179            level: BIN_LEVEL,
2180            entries,
2181            key_prefix: Vec::new(),
2182            dirty: false,
2183            is_delta: false,
2184            last_full_lsn: NULL_LSN, // caller sets this to the logged LSN
2185            last_delta_lsn: NULL_LSN,
2186            generation: 0,
2187            parent: None,
2188            expiration_in_hours: true,
2189            cursor_count: 0,
2190            prohibit_next_delta: false,
2191            lsn_rep: LsnRep::from_lsns(&lsns), // T-3
2192            keys: KeyRep::from_keys(keys),     // T-2 (full keys, no prefix yet)
2193            compact_max_key_length: INKeyRep_DEFAULT_MAX_KEY_LENGTH,
2194        };
2195        // Recompute key prefix from the full keys just loaded.
2196        // `IN.recalcKeyPrefix()` called after materializing from log.
2197        if bin.entries.len() >= 2 {
2198            bin.recompute_key_prefix();
2199        } else {
2200            // Even a single-slot BIN should attempt compaction.
2201            bin.keys.compact(bin.compact_max_key_length);
2202        }
2203        Some(bin)
2204    }
2205
2206    /// Deserialise a BIN delta from the bytes produced by `serialize_delta()`.
2207    ///
2208    /// **DO NOT USE for BIN reconstruction.** This helper writes full
2209    /// (uncompressed) keys directly into slots without recomputing the BIN
2210    /// key prefix, so on a prefix-compressed BIN it corrupts the slot keys and
2211    /// breaks the sorted-suffix invariant. It is NOT wired into any live path.
2212    /// The correct delta-reconstruction path is
2213    /// `mutate_to_full_bin` → `apply_delta_to_bin` → `insert_with_prefix`,
2214    /// which recomputes the prefix. This function is retained only for the
2215    /// raw byte-format round-trip and must not be used to reconstitute a BIN.
2216    /// Tracked for removal — see the v3.x review synthesis (storage C-2).
2217    ///
2218    /// Returns `None` if `delta_bytes` is malformed.
2219    pub fn apply_delta(base: &mut BinStub, delta_bytes: &[u8]) -> Option<()> {
2220        if delta_bytes.len() < 12 {
2221            return None;
2222        }
2223        // node_id(u64BE) — must match base
2224        let _node_id = u64::from_be_bytes(delta_bytes[0..8].try_into().ok()?);
2225        let num_dirty =
2226            u32::from_be_bytes(delta_bytes[8..12].try_into().ok()?) as usize;
2227        let mut pos = 12usize;
2228        for _ in 0..num_dirty {
2229            // slot_idx(u32BE) | key_len(u32BE) | key | lsn(u64BE) | has_data(u8) [| data_len | data] | known_deleted(u8)
2230            if pos + 4 > delta_bytes.len() {
2231                return None;
2232            }
2233            let slot_idx =
2234                u32::from_be_bytes(delta_bytes[pos..pos + 4].try_into().ok()?)
2235                    as usize;
2236            pos += 4;
2237            if pos + 4 > delta_bytes.len() {
2238                return None;
2239            }
2240            let key_len =
2241                u32::from_be_bytes(delta_bytes[pos..pos + 4].try_into().ok()?)
2242                    as usize;
2243            pos += 4;
2244            if pos + key_len > delta_bytes.len() {
2245                return None;
2246            }
2247            let key = delta_bytes[pos..pos + key_len].to_vec();
2248            pos += key_len;
2249            if pos + 8 > delta_bytes.len() {
2250                return None;
2251            }
2252            let lsn = Lsn::from_u64(u64::from_be_bytes(
2253                delta_bytes[pos..pos + 8].try_into().ok()?,
2254            ));
2255            pos += 8;
2256            if pos + 1 > delta_bytes.len() {
2257                return None;
2258            }
2259            let has_data = delta_bytes[pos] != 0;
2260            pos += 1;
2261            let data = if has_data {
2262                if pos + 4 > delta_bytes.len() {
2263                    return None;
2264                }
2265                let data_len = u32::from_be_bytes(
2266                    delta_bytes[pos..pos + 4].try_into().ok()?,
2267                ) as usize;
2268                pos += 4;
2269                if pos + data_len > delta_bytes.len() {
2270                    return None;
2271                }
2272                let d = delta_bytes[pos..pos + data_len].to_vec();
2273                pos += data_len;
2274                Some(d)
2275            } else {
2276                None
2277            };
2278            if pos + 1 > delta_bytes.len() {
2279                return None;
2280            }
2281            let known_deleted = delta_bytes[pos] != 0;
2282            pos += 1;
2283
2284            // Apply to base: update existing slot or insert new one.
2285            if slot_idx < base.entries.len() {
2286                base.keys.set(slot_idx, key); // T-2
2287                base.set_lsn(slot_idx, lsn); // T-3
2288                base.entries[slot_idx].data = data;
2289                base.entries[slot_idx].known_deleted = known_deleted;
2290                base.entries[slot_idx].dirty = false;
2291            } else {
2292                // Slot index beyond current length — append.
2293                base.entries.push(BinEntry {
2294                    data,
2295                    known_deleted,
2296                    dirty: false,
2297                    expiration_time: 0,
2298                });
2299                let n = base.entries.len();
2300                base.keys.insert(n - 1, key); // T-2
2301                base.lsn_rep.set(n - 1, lsn, n); // T-3
2302            }
2303        }
2304        Some(())
2305    }
2306
2307    /// Clear per-slot dirty flags and record `logged_at` as the LSN at which
2308    /// this BIN was last fully logged.
2309    ///
2310    /// Called by the checkpoint path after a successful full-BIN log write.
2311    /// `BIN.afterLog()` / `BIN.setLastFullLsn()`.
2312    pub fn clear_dirty_after_full_log(&mut self, logged_at: Lsn) {
2313        for e in &mut self.entries {
2314            e.dirty = false;
2315        }
2316        self.last_full_lsn = logged_at;
2317        self.dirty = false;
2318        // A full BIN captures all current state, so the delta-chain bound is
2319        // cleared: the next log may once again be a delta.
2320        // JE `IN.afterLog` clears the prohibit flag after a full log
2321        // (IN.java:5557 `bin.setProhibitNextDelta(false)`).
2322        self.prohibit_next_delta = false;
2323    }
2324
2325    /// Clear per-slot dirty flags after a successful delta log write.
2326    ///
2327    /// `last_full_lsn` is NOT updated — the full LSN only changes after a
2328    /// full BIN write.
2329    /// `BIN.afterLog()` (delta path).
2330    pub fn clear_dirty_after_delta_log(&mut self) {
2331        for e in &mut self.entries {
2332            e.dirty = false;
2333        }
2334        self.dirty = false;
2335    }
2336}
2337
2338impl TreeNode {
2339    /// Returns true if this is a BIN (bottom internal node).
2340    pub fn is_bin(&self) -> bool {
2341        matches!(self, TreeNode::Bottom(_))
2342    }
2343
2344    /// Returns the level of this node.
2345    pub fn level(&self) -> i32 {
2346        match self {
2347            TreeNode::Internal(n) => n.level,
2348            TreeNode::Bottom(b) => b.level,
2349        }
2350    }
2351
2352    /// Returns the node id of this node.
2353    pub fn node_id(&self) -> u64 {
2354        match self {
2355            TreeNode::Internal(n) => n.node_id,
2356            TreeNode::Bottom(b) => b.node_id,
2357        }
2358    }
2359
2360    /// Faithful in-memory heap footprint of this node, in bytes.
2361    ///
2362    /// JE `IN.getBudgetedMemorySize()` (IN.java) returns the running
2363    /// `inMemorySize` that `MemoryBudget` tracks for the node: the fixed
2364    /// IN/BIN struct overhead plus, per slot, the fixed entry overhead and the
2365    /// variable key (and embedded-LN data for BINs) bytes.  This is the single
2366    /// source of truth for both the live tree accounting and the evictor's
2367    /// detach credit (EV-13) — keeping it on `TreeNode` avoids the formula
2368    /// drifting between `noxu-tree` and `noxu-evictor`.
2369    ///
2370    /// Rust has a fixed struct layout (unlike JE's `Sizeof`-measured JVM
2371    /// constants) so `size_of` is exact for the fixed overheads; the variable
2372    /// part mirrors JE's per-slot `entryKeys`/embedded-data accounting.
2373    pub fn budgeted_memory_size(&self) -> u64 {
2374        use std::mem::size_of;
2375        match self {
2376            TreeNode::Bottom(b) => {
2377                (size_of::<BinStub>()
2378                    + b.entries.len() * size_of::<BinEntry>()
2379                    + b.key_prefix.len()
2380                    + b.keys.memory_size() // T-2: node-level key rep bytes
2381                    + b.lsn_rep.memory_size() // T-3: node-level LSN rep bytes
2382                    + b.entries
2383                        .iter()
2384                        .map(|e| {
2385                            e.data.as_ref().map(|d| d.len()).unwrap_or(0)
2386                        })
2387                        .sum::<usize>()) as u64
2388            }
2389            TreeNode::Internal(n) => {
2390                (size_of::<InNodeStub>()
2391                    + n.entries.len() * size_of::<InEntry>()
2392                    + n.targets.memory_size()
2393                    + n.entries.iter().map(|e| e.key.len()).sum::<usize>())
2394                    as u64
2395            }
2396        }
2397    }
2398
2399    /// Binary search for a key in this node.
2400    ///
2401    /// For BIN nodes the search is prefix-aware: if the BIN has a key prefix,
2402    /// `key` (a full, uncompressed key) is compared against stored suffixes
2403    /// after stripping the prefix.
2404    /// `IN.findEntry(key, indicateIfDuplicate, exact)`.
2405    ///
2406    /// Returns index with EXACT_MATCH flag set if exact match found.
2407    /// If exact is false, returns insertion point.
2408    pub fn find_entry(&self, key: &[u8], _indicator: bool, exact: bool) -> i32 {
2409        match self {
2410            TreeNode::Internal(n) => {
2411                let result = n
2412                    .entries
2413                    .binary_search_by(|entry| entry.key.as_slice().cmp(key));
2414                match result {
2415                    Ok(idx) => (idx as i32) | EXACT_MATCH,
2416                    Err(idx) => {
2417                        if exact {
2418                            -1
2419                        } else {
2420                            // Floor (not insertion point): the child slot to
2421                            // descend into is the largest entry ≤ key. Slot 0
2422                            // is the leftmost child, so a key below every
2423                            // separator floors to 0. (St-H5: previously
2424                            // returned the insertion point `idx`, which routes
2425                            // one child too far right.)
2426                            (idx as i32 - 1).max(0)
2427                        }
2428                    }
2429                }
2430            }
2431            TreeNode::Bottom(b) => {
2432                // Use prefix-aware search: the stored key is a suffix when
2433                // key_prefix is non-empty.
2434                let (idx, found) = b.find_entry_compressed(key);
2435                if found {
2436                    (idx as i32) | EXACT_MATCH
2437                } else if exact {
2438                    -1
2439                } else {
2440                    idx as i32
2441                }
2442            }
2443        }
2444    }
2445
2446    /// Gets the number of entries in this node.
2447    pub fn get_n_entries(&self) -> usize {
2448        match self {
2449            TreeNode::Internal(n) => n.entries.len(),
2450            TreeNode::Bottom(b) => b.entries.len(),
2451        }
2452    }
2453
2454    // ========================================================================
2455    // Dirty flag
2456    // ========================================================================
2457
2458    /// Returns true if this node has been modified since last checkpoint.
2459    ///
2460    /// `IN.getDirty()`.
2461    pub fn is_dirty(&self) -> bool {
2462        match self {
2463            TreeNode::Internal(n) => n.dirty,
2464            TreeNode::Bottom(b) => b.dirty,
2465        }
2466    }
2467
2468    /// Sets or clears the dirty flag on this node.
2469    ///
2470    /// `IN.setDirty(boolean dirty)`.
2471    pub fn set_dirty(&mut self, dirty: bool) {
2472        match self {
2473            TreeNode::Internal(n) => n.dirty = dirty,
2474            TreeNode::Bottom(b) => b.dirty = dirty,
2475        }
2476    }
2477
2478    // ========================================================================
2479    // LRU generation
2480    // ========================================================================
2481
2482    /// Returns the LRU generation counter.
2483    ///
2484    /// `IN.getGeneration()`.
2485    pub fn get_generation(&self) -> u64 {
2486        match self {
2487            TreeNode::Internal(n) => n.generation,
2488            TreeNode::Bottom(b) => b.generation,
2489        }
2490    }
2491
2492    /// Sets the LRU generation counter.
2493    ///
2494    /// `IN.setGeneration(long gen)`.
2495    pub fn set_generation(&mut self, r#gen: u64) {
2496        match self {
2497            TreeNode::Internal(n) => n.generation = r#gen,
2498            TreeNode::Bottom(b) => b.generation = r#gen,
2499        }
2500    }
2501
2502    // ========================================================================
2503    // Parent pointer
2504    // ========================================================================
2505
2506    /// Returns a clone of the weak parent pointer, if any.
2507    pub fn get_parent(&self) -> Option<Weak<RwLock<TreeNode>>> {
2508        match self {
2509            TreeNode::Internal(n) => n.parent.clone(),
2510            TreeNode::Bottom(b) => b.parent.clone(),
2511        }
2512    }
2513
2514    /// Sets the weak parent pointer on this node.
2515    pub fn set_parent(&mut self, parent: Option<Weak<RwLock<TreeNode>>>) {
2516        match self {
2517            TreeNode::Internal(n) => n.parent = parent,
2518            TreeNode::Bottom(b) => b.parent = parent,
2519        }
2520    }
2521
2522    // ========================================================================
2523    // Log serialization
2524    // ========================================================================
2525
2526    /// Estimates the serialized byte size of this node for log/checkpoint use.
2527    ///
2528    /// `IN.getLogSize()` — Noxu-native serialization format.
2529    ///
2530    /// Format (big-endian):
2531    /// - node_id     : 8 bytes
2532    /// - level       : 4 bytes
2533    /// - n_entries   : 4 bytes
2534    /// - dirty       : 1 byte
2535    /// - For each entry:
2536    ///   - key_len   : 2 bytes
2537    ///   - key       : key_len bytes
2538    ///   - lsn       : 8 bytes
2539    pub fn log_size(&self) -> usize {
2540        // Fixed header: node_id(8) + level(4) + n_entries(4) + dirty(1)
2541        let mut size: usize = 8 + 4 + 4 + 1;
2542        match self {
2543            TreeNode::Internal(n) => {
2544                for entry in &n.entries {
2545                    size += 2 + entry.key.len() + 8; // key_len + key + lsn
2546                }
2547            }
2548            TreeNode::Bottom(b) => {
2549                for i in 0..b.entries.len() {
2550                    size += 2 + b.get_key(i).len() + 8; // key_len + key + lsn
2551                }
2552            }
2553        }
2554        size
2555    }
2556
2557    /// Serializes this node to bytes for log writing.
2558    ///
2559    /// `IN.writeToLog(ByteBuffer logBuffer)` — Noxu-native
2560    /// format matching `log_size()`.
2561    pub fn write_to_bytes(&self) -> Vec<u8> {
2562        let mut buf = Vec::with_capacity(self.log_size());
2563        match self {
2564            TreeNode::Internal(n) => {
2565                buf.extend_from_slice(&n.node_id.to_be_bytes());
2566                buf.extend_from_slice(&n.level.to_be_bytes());
2567                buf.extend_from_slice(&(n.entries.len() as u32).to_be_bytes());
2568                buf.push(n.dirty as u8);
2569                for (i, entry) in n.entries.iter().enumerate() {
2570                    buf.extend_from_slice(
2571                        &(entry.key.len() as u16).to_be_bytes(),
2572                    );
2573                    buf.extend_from_slice(&entry.key);
2574                    buf.extend_from_slice(&n.get_lsn(i).as_u64().to_be_bytes());
2575                }
2576            }
2577            TreeNode::Bottom(b) => {
2578                buf.extend_from_slice(&b.node_id.to_be_bytes());
2579                buf.extend_from_slice(&b.level.to_be_bytes());
2580                buf.extend_from_slice(&(b.entries.len() as u32).to_be_bytes());
2581                buf.push(b.dirty as u8);
2582                for i in 0..b.entries.len() {
2583                    let key = b.get_key(i);
2584                    buf.extend_from_slice(&(key.len() as u16).to_be_bytes());
2585                    buf.extend_from_slice(key);
2586                    buf.extend_from_slice(&b.get_lsn(i).as_u64().to_be_bytes());
2587                }
2588            }
2589        }
2590        buf
2591    }
2592}
2593
2594/// Internal helper used during splits to carry entries of either node kind.
2595///
2596/// `BinStub` and `InNodeStub` store different entry types, so we need a
2597/// common wrapper to pass split slices around without code duplication.
2598enum SplitEntries {
2599    /// Upper-IN entries plus the parallel resident-child pointers (one per
2600    /// entry; `None` when the child is not cached) and the parallel per-slot
2601    /// LSNs (T-3: LSNs travel with their slots on a split, just like JE
2602    /// `IN.split` copies `entryLsnByteArray`/`entryLsnLongArray`).
2603    Internal(Vec<InEntry>, Vec<Option<ChildArc>>, Vec<Lsn>),
2604    /// BIN entries (metadata only) plus the parallel per-slot LSNs and the
2605    /// parallel FULL keys (T-2: keys live in the node-level `KeyRep`, not in
2606    /// `BinEntry`, so they travel as a separate `Vec<Vec<u8>>` of full keys
2607    /// through the split — the new BINs recompute their prefix from these).
2608    Bottom(Vec<BinEntry>, Vec<Lsn>, Vec<Vec<u8>>),
2609}
2610
2611impl SplitEntries {
2612    /// Returns the number of entries.
2613    fn len(&self) -> usize {
2614        match self {
2615            SplitEntries::Internal(v, _, _) => v.len(),
2616            SplitEntries::Bottom(v, _, _) => v.len(),
2617        }
2618    }
2619
2620    /// Returns the key at `index` as a slice.
2621    fn get_key(&self, index: usize) -> &[u8] {
2622        match self {
2623            SplitEntries::Internal(v, _, _) => v[index].key.as_slice(),
2624            SplitEntries::Bottom(_, _, k) => k[index].as_slice(),
2625        }
2626    }
2627
2628    /// Returns a sub-range `[lo, hi)` as a new `SplitEntries`.
2629    fn slice(&self, lo: usize, hi: usize) -> Self {
2630        match self {
2631            SplitEntries::Internal(v, c, l) => SplitEntries::Internal(
2632                v[lo..hi].to_vec(),
2633                c[lo..hi].to_vec(),
2634                l[lo..hi].to_vec(),
2635            ),
2636            SplitEntries::Bottom(v, l, k) => SplitEntries::Bottom(
2637                v[lo..hi].to_vec(),
2638                l[lo..hi].to_vec(),
2639                k[lo..hi].to_vec(),
2640            ),
2641        }
2642    }
2643}
2644
2645/// Tri-state outcome from one attempt at
2646/// `Tree::get_adjacent_bin_attempt`.
2647///
2648/// Distinguishes "the tree genuinely has no BIN in the requested
2649/// direction" (→ propagate as end-of-iteration) from "the path we
2650/// captured was invalidated by a concurrent split" (→ caller
2651/// retries from root). This split is necessary because the cursor
2652/// translates a `None` from `get_adjacent_bin` into
2653/// `OperationStatus::NotFound`, which is indistinguishable from a
2654/// real end-of-tree.
2655#[derive(Debug)]
2656enum AdjacentBinOutcome {
2657    /// A BIN was found in the requested direction.  T-3: each slot carries its
2658    /// `Lsn` alongside the `BinEntry` (the LSN lives in the node's packed
2659    /// `LsnRep`, not in `BinEntry`, so the scan snapshot pairs them).
2660    Found(Vec<(BinEntry, Lsn, Vec<u8>)>),
2661    /// The tree genuinely has no BIN in the requested direction.
2662    NoAdjacent,
2663    /// A concurrent split invalidated our captured path; the
2664    /// caller should retry from root.
2665    SplitRaceRetry,
2666}
2667
2668/// Split hint for the `splitSpecial` heuristic.
2669///
2670/// JE `Tree.forceSplit` tracks `allLeftSideDescent` / `allRightSideDescent`
2671/// (true if **every** routing decision during the top-down descent followed
2672/// the leftmost / rightmost child). At split time, when one of those flags
2673/// is set, `IN.splitSpecial` forces the split index to 1 (left side) or
2674/// `nEntries - 1` (right side) instead of `nEntries / 2`.
2675///
2676/// Effect: for sequential-append workloads the left BIN stays near-full
2677/// after every split (only one entry migrates to the new sibling), cutting
2678/// the split count roughly in half and reducing write amplification.
2679///
2680/// Ref: `IN.java splitSpecial` ~line 4129, `Tree.java forceSplit` ~line 1907.
2681#[derive(Clone, Copy, Debug, PartialEq, Eq)]
2682enum SplitHint {
2683    /// Normal midpoint split (`n_entries / 2`).
2684    Normal,
2685    /// Key was at position 0 on every level of descent.
2686    /// → `split_index = 1` so left node keeps all but the first entry.
2687    AllLeft,
2688    /// Key was at the rightmost position on every level of descent.
2689    /// → `split_index = n_entries - 1` so left node keeps almost everything.
2690    AllRight,
2691}
2692
2693impl Tree {
2694    /// Creates a new empty tree.
2695    ///
2696    /// Constructor.
2697    pub fn new(database_id: u64, max_entries_per_node: usize) -> Self {
2698        Tree {
2699            database_id,
2700            max_entries_per_node,
2701            root: RwLock::new(None),
2702            root_latch: SharedLatch::new(LatchContext::new("TreeRoot"), false),
2703            root_log_lsn: RwLock::new(noxu_util::NULL_LSN),
2704            root_splits: AtomicU64::new(0),
2705            relatches_required: AtomicU64::new(0),
2706            key_comparator: None,
2707            memory_counter: None,
2708            in_list_listener: None,
2709            log_manager: None,
2710            redo_capacity_hint: 0,
2711            key_prefixing: false, // JE default: KEY_PREFIXING_DEFAULT = false
2712            compact_max_key_length: INKeyRep_DEFAULT_MAX_KEY_LENGTH, // T-5
2713        }
2714    }
2715
2716    /// Installs a shared memory counter for evictor / MemoryBudget feedback.
2717    ///
2718    /// → `env.getMemoryBudget().updateTreeMemoryUsage(delta)`
2719    ///.  The counter is updated on every BIN entry insert/delete.
2720    pub fn set_memory_counter(&mut self, counter: Arc<AtomicI64>) {
2721        self.memory_counter = Some(counter);
2722    }
2723
2724    /// Installs the [`InListListener`] (the evictor) so node add/access/remove
2725    /// feed the LRU lists.  JE: `INList` registration that feeds
2726    /// `Evictor.addBack`/`moveBack`/`remove`.
2727    pub fn set_in_list_listener(&mut self, listener: Arc<dyn InListListener>) {
2728        self.in_list_listener = Some(listener);
2729    }
2730
2731    /// Installs the [`noxu_log::LogManager`] so an evicted root IN can be
2732    /// re-materialized from its persisted LSN on the next access (EV-14).
2733    ///
2734    /// JE: the tree reaches the log through `database.getEnv().getLogManager()`
2735    /// for `ChildReference.fetchTarget`.  Noxu installs it directly.
2736    pub fn set_log_manager(&mut self, lm: Arc<noxu_log::LogManager>) {
2737        self.log_manager = Some(lm);
2738    }
2739
2740    /// Drops this tree's `Arc<LogManager>` reference (EV-14 teardown).
2741    ///
2742    /// The env's `Drop` calls this on every tree it owns so the
2743    /// `Tree -> Arc<LogManager> -> Arc<FileManager>` chain cannot keep the
2744    /// FileManager (and its on-disk exclusive lock) alive past environment
2745    /// close.  After this the tree can no longer re-fetch an evicted root
2746    /// from the log — which is correct, because the environment is shutting
2747    /// down and the tree is about to be dropped.
2748    pub fn clear_log_manager(&mut self) {
2749        self.log_manager = None;
2750    }
2751
2752    /// T-5: set the compact-key threshold (`TREE_COMPACT_MAX_KEY_LENGTH` /
2753    /// `IN.getCompactMaxKeyLength`).  New BINs created by this tree inherit it;
2754    /// `<= 0` disables the compact key rep.  Default 16.
2755    pub fn set_compact_max_key_length(&mut self, len: i32) {
2756        self.compact_max_key_length = len;
2757    }
2758
2759    /// Notify the listener that a node became resident (JE `Evictor.addBack`).
2760    #[inline]
2761    fn note_added(&self, node_id: u64) {
2762        if let Some(l) = &self.in_list_listener {
2763            l.note_ins_added(node_id);
2764        }
2765    }
2766
2767    /// Notify the listener that a resident node was accessed
2768    /// (JE `Evictor.moveBack` — LRU touch).
2769    #[inline]
2770    fn note_accessed(&self, node_id: u64) {
2771        if let Some(l) = &self.in_list_listener {
2772            l.note_ins_accessed(node_id);
2773        }
2774    }
2775
2776    /// Notify the listener that a node was removed (JE `Evictor.remove`).
2777    #[inline]
2778    fn note_removed(&self, node_id: u64) {
2779        if let Some(l) = &self.in_list_listener {
2780            l.note_ins_removed(node_id);
2781        }
2782    }
2783
2784    /// Creates a new empty tree with a custom key comparator.
2785    ///
2786    /// Used for sorted-duplicate databases where keys are two-part
2787    /// composite keys that require a custom ordering function.
2788    ///
2789    /// Constructor with `btreeComparator` parameter.
2790    pub fn new_with_comparator(
2791        database_id: u64,
2792        max_entries_per_node: usize,
2793        comparator: KeyComparatorFn,
2794    ) -> Self {
2795        Tree {
2796            database_id,
2797            max_entries_per_node,
2798            root: RwLock::new(None),
2799            root_latch: SharedLatch::new(LatchContext::new("TreeRoot"), false),
2800            root_log_lsn: RwLock::new(noxu_util::NULL_LSN),
2801            root_splits: AtomicU64::new(0),
2802            relatches_required: AtomicU64::new(0),
2803            key_comparator: Some(comparator),
2804            memory_counter: None,
2805            in_list_listener: None,
2806            log_manager: None,
2807            redo_capacity_hint: 0,
2808            key_prefixing: false,
2809            compact_max_key_length: INKeyRep_DEFAULT_MAX_KEY_LENGTH, // T-5
2810        }
2811    }
2812
2813    /// Sets the key-prefixing flag.
2814    ///
2815    /// When `true`, BIN key-prefix compression is enabled: shared leading
2816    /// bytes are factored out of each slot's key.  When `false` (the
2817    /// default), keys are stored verbatim — matching JE
2818    /// `DatabaseConfig.setKeyPrefixing(false)` / `IN.computeKeyPrefix`
2819    /// returning `null`.
2820    ///
2821    /// Ref: `IN.java computeKeyPrefix` ~line 2456.
2822    pub fn set_key_prefixing(&mut self, enabled: bool) {
2823        self.key_prefixing = enabled;
2824    }
2825
2826    /// Sets the key comparator, replacing any existing one.
2827    pub fn set_comparator(&mut self, comparator: KeyComparatorFn) {
2828        self.key_comparator = Some(comparator);
2829    }
2830
2831    /// Store a capacity hint used by `redo_insert` when it creates the first
2832    /// BIN for this tree (the first-key path).
2833    ///
2834    /// The first BIN's `entries` Vec is pre-allocated with
2835    /// `capacity.min(max_entries_per_node)` slots, eliminating the
2836    /// Vec-resize doubling cycle (1 → 2 → 4 → … → cap) that would
2837    /// otherwise occur during the redo loop.
2838    ///
2839    /// Call once before the redo loop.  Has no effect on `insert` (the
2840    /// normal, non-recovery path).
2841    ///
2842    /// Wave 11-K optimisation (Fix 3).
2843    pub fn hint_redo_capacity(&mut self, capacity: usize) {
2844        self.redo_capacity_hint = capacity;
2845    }
2846
2847    /// Returns the current redo capacity hint (0 = no hint set).
2848    pub fn get_redo_capacity_hint(&self) -> usize {
2849        self.redo_capacity_hint
2850    }
2851
2852    /// Takes the key comparator out of this tree (leaving None).
2853    pub fn take_comparator(&mut self) -> Option<KeyComparatorFn> {
2854        self.key_comparator.take()
2855    }
2856
2857    /// Returns a reference to the key comparator, if configured.
2858    ///
2859    /// Used by `CursorImpl::find_bin_for_key` (R4 fix) so the cursor's own
2860    /// IN-level descent uses the same comparator-aware floor slot as the
2861    /// tree's own search paths. Mirrors JE `DatabaseImpl.getKeyComparator()`.
2862    pub fn get_comparator(&self) -> Option<&KeyComparatorFn> {
2863        self.key_comparator.as_ref()
2864    }
2865
2866    /// Returns the key comparator if set, or performs lexicographic comparison.
2867    #[inline]
2868    fn key_cmp(&self, a: &[u8], b: &[u8]) -> std::cmp::Ordering {
2869        match &self.key_comparator {
2870            Some(cmp) => cmp(a, b),
2871            None => a.cmp(b),
2872        }
2873    }
2874
2875    /// Floor child slot index for descending an internal node: the largest
2876    /// slot whose key is ≤ `key`. Slot 0 carries a virtual −∞ key (always
2877    /// qualifies); `entries[1..]` are sorted ascending, so this binary-searches
2878    /// the partition point instead of an O(n) linear walk (St-H4). Uses
2879    /// `key_cmp` so a configured custom comparator is honoured on every descent
2880    /// path. Returns 0 for an empty/single-slot node.
2881    fn upper_in_floor_index(&self, entries: &[InEntry], key: &[u8]) -> usize {
2882        if entries.len() <= 1 {
2883            return 0;
2884        }
2885        entries[1..].partition_point(|e| {
2886            self.key_cmp(e.key.as_slice(), key) != std::cmp::Ordering::Greater
2887        })
2888    }
2889
2890    /// Returns true if the tree has no root (is empty).
2891    pub fn is_empty(&self) -> bool {
2892        self.root.read().is_none()
2893    }
2894
2895    /// Sets the root of the tree.
2896    ///
2897    /// Must hold root_latch exclusively before calling.
2898    pub fn set_root(&self, node: TreeNode) {
2899        *self.root.write() = Some(Arc::new(RwLock::new(node)));
2900    }
2901
2902    /// Returns the root Arc, if any.
2903    ///
2904    /// Returns a cloned `Arc` rather than a reference so the caller does not
2905    /// hold the inner `RwLock` guard.
2906    ///
2907    /// EV-14: when the in-memory root has been evicted (`evict_root`) but a
2908    /// persisted version exists (`root_log_lsn` set), this re-materializes it
2909    /// from the log before returning — the faithful equivalent of JE
2910    /// `Tree.getRootIN` always calling `root.fetchTarget(...)`.  Returns
2911    /// `None` only for a genuinely empty tree (no resident root and no
2912    /// persisted root LSN).
2913    pub fn get_root(&self) -> Option<Arc<RwLock<TreeNode>>> {
2914        if let Some(r) = self.root.read().clone() {
2915            return Some(r);
2916        }
2917        // Root not resident: re-fetch it from `root_log_lsn` if one exists
2918        // (a no-op returning None when the tree was never populated).
2919        self.fetch_root_from_log()
2920    }
2921
2922    /// Returns the database ID.
2923    pub fn get_database_id(&self) -> u64 {
2924        self.database_id
2925    }
2926
2927    /// Count the total number of live (non-deleted) entries across all BINs.
2928    ///
2929    /// Used by `DatabaseImpl::set_recovered_tree()` to initialise the
2930    /// per-database `entry_count` AtomicU64 after recovery replays the log.
2931    pub fn count_entries(&self) -> u64 {
2932        let mut total = 0u64;
2933        if let Some(root) = self.get_root() {
2934            Self::count_entries_recursive(&root, &mut total);
2935        }
2936        total
2937    }
2938
2939    /// DBI-14: collect every live `(full_key, data, lsn)` triple in physical
2940    /// (left-to-right) order.  Used by `resort_under_comparator` to rebuild a
2941    /// tree whose slots were laid out in byte order (e.g. by recovery redo,
2942    /// which has no access to the application comparator) under the real
2943    /// configured comparator.
2944    fn collect_all_entries(&self) -> Vec<(Vec<u8>, Vec<u8>, Lsn)> {
2945        let mut out = Vec::new();
2946        if let Some(root) = self.get_root() {
2947            Self::collect_all_entries_recursive(&root, &mut out);
2948        }
2949        out
2950    }
2951
2952    fn collect_all_entries_recursive(
2953        node_arc: &Arc<RwLock<TreeNode>>,
2954        out: &mut Vec<(Vec<u8>, Vec<u8>, Lsn)>,
2955    ) {
2956        let guard = node_arc.read();
2957        match &*guard {
2958            TreeNode::Bottom(b) => {
2959                for i in 0..b.entries.len() {
2960                    if b.entries[i].known_deleted {
2961                        continue;
2962                    }
2963                    if let Some(fk) = b.get_full_key(i) {
2964                        let data =
2965                            b.entries[i].data.clone().unwrap_or_default();
2966                        out.push((fk, data, b.get_lsn(i)));
2967                    }
2968                }
2969            }
2970            TreeNode::Internal(n) => {
2971                let children: Vec<Arc<RwLock<TreeNode>>> =
2972                    n.resident_children();
2973                drop(guard);
2974                for child in &children {
2975                    Self::collect_all_entries_recursive(child, out);
2976                }
2977            }
2978        }
2979    }
2980
2981    /// DBI-14: rebuild this tree so that its on-disk byte-ordered slot layout
2982    /// is re-sorted under the currently-configured key comparator.
2983    ///
2984    /// Recovery redo (`redo_insert`) has no access to the application's
2985    /// comparator function — only the persisted identity — so it lays keys
2986    /// out in unsigned-byte order.  After `set_recovered_tree` attaches the
2987    /// real comparator, the slots must be re-sorted, or comparator-driven
2988    /// searches would binary-search a tree ordered by the wrong relation.
2989    ///
2990    /// No-op when no comparator is configured (byte order already matches the
2991    /// recovered layout) or when the tree is empty.  Mirrors the effect of
2992    /// JE reconstructing the comparator at open and the tree always having
2993    /// been built under it.
2994    pub fn resort_under_comparator(&self) {
2995        if self.key_comparator.is_none() {
2996            return;
2997        }
2998        let entries = self.collect_all_entries();
2999        if entries.is_empty() {
3000            return;
3001        }
3002        // Drop the current root; re-insert every entry through the normal
3003        // comparator-aware insert path so the new layout obeys the comparator.
3004        *self.root.write() = None;
3005        *self.root_log_lsn.write() = noxu_util::NULL_LSN;
3006        for (key, data, lsn) in entries {
3007            // Best-effort: a failed re-insert would be a tree-structure bug;
3008            // surface it loudly in debug builds.
3009            let r = self.insert(key, data, lsn);
3010            debug_assert!(
3011                r.is_ok(),
3012                "resort_under_comparator: re-insert failed: {r:?}"
3013            );
3014        }
3015    }
3016
3017    fn count_entries_recursive(
3018        node_arc: &Arc<RwLock<TreeNode>>,
3019        total: &mut u64,
3020    ) {
3021        let guard = node_arc.read();
3022        match &*guard {
3023            TreeNode::Bottom(b) => {
3024                // Count only live (non-known_deleted) entries.
3025                *total += b.entries.iter().filter(|e| !e.known_deleted).count()
3026                    as u64;
3027            }
3028            TreeNode::Internal(n) => {
3029                let children: Vec<Arc<RwLock<TreeNode>>> =
3030                    n.resident_children();
3031                drop(guard);
3032                for child in children {
3033                    Self::count_entries_recursive(&child, total);
3034                }
3035            }
3036        }
3037    }
3038
3039    /// Sum the real in-memory heap footprint of every resident node in the
3040    /// tree (DBI-23 oracle / reconciliation), in bytes.
3041    ///
3042    /// Walks all resident IN/BIN nodes and adds each node's
3043    /// `budgeted_memory_size` (JE `IN.getBudgetedMemorySize`).  This is the
3044    /// authoritative "real heap" figure the incrementally-maintained
3045    /// `memory_counter` is meant to approximate; an engine can call it to
3046    /// reconcile counter drift, and the DBI-23 test uses it as the oracle the
3047    /// live counter must stay within tolerance of.
3048    pub fn total_budgeted_memory(&self) -> u64 {
3049        let mut total = 0u64;
3050        if let Some(root) = self.get_root() {
3051            Self::total_budgeted_memory_recursive(&root, &mut total);
3052        }
3053        total
3054    }
3055
3056    fn total_budgeted_memory_recursive(
3057        node_arc: &Arc<RwLock<TreeNode>>,
3058        total: &mut u64,
3059    ) {
3060        let guard = node_arc.read();
3061        *total += guard.budgeted_memory_size();
3062        if let TreeNode::Internal(n) = &*guard {
3063            let children: Vec<Arc<RwLock<TreeNode>>> = n.resident_children();
3064            drop(guard);
3065            for child in children {
3066                Self::total_budgeted_memory_recursive(&child, total);
3067            }
3068        }
3069    }
3070
3071    /// Search for a BIN that should contain the given key.
3072    ///
3073    /// This is the core tree traversal operation. It walks from root to BIN
3074    /// using latch-coupling (acquire child latch, then release parent latch).
3075    ///
3076    /// . Descends the tree until a BIN is
3077    /// reached, following the child pointer at the slot whose key is the
3078    /// largest key <= the search key (the "LTE" rule).  Slot 0 in every upper
3079    /// IN carries a virtual key (-infinity) so any search key routes through
3080    /// it when all real keys are larger.
3081    ///
3082    /// Returns a SearchResult indicating where the key is or should be.
3083    /// Returns None if tree is empty.
3084    pub fn search(&self, key: &[u8]) -> Option<SearchResult> {
3085        let root = self.get_root()?;
3086
3087        // Hand-over-hand latch coupling for the descent. At each level we
3088        // hold a `parking_lot::ArcRwLockReadGuard` on the current node;
3089        // before dropping it, we acquire the child's read guard via
3090        // `Arc::read_arc`. This keeps a continuous chain of read locks
3091        // along the descent path so that no concurrent `split_child(parent,
3092        // …)` can run on a node we are about to enter — `split_child` takes
3093        // `parent.write()` to install the new sibling, and that write
3094        // blocks while we hold `parent.read()`. Without this, the prior
3095        // pattern (capture child Arc, drop parent guard, then take child
3096        // read lock) left a window in which a split could relocate the
3097        // child entries: a search for a key that should have ended up in
3098        // the new sibling would instead reach the (now left-half) child
3099        // and return a false `NotFound`.
3100        //
3101        // `read_arc()` returns `ArcRwLockReadGuard<RawRwLock, TreeNode>`
3102        // — a guard that owns its own Arc reference, so it has no
3103        // borrow lifetime and can be held across loop iterations and
3104        // assignment.
3105        let mut guard: parking_lot::ArcRwLockReadGuard<
3106            parking_lot::RawRwLock,
3107            TreeNode,
3108        > = root.read_arc();
3109
3110        loop {
3111            if guard.is_bin() {
3112                // JE: IN.fetchTarget / CursorImpl access moves the reached
3113                // BIN toward the hot end of the evictor's LRU list
3114                // (Evictor.moveBack).  A freshly split BIN that has not yet
3115                // been registered is added here (moveBack is add-if-absent).
3116                if let TreeNode::Bottom(bin) = &*guard {
3117                    self.note_accessed(bin.node_id);
3118                }
3119                // Reached a BIN: final key lookup within the same guard.
3120                // Use indicate_if_duplicate=true so an exact match sets
3121                // EXACT_MATCH in the return value.  Guard against -1 (not
3122                // found): -1i32 has all bits set, so the naive
3123                // `index & EXACT_MATCH != 0` check would incorrectly report
3124                // an exact match for a missing key.
3125                let (found, raw_idx) = match &*guard {
3126                    TreeNode::Bottom(bin) => match &self.key_comparator {
3127                        Some(cmp) => {
3128                            let (idx, exact) =
3129                                bin.find_entry_cmp(key, cmp.as_ref());
3130                            (exact, idx as i32)
3131                        }
3132                        None => {
3133                            let index = guard.find_entry(key, true, true);
3134                            let exact =
3135                                index >= 0 && (index & EXACT_MATCH != 0);
3136                            (exact, index & 0xFFFF)
3137                        }
3138                    },
3139                    _ => {
3140                        let index = guard.find_entry(key, true, true);
3141                        let exact = index >= 0 && (index & EXACT_MATCH != 0);
3142                        (exact, index & 0xFFFF)
3143                    }
3144                };
3145                // CursorImpl.isProbablyExpired(): if an exact match
3146                // was found, check whether the entry's TTL has already elapsed.
3147                // If it has, treat the slot as not found so callers skip it.
3148                //
3149                // TREE-F1: also treat a known_deleted slot as ABSENT on an
3150                // exact lookup, mirroring the tail of IN.findEntry
3151                // (IN.java:3197): `if (ret >= 0 && exact &&
3152                // isEntryKnownDeleted(ret & 0xffff)) return -1;`.  KD slots
3153                // legitimately exist in live BINs during BIN-delta
3154                // reconstitution until the compressor reclaims them.
3155                let found = if found {
3156                    if let TreeNode::Bottom(bin) = &*guard {
3157                        let idx = (raw_idx & 0x7FFF) as usize;
3158                        bin.slot_is_live(idx)
3159                    } else {
3160                        found
3161                    }
3162                } else {
3163                    found
3164                };
3165                return Some(SearchResult::with_values(found, raw_idx, false));
3166            }
3167
3168            // Upper IN: find the child slot with the largest key <= search
3169            // key, and capture the child Arc WHILE HOLDING the guard.
3170            // Slot 0 has a virtual key that compares as -infinity.
3171            let parent_arc =
3172                parking_lot::ArcRwLockReadGuard::rwlock(&guard).clone();
3173            let next_arc = match &*guard {
3174                TreeNode::Internal(n) => {
3175                    if n.entries.is_empty() {
3176                        return None;
3177                    }
3178                    // Walk forward as long as entry.key <= key, starting
3179                    // from slot 0 (which always qualifies because its key
3180                    // is the virtual -infinity key).
3181                    let idx = self.upper_in_floor_index(&n.entries, key);
3182                    match n.get_child(idx) {
3183                        // Resident child: keep the hand-over-hand fast path.
3184                        Some(c) => {
3185                            let next_guard = c.read_arc();
3186                            drop(guard);
3187                            guard = next_guard;
3188                            continue;
3189                        }
3190                        // EV-14/EV-13: child evicted — re-fetch it from its
3191                        // slot LSN (JE ChildReference.fetchTarget).  Must
3192                        // drop the parent read guard to upgrade to a write
3193                        // latch inside child_at_or_fetch.
3194                        None => idx,
3195                    }
3196                }
3197                TreeNode::Bottom(_) => {
3198                    unreachable!("is_bin() returned false above")
3199                }
3200            };
3201            drop(guard);
3202            let child = self.child_at_or_fetch(&parent_arc, next_arc)?;
3203            guard = child.read_arc();
3204        }
3205    }
3206
3207    /// Combined search-and-fetch: descend once to the BIN and return the
3208    /// slot's data together with a reference to the BIN arc.
3209    ///
3210    /// Replaces the previous three-descent sequence on the `Database::get`
3211    /// hot path:
3212    ///   1. `Tree::search` — existence check only.
3213    ///   2. `CursorImpl::get_data_from_tree` — re-descended to fetch data.
3214    ///   3. `CursorImpl::find_bin_for_key` — re-descended for BIN pinning.
3215    ///
3216    /// One descent now does all three jobs.  At the BIN level it uses the
3217    /// existing binary-search helper `find_entry_compressed` instead of the
3218    /// O(n) `iter().find()` used by `get_data_from_tree`.
3219    ///
3220    /// Returns `None` only when the tree is empty.  Otherwise returns
3221    /// `Some(SlotFetch)` — callers must inspect `SlotFetch::found` to
3222    /// determine whether the key was present.  The BIN read-guard is released
3223    /// before this method returns so callers may safely call `lock_ln`
3224    /// (which may block) without holding any tree latch.
3225    ///
3226    /// Wave-11-I — see the 2026 review.
3227    pub fn search_with_data(&self, key: &[u8]) -> Option<SlotFetch> {
3228        let root = self.get_root()?;
3229        let mut guard: parking_lot::ArcRwLockReadGuard<
3230            parking_lot::RawRwLock,
3231            TreeNode,
3232        > = root.read_arc();
3233
3234        loop {
3235            if guard.is_bin() {
3236                // Capture the BIN Arc before inspecting entries.
3237                let bin_arc =
3238                    parking_lot::ArcRwLockReadGuard::rwlock(&guard).clone();
3239
3240                let (found, data, lsn, slot_index) = match &*guard {
3241                    TreeNode::Bottom(bin) => {
3242                        let (idx, exact) = match &self.key_comparator {
3243                            Some(cmp) => bin.find_entry_cmp(key, cmp.as_ref()),
3244                            None => bin.find_entry_compressed(key),
3245                        };
3246                        if exact {
3247                            // TREE-F1: a slot is reported as found only when
3248                            // live (not known_deleted, not TTL-expired) — the
3249                            // same predicate used by Tree::search and the
3250                            // cursor scan.  Mirrors IN.findEntry (IN.java:3197)
3251                            // and CursorImpl.isProbablyExpired.
3252                            if bin.slot_is_live(idx) {
3253                                let lsn = bin.get_lsn(idx); // T-3
3254                                let e = &bin.entries[idx];
3255                                (true, e.data.clone(), lsn.as_u64(), idx)
3256                            } else {
3257                                (false, None, 0u64, 0)
3258                            }
3259                        } else {
3260                            (false, None, 0u64, 0)
3261                        }
3262                    }
3263                    _ => (false, None, 0u64, 0),
3264                };
3265                // Release the BIN read guard before returning so the caller
3266                // can call lock_ln (which may block) without holding a latch.
3267                drop(guard);
3268                return Some(SlotFetch {
3269                    found,
3270                    data,
3271                    lsn,
3272                    slot_index,
3273                    bin_arc,
3274                });
3275            }
3276
3277            // Upper IN: same hand-over-hand descent as `Tree::search`.
3278            let parent_arc =
3279                parking_lot::ArcRwLockReadGuard::rwlock(&guard).clone();
3280            let next_idx = match &*guard {
3281                TreeNode::Internal(n) => {
3282                    if n.entries.is_empty() {
3283                        return None;
3284                    }
3285                    // Slot 0 = virtual −∞; walk forward while entry.key ≤ key.
3286                    let idx = self.upper_in_floor_index(&n.entries, key);
3287                    match n.get_child(idx) {
3288                        Some(c) => {
3289                            let next_guard = c.read_arc();
3290                            drop(guard);
3291                            guard = next_guard;
3292                            continue;
3293                        }
3294                        // EV-14/EV-13: re-fetch an evicted child from its LSN.
3295                        None => idx,
3296                    }
3297                }
3298                TreeNode::Bottom(_) => {
3299                    unreachable!("is_bin() returned false above")
3300                }
3301            };
3302            drop(guard);
3303            let child = self.child_at_or_fetch(&parent_arc, next_idx)?;
3304            guard = child.read_arc();
3305        }
3306    }
3307
3308    /// Sets the expiration time (in absolute hours since Unix epoch) for an
3309    /// existing key's BIN slot.
3310    ///
3311    /// Returns `true` if the key was found and updated, `false` otherwise.
3312    ///
3313    /// Used by `Database::put_with_options()` to apply per-record TTL.
3314    /// `IN.entryExpiration` / `BIN.expirationInHours` path.
3315    pub fn update_key_expiration(
3316        &self,
3317        key: &[u8],
3318        expiration_hours: u32,
3319    ) -> bool {
3320        let root = match self.get_root() {
3321            Some(r) => r,
3322            None => return false,
3323        };
3324        // Hand-over-hand latch coupling for the descent. At the BIN we
3325        // need a write lock; we drop our read lock first and take the
3326        // write lock under the protection of the *outer* parent's read
3327        // lock (held by the previous loop iteration's guard). For the
3328        // first iteration there is no outer parent, but no `split_child`
3329        // can run on the root itself in that single-level case because
3330        // root splits go through `split_root_if_needed` which holds
3331        // `self.root.write()`. So the worst case is that the root is
3332        // promoted from a single BIN to a level-2 IN between our read
3333        // detect and our write — handled by the `is_bin` re-check
3334        // inside the write lock.
3335        //
3336        // We retry the descent up to a small bound to absorb the rare
3337        // case where a concurrent split moved this key into the new
3338        // sibling between the read-chain release and the write-lock
3339        // acquisition. Without the retry, the sole caller
3340        // (`Database::put_with_options`) would silently lose the TTL
3341        // for the affected key. Three attempts is generous: each
3342        // retry only races a single split and splits are infrequent.
3343        for _ in 0..3 {
3344            let mut guard: parking_lot::ArcRwLockReadGuard<
3345                parking_lot::RawRwLock,
3346                TreeNode,
3347            > = root.read_arc();
3348            let bin_arc;
3349            loop {
3350                if guard.is_bin() {
3351                    bin_arc =
3352                        parking_lot::ArcRwLockReadGuard::rwlock(&guard).clone();
3353                    drop(guard);
3354                    break;
3355                }
3356                let next_arc = match &*guard {
3357                    TreeNode::Internal(n) => {
3358                        if n.entries.is_empty() {
3359                            return false;
3360                        }
3361                        let idx = self.upper_in_floor_index(&n.entries, key);
3362                        match n.get_child(idx) {
3363                            Some(c) => c,
3364                            None => return false,
3365                        }
3366                    }
3367                    TreeNode::Bottom(_) => unreachable!(),
3368                };
3369                let next_guard = next_arc.read_arc();
3370                drop(guard);
3371                guard = next_guard;
3372            }
3373
3374            // Now take the write lock on the BIN we descended to.
3375            let mut wguard = bin_arc.write();
3376            if let TreeNode::Bottom(bin) = &mut *wguard {
3377                let slot = if let Some(cmp) = &self.key_comparator {
3378                    let (idx, exact) = bin.find_entry_cmp(key, cmp.as_ref());
3379                    if exact { Some(idx) } else { None }
3380                } else {
3381                    let (idx, exact) = bin.find_entry_compressed(key);
3382                    if exact { Some(idx) } else { None }
3383                };
3384                if let Some(slot_idx) = slot
3385                    && let Some(entry) = bin.entries.get_mut(slot_idx)
3386                {
3387                    entry.expiration_time = expiration_hours;
3388                    bin.expiration_in_hours = true;
3389                    bin.dirty = true;
3390                    return true;
3391                }
3392            }
3393            // Key not in this BIN — either it was never present or a
3394            // concurrent split moved it. Retry the descent; at most a
3395            // few iterations are needed to follow the key into its new
3396            // BIN.
3397        }
3398        false
3399    }
3400
3401    /// Returns the key and data of the first BIN entry at or after `key`.
3402    ///
3403    /// Descends with the tree's key comparator (same path as `search()`), then
3404    /// within the BIN finds the first slot whose stored key >= `key` using the
3405    /// comparator.  Returns `None` if every entry in the tree is < `key`.
3406    ///
3407    /// Used by sorted-duplicate cursor `search(Set)` to position at the first
3408    /// (key, data) pair whose two-part key >= `lower_bound(primary_key)`.
3409    ///
3410    /// → BIN scan path.
3411    pub fn first_entry_at_or_after(
3412        &self,
3413        key: &[u8],
3414    ) -> Option<(Vec<u8>, Vec<u8>, u64)> {
3415        // Hand-over-hand latch coupling — see Tree::search for the
3416        // detailed rationale on why this closes a reader-vs-splitter
3417        // race window.
3418        let mut guard: parking_lot::ArcRwLockReadGuard<
3419            parking_lot::RawRwLock,
3420            TreeNode,
3421        > = self.get_root()?.read_arc();
3422
3423        loop {
3424            if guard.is_bin() {
3425                let result = match &*guard {
3426                    TreeNode::Bottom(bin) => {
3427                        let (mut idx, _exact) = match &self.key_comparator {
3428                            Some(cmp) => bin.find_entry_cmp(key, cmp.as_ref()),
3429                            None => bin.find_entry_compressed(key),
3430                        };
3431                        // TREE-F1: skip non-live slots (known_deleted /
3432                        // TTL-expired) at/after the floor index, mirroring the
3433                        // cursor getNext skip (CursorImpl.java:2062-2064).
3434                        while idx < bin.entries.len() && !bin.slot_is_live(idx)
3435                        {
3436                            idx += 1;
3437                        }
3438                        if idx < bin.entries.len() {
3439                            let full_key =
3440                                bin.get_full_key(idx).unwrap_or_default();
3441                            let data = bin.entries[idx]
3442                                .data
3443                                .clone()
3444                                .unwrap_or_default();
3445                            let lsn = bin.get_lsn(idx).as_u64(); // T-3
3446                            Some((full_key, data, lsn))
3447                        } else {
3448                            None
3449                        }
3450                    }
3451                    _ => None,
3452                };
3453                return result;
3454            }
3455
3456            // Upper IN: same descent as search().
3457            let parent_arc =
3458                parking_lot::ArcRwLockReadGuard::rwlock(&guard).clone();
3459            let next_idx = match &*guard {
3460                TreeNode::Internal(n) => {
3461                    if n.entries.is_empty() {
3462                        return None;
3463                    }
3464                    let idx = self.upper_in_floor_index(&n.entries, key);
3465                    match n.get_child(idx) {
3466                        Some(c) => {
3467                            let next_guard = c.read_arc();
3468                            drop(guard);
3469                            guard = next_guard;
3470                            continue;
3471                        }
3472                        None => idx, // EV-14/EV-13: re-fetch below.
3473                    }
3474                }
3475                TreeNode::Bottom(_) => unreachable!(),
3476            };
3477            drop(guard);
3478            let child = self.child_at_or_fetch(&parent_arc, next_idx)?;
3479            guard = child.read_arc();
3480        }
3481    }
3482
3483    /// Like [`Tree::first_entry_at_or_after`] but also returns the BIN node
3484    /// (so callers may pin it) and the entry's slot index inside that
3485    /// BIN.
3486    ///
3487    /// Wave 11-N (Bug 2): `CursorImpl::search_dup` previously stored
3488    /// `current_index = 0` after a sorted-dup `Search`, which broke the
3489    /// fast-path of `retrieve_next` (and the slow path's
3490    /// `next_index = current_index + 1` arithmetic) for any primary
3491    /// that was not the first slot of its BIN.  This helper hands back
3492    /// the real index so the cursor can be positioned correctly.
3493    ///
3494    /// CC-2 fix: uses the same `read_arc()` hand-over-hand latch coupling
3495    /// as every other descent method (`search`, `first_entry_at_or_after`,
3496    /// `get_first_node`, `get_adjacent_bin_attempt`).  The original
3497    /// implementation did `arc.read().is_bin()` (lock acquired and released)
3498    /// then a SECOND `arc.read()` on the next line — a gap in which a
3499    /// concurrent split can promote the node (BIN→upper IN) or move the
3500    /// sought key to a new sibling, yielding a false "not found" for an
3501    /// existing key.  Mirrors JE `Tree.searchSubTree` / `Tree.search`
3502    /// which hold the latch across the `is_bin()` test and the subsequent
3503    /// entry lookup.
3504    pub fn first_entry_at_or_after_with_index(
3505        &self,
3506        key: &[u8],
3507    ) -> Option<(
3508        Vec<u8>,
3509        Vec<u8>,
3510        usize,
3511        u64,
3512        std::sync::Arc<crate::NodeRwLock<TreeNode>>,
3513    )> {
3514        // Hand-over-hand latch coupling — identical strategy to
3515        // first_entry_at_or_after; the guard is held continuously across
3516        // is_bin() and the subsequent entry lookup so no split can
3517        // restructure the path between the two observations.
3518        let mut guard: parking_lot::ArcRwLockReadGuard<
3519            parking_lot::RawRwLock,
3520            TreeNode,
3521        > = self.get_root()?.read_arc();
3522        loop {
3523            if guard.is_bin() {
3524                if let TreeNode::Bottom(bin) = &*guard {
3525                    let (idx, _exact) = match &self.key_comparator {
3526                        Some(cmp) => bin.find_entry_cmp(key, cmp.as_ref()),
3527                        None => bin.find_entry_compressed(key),
3528                    };
3529                    // TREE-F1: skip non-live slots (known_deleted /
3530                    // TTL-expired) at/after the floor index
3531                    // (CursorImpl.java:2062-2064).
3532                    let mut idx = idx;
3533                    while idx < bin.entries.len() && !bin.slot_is_live(idx) {
3534                        idx += 1;
3535                    }
3536                    if idx < bin.entries.len() {
3537                        let full_key =
3538                            bin.get_full_key(idx).unwrap_or_default();
3539                        let data =
3540                            bin.entries[idx].data.clone().unwrap_or_default();
3541                        let lsn = bin.get_lsn(idx).as_u64(); // T-3
3542                        // Obtain the Arc for the BIN node the guard came from.
3543                        // `ArcRwLockReadGuard::rwlock()` returns the backing Arc.
3544                        let bin_arc =
3545                            parking_lot::ArcRwLockReadGuard::rwlock(&guard)
3546                                .clone();
3547                        return Some((full_key, data, idx, lsn, bin_arc));
3548                    } else {
3549                        return None;
3550                    }
3551                }
3552                return None;
3553            }
3554
3555            // Upper IN: descend as in first_entry_at_or_after / search.
3556            let parent_arc =
3557                parking_lot::ArcRwLockReadGuard::rwlock(&guard).clone();
3558            let next_idx = match &*guard {
3559                TreeNode::Internal(n) => {
3560                    if n.entries.is_empty() {
3561                        return None;
3562                    }
3563                    let idx = self.upper_in_floor_index(&n.entries, key);
3564                    match n.get_child(idx) {
3565                        Some(c) => {
3566                            let next_guard = c.read_arc();
3567                            drop(guard);
3568                            guard = next_guard;
3569                            continue;
3570                        }
3571                        None => idx, // EV-14/EV-13: re-fetch below.
3572                    }
3573                }
3574                TreeNode::Bottom(_) => unreachable!(),
3575            };
3576            drop(guard);
3577            let child = self.child_at_or_fetch(&parent_arc, next_idx)?;
3578            guard = child.read_arc();
3579        }
3580    }
3581
3582    /// Insert a key/data pair into the tree.
3583    ///
3584    /// . Handles the root-is-null case by
3585    /// creating a two-level tree (upper IN + BIN) per initialisation path,
3586    /// then delegates to `insert_recursive` which performs preemptive splitting
3587    /// as it descends.
3588    ///
3589    /// Returns Ok(true) if this was a new insert, Ok(false) if it was an update.
3590    pub fn insert(
3591        &self,
3592        key: Vec<u8>,
3593        data: Vec<u8>,
3594        lsn: Lsn,
3595    ) -> Result<bool, TreeError> {
3596        // Save sizes before potentially moving key/data — needed for memory tracking.
3597        let key_len = key.len();
3598        let data_len = data.len();
3599
3600        // First-key path. We MUST hold the write lock while testing
3601        // root.is_none() and replacing the root, otherwise N threads can all
3602        // observe an empty tree, each build a fresh single-entry root, and
3603        // the last writer's `*self.root.write() = Some(...)` silently
3604        // discards the others' inserts. (Reproducer:
3605        // xa_protocol_test::test_concurrent_independent_xids — 8 threads
3606        // each inserting their own key into an empty tree lost ~30% of
3607        // inserts before this lock change.)
3608        {
3609            let mut root_guard = self.root.write();
3610            if root_guard.is_none() {
3611                let bin_node_id = generate_node_id();
3612                let root_node_id = generate_node_id();
3613                let bin = Arc::new(RwLock::new(TreeNode::Bottom(BinStub {
3614                    node_id: bin_node_id,
3615                    level: BIN_LEVEL,
3616                    entries: vec![BinEntry {
3617                        data: Some(data),
3618                        known_deleted: false,
3619                        dirty: false,
3620                        expiration_time: 0,
3621                    }],
3622                    key_prefix: Vec::new(), // single entry — no common prefix yet
3623                    dirty: true,
3624                    is_delta: false,
3625                    last_full_lsn: NULL_LSN,
3626                    last_delta_lsn: NULL_LSN,
3627                    generation: 0,
3628                    parent: None, // set below after root_in is created
3629                    // St-H6: use true to match the engine-wide invariant that
3630                    // every BIN which may hold TTL entries uses hours granularity
3631                    // (JE BIN.java default; matches tree.rs:980 and read_from_log).
3632                    expiration_in_hours: true,
3633                    cursor_count: 0,
3634                    prohibit_next_delta: false,
3635                    lsn_rep: LsnRep::from_lsns(&[lsn]),
3636                    keys: KeyRep::from_keys(vec![key]), // T-2
3637                    compact_max_key_length: self.compact_max_key_length,
3638                })));
3639
3640                // Upper IN at level 2; slot 0 uses an empty key (virtual root key).
3641                let root_arc =
3642                    Arc::new(RwLock::new(TreeNode::Internal(InNodeStub {
3643                        node_id: root_node_id,
3644                        level: MAIN_LEVEL | 2,
3645                        entries: vec![InEntry {
3646                            key: vec![], // virtual key for slot 0 in upper IN
3647                        }],
3648                        // T-4: the single resident child at slot 0.
3649                        targets: TargetRep::Sparse(vec![(0, bin.clone())]),
3650                        dirty: true,
3651                        generation: 0,
3652                        parent: None,
3653                        lsn_rep: LsnRep::from_lsns(&[lsn]),
3654                    })));
3655
3656                // Wire the BIN's parent pointer back to the root IN.
3657                {
3658                    let mut g = bin.write();
3659                    g.set_parent(Some(Arc::downgrade(&root_arc)));
3660                }
3661
3662                *root_guard = Some(root_arc);
3663
3664                // JE: IN.fetchTarget / initial tree build registers the new
3665                // resident nodes with the evictor (Evictor.addBack).
3666                self.note_added(root_node_id);
3667                self.note_added(bin_node_id);
3668
3669                // Count the first entry.
3670                if let Some(counter) = &self.memory_counter {
3671                    let delta =
3672                        (key_len + data_len + BIN_ENTRY_OVERHEAD) as i64;
3673                    counter.fetch_add(delta, Ordering::Relaxed);
3674                }
3675                return Ok(true);
3676            }
3677            // Another thread initialized the root while we were waiting for
3678            // the write lock; fall through and insert into the existing tree.
3679        }
3680
3681        // Check whether the root itself needs to be split before descending.
3682        // Tree.searchSplitsAllowed(): if rootIN.needsSplitting()
3683        // call splitRoot first.
3684        self.split_root_if_needed(lsn)?;
3685
3686        // Recursively insert, splitting children proactively as we descend
3687        // (forceSplit / searchSplitsAllowed pattern).
3688        let root_arc = self.get_root().unwrap();
3689        let result = Self::insert_recursive(
3690            &root_arc,
3691            key,
3692            data,
3693            lsn,
3694            self.max_entries_per_node,
3695            self.key_comparator.as_ref(),
3696            self.key_prefixing,
3697            self.in_list_listener.as_ref(),
3698        )?;
3699
3700        // Update the memory counter for new inserts.
3701        // IN.updateMemorySize(delta) → MemoryBudget.updateTreeMemoryUsage(delta).
3702        // LN_OVERHEAD = 48 bytes (approximate fixed overhead per entry).
3703        if result && let Some(counter) = &self.memory_counter {
3704            let delta = (key_len + data_len + BIN_ENTRY_OVERHEAD) as i64;
3705            counter.fetch_add(delta, Ordering::Relaxed);
3706        }
3707
3708        Ok(result)
3709    }
3710
3711    /// Recovery-redo variant of [`Tree::insert`] that accepts `&[u8]` slices.
3712    ///
3713    /// Eliminates the two intermediate `Vec<u8>` allocations that the normal
3714    /// insert path requires at the `redo_ln` call site (one for the key, one
3715    /// for the data).  The compressed key suffix and the data bytes are each
3716    /// materialised into their `BinEntry` slots exactly once.
3717    ///
3718    /// Semantics are identical to `insert`:
3719    /// - Updates the existing slot when the key is already present.
3720    /// - Inserts a new sorted entry when the key is absent.
3721    /// - Triggers the same root-split and proactive-split logic.
3722    ///
3723    /// `data` should be the raw value bytes, or an empty slice for a
3724    /// deletion (which should not normally arrive here during redo, but is
3725    /// handled gracefully).
3726    ///
3727    /// Wave 11-K optimisation (Fix 1).
3728    pub fn redo_insert(
3729        &self,
3730        key: &[u8],
3731        data: &[u8],
3732        lsn: Lsn,
3733    ) -> Result<bool, TreeError> {
3734        let key_len = key.len();
3735        let data_len = data.len();
3736        let data_opt: Option<&[u8]> =
3737            if data.is_empty() { None } else { Some(data) };
3738
3739        // First-key path: initialise a two-level tree from scratch.
3740        {
3741            let mut root_guard = self.root.write();
3742            if root_guard.is_none() {
3743                // Pre-allocate the BIN's entries Vec using the redo capacity
3744                // hint (Fix 3).  Without the hint the first BIN starts at
3745                // capacity 1 and doubles on each insert; with the hint it
3746                // starts at min(hint, max_entries) entries, eliminating
3747                // ~log2(max_entries) Vec-resize doublings.
3748                let initial_cap = if self.redo_capacity_hint > 0 {
3749                    self.redo_capacity_hint.min(self.max_entries_per_node)
3750                } else {
3751                    1
3752                };
3753                let mut initial_entries = Vec::with_capacity(initial_cap);
3754                initial_entries.push(BinEntry {
3755                    data: data_opt.map(|d| d.to_vec()),
3756                    known_deleted: false,
3757                    dirty: false,
3758                    expiration_time: 0,
3759                });
3760                let bin = Arc::new(RwLock::new(TreeNode::Bottom(BinStub {
3761                    node_id: generate_node_id(),
3762                    level: BIN_LEVEL,
3763                    entries: initial_entries,
3764                    key_prefix: Vec::new(),
3765                    dirty: true,
3766                    is_delta: false,
3767                    last_full_lsn: NULL_LSN,
3768                    last_delta_lsn: NULL_LSN,
3769                    generation: 0,
3770                    parent: None,
3771                    // St-H6: use true to match the engine-wide hours-only
3772                    // invariant (JE BIN.java default; matches tree.rs:980).
3773                    expiration_in_hours: true,
3774                    cursor_count: 0,
3775                    prohibit_next_delta: false,
3776                    lsn_rep: LsnRep::from_lsns(&[lsn]),
3777                    keys: KeyRep::from_keys(vec![key.to_vec()]), // T-2
3778                    compact_max_key_length: self.compact_max_key_length,
3779                })));
3780
3781                let root_arc =
3782                    Arc::new(RwLock::new(TreeNode::Internal(InNodeStub {
3783                        node_id: generate_node_id(),
3784                        level: MAIN_LEVEL | 2,
3785                        entries: vec![InEntry { key: vec![] }],
3786                        // T-4: the single resident child at slot 0.
3787                        targets: TargetRep::Sparse(vec![(0, bin.clone())]),
3788                        dirty: true,
3789                        generation: 0,
3790                        parent: None,
3791                        lsn_rep: LsnRep::from_lsns(&[lsn]),
3792                    })));
3793
3794                {
3795                    let mut g = bin.write();
3796                    g.set_parent(Some(Arc::downgrade(&root_arc)));
3797                }
3798
3799                *root_guard = Some(root_arc);
3800
3801                if let Some(counter) = &self.memory_counter {
3802                    let delta =
3803                        (key_len + data_len + BIN_ENTRY_OVERHEAD) as i64;
3804                    counter.fetch_add(delta, Ordering::Relaxed);
3805                }
3806                return Ok(true);
3807            }
3808        }
3809
3810        self.split_root_if_needed(lsn)?;
3811
3812        let root_arc = self.get_root().unwrap();
3813        let result = Self::redo_insert_recursive(
3814            &root_arc,
3815            key,
3816            data_opt,
3817            lsn,
3818            self.max_entries_per_node,
3819            self.key_comparator.as_ref(),
3820            self.key_prefixing,
3821        )?;
3822
3823        if result && let Some(counter) = &self.memory_counter {
3824            let delta = (key_len + data_len + BIN_ENTRY_OVERHEAD) as i64;
3825            counter.fetch_add(delta, Ordering::Relaxed);
3826        }
3827
3828        Ok(result)
3829    }
3830
3831    /// Splits the root node if it is full (needsSplitting).
3832    ///
3833    ///
3834    /// ```text
3835    /// 1. Save oldRoot (the current root IN or BIN).
3836    /// 2. Create newRoot at oldRoot.level + 1.
3837    /// 3. Insert oldRoot into newRoot at slot 0 with a virtual (empty) key.
3838    /// 4. Call split_node on oldRoot, passing newRoot as parent.
3839    /// 5. Replace tree root with newRoot.
3840    /// ```
3841    fn split_root_if_needed(&self, lsn: Lsn) -> Result<(), TreeError> {
3842        // Hold `self.root.write()` across the needs_split check and the
3843        // root promotion, mirroring the first-key path fix and matching
3844        // the broader insert/split serialisation discipline.
3845        //
3846        // With the previous read-then-write pattern, two concurrent
3847        // splitters could each observe needs_split == true, then take()
3848        // and install in turn, with the second wrapping the first's
3849        // already-promoted root in its own new IN. Each level wraps the
3850        // previous, producing a chain of one-child internal nodes. No
3851        // data is lost (every entry is still reachable) but the tree
3852        // becomes unnecessarily deep, and the imbalance can compound
3853        // under heavy concurrent insertion.
3854        let mut root_guard = self.root.write();
3855        let needs_split = match root_guard.as_ref() {
3856            Some(arc) => {
3857                let g = arc.read();
3858                g.get_n_entries() >= self.max_entries_per_node
3859            }
3860            None => false,
3861        };
3862        if !needs_split {
3863            return Ok(());
3864        }
3865
3866        // Create a fresh new root one level above the current root.
3867        let old_root_arc = root_guard.take().expect("checked Some above");
3868        let old_root_level = {
3869            let g = old_root_arc.read();
3870            g.level()
3871        };
3872
3873        // newRoot = new IN(level = oldRoot.level + 1) with slot 0 = oldRoot.
3874        // The key at slot 0 is the virtual key (empty slice) following the
3875        // convention that entry-zero in an upper IN compares as -infinity.
3876        let new_root_arc =
3877            Arc::new(RwLock::new(TreeNode::Internal(InNodeStub {
3878                node_id: generate_node_id(),
3879                level: old_root_level + 1,
3880                entries: vec![InEntry { key: vec![] }],
3881                // T-4: slot 0's resident child is the old root.
3882                targets: TargetRep::Sparse(vec![(0, old_root_arc.clone())]),
3883                dirty: true,
3884                generation: 0,
3885                parent: None,
3886                lsn_rep: LsnRep::from_lsns(&[lsn]),
3887            })));
3888
3889        // Update the old root's parent pointer to the new root.
3890        {
3891            let mut g = old_root_arc.write();
3892            g.set_parent(Some(Arc::downgrade(&new_root_arc)));
3893        }
3894
3895        // Install the new root before calling split_child so split_child
3896        // (which itself takes parent.write()) can run unencumbered.
3897        *root_guard = Some(new_root_arc.clone());
3898        drop(root_guard);
3899
3900        // Now split the old root (which is now child at slot 0 in new_root).
3901        Self::split_child(
3902            &new_root_arc,
3903            0, // child is at slot 0
3904            self.max_entries_per_node,
3905            lsn,
3906            SplitHint::Normal,
3907            &[], // no insertion key at root-init time
3908            self.key_comparator.as_ref(),
3909            self.key_prefixing,
3910            self.in_list_listener.as_ref(),
3911        )?;
3912
3913        // EVICTOR-RECLAIM-1: register the freshly-promoted root IN with the
3914        // evictor's LRU (JE Tree.splitRoot adds the new root to the INList).
3915        // split_child above already registers the new sibling.
3916        let new_root_id = match &*new_root_arc.read() {
3917            TreeNode::Internal(n) => n.node_id,
3918            TreeNode::Bottom(b) => b.node_id,
3919        };
3920        self.note_added(new_root_id);
3921
3922        self.root_splits.fetch_add(1, Ordering::Relaxed);
3923        Ok(())
3924    }
3925
3926    /// Splits the child at `child_index` in `parent`.
3927    ///
3928    /// .  This implementation always keeps the **left** half in the
3929    /// existing child node (`child_arc`) and puts the right half in the new
3930    /// sibling, regardless of where the `identifierKey` falls.  JE's
3931    /// `IN.splitInternal` (`idKeyIndex` logic ~line 4172) can place either
3932    /// half in the existing node; Noxu's preemptive-split discipline ensures
3933    /// the parent always has a free slot at split time (the split is done on
3934    /// the way *down*, before the parent fills up), so the safe simplification
3935    /// of always using the left half is correct here — no routing information
3936    /// is lost.  This comment replaces the previous incorrect claim that
3937    /// `idKeyIndex` drove the choice.
3938    ///
3939    /// Note: does not emit a split log entry; split nodes are marked dirty
3940    /// and flushed at the next checkpoint (flush_dirty_bins/upper_ins).
3941    ///
3942    /// ```text
3943    /// 1. splitIndex = child.nEntries / 2  (or 1 / n-1 for splitSpecial)
3944    /// 2. Create newSibling at the same level.
3945    /// 3. Move entries [splitIndex..nEntries) to newSibling.
3946    /// 4. Update parent slot childIndex -> child (left half),
3947    ///    insert newSibling with newIdKey after childIndex.
3948    /// ```
3949    fn split_child(
3950        parent: &Arc<RwLock<TreeNode>>,
3951        child_index: usize,
3952        max_entries: usize,
3953        lsn: Lsn,
3954        hint: SplitHint,
3955        insert_key: &[u8],
3956        key_comparator: Option<&KeyComparatorFn>,
3957        key_prefixing: bool,
3958        listener: Option<&Arc<dyn InListListener>>,
3959    ) -> Result<(), TreeError> {
3960        // The split is performed under `parent.write()` for the entire
3961        // duration. This is a deliberate choice for correctness:
3962        //
3963        // - Without it, between dropping `child.write()` (after installing
3964        //   the left half) and acquiring `parent.write()` (to install the
3965        //   sibling), a concurrent descender can pick `child_arc` from the
3966        //   parent (still pointing at it), descend, take `child.write()`
3967        //   and insert a key. Whether the descender's key belongs in the
3968        //   left half (now in `child`) or the right half (which will be
3969        //   in the new sibling) is determined by the parent's split key —
3970        //   but the parent doesn't know about the split key yet, so the
3971        //   descender's routing decision is based on stale data. If the
3972        //   descender's key falls in the right half, it lands in `child`
3973        //   (left half) where a future search will not find it: the
3974        //   future search descends from the root, the parent now has the
3975        //   sibling installed, the search routes the key to the sibling,
3976        //   the sibling does not contain the key — silently lost.
3977        //
3978        // - Holding `parent.write()` throughout serialises split_child
3979        //   against every descender that wants `parent.read()`. A
3980        //   descender already holding `parent.read()` (latch coupling
3981        //   from above) keeps split_child waiting at this lock until it
3982        //   has finished its own work. Combined, the split + sibling
3983        //   install is atomic with respect to descents.
3984        //
3985        // - Splits are infrequent compared to inserts (~ once per
3986        //   max_entries new keys) so the extra serialisation here does
3987        //   not dominate.
3988        //
3989        // Reproducer that exercises this race:
3990        // crates/noxu-db/tests/concurrent_commits_stress.rs.
3991        let mut parent_write_guard = parent.write();
3992
3993        // Extract the child Arc from the parent slot.
3994        let child_arc = match &*parent_write_guard {
3995            TreeNode::Internal(p) => {
3996                p.get_child(child_index).ok_or(TreeError::SplitRequired)?
3997            }
3998            TreeNode::Bottom(_) => return Err(TreeError::SplitRequired),
3999        };
4000
4001        // Gather all entries from the child plus split metadata, AND
4002        // perform the in-place left-half install, all under a single
4003        // write lock on the child. See the earlier comment on the race
4004        // this avoids inside split_child.
4005        let mut child_guard = child_arc.write();
4006        let child_level = child_guard.level();
4007        // St-H6: capture the splitting BIN's expiration_in_hours flag BEFORE
4008        // drop(child_guard) so the right-half sibling inherits it.
4009        // JE: BIN.java::setExpiration calls setExpirationInHours(hours) to
4010        // propagate the flag on split/clone; the Rust split was hardcoding
4011        // false instead of inheriting — this caused hours-granularity TTL
4012        // entries in the right sibling to be read with in_hours=false, making
4013        // the hours-since-epoch value compare as seconds-since-epoch (far in
4014        // the past) and every right-sibling TTL record appear expired.
4015        let bin_expiration_in_hours: bool = match &*child_guard {
4016            TreeNode::Bottom(b) => b.expiration_in_hours,
4017            // Internal nodes do not carry per-entry TTL; default to true
4018            // (the engine-wide invariant for any BIN that may hold TTL data).
4019            TreeNode::Internal(_) => true,
4020        };
4021        // T-2/T-5: the compact-key threshold the new sibling BIN inherits.
4022        // (Only consumed when the child is a BIN; an upper-IN split produces
4023        // upper-IN siblings, which have no compact key rep.)
4024        let bin_compact_max_key_length: i32 = match &*child_guard {
4025            TreeNode::Bottom(b) => b.compact_max_key_length,
4026            TreeNode::Internal(_) => INKeyRep_DEFAULT_MAX_KEY_LENGTH,
4027        };
4028        let (all_entries, bin_old_prefix) = match &*child_guard {
4029            TreeNode::Internal(n) => {
4030                // T-4: capture the parallel resident-child array alongside the
4031                // entries so children travel with their slots through the
4032                // split (JE `IN.split` copies `entryTargets`).
4033                let children: Vec<Option<ChildArc>> =
4034                    (0..n.entries.len()).map(|i| n.get_child(i)).collect();
4035                // T-3: capture the parallel per-slot LSNs so they travel with
4036                // their slots (JE `IN.split` copies `entryLsnByteArray`).
4037                let lsns: Vec<Lsn> =
4038                    (0..n.entries.len()).map(|i| n.get_lsn(i)).collect();
4039                (
4040                    SplitEntries::Internal(n.entries.clone(), children, lsns),
4041                    Vec::new(),
4042                )
4043            }
4044            TreeNode::Bottom(b) => {
4045                // Decompress to full keys.
4046                let full: Vec<BinEntry> = (0..b.entries.len())
4047                    .map(|i| BinEntry {
4048                        data: b.entries[i].data.clone(),
4049                        known_deleted: b.entries[i].known_deleted,
4050                        dirty: b.entries[i].dirty,
4051                        expiration_time: b.entries[i].expiration_time,
4052                    })
4053                    .collect();
4054                let lsns: Vec<Lsn> =
4055                    (0..b.entries.len()).map(|i| b.get_lsn(i)).collect();
4056                // T-2: carry FULL keys through the split; the new BINs
4057                // recompute their own prefix from them.
4058                let full_keys: Vec<Vec<u8>> = (0..b.entries.len())
4059                    .map(|i| b.get_full_key(i).unwrap_or_default())
4060                    .collect();
4061                (
4062                    SplitEntries::Bottom(full, lsns, full_keys),
4063                    b.key_prefix.clone(),
4064                )
4065            }
4066        };
4067
4068        // Determine split point — JE `IN.splitSpecial` / `IN.splitInternal`.
4069        //
4070        // Normal midpoint: `n_entries / 2`.
4071        // AllLeft:  insertion key is at position 0 on every descend level.
4072        //   → split_index = 1 (left half keeps n-1 entries; new right sibling
4073        //     gets only the former-first slot, then the insertion fills it).
4074        //   This matches JE: `if (leftSide && index == 0) splitInternal(…, 1)`.
4075        // AllRight: insertion key is at the last position on every level.
4076        //   → split_index = n_entries - 1 (left half keeps all but one entry).
4077        //   JE: `else if (!leftSide && index == nEntries-1) splitInternal(…, nEntries-1)`.
4078        //
4079        // Ref: `IN.java` splitSpecial ~line 4129, splitInternal ~line 4159.
4080        let n_entries = all_entries.len();
4081        let split_index = if n_entries >= 2 {
4082            // Find where insert_key falls in the child.
4083            let insert_idx = {
4084                let mut idx = 0usize;
4085                for i in 1..n_entries {
4086                    let ord = match key_comparator {
4087                        Some(cmp) => cmp(all_entries.get_key(i), insert_key),
4088                        None => all_entries.get_key(i).cmp(insert_key),
4089                    };
4090                    if ord != std::cmp::Ordering::Greater {
4091                        idx = i;
4092                    } else {
4093                        break;
4094                    }
4095                }
4096                idx
4097            };
4098            match hint {
4099                SplitHint::AllLeft if insert_idx == 0 => 1,
4100                SplitHint::AllRight if insert_idx == n_entries - 1 => {
4101                    n_entries - 1
4102                }
4103                _ => n_entries / 2,
4104            }
4105        } else {
4106            n_entries / 2
4107        };
4108
4109        // newIdKey — the full key of the first entry of the right half.
4110        // For BIN: entries are already full keys after decompression above.
4111        // For IN:  entries carry full keys directly.
4112        let new_id_key = all_entries.get_key(split_index).to_vec();
4113        // Suppress unused-variable warning when no BIN is involved.
4114        let _ = &bin_old_prefix;
4115
4116        // Divide into left and right halves.
4117        let left_entries = all_entries.slice(0, split_index);
4118        let right_entries = all_entries.slice(split_index, n_entries);
4119
4120        // Install the left half into `child_arc` (still under the same
4121        // write lock) and mark the node dirty.
4122        match (&mut *child_guard, &left_entries) {
4123            (TreeNode::Internal(n), SplitEntries::Internal(le, lc, ll)) => {
4124                n.entries = le.clone();
4125                // T-4: reinstall the (now-shorter) left child array.
4126                n.targets = TargetRep::None;
4127                for (i, c) in lc.iter().enumerate() {
4128                    if let Some(child) = c {
4129                        n.set_child(i, Some(child.clone()));
4130                    }
4131                }
4132                // T-3: reinstall the (now-shorter) left LSN array.
4133                n.lsn_rep = LsnRep::from_lsns(ll);
4134            }
4135            (TreeNode::Bottom(b), SplitEntries::Bottom(le, ll, lk)) => {
4136                // Reset prefix; keys arrive as FULL keys (no prefix yet).
4137                b.key_prefix = Vec::new();
4138                // Pre-allocate at max_entries capacity so the left half
4139                // does not need to reallocate on the next insert (Fix 3).
4140                let mut left = Vec::with_capacity(max_entries);
4141                left.extend_from_slice(le);
4142                b.entries = left;
4143                // T-3: reinstall the left LSN array.
4144                b.lsn_rep = LsnRep::from_lsns(ll);
4145                // T-2: reinstall the left key rep from the full keys (Default;
4146                // recompute_key_prefix below compresses + compacts).
4147                b.keys = KeyRep::from_keys(lk.clone());
4148                // Recompute prefix on each half after split (only when
4149                // key_prefixing is enabled for this database).
4150                // JE: IN.computeKeyPrefix returns null when
4151                // databaseImpl.getKeyPrefixing() is false.
4152                // Ref: IN.java computeKeyPrefix ~line 2456.
4153                if key_prefixing && b.entries.len() >= 2 {
4154                    b.recompute_key_prefix();
4155                } else {
4156                    b.keys.compact(b.compact_max_key_length); // T-2
4157                }
4158            }
4159            _ => return Err(TreeError::SplitRequired),
4160        }
4161        child_guard.set_dirty(true);
4162        drop(child_guard);
4163
4164        // Create the new right-half sibling.
4165        // Parent pointer will be wired in when it is inserted into the parent.
4166        let new_sibling = match right_entries {
4167            SplitEntries::Internal(re, rc, rl) => {
4168                let mut rin = InNodeStub {
4169                    node_id: generate_node_id(),
4170                    level: child_level,
4171                    entries: re,
4172                    targets: TargetRep::None,
4173                    dirty: true,
4174                    generation: 0,
4175                    parent: None, // set below
4176                    // T-3: the right half's per-slot LSNs.
4177                    lsn_rep: LsnRep::from_lsns(&rl),
4178                };
4179                // T-4: install the right half's resident children.
4180                for (i, c) in rc.into_iter().enumerate() {
4181                    if c.is_some() {
4182                        rin.set_child(i, c);
4183                    }
4184                }
4185                Arc::new(RwLock::new(TreeNode::Internal(rin)))
4186            }
4187            SplitEntries::Bottom(re, rl, rk) => {
4188                // Entries arrive as FULL keys; build BinStub with no prefix
4189                // then recompute key prefix for the new sibling.
4190                // Pre-allocate at max_entries capacity so the right half
4191                // does not need to reallocate on the next insert (Fix 3).
4192                let mut right = Vec::with_capacity(max_entries);
4193                right.extend(re);
4194                let mut sibling_bin = BinStub {
4195                    node_id: generate_node_id(),
4196                    level: child_level,
4197                    entries: right,
4198                    key_prefix: Vec::new(),
4199                    dirty: true,
4200                    is_delta: false,
4201                    last_full_lsn: NULL_LSN,
4202                    last_delta_lsn: NULL_LSN,
4203                    generation: 0,
4204                    parent: None, // set below
4205                    // St-H6 fix: inherit the splitting BIN's flag so that
4206                    // is_expired() uses the correct granularity for entries
4207                    // that were already in the BIN before the split.
4208                    // JE reference: BIN.java::split() propagates
4209                    // expirationInHours via setExpirationInHours(hours).
4210                    expiration_in_hours: bin_expiration_in_hours,
4211                    cursor_count: 0,
4212                    prohibit_next_delta: false,
4213                    // T-3: the right half's per-slot LSNs.
4214                    lsn_rep: LsnRep::from_lsns(&rl),
4215                    // T-2: full keys (Default); recompute/compact below.
4216                    keys: KeyRep::from_keys(rk),
4217                    compact_max_key_length: bin_compact_max_key_length,
4218                };
4219                // St-H6 debug guard: the sibling must carry the same flag as
4220                // the splitting BIN so that in_hours-resolution entries are
4221                // never silently expired by a mismatched false flag.
4222                debug_assert_eq!(
4223                    sibling_bin.expiration_in_hours, bin_expiration_in_hours,
4224                    "St-H6 invariant: sibling BIN expiration_in_hours must \
4225                     match the splitting BIN (got {}, expected {})",
4226                    sibling_bin.expiration_in_hours, bin_expiration_in_hours
4227                );
4228
4229                if key_prefixing && sibling_bin.entries.len() >= 2 {
4230                    sibling_bin.recompute_key_prefix();
4231                } else {
4232                    sibling_bin.keys.compact(bin_compact_max_key_length); // T-2
4233                }
4234                Arc::new(RwLock::new(TreeNode::Bottom(sibling_bin)))
4235            }
4236        };
4237
4238        // Note: the child (left half) was marked dirty earlier under the
4239        // same write lock that installed left_entries; no need to re-take
4240        // the write lock here.
4241
4242        // Insert the new sibling into the parent after child_index.
4243        // We already hold `parent.write()` (taken at the top of the
4244        // function); operate on it directly rather than re-acquiring.
4245        match &mut *parent_write_guard {
4246            TreeNode::Internal(p) => {
4247                let insert_pos = child_index + 1;
4248                // T-4: insert the parent slot and set its cached child via the
4249                // node-level INTargetRep (shifting existing children).
4250                p.insert_entry(
4251                    insert_pos,
4252                    new_id_key,
4253                    lsn,
4254                    Some(new_sibling.clone()),
4255                );
4256                // Parent is dirty because it gained a new entry.
4257                p.dirty = true;
4258            }
4259            TreeNode::Bottom(_) => return Err(TreeError::SplitRequired),
4260        }
4261
4262        // Wire the new sibling's parent pointer to the parent node
4263        // before releasing parent_write_guard, so a future descent that
4264        // takes parent.read() and finds the sibling immediately sees a
4265        // fully-wired parent pointer.
4266        {
4267            let mut g = new_sibling.write();
4268            g.set_parent(Some(Arc::downgrade(parent)));
4269        }
4270        // T-4: when an upper IN split, the children that moved into the new
4271        // sibling must have their parent back-pointers re-wired to the
4272        // sibling (JE re-parents moved targets in IN.split).
4273        {
4274            let sg = new_sibling.read();
4275            if let TreeNode::Internal(sn) = &*sg {
4276                let moved = sn.resident_children();
4277                drop(sg);
4278                for child in moved {
4279                    let mut cg = child.write();
4280                    cg.set_parent(Some(Arc::downgrade(&new_sibling)));
4281                }
4282            }
4283        }
4284        drop(parent_write_guard);
4285
4286        // EVICTOR-RECLAIM-1: register the freshly-split sibling with the
4287        // evictor's LRU (JE IN.splitInternal calls inList.add(newSibling)).
4288        // Without this, split-created BINs/INs are invisible to the evictor:
4289        // the policy lists never receive them, every evict_batch phase quota
4290        // is 0, and eviction reclaims nothing under pressure even though the
4291        // nodes are fully resident.  Only the very first root+BIN (the
4292        // first-key path) and re-fetched nodes were ever registered.
4293        if let Some(l) = listener {
4294            let sibling_id = match &*new_sibling.read() {
4295                TreeNode::Internal(n) => n.node_id,
4296                TreeNode::Bottom(b) => b.node_id,
4297            };
4298            l.note_ins_added(sibling_id);
4299        }
4300
4301        Ok(())
4302    }
4303
4304    /// Recursive insert with preemptive splitting.
4305    ///
4306    /// Top-down traversal in `Tree.forceSplit` +
4307    /// `Tree.searchSplitsAllowed`:
4308    ///
4309    /// 1. At an upper IN: find which child slot covers `key`, split the child
4310    ///    proactively if it is full (so we always have room to insert the split
4311    ///    key into the parent), then recurse into the appropriate child.
4312    /// 2. At a BIN: insert the key/data directly.
4313    ///
4314    /// This implements the "preemptive splitting" strategy from the: we split
4315    /// children on the way down so we never need to walk back up.
4316    fn insert_recursive(
4317        node_arc: &Arc<RwLock<TreeNode>>,
4318        key: Vec<u8>,
4319        data: Vec<u8>,
4320        lsn: Lsn,
4321        max_entries: usize,
4322        key_comparator: Option<&KeyComparatorFn>,
4323        key_prefixing: bool,
4324        listener: Option<&Arc<dyn InListListener>>,
4325    ) -> Result<bool, TreeError> {
4326        Self::insert_recursive_inner(
4327            node_arc,
4328            key,
4329            data,
4330            lsn,
4331            max_entries,
4332            key_comparator,
4333            key_prefixing,
4334            true, // all_left_so_far
4335            true, // all_right_so_far
4336            listener,
4337        )
4338    }
4339
4340    /// Inner recursive helper that threads `allLeftSideDescent` /
4341    /// `allRightSideDescent` from `Tree.forceSplit` (JE ~line 1912).
4342    ///
4343    /// Both flags start `true` at the root and are cleared as soon as the
4344    /// descent takes a non-leftmost / non-rightmost child slot.  At split
4345    /// time they are forwarded to `split_child` which uses them to pick the
4346    /// `splitSpecial` split index (JE `IN.splitSpecial` ~line 4129).
4347    #[allow(clippy::too_many_arguments)]
4348    fn insert_recursive_inner(
4349        node_arc: &Arc<RwLock<TreeNode>>,
4350        key: Vec<u8>,
4351        data: Vec<u8>,
4352        lsn: Lsn,
4353        max_entries: usize,
4354        key_comparator: Option<&KeyComparatorFn>,
4355        key_prefixing: bool,
4356        all_left_so_far: bool,
4357        all_right_so_far: bool,
4358        listener: Option<&Arc<dyn InListListener>>,
4359    ) -> Result<bool, TreeError> {
4360        // Determine if this is a BIN (leaf level).
4361        //
4362        // We hold a read lock on `node_arc` (the parent of any descent we
4363        // do below) for the duration of this call, releasing it just
4364        // before returning. That achieves *latch coupling*: a concurrent
4365        // `split_child(parent, …)` that wants to reorganise our subtree
4366        // ultimately needs `parent.write()` to install the new sibling,
4367        // and that write blocks until our read lock is dropped. Without
4368        // this, the descender-vs-splitter race goes:
4369        //
4370        //   T_X: at root, picks child_arc (BIN), drops root read lock.
4371        //   T_Y: at root, runs split_child(root, …): takes child_arc.write(),
4372        //        installs left half [E1..E5], creates sibling [E6..E10],
4373        //        takes root.write() and inserts the sibling.
4374        //   T_X: now takes child_arc.write() and inserts a key whose
4375        //        sort order falls in the right half. The key lands in
4376        //        child_arc (left half) but a future search descending
4377        //        from the root routes that key to the new sibling and
4378        //        does not find it — silently lost.
4379        //
4380        // Reproducer: noxu-db/tests/concurrent_commits_stress.rs
4381        // (32 threads × 100 keys, ~1–6 lost writes per run before this fix;
4382        // occasionally hundreds when an entire BIN is orphaned).
4383        let parent_guard = node_arc.read();
4384        let is_bin = parent_guard.is_bin();
4385
4386        if is_bin {
4387            // BIN: drop the read lock and take the write lock; this is
4388            // safe because the *outer* call frame still holds a read
4389            // lock on this BIN's parent (or this is the root, in which
4390            // case the first-key path has already initialised it). A
4391            // concurrent split_child(parent, …) cannot run while the
4392            // outer parent.read() is held, so the BIN cannot be
4393            // restructured between dropping our read lock and acquiring
4394            // our write lock.
4395            drop(parent_guard);
4396            let mut guard = node_arc.write();
4397            match &mut *guard {
4398                TreeNode::Bottom(bin) => {
4399                    let is_new = if let Some(cmp) = key_comparator {
4400                        // Comparator-based insert: no prefix compression.
4401                        let (_idx, new) =
4402                            bin.insert_cmp(key, lsn, Some(data), cmp.as_ref());
4403                        new
4404                    } else if key_prefixing {
4405                        // insert_with_prefix handles prefix recomputation when
4406                        // the new key shrinks the existing prefix, and also
4407                        // initialises the prefix when 2 entries are present for
4408                        // the first time.
4409                        let (_idx, new) =
4410                            bin.insert_with_prefix(key, lsn, Some(data));
4411                        new
4412                    } else {
4413                        // key_prefixing disabled: store full key, no prefix.
4414                        // JE: IN.computeKeyPrefix returns null when
4415                        // databaseImpl.getKeyPrefixing() is false.
4416                        // Ref: IN.java computeKeyPrefix ~line 2456.
4417                        let (_idx, new) = bin.insert_raw(key, lsn, Some(data));
4418                        new
4419                    };
4420                    // Mark dirty after any modification.
4421                    bin.dirty = true;
4422                    Ok(is_new)
4423                }
4424                TreeNode::Internal(_) => Err(TreeError::SplitRequired),
4425            }
4426        } else {
4427            // Upper IN: find the child slot that covers key.
4428            // Index = parent.findEntry(key, false, false)
4429            // Entry zero in an upper IN has a virtual key (-infinity), so
4430            // any real key is routed to at least slot 0.
4431            let (child_index, n_entries_at_level, child_arc) =
4432                match &*parent_guard {
4433                    TreeNode::Internal(n) => {
4434                        // Binary search for the largest key <= search key.
4435                        // Slot 0 always matches (virtual key = -infinity).
4436                        let mut idx = 0usize;
4437                        for (i, entry) in n.entries.iter().enumerate() {
4438                            if i == 0 {
4439                                idx = 0;
4440                            } else {
4441                                let ord = match key_comparator {
4442                                    Some(cmp) => cmp(
4443                                        entry.key.as_slice(),
4444                                        key.as_slice(),
4445                                    ),
4446                                    None => {
4447                                        entry.key.as_slice().cmp(key.as_slice())
4448                                    }
4449                                };
4450                                if ord != std::cmp::Ordering::Greater {
4451                                    idx = i;
4452                                } else {
4453                                    break;
4454                                }
4455                            }
4456                        }
4457                        let child =
4458                            n.get_child(idx).ok_or(TreeError::SplitRequired)?;
4459                        (idx, n.entries.len(), child)
4460                    }
4461                    TreeNode::Bottom(_) => {
4462                        return Err(TreeError::SplitRequired);
4463                    }
4464                };
4465
4466            // Update the descent-side flags (JE `Tree.forceSplit` ~1959).
4467            // `allLeftSideDescent`  ← still true only if we chose slot 0.
4468            // `allRightSideDescent` ← still true only if we chose the last slot.
4469            let all_left = all_left_so_far && child_index == 0;
4470            let all_right = all_right_so_far
4471                && child_index == n_entries_at_level.saturating_sub(1);
4472
4473            // Proactively split the child if it is full.
4474            // If (child.needsSplitting()) child.split(parent, ...)
4475            let child_full = {
4476                let g = child_arc.read();
4477                g.get_n_entries() >= max_entries
4478            };
4479
4480            if child_full {
4481                // Build the splitSpecial hint from the accumulated flags.
4482                // JE `Tree.forceSplit` ~line 2010:
4483                //   if (allLeftSideDescent || allRightSideDescent)
4484                //       child.splitSpecial(parent, index, grandParent,
4485                //           maxTreeEntriesPerNode, key, allLeftSideDescent)
4486                let hint = match (all_left, all_right) {
4487                    (true, _) => SplitHint::AllLeft,
4488                    (_, true) => SplitHint::AllRight,
4489                    _ => SplitHint::Normal,
4490                };
4491                // split_child(parent, …) needs parent.write(); we must
4492                // drop our parent read lock before calling it.
4493                drop(parent_guard);
4494                Self::split_child(
4495                    node_arc,
4496                    child_index,
4497                    max_entries,
4498                    lsn,
4499                    hint,
4500                    &key,
4501                    key_comparator,
4502                    key_prefixing,
4503                    listener,
4504                )?;
4505
4506                // After the split, re-find which child now covers key.
4507                // Re-enter at the top of the inner function; carry the
4508                // flags (the new topology doesn't invalidate them — we
4509                // still know the overall descent direction).
4510                return Self::insert_recursive_inner(
4511                    node_arc,
4512                    key,
4513                    data,
4514                    lsn,
4515                    max_entries,
4516                    key_comparator,
4517                    key_prefixing,
4518                    all_left_so_far,
4519                    all_right_so_far,
4520                    listener,
4521                );
4522            }
4523
4524            // Descend into the child while still holding parent_guard.
4525            // The recursive call will hold child.read() before this
4526            // returns, then drop it; combined with our parent_guard,
4527            // the latch coupling chain is preserved on the way down and
4528            // unwound on the way back up.
4529            let r = Self::insert_recursive_inner(
4530                &child_arc,
4531                key,
4532                data,
4533                lsn,
4534                max_entries,
4535                key_comparator,
4536                key_prefixing,
4537                all_left,
4538                all_right,
4539                listener,
4540            );
4541            drop(parent_guard);
4542            r
4543        }
4544    }
4545
4546    /// Slice-based variant of [`Tree::insert_recursive`] for the recovery redo path.
4547    ///
4548    /// Accepts `key: &[u8]` and `data: Option<&[u8]>` instead of owned
4549    /// `Vec<u8>` values.  At the BIN leaf, calls
4550    /// [`BinStub::insert_with_prefix_slice`] which copies bytes into the
4551    /// `BinEntry` exactly once.
4552    ///
4553    /// For the comparator path (custom key comparator), falls back to
4554    /// `insert_cmp` with a one-time `to_vec()` conversion — that path is
4555    /// rare in practice (sorted-dup databases only) and is not on the
4556    /// W11 hot path.
4557    ///
4558    /// Wave 11-K optimisation (Fix 1).
4559    fn redo_insert_recursive(
4560        node_arc: &Arc<RwLock<TreeNode>>,
4561        key: &[u8],
4562        data: Option<&[u8]>,
4563        lsn: Lsn,
4564        max_entries: usize,
4565        key_comparator: Option<&KeyComparatorFn>,
4566        key_prefixing: bool,
4567    ) -> Result<bool, TreeError> {
4568        Self::redo_insert_recursive_inner(
4569            node_arc,
4570            key,
4571            data,
4572            lsn,
4573            max_entries,
4574            key_comparator,
4575            key_prefixing,
4576            true,
4577            true,
4578        )
4579    }
4580
4581    #[allow(clippy::too_many_arguments)]
4582    fn redo_insert_recursive_inner(
4583        node_arc: &Arc<RwLock<TreeNode>>,
4584        key: &[u8],
4585        data: Option<&[u8]>,
4586        lsn: Lsn,
4587        max_entries: usize,
4588        key_comparator: Option<&KeyComparatorFn>,
4589        key_prefixing: bool,
4590        all_left_so_far: bool,
4591        all_right_so_far: bool,
4592    ) -> Result<bool, TreeError> {
4593        let parent_guard = node_arc.read();
4594        let is_bin = parent_guard.is_bin();
4595
4596        if is_bin {
4597            drop(parent_guard);
4598            let mut guard = node_arc.write();
4599            match &mut *guard {
4600                TreeNode::Bottom(bin) => {
4601                    // REC-F2: JE redo currency check
4602                    // (RecoveryManager.redo() line ~2512/2544).  A logged LN
4603                    // is applied only when logrecLsn > treeLsn.  If the slot
4604                    // already holds an equal-or-newer LSN, skip the overwrite
4605                    // so an out-of-order (older-LSN) redo cannot revert
4606                    // committed data or reset the slot LSN backward.  This
4607                    // makes redo genuinely idempotent regardless of
4608                    // redo/undo phase order.  Deletes never reach this path
4609                    // (redo_ln routes Delete through tree.delete), so the JE
4610                    // "lsnCmp == 0 && isDeletion -> set KD" sub-case does not
4611                    // apply here.
4612                    let cmp_ref = key_comparator.map(|c| {
4613                        c.as_ref()
4614                            as &dyn Fn(&[u8], &[u8]) -> std::cmp::Ordering
4615                    });
4616                    if let Some(slot_lsn) =
4617                        bin.redo_slot_lsn(key, cmp_ref, key_prefixing)
4618                        && lsn <= slot_lsn
4619                    {
4620                        // Tree already holds an equal-or-newer version.
4621                        return Ok(false);
4622                    }
4623                    let is_new = if let Some(cmp) = key_comparator {
4624                        // Comparator path: fall back to owned-Vec variant.
4625                        let (_idx, new) = bin.insert_cmp(
4626                            key.to_vec(),
4627                            lsn,
4628                            data.map(|d| d.to_vec()),
4629                            cmp.as_ref(),
4630                        );
4631                        new
4632                    } else if key_prefixing {
4633                        let (_idx, new) =
4634                            bin.insert_with_prefix_slice(key, lsn, data);
4635                        new
4636                    } else {
4637                        // key_prefixing disabled: store full key verbatim.
4638                        // Ref: IN.java computeKeyPrefix ~line 2456.
4639                        let (_idx, new) = bin.insert_raw(
4640                            key.to_vec(),
4641                            lsn,
4642                            data.map(|d| d.to_vec()),
4643                        );
4644                        new
4645                    };
4646                    bin.dirty = true;
4647                    Ok(is_new)
4648                }
4649                TreeNode::Internal(_) => Err(TreeError::SplitRequired),
4650            }
4651        } else {
4652            let (child_index, n_entries_at_level, child_arc) =
4653                match &*parent_guard {
4654                    TreeNode::Internal(n) => {
4655                        let mut idx = 0usize;
4656                        for (i, entry) in n.entries.iter().enumerate() {
4657                            if i == 0 {
4658                                idx = 0;
4659                            } else {
4660                                let ord = match key_comparator {
4661                                    Some(cmp) => cmp(entry.key.as_slice(), key),
4662                                    None => entry.key.as_slice().cmp(key),
4663                                };
4664                                if ord != std::cmp::Ordering::Greater {
4665                                    idx = i;
4666                                } else {
4667                                    break;
4668                                }
4669                            }
4670                        }
4671                        let child =
4672                            n.get_child(idx).ok_or(TreeError::SplitRequired)?;
4673                        (idx, n.entries.len(), child)
4674                    }
4675                    TreeNode::Bottom(_) => {
4676                        return Err(TreeError::SplitRequired);
4677                    }
4678                };
4679
4680            let all_left = all_left_so_far && child_index == 0;
4681            let all_right = all_right_so_far
4682                && child_index == n_entries_at_level.saturating_sub(1);
4683
4684            let child_full = {
4685                let g = child_arc.read();
4686                g.get_n_entries() >= max_entries
4687            };
4688
4689            if child_full {
4690                let hint = match (all_left, all_right) {
4691                    (true, _) => SplitHint::AllLeft,
4692                    (_, true) => SplitHint::AllRight,
4693                    _ => SplitHint::Normal,
4694                };
4695                drop(parent_guard);
4696                Self::split_child(
4697                    node_arc,
4698                    child_index,
4699                    max_entries,
4700                    lsn,
4701                    hint,
4702                    key,
4703                    key_comparator,
4704                    key_prefixing,
4705                    // Recovery redo path: the listener is not active during
4706                    // log replay (the evictor is wired AFTER recovery, and
4707                    // the INList is rebuilt separately).  EVICTOR-RECLAIM-1
4708                    // registration happens on the live insert path.
4709                    None,
4710                )?;
4711                return Self::redo_insert_recursive_inner(
4712                    node_arc,
4713                    key,
4714                    data,
4715                    lsn,
4716                    max_entries,
4717                    key_comparator,
4718                    key_prefixing,
4719                    all_left_so_far,
4720                    all_right_so_far,
4721                );
4722            }
4723
4724            let r = Self::redo_insert_recursive_inner(
4725                &child_arc,
4726                key,
4727                data,
4728                lsn,
4729                max_entries,
4730                key_comparator,
4731                key_prefixing,
4732                all_left,
4733                all_right,
4734            );
4735            drop(parent_guard);
4736            r
4737        }
4738    }
4739
4740    /// Pre-warm the tree's internal `Vec<BinEntry>` capacity before a redo
4741    /// pass that will insert approximately `n` records.
4742    ///
4743    /// If the tree is empty, this is a no-op (there is no BIN yet to reserve
4744    /// capacity on).  If the tree already has a root BIN (from a previous
4745    /// checkpoint), reserves `n.min(max_entries_per_node)` additional slots
4746    /// in that BIN's entries vector, eliminating the resize-double cycle
4747    /// during the redo loop.
4748    ///
4749    /// Wave 11-K optimisation (Fix 3).
4750    pub fn reserve_redo_capacity(&self, n: usize) {
4751        if n == 0 {
4752            return;
4753        }
4754        let root = match self.get_root() {
4755            Some(r) => r,
4756            None => return,
4757        };
4758        // Descend to the leftmost BIN and reserve there.
4759        let mut arc = root;
4760        loop {
4761            let guard = arc.read();
4762            match &*guard {
4763                TreeNode::Bottom(bin_guard) => {
4764                    let additional = n
4765                        .min(self.max_entries_per_node)
4766                        .saturating_sub(bin_guard.entries.len());
4767                    drop(guard);
4768                    let mut wguard = arc.write();
4769                    if let TreeNode::Bottom(bin) = &mut *wguard {
4770                        bin.entries.reserve(additional);
4771                    }
4772                    return;
4773                }
4774                TreeNode::Internal(inner) => {
4775                    let child = inner.get_child(0);
4776                    drop(guard);
4777                    match child {
4778                        Some(c) => arc = c,
4779                        None => return,
4780                    }
4781                }
4782            }
4783        }
4784    }
4785
4786    /// Get the first (leftmost) BIN in the tree.
4787    ///
4788    /// Descends to the leftmost BIN by
4789    /// always following the first child slot at each upper IN level.
4790    pub fn get_first_node(&self) -> Option<SearchResult> {
4791        let mut guard: parking_lot::ArcRwLockReadGuard<
4792            parking_lot::RawRwLock,
4793            TreeNode,
4794        > = self.get_root()?.read_arc();
4795
4796        loop {
4797            if guard.is_bin() {
4798                let n = guard.get_n_entries();
4799                if n == 0 {
4800                    return None;
4801                }
4802                // TREE-F1: return the first LIVE slot, skipping known_deleted
4803                // slots (CursorImpl.java:2062-2064).  If the leftmost BIN is
4804                // entirely KD during the reconstitution window the cursor's
4805                // get_first falls through to its cross-BIN advance.
4806                if let TreeNode::Bottom(b) = &*guard {
4807                    match (0..b.entries.len()).find(|&i| b.slot_is_live(i)) {
4808                        Some(i) => {
4809                            return Some(SearchResult::with_values(
4810                                true, i as i32, false,
4811                            ));
4812                        }
4813                        None => return None,
4814                    }
4815                }
4816                return Some(SearchResult::with_values(true, 0, false));
4817            }
4818
4819            // Capture the leftmost child Arc while holding `guard`, then
4820            // hand-over-hand: take the child read lock before releasing
4821            // the parent's. Same race fix as `Tree::search`.
4822            let next_arc = match &*guard {
4823                TreeNode::Internal(n_node) => n_node.get_child(0)?,
4824                _ => return None,
4825            };
4826            let next_guard = next_arc.read_arc();
4827            drop(guard);
4828            guard = next_guard;
4829        }
4830    }
4831
4832    /// Get the last (rightmost) BIN in the tree.
4833    ///
4834    /// Descends to the rightmost BIN by
4835    /// always following the last child slot at each upper IN level.
4836    pub fn get_last_node(&self) -> Option<SearchResult> {
4837        let mut guard: parking_lot::ArcRwLockReadGuard<
4838            parking_lot::RawRwLock,
4839            TreeNode,
4840        > = self.get_root()?.read_arc();
4841
4842        loop {
4843            if guard.is_bin() {
4844                let n = guard.get_n_entries();
4845                if n == 0 {
4846                    return None;
4847                }
4848                // TREE-F1: return the last LIVE slot, skipping known_deleted
4849                // slots (CursorImpl.java:2062-2064).
4850                if let TreeNode::Bottom(b) = &*guard {
4851                    match (0..b.entries.len())
4852                        .rev()
4853                        .find(|&i| b.slot_is_live(i))
4854                    {
4855                        Some(i) => {
4856                            return Some(SearchResult::with_values(
4857                                true, i as i32, false,
4858                            ));
4859                        }
4860                        None => return None,
4861                    }
4862                }
4863                return Some(SearchResult::with_values(
4864                    true,
4865                    (n - 1) as i32,
4866                    false,
4867                ));
4868            }
4869
4870            // Capture the rightmost child Arc while holding `guard`, then
4871            // hand-over-hand: take the child read lock before releasing
4872            // the parent's. Same race fix as `Tree::search`.
4873            let next_arc = match &*guard {
4874                TreeNode::Internal(n_node) => {
4875                    n_node.get_child(n_node.entries.len().saturating_sub(1))?
4876                }
4877                _ => return None,
4878            };
4879            let next_guard = next_arc.read_arc();
4880            drop(guard);
4881            guard = next_guard;
4882        }
4883    }
4884
4885    /// Returns the number of root splits that have occurred.
4886    pub fn get_root_splits(&self) -> u64 {
4887        self.root_splits.load(Ordering::Relaxed)
4888    }
4889
4890    /// Returns the number of relatches required.
4891    pub fn get_relatches_required(&self) -> u64 {
4892        self.relatches_required.load(Ordering::Relaxed)
4893    }
4894
4895    /// Delete a key from the tree.
4896    ///
4897    /// Traverses the tree to find the BIN that should contain the key, then
4898    /// removes the entry. Returns true if the key was found and removed.
4899    ///
4900    /// Delete path in `Tree` from the.
4901    ///
4902    /// In-memory removal only — WAL logging for deletes is handled by the
4903    /// cursor layer (`cursor_impl.rs::log_ln_write`) before this is called,
4904    /// matching separation between LN logging and tree mutation.
4905    pub fn delete(&self, key: &[u8]) -> bool {
4906        let root = match self.get_root() {
4907            Some(r) => r,
4908            None => return false,
4909        };
4910
4911        // F8 consistency: insert accounts key + data + BIN_ENTRY_OVERHEAD; delete must
4912        // subtract the SAME (data_len was previously omitted, leaking
4913        // data_len from the cache counter on every delete and biasing the
4914        // evictor's over-budget view). Peek the data length before deleting.
4915        let data_len = if self.memory_counter.is_some() {
4916            self.search_with_data(key)
4917                .filter(|sf| sf.found)
4918                .and_then(|sf| sf.data.as_ref().map(|d| d.len()))
4919                .unwrap_or(0)
4920        } else {
4921            0
4922        };
4923
4924        let deleted =
4925            Self::delete_recursive(&root, key, self.key_comparator.as_ref());
4926
4927        // Update the memory counter when an entry is removed.
4928        // IN.updateMemorySize(-delta) → MemoryBudget.updateTreeMemoryUsage(-delta).
4929        if deleted && let Some(counter) = &self.memory_counter {
4930            let delta = (key.len() + data_len + BIN_ENTRY_OVERHEAD) as i64;
4931            counter.fetch_sub(delta, Ordering::Relaxed);
4932        }
4933
4934        deleted
4935    }
4936
4937    /// Recursive helper for `delete`: descend to the BIN that holds `key`
4938    /// and remove it.
4939    fn delete_recursive(
4940        node_arc: &Arc<RwLock<TreeNode>>,
4941        key: &[u8],
4942        key_comparator: Option<&KeyComparatorFn>,
4943    ) -> bool {
4944        // Latch coupling, mirroring `insert_recursive`. Without this,
4945        // delete has the same "BIN split out from under us" race: thread
4946        // A finds child_arc as the target BIN under parent.read(), drops
4947        // the lock, and another thread runs split_child(parent, …) that
4948        // moves the target key into the new sibling. A then takes
4949        // child_arc.write(), looks for the key in the (now left-half)
4950        // BIN, doesn't find it, and returns `false`. The caller treats
4951        // the `false` as "key was not present", but the key is actually
4952        // still in the tree (in the sibling). Subsequent operations
4953        // observe a stale record that should have been deleted —
4954        // semantically a lost delete.
4955        let parent_guard = node_arc.read();
4956        let is_bin = parent_guard.is_bin();
4957        let child_arc = if !is_bin {
4958            match &*parent_guard {
4959                TreeNode::Internal(n) => {
4960                    // Find child slot with largest key <= search key
4961                    let mut idx = 0usize;
4962                    for (i, entry) in n.entries.iter().enumerate() {
4963                        if i == 0 {
4964                            idx = 0;
4965                        } else {
4966                            let ord = match key_comparator {
4967                                Some(cmp) => cmp(entry.key.as_slice(), key),
4968                                None => entry.key.as_slice().cmp(key),
4969                            };
4970                            if ord != std::cmp::Ordering::Greater {
4971                                idx = i;
4972                            } else {
4973                                break;
4974                            }
4975                        }
4976                    }
4977                    n.get_child(idx)
4978                }
4979                _ => None,
4980            }
4981        } else {
4982            None
4983        };
4984
4985        if is_bin {
4986            // Drop the read lock before taking the write lock; the outer
4987            // call frame still holds the parent read lock so a concurrent
4988            // split_child cannot run on this BIN's parent until we unwind.
4989            drop(parent_guard);
4990            let mut g = node_arc.write();
4991            match &mut *g {
4992                TreeNode::Bottom(bin) => {
4993                    if let Some(cmp) = key_comparator {
4994                        bin.delete_cmp(key, cmp.as_ref())
4995                    } else {
4996                        // Entries store compressed (suffix) keys when key_prefix
4997                        // is non-empty.  Compress the search key before comparing.
4998                        //
4999                        // The caller is not required to ensure that `key`
5000                        // shares this BIN's learned `key_prefix` — a stray
5001                        // delete of a key that was never present (or that
5002                        // sits under a different prefix) is legal and must
5003                        // simply return `false`.  Calling `compress_key`
5004                        // unconditionally would `debug_assert!`-panic on
5005                        // such inputs, so guard it the same way the cursor
5006                        // path does.
5007                        if !bin.key_prefix.is_empty()
5008                            && !key.starts_with(bin.key_prefix.as_slice())
5009                        {
5010                            return false;
5011                        }
5012                        let suffix = bin.compress_key(key);
5013                        match bin.key_binary_search(suffix.as_slice()) {
5014                            Ok(idx) => {
5015                                bin.entries.remove(idx);
5016                                bin.keys.remove(idx); // T-2
5017                                bin.lsn_rep.remove_shift(idx); // T-3
5018                                // Mark dirty after any modification.
5019                                bin.dirty = true;
5020                                true
5021                            }
5022                            Err(_) => false,
5023                        }
5024                    }
5025                }
5026                _ => false,
5027            }
5028        } else {
5029            // Descend with parent_guard still held; the recursion will
5030            // hold its own read lock and drop ours after it returns.
5031            let r = match child_arc {
5032                Some(child) => {
5033                    Self::delete_recursive(&child, key, key_comparator)
5034                }
5035                None => false,
5036            };
5037            drop(parent_guard);
5038            r
5039        }
5040    }
5041
5042    // ========================================================================
5043    // B-tree Merge / Compress
5044    // ========================================================================
5045
5046    /// Merge under-full sibling BIN pairs and remove empty subtrees.
5047    ///
5048    /// `INCompressor` / `Tree.compressInternal()` logic.
5049    ///
5050    /// merges two adjacent siblings when their combined entry count is
5051    /// ≤ `max_entries_per_node` (the merge threshold equal to the node
5052    /// capacity).  The left sibling's entries are prepended into the right
5053    /// sibling; the parent key slot pointing at the left sibling is then
5054    /// removed from the parent IN with `deleteEntry`.  If the parent IN
5055    /// becomes empty after the removal the process repeats recursively up
5056    /// the tree.
5057    ///
5058    /// This implementation performs a single post-order walk so that each
5059    /// level is compressed after all its children have been compressed.
5060    pub fn compress(&self) {
5061        let root = match self.get_root() {
5062            Some(r) => r,
5063            None => return,
5064        };
5065        Self::compress_node(&root, self.max_entries_per_node);
5066    }
5067
5068    /// Recursive post-order compress helper.
5069    ///
5070    /// Visits children first (post-order), then scans adjacent child
5071    /// pairs in the current IN and merges them when the merge condition
5072    /// holds: `left.n_entries + right.n_entries <= max_entries`.
5073    ///
5074    /// After merging, the parent entry for the left sibling is deleted.
5075    /// The loop restarts after each merge so that newly under-full pairs
5076    /// created by previous merges are also considered.
5077    fn compress_node(node_arc: &Arc<RwLock<TreeNode>>, max_entries: usize) {
5078        // Collect child arcs to recurse without holding the node lock.
5079        let children: Vec<Arc<RwLock<TreeNode>>> = {
5080            let g = node_arc.read();
5081            match &*g {
5082                TreeNode::Internal(n) => n.resident_children(),
5083                // BINs are leaves; nothing to compress at this level.
5084                TreeNode::Bottom(_) => return,
5085            }
5086        };
5087
5088        // Post-order: recurse into every child before working on this level.
5089        for child in &children {
5090            Self::compress_node(child, max_entries);
5091        }
5092
5093        // Compress the current IN level: merge adjacent under-full children.
5094        // Repeat until a full pass produces no merges.
5095        loop {
5096            let n_entries = {
5097                let g = node_arc.read();
5098                g.get_n_entries()
5099            };
5100
5101            let mut merged_any = false;
5102
5103            // `i` is the index of the *left* candidate; right is at `i+1`.
5104            let mut i = 0usize;
5105            while i + 1 < n_entries {
5106                // Fetch left and right child arcs.
5107                let (left_arc, right_arc) = {
5108                    let g = node_arc.read();
5109                    match &*g {
5110                        TreeNode::Internal(p) => {
5111                            let l = p.get_child(i);
5112                            let r = p.get_child(i + 1);
5113                            match (l, r) {
5114                                (Some(l), Some(r)) => (l, r),
5115                                _ => {
5116                                    i += 1;
5117                                    continue;
5118                                }
5119                            }
5120                        }
5121                        TreeNode::Bottom(_) => return,
5122                    }
5123                };
5124
5125                let left_n = { left_arc.read().get_n_entries() };
5126                let right_n = { right_arc.read().get_n_entries() };
5127
5128                // merge condition: combined count fits within one node.
5129                if left_n + right_n > max_entries {
5130                    i += 1;
5131                    continue;
5132                }
5133
5134                // Determine node kind from left child.
5135                let left_is_bin = { left_arc.read().is_bin() };
5136
5137                if left_is_bin {
5138                    // BIN merge: decompress left entries to full keys, then
5139                    // prepend into right BIN (also decompressed), and finally
5140                    // recompute the merged BIN's prefix.
5141                    // merge left into right, then
5142                    // recalcKeyPrefix on the merged node.
5143                    let left_full_entries: Vec<BinEntry> = {
5144                        {
5145                            let g = left_arc.read();
5146                            match &*g {
5147                                TreeNode::Bottom(b) => (0..b.entries.len())
5148                                    .map(|j| BinEntry {
5149                                        data: b.entries[j].data.clone(),
5150                                        known_deleted: b.entries[j]
5151                                            .known_deleted,
5152                                        dirty: b.entries[j].dirty,
5153                                        expiration_time: b.entries[j]
5154                                            .expiration_time,
5155                                    })
5156                                    .collect(),
5157                                _ => {
5158                                    i += 1;
5159                                    continue;
5160                                }
5161                            }
5162                        }
5163                    };
5164                    // T-3 / T-2: capture left's per-slot LSNs and FULL keys.
5165                    let (left_full_lsns, left_full_keys): (
5166                        Vec<Lsn>,
5167                        Vec<Vec<u8>>,
5168                    ) = {
5169                        let g = left_arc.read();
5170                        match &*g {
5171                            TreeNode::Bottom(b) => (
5172                                (0..b.entries.len())
5173                                    .map(|j| b.get_lsn(j))
5174                                    .collect(),
5175                                (0..b.entries.len())
5176                                    .map(|j| {
5177                                        b.get_full_key(j).unwrap_or_default()
5178                                    })
5179                                    .collect(),
5180                            ),
5181                            _ => (Vec::new(), Vec::new()),
5182                        }
5183                    };
5184                    {
5185                        {
5186                            let mut g = right_arc.write();
5187                            match &mut *g {
5188                                TreeNode::Bottom(rb) => {
5189                                    // Decompress right entries to full keys.
5190                                    let right_full: Vec<BinEntry> = (0..rb
5191                                        .entries
5192                                        .len())
5193                                        .map(|j| BinEntry {
5194                                            data: rb.entries[j].data.clone(),
5195                                            known_deleted: rb.entries[j]
5196                                                .known_deleted,
5197                                            dirty: rb.entries[j].dirty,
5198                                            expiration_time: rb.entries[j]
5199                                                .expiration_time,
5200                                        })
5201                                        .collect();
5202                                    // T-3 / T-2: right's per-slot LSNs + keys.
5203                                    let right_full_lsns: Vec<Lsn> =
5204                                        (0..rb.entries.len())
5205                                            .map(|j| rb.get_lsn(j))
5206                                            .collect();
5207                                    let right_full_keys: Vec<Vec<u8>> =
5208                                        (0..rb.entries.len())
5209                                            .map(|j| {
5210                                                rb.get_full_key(j)
5211                                                    .unwrap_or_default()
5212                                            })
5213                                            .collect();
5214                                    // Left entries are all smaller; prepend.
5215                                    let mut combined = left_full_entries;
5216                                    combined.extend(right_full);
5217                                    let mut combined_lsns = left_full_lsns;
5218                                    combined_lsns.extend(right_full_lsns);
5219                                    let mut combined_keys = left_full_keys;
5220                                    combined_keys.extend(right_full_keys);
5221                                    // Reset prefix and assign full keys.
5222                                    rb.key_prefix = Vec::new();
5223                                    rb.entries = combined;
5224                                    // T-3: rebuild the merged LSN array.
5225                                    rb.lsn_rep =
5226                                        LsnRep::from_lsns(&combined_lsns);
5227                                    // T-2: rebuild the merged key rep (Default;
5228                                    // recompute below compresses + compacts).
5229                                    rb.keys = KeyRep::from_keys(combined_keys);
5230                                    // Recompute prefix on merged BIN.
5231                                    if rb.entries.len() >= 2 {
5232                                        rb.recompute_key_prefix();
5233                                    } else {
5234                                        rb.keys
5235                                            .compact(rb.compact_max_key_length);
5236                                    }
5237                                    rb.dirty = true;
5238                                }
5239                                _ => {
5240                                    i += 1;
5241                                    continue;
5242                                }
5243                            }
5244                        }
5245                    }
5246                    // Clear the now-merged left BIN.
5247                    {
5248                        let mut g = left_arc.write();
5249                        if let TreeNode::Bottom(lb) = &mut *g {
5250                            lb.entries.clear();
5251                            lb.lsn_rep = LsnRep::Empty; // T-3
5252                            lb.keys = KeyRep::new(); // T-2
5253                            lb.key_prefix = Vec::new();
5254                            lb.dirty = true;
5255                        }
5256                    }
5257                } else {
5258                    // Upper-IN merge: prepend left's InEntries into right.
5259                    // T-4: capture left's resident children alongside its
5260                    // entries so they travel into the merged right IN.
5261                    let (left_in_entries, left_children): (
5262                        Vec<InEntry>,
5263                        Vec<Option<ChildArc>>,
5264                    ) = {
5265                        let g = left_arc.read();
5266                        match &*g {
5267                            TreeNode::Internal(n) => {
5268                                let children = (0..n.entries.len())
5269                                    .map(|j| n.get_child(j))
5270                                    .collect();
5271                                (n.entries.clone(), children)
5272                            }
5273                            _ => {
5274                                i += 1;
5275                                continue;
5276                            }
5277                        }
5278                    };
5279                    // T-3: capture left's per-slot LSNs.
5280                    let left_in_lsns: Vec<Lsn> = {
5281                        let g = left_arc.read();
5282                        match &*g {
5283                            TreeNode::Internal(n) => (0..n.entries.len())
5284                                .map(|j| n.get_lsn(j))
5285                                .collect(),
5286                            _ => Vec::new(),
5287                        }
5288                    };
5289                    let n_left = left_in_entries.len();
5290                    {
5291                        {
5292                            let mut g = right_arc.write();
5293                            match &mut *g {
5294                                TreeNode::Internal(rn) => {
5295                                    // Snapshot right's existing children, then
5296                                    // rebuild the merged entry + target arrays
5297                                    // (left half first, then right half).
5298                                    let right_children: Vec<Option<ChildArc>> =
5299                                        (0..rn.entries.len())
5300                                            .map(|j| rn.get_child(j))
5301                                            .collect();
5302                                    // T-3: snapshot right's LSNs too.
5303                                    let right_in_lsns: Vec<Lsn> =
5304                                        (0..rn.entries.len())
5305                                            .map(|j| rn.get_lsn(j))
5306                                            .collect();
5307                                    let mut combined = left_in_entries.clone();
5308                                    combined.append(&mut rn.entries);
5309                                    rn.entries = combined;
5310                                    // T-3: rebuild the merged LSN array.
5311                                    let mut combined_lsns =
5312                                        left_in_lsns.clone();
5313                                    combined_lsns.extend(right_in_lsns);
5314                                    rn.lsn_rep =
5315                                        LsnRep::from_lsns(&combined_lsns);
5316                                    rn.targets = TargetRep::None;
5317                                    for (j, c) in
5318                                        left_children.iter().enumerate()
5319                                    {
5320                                        if let Some(child) = c {
5321                                            rn.set_child(
5322                                                j,
5323                                                Some(child.clone()),
5324                                            );
5325                                        }
5326                                    }
5327                                    for (j, c) in
5328                                        right_children.into_iter().enumerate()
5329                                    {
5330                                        if c.is_some() {
5331                                            rn.set_child(n_left + j, c);
5332                                        }
5333                                    }
5334                                    rn.dirty = true;
5335                                }
5336                                _ => {
5337                                    i += 1;
5338                                    continue;
5339                                }
5340                            }
5341                        }
5342                    }
5343                    // Update parent pointers for moved children.
5344                    for child in left_children.into_iter().flatten() {
5345                        let mut cg = child.write();
5346                        cg.set_parent(Some(Arc::downgrade(&right_arc)));
5347                    }
5348                    // Clear the now-merged left IN.
5349                    {
5350                        let mut g = left_arc.write();
5351                        if let TreeNode::Internal(ln) = &mut *g {
5352                            ln.entries.clear();
5353                            ln.lsn_rep = LsnRep::Empty; // T-3
5354                            ln.targets = TargetRep::None;
5355                            ln.dirty = true;
5356                        }
5357                    }
5358                }
5359
5360                // Remove the right sibling's parent slot and update
5361                // the left slot to point at the merged right child.
5362                //
5363                // We keep the LEFT slot's key (which is the correct minimum for
5364                // the merged BIN's range) and remove the RIGHT slot (i+1).
5365                // This avoids having to update the parent key when i == 0.
5366                {
5367                    {
5368                        let mut g = node_arc.write();
5369                        match &mut *g {
5370                            TreeNode::Internal(p) => {
5371                                // Update left slot (i) to point at right_arc
5372                                // (which now contains the merged entries).
5373                                if i < p.entries.len() {
5374                                    p.set_child(i, Some(right_arc.clone()));
5375                                }
5376                                // Remove right slot (i+1) — it is now redundant.
5377                                // T-4: remove_entry shifts the child array too.
5378                                if i + 1 < p.entries.len() {
5379                                    p.remove_entry(i + 1);
5380                                }
5381                                p.dirty = true;
5382                            }
5383                            TreeNode::Bottom(_) => return,
5384                        }
5385                    }
5386                }
5387
5388                merged_any = true;
5389                // Advance i to check the merged BIN against its new right
5390                // sibling (the old slot i+2 is now at i+1).
5391                i += 1;
5392                let updated_n = { node_arc.read().get_n_entries() };
5393                if i + 1 >= updated_n {
5394                    break;
5395                }
5396            }
5397
5398            if !merged_any {
5399                break;
5400            }
5401        }
5402    }
5403
5404    // ========================================================================
5405    // BIN slot compression
5406    // ========================================================================
5407
5408    /// Compress deleted slots from a BIN node, then prune it from its parent
5409    /// IN when it becomes empty.
5410    ///
5411    /// (the in-place slot-removal
5412    /// path, NOT the sibling-merge path handled by `compress()`).
5413    ///
5414    /// # Algorithm
5415    ///
5416    /// 1. If the BIN is a delta, skip — deltas cannot be compressed.
5417    /// 2. Remove all slots where `entry.known_deleted` is true.  This mirrors
5418    ///    `bin.compress(!bin.shouldLogDelta(), localTracker)`.
5419    /// 3. If the BIN is now empty, remove it from its parent IN.  This mirrors
5420    ///    `pruneBIN(db, binRef, idKey)` → `tree.delete(idKey)`.
5421    ///
5422    /// # Arguments
5423    ///
5424    /// * `bin_arc` — the BIN to compress (must be a `TreeNode::Bottom`).
5425    ///
5426    /// # Returns
5427    ///
5428    /// `true` if compression made progress (slots were removed or the BIN was
5429    /// pruned), `false` if the BIN was skipped (delta, no cursors issue, etc.).
5430    pub fn compress_bin(&self, bin_arc: &Arc<RwLock<TreeNode>>) -> bool {
5431        self.compress_bin_with_lock_check(bin_arc, None)
5432    }
5433
5434    /// Like [`compress_bin`](Self::compress_bin), but consults a caller-supplied
5435    /// `is_locked` predicate before physically removing each `known_deleted`
5436    /// slot.  If `is_locked(slot_lsn)` returns `true`, the slot is SKIPPED
5437    /// (left for a later compression pass after the locking txn resolves).
5438    ///
5439    /// This is the faithful port of JE `BIN.compress` (BIN.java:1141-1172):
5440    ///
5441    /// > We have to be able to lock the LN before we can compress the entry.
5442    /// > If we can't, then skip over it. ... it is more efficient to call
5443    /// > `isLockUncontended` than to actually lock the LN, since we would
5444    /// > release the lock immediately.
5445    ///
5446    /// ```text
5447    /// if (lsn != DbLsn.NULL_LSN &&
5448    ///     !lockManager.isLockUncontended(lsn)) {
5449    ///     anyLocked = true;
5450    ///     continue;
5451    /// }
5452    /// ```
5453    ///
5454    /// JE's `isLockUncontended(lsn)` (LockManager.java:692) returns
5455    /// `nWaiters() == 0 && nOwners() == 0`.  Our `is_locked(lsn)` is its
5456    /// inverse: the dbi layer supplies a closure over the `LockManager` that
5457    /// returns `true` iff the slot's LSN has any owner or waiter
5458    /// (`LockManager::get_lock_info(lsn) != (0, 0)`).  A `NULL_LSN` slot is
5459    /// always discardable without locking (JE: "Can discard a NULL_LSN entry
5460    /// without locking"), so we never invoke the predicate for it.
5461    ///
5462    /// # Layering (noxu-tree -/-> noxu-txn)
5463    ///
5464    /// The predicate is a `&dyn Fn(u64) -> bool`, NOT a `LockManager`
5465    /// reference, so noxu-tree never depends on noxu-txn.  The lock knowledge
5466    /// lives entirely in the dbi-supplied closure.
5467    ///
5468    /// # Lock ordering (no deadlock)
5469    ///
5470    /// `is_locked` is invoked while this method holds the **BIN write latch**.
5471    /// The dbi closure calls `LockManager::get_lock_info`, which takes a
5472    /// lock-table *shard* mutex for a single, non-blocking critical section
5473    /// and releases it before returning — it never waits and never re-enters
5474    /// the tree.  The LockManager has no edge back into a BIN latch (lock
5475    /// acquisition descends the tree from the dbi/cursor layer, never the
5476    /// reverse).  The only ordering is therefore BIN-latch -> shard-mutex,
5477    /// which is acyclic; no lock cycle exists, so the predicate cannot
5478    /// deadlock against the latch.
5479    ///
5480    /// When `is_locked` is `None` (recovery, BIN-delta replay, unit tests with
5481    /// no lock manager) behavior is identical to the historical
5482    /// `compress_bin`: every `known_deleted` slot is removed.
5483    pub fn compress_bin_with_lock_check(
5484        &self,
5485        bin_arc: &Arc<RwLock<TreeNode>>,
5486        is_locked: Option<&dyn Fn(u64) -> bool>,
5487    ) -> bool {
5488        // ---- Step 1: collect metadata without holding the write lock ----
5489        let (is_delta, n_entries, id_key) = {
5490            {
5491                let g = bin_arc.read();
5492                match &*g {
5493                    TreeNode::Bottom(b) => {
5494                        // Identifier key = first full key in the BIN
5495                        // (the: bin.getIdentifierKey()).
5496                        let id_key = b.get_full_key(0);
5497                        (b.is_delta, b.entries.len(), id_key)
5498                    }
5499                    _ => return false, // not a BIN
5500                }
5501            }
5502        };
5503
5504        // If (bin.isBINDelta()) return; — deltas cannot be compressed.
5505        if is_delta {
5506            return false;
5507        }
5508
5509        // ---- Step 2: remove known-deleted slots) ----
5510        // We compress dirty slots too (compress_dirty_slots = true) because
5511        // we are not writing a BIN-delta here.
5512        let removed_any = {
5513            {
5514                let mut g = bin_arc.write();
5515                match &mut *g {
5516                    TreeNode::Bottom(b) => {
5517                        let before = b.entries.len();
5518                        // BIN.compress(): walk backwards to remove
5519                        // deleted slots without index confusion.
5520                        //
5521                        // IC-3 — JE `BIN.compress` (BIN.java:1141-1172) does
5522                        // NOT compress a slot it cannot lock: "We have to be
5523                        // able to lock the LN before we can compress the
5524                        // entry.  If we can't, then skip over it."  JE calls
5525                        // `lockManager.isLockUncontended(lsn)` and, on a
5526                        // contended slot, does `anyLocked = true; continue;`.
5527                        // We mirror that here via the optional `is_locked`
5528                        // predicate (supplied by the dbi layer, closing over
5529                        // the LockManager — see
5530                        // `compress_bin_with_lock_check`).  This removes the
5531                        // previously fragile implicit invariant ("no code path
5532                        // ever tombstones a slot before its txn commits"):
5533                        // even if a future write path leaves an uncommitted,
5534                        // write-locked `known_deleted` tombstone in a BinStub,
5535                        // the predicate keeps the compressor from physically
5536                        // removing a slot a live txn still references.
5537                        //
5538                        // When `is_locked` is `None` (recovery / BIN-delta
5539                        // replay / lock-manager-less tests) behavior is
5540                        // unchanged: every `known_deleted` slot is removed,
5541                        // matching the historical safe-by-invariant path.
5542                        let mut j = b.entries.len();
5543                        while j > 0 {
5544                            j -= 1;
5545                            if b.entries[j].known_deleted {
5546                                // IC-3 lock check (JE BIN.compress).  A
5547                                // NULL_LSN slot is always discardable without
5548                                // locking (JE: "Can discard a NULL_LSN entry
5549                                // without locking"), so we only consult the
5550                                // predicate for a non-null LSN.
5551                                if let Some(is_locked) = is_locked {
5552                                    let slot_lsn = b.get_lsn(j);
5553                                    if !slot_lsn.is_null()
5554                                        && is_locked(slot_lsn.as_u64())
5555                                    {
5556                                        // Slot still write-locked by an
5557                                        // in-flight txn — leave it for a later
5558                                        // pass (JE: anyLocked = true; continue).
5559                                        continue;
5560                                    }
5561                                }
5562                                // JE `IN.deleteEntry` (IN.java:3466): removing a
5563                                // DIRTY slot must prohibit the next delta — a
5564                                // delta only carries dirty slots, so the removal
5565                                // would otherwise be silently lost.  Force a
5566                                // full BIN on the next log.
5567                                if b.entries[j].dirty {
5568                                    b.prohibit_next_delta = true;
5569                                }
5570                                b.entries.remove(j);
5571                                b.keys.remove(j); // T-2
5572                                b.lsn_rep.remove_shift(j); // T-3
5573                                b.dirty = true;
5574                            }
5575                        }
5576                        // Recompute prefix after slot removal, since the
5577                        // remaining keys may share a longer common prefix.
5578                        // After compress(), call recalcKeyPrefix().
5579                        if b.entries.len() >= 2 {
5580                            b.recompute_key_prefix();
5581                        } else if b.entries.len() < 2 {
5582                            b.key_prefix = Vec::new();
5583                        }
5584                        b.entries.len() < before
5585                    }
5586                    _ => false,
5587                }
5588            }
5589        };
5590
5591        // ---- Step 3: prune empty BIN from parent ----
5592        // If (empty) pruneBIN(db, binRef, idKey)  → tree.delete(idKey).
5593        // We only prune when the BIN is actually empty after compression.
5594        let now_empty = { bin_arc.read().get_n_entries() == 0 };
5595
5596        if now_empty {
5597            // pruneBIN re-descends to the SPECIFIC empty BIN and removes its
5598            // parent-IN slot ONLY IF the BIN is still empty (and has no
5599            // cursors and is not a delta) UNDER THE PARENT LATCH.
5600            //
5601            // We must NOT use `self.delete(&id_key)` here (IC-1): that
5602            // re-descends by key and removes whatever live entry now matches
5603            // `id_key`.  Between reading `now_empty` (a fresh read lock taken
5604            // after the compression write lock was dropped) and acting on it,
5605            // a concurrent insert can repopulate this BIN; `self.delete` would
5606            // then drop a LIVE entry — tree corruption / lost write.
5607            //
5608            // JE `INCompressor.pruneBIN` (INCompressor.java ~line 502-510)
5609            // calls `tree.delete(idKey)`, and JE `Tree.delete` /
5610            // `searchDeletableSubTree` (Tree.java ~line 755-800) re-validates
5611            // `bin.getNEntries() != 0` → NODE_NOT_EMPTY (abort) and
5612            // `bin.nCursors() > 0` → CURSORS_EXIST (abort) while holding the
5613            // parent (branch) latch.  `prune_empty_bin` reproduces exactly
5614            // that re-validation.  See `prune_empty_bin` below.
5615            //
5616            // Note: we only attempt the prune if n_entries was > 0 before
5617            // compression (an already-empty BIN we never populated is left
5618            // alone, matching the pre-existing guard).
5619            if let Some(key) = id_key
5620                && n_entries > 0
5621            {
5622                self.prune_empty_bin(&key);
5623            }
5624            return true;
5625        }
5626
5627        removed_any
5628    }
5629
5630    /// Re-descend to the leaf BIN that should contain `id_key` and remove its
5631    /// parent-IN child slot ONLY IF the BIN is still safe to prune.
5632    ///
5633    /// This is the faithful port of JE `Tree.delete(idKey)` /
5634    /// `Tree.searchDeletableSubTree` (Tree.java ~line 755-800) as invoked by
5635    /// `INCompressor.pruneBIN` (INCompressor.java ~line 502-510).  JE takes the
5636    /// branch-parent latch, re-descends to the specific empty BIN, and aborts
5637    /// the prune (removing NOTHING) if any of the following changed since the
5638    /// compressor observed the BIN as empty:
5639    ///
5640    /// * `bin.getNEntries() != 0`  → `NodeNotEmptyException` (a concurrent
5641    ///   insert repopulated the BIN — IC-1: we must NOT delete a live entry).
5642    /// * `bin.isBINDelta()`        → `unexpectedState` (deltas are never empty).
5643    /// * `bin.nCursors() > 0`      → `CursorsExistException` (a cursor is parked
5644    ///   on the empty BIN; requeue rather than orphan the cursor).
5645    ///
5646    /// The re-check and the slot removal both happen while holding the
5647    /// **parent IN write latch**.  Holding the parent write latch blocks every
5648    /// descender (insert / delete take `parent.read()` hand-over-hand), so a
5649    /// concurrent insert cannot reach the BIN between our re-check and the
5650    /// slot removal — the TOCTOU window IC-1 describes is closed.
5651    ///
5652    /// Returns `true` iff a parent-IN slot was removed, `false` otherwise
5653    /// (BIN repopulated, has a cursor, is a delta, vanished, or is the root —
5654    /// in every `false` case NOTHING is removed).
5655    pub fn prune_empty_bin(&self, id_key: &[u8]) -> bool {
5656        let root = match self.get_root() {
5657            Some(r) => r,
5658            None => return false,
5659        };
5660
5661        // If the root itself is the BIN (single-BIN tree) there is no parent
5662        // IN to remove a slot from.  JE's searchDeletableSubTree returns null
5663        // ("the entire tree is empty") and keeps the root BIN; we do the same.
5664        if root.read().is_bin() {
5665            return false;
5666        }
5667
5668        // Descend by id_key tracking the IN that is the *parent of the leaf
5669        // BIN* and the child index within it.  Hand-over-hand read coupling
5670        // keeps the descent consistent with concurrent splits, exactly like
5671        // `get_parent_bin_for_child_ln`.
5672        let (parent_arc, child_index) = {
5673            let mut parent_arc: Arc<RwLock<TreeNode>> = root.clone();
5674            let mut guard: parking_lot::ArcRwLockReadGuard<
5675                parking_lot::RawRwLock,
5676                TreeNode,
5677            > = root.read_arc();
5678            loop {
5679                let (next_arc, idx) = match &*guard {
5680                    TreeNode::Internal(n) => {
5681                        if n.entries.is_empty() {
5682                            return false;
5683                        }
5684                        let idx = self.upper_in_floor_index(&n.entries, id_key);
5685                        match n.get_child(idx) {
5686                            Some(c) => (c, idx),
5687                            None => return false,
5688                        }
5689                    }
5690                    TreeNode::Bottom(_) => {
5691                        unreachable!("is_bin checked before / below")
5692                    }
5693                };
5694                // Is the next node the leaf BIN?  If so, `guard`'s node is the
5695                // parent IN we want and `idx` is the child slot.
5696                if next_arc.read().is_bin() {
5697                    drop(guard);
5698                    break (parent_arc, idx);
5699                }
5700                let next_guard = next_arc.read_arc();
5701                drop(guard);
5702                parent_arc = next_arc;
5703                guard = next_guard;
5704            }
5705        };
5706
5707        // ---- Re-validate and remove the slot UNDER THE PARENT WRITE LATCH ----
5708        // Holding parent.write() excludes all descenders (they need
5709        // parent.read()), so the BIN cannot be repopulated between the
5710        // re-check and the slot removal.
5711        let mut parent_guard = parent_arc.write();
5712        let pruned_bin_id;
5713        let removed_key_len = match &mut *parent_guard {
5714            TreeNode::Internal(p) => {
5715                let child = match p.get_child(child_index) {
5716                    Some(c) => c,
5717                    None => return false, // slot already vacated / invalid
5718                };
5719                // Re-validate the child BIN under the parent latch.
5720                {
5721                    let cg = child.read();
5722                    match &*cg {
5723                        TreeNode::Bottom(b) => {
5724                            // JE: bin.getNEntries() != 0 → NODE_NOT_EMPTY (abort).
5725                            if !b.entries.is_empty() {
5726                                return false;
5727                            }
5728                            // JE: bin.isBINDelta() → unexpectedState (abort).
5729                            if b.is_delta {
5730                                return false;
5731                            }
5732                            // JE: bin.nCursors() > 0 → CURSORS_EXIST (abort).
5733                            if b.cursor_count > 0 {
5734                                return false;
5735                            }
5736                            pruned_bin_id = b.node_id;
5737                        }
5738                        // A concurrent split could in principle have replaced
5739                        // the child with an IN; never prune in that case.
5740                        TreeNode::Internal(_) => return false,
5741                    }
5742                }
5743                // Safe to prune: remove the BIN's slot from the parent IN.
5744                // Mirrors the parent-slot removal `Tree.delete` performs for
5745                // an empty BIN (Tree.java deleteEntry under the branch latch).
5746                // T-4: remove_entry shifts the node-level child array too.
5747                let removed = p.remove_entry(child_index);
5748                p.dirty = true;
5749                removed.key.len()
5750            }
5751            TreeNode::Bottom(_) => return false,
5752        };
5753        drop(parent_guard);
5754
5755        // JE: removing the BIN slot detaches the BIN from the tree; the
5756        // evictor must drop it from its LRU lists (Evictor.remove).
5757        self.note_removed(pruned_bin_id);
5758
5759        // Preserve the memory-counter bookkeeping that `self.delete` performed
5760        // (IN.updateMemorySize(-delta) → MemoryBudget.updateTreeMemoryUsage).
5761        // The pruned slot's key plus the fixed per-entry overhead matches the
5762        // `delete` accounting (key.len() + BIN_ENTRY_OVERHEAD).
5763        if let Some(counter) = &self.memory_counter {
5764            let delta = (removed_key_len + BIN_ENTRY_OVERHEAD) as i64;
5765            counter.fetch_sub(delta, Ordering::Relaxed);
5766        }
5767
5768        true
5769    }
5770
5771    /// Detach the resident child node `node_id` from its parent IN, dropping
5772    /// the strong `Arc` so the node is actually freed from memory, and return
5773    /// the heap bytes reclaimed (0 if not found / not detachable).
5774    ///
5775    /// This is the faithful port of JE `IN.detachNode(idx, updateLsn, newLsn)`
5776    /// (IN.java ~4019) as called from `Evictor.evict` (Evictor.java ~3035):
5777    /// `evict` measures `target.getBudgetedMemorySize()` and then
5778    /// `parent.detachNode(index, ...)` does `setTarget(idx, null)` to drop the
5779    /// child reference and `getInMemoryINs().remove(child)` to drop it from
5780    /// the INList.
5781    ///
5782    /// EV-13: before this method existed, the evictor credited
5783    /// `node_size_fn(node_id)` bytes back to the budget and removed the node
5784    /// from the LRU lists, but the parent's `InEntry.child` still held a
5785    /// strong `Arc` — so the node was never dropped from the heap.  The budget
5786    /// over-credited (claimed bytes freed that were not), `cache_usage`
5787    /// drifted below reality, and the evictor under-fired.  Detaching here
5788    /// drops the `Arc` for real and credits exactly the measured size.
5789    ///
5790    /// The detach happens **under the parent IN write latch** (JE detaches
5791    /// under the parent's latch), so no concurrent descender can re-cache the
5792    /// child between measurement and detach.  The slot (key + LSN) is kept —
5793    /// only the in-memory `child` target is cleared — matching JE's
5794    /// `setTarget(idx, null)` which leaves the `ChildReference` LSN intact so
5795    /// the node can be re-fetched from the log later.
5796    ///
5797    /// Returns `0` if the node is not a resident child of any IN (e.g. it is
5798    /// the root, already detached, or was pinned and could not be latched).
5799    pub fn detach_node_by_id(&self, node_id: u64) -> u64 {
5800        let root = match self.get_root() {
5801            Some(r) => r,
5802            None => return 0,
5803        };
5804
5805        // The root has no parent IN to detach from (JE evicts the root via a
5806        // separate evictRoot path; we keep the root resident here).
5807        let root_id = {
5808            let g = root.read();
5809            match &*g {
5810                TreeNode::Internal(n) => n.node_id,
5811                TreeNode::Bottom(b) => b.node_id,
5812            }
5813        };
5814        if root_id == node_id {
5815            return 0;
5816        }
5817
5818        // Locate the parent IN and the child slot index.
5819        let (parent_arc, child_index) =
5820            match Self::find_parent_of_node_id(&root, node_id) {
5821                Some(p) => p,
5822                None => return 0,
5823            };
5824
5825        // ---- Measure + detach UNDER THE PARENT WRITE LATCH ----
5826        // Holding parent.write() excludes all descenders (they take
5827        // parent.read() hand-over-hand), so the child cannot be re-cached or
5828        // re-pinned between the measurement and the detach.  Mirrors JE
5829        // detachNode running under the parent latch held by Evictor.evict.
5830        let mut parent_guard = parent_arc.write();
5831        let TreeNode::Internal(p) = &mut *parent_guard else {
5832            return 0; // parent is not an IN (concurrent restructure)
5833        };
5834        if child_index >= p.entries.len() {
5835            return 0;
5836        }
5837        // T-4: detach the cached child via the node-level INTargetRep, leaving
5838        // the slot's key/LSN intact for re-fetch (JE IN.setTarget(idx, null)).
5839        let child = match p.take_child(child_index) {
5840            Some(c) => c,     // child Arc removed from the slot
5841            None => return 0, // already detached
5842        };
5843
5844        // Measure the child's real heap footprint while we still hold it.
5845        // JE: long evictedBytes = target.getBudgetedMemorySize().
5846        let freed = child.read().budgeted_memory_size();
5847
5848        // EV-14 re-fetch correctness: the parent slot LSN must point at the
5849        // child's CURRENT on-disk version so `child_at_or_fetch` re-reads the
5850        // right bytes (JE `IN.updateEntry(idx, newLsn)` is called whenever a
5851        // child is logged; the parent slot LSN tracks the child's LSN).  The
5852        // evictor only fully evicts/detaches a CLEAN BIN (it logs+clears dirty
5853        // BINs via flush_dirty_node_to_log first, which sets `last_full_lsn`),
5854        // so the child's authoritative LSN is its `last_full_lsn`.  Stamp it
5855        // into the parent slot before dropping the child; if it is null (the
5856        // child was never logged) leave the existing slot LSN intact rather
5857        // than writing a null — a never-logged clean child cannot occur on
5858        // the evict path, but be conservative.
5859        let child_full_lsn = match &*child.read() {
5860            TreeNode::Bottom(b) => b.last_full_lsn,
5861            TreeNode::Internal(_) => NULL_LSN,
5862        };
5863        if child_full_lsn != NULL_LSN {
5864            p.set_lsn(child_index, child_full_lsn);
5865        }
5866
5867        // Mark the parent dirty: the slot's in-memory target changed (JE
5868        // detachNode sets dirty when updateLsn; we conservatively mark dirty
5869        // so the parent is re-logged with the now-non-resident slot).
5870        p.dirty = true;
5871
5872        // Drop the strong Arc explicitly so the node is freed now (the slot's
5873        // `child` is already None).  If any other resident path still held a
5874        // strong reference this would not free — but the tree is the sole
5875        // strong owner of a cached child, so this drops the last strong ref.
5876        drop(parent_guard);
5877        drop(child);
5878
5879        // JE: getInMemoryINs().remove(child) — drop it from the evictor LRU.
5880        self.note_removed(node_id);
5881
5882        // NOTE: the live tree-memory counter (`memory_counter`) is the SAME
5883        // `Arc<AtomicI64>` the evictor's Arbiter uses as `cache_usage`.  The
5884        // evictor decrements it once via `Arbiter::release_memory(bytes)` for
5885        // the full eviction batch, so detach must NOT decrement here too —
5886        // that would double-credit and drive `cache_usage` below reality
5887        // (the very drift EV-13 fixes, in the other direction).  We only
5888        // measure-and-free; the caller does the single counter update.
5889        freed
5890    }
5891
5892    /// Evict the root IN of this tree (EV-14).
5893    ///
5894    /// Faithful port of JE `Evictor.evictRoot` (Evictor.java:3050-3110) plus
5895    /// the `RootEvictor.doWork` + `Tree.withRootLatchedExclusive` framing
5896    /// (Evictor.java:2529-2576, Tree.java:508-517).  Unlike a normal IN, the
5897    /// root has no parent slot to detach from; instead the *tree's* root
5898    /// reference is the equivalent of the `RootChildReference`, so eviction:
5899    ///
5900    ///   1. Latches the root reference exclusively (`rootLatch.acquireExclusive`
5901    ///      via `withRootLatchedExclusive`).
5902    ///   2. Re-checks that the root is still resident and still evictable
5903    ///      (no resident children, no pinned BIN — JE `RootEvictor.doWork`
5904    ///      re-latches and re-checks `rootIN == target && rootIN.isRoot()`).
5905    ///   3. If the root is dirty, LOGS it first so the on-disk version is
5906    ///      current and updates `root_log_lsn` to the new LSN (JE
5907    ///      `evictRoot`: `long newLsn = target.log(...); rootRef.setLsn(newLsn)`).
5908    ///   4. Clears the in-memory root (`rootRef.clearTarget()` — JE leaves the
5909    ///      `ChildReference` LSN intact; here `root_log_lsn` is that LSN) and
5910    ///      `note_removed`s it from the evictor LRU (JE `inList.remove(target)`).
5911    ///
5912    /// On the next access `fetch_root_from_log` re-materializes the root from
5913    /// `root_log_lsn` (JE `Tree.getRootINRootAlreadyLatched` →
5914    /// `root.fetchTarget`).
5915    ///
5916    /// # Conditions (eviction is REFUSED, returning `None`, when)
5917    ///
5918    /// * there is no log manager wired (the root could never be re-fetched),
5919    /// * the tree has no resident root (already evicted),
5920    /// * the root has any resident child (JE only evicts a childless root —
5921    ///   the `hasCachedChildren` skip in `processTarget`; a root with cached
5922    ///   children would orphan them, the EV-6 invariant),
5923    /// * the root is a BIN pinned by a cursor (`cursor_count > 0`),
5924    /// * the root is dirty but we have no clean persisted version AND logging
5925    ///   it fails, or
5926    /// * the root is clean but `root_log_lsn` is null (never logged — cannot
5927    ///   be re-fetched; happens only for a brand-new unlogged tree).
5928    ///
5929    /// Returns `Some((freed_bytes, was_dirty))` on success, where `freed_bytes`
5930    /// is the root's measured heap footprint (JE
5931    /// `target.getBudgetedMemorySize()`) and `was_dirty` reports whether the
5932    /// root had to be logged (JE `rootEvictor.flushed`, which drives
5933    /// `nDirtyNodesEvicted` and `modifyDbRoot`).
5934    pub fn evict_root(&self, db_id: u64) -> Option<(u64, bool)> {
5935        // A root with no re-fetch path must never be made non-resident.
5936        self.log_manager.as_ref()?;
5937
5938        // JE `Tree.withRootLatchedExclusive(rootEvictor)`: hold the root latch
5939        // exclusively across the whole evict so no descender or splitter can
5940        // observe/install a half-evicted root.  Acquiring `self.root.write()`
5941        // is the Noxu equivalent (it is the lock guarding the root pointer).
5942        let mut root_slot = self.root.write();
5943        let root_arc = root_slot.as_ref()?.clone();
5944
5945        // JE `RootEvictor.doWork`: re-latch the target and re-check the
5946        // conditions.  We hold the node guard for the duration.
5947        let node_guard = root_arc.write();
5948
5949        // EV-6 / JE `processTarget` hasCachedChildren skip: a root with any
5950        // resident child must NOT be evicted (it would orphan the child).
5951        // EV-14 only evicts an *idle* root whose children are already
5952        // non-resident (or which is itself a leaf BIN).
5953        let (node_id, was_dirty, freed) = match &*node_guard {
5954            TreeNode::Internal(n) => {
5955                if !n.resident_children().is_empty() {
5956                    return None; // has cached children — keep resident
5957                }
5958                (n.node_id, n.dirty, node_guard.budgeted_memory_size())
5959            }
5960            TreeNode::Bottom(b) => {
5961                if b.cursor_count > 0 {
5962                    return None; // pinned by a cursor — keep resident
5963                }
5964                (
5965                    b.node_id,
5966                    b.dirty || b.dirty_count() > 0,
5967                    node_guard.budgeted_memory_size(),
5968                )
5969            }
5970        };
5971
5972        // If dirty, log the root first so the on-disk version is current,
5973        // then record the new LSN as the root's re-fetch point (JE
5974        // `evictRoot`: target.log(...) + rootRef.setLsn(newLsn)).
5975        if was_dirty {
5976            let lm = self.log_manager.as_ref()?; // checked above; re-borrow
5977            let node_bytes = node_guard.write_to_bytes();
5978            let is_bin = node_guard.is_bin();
5979            let entry = noxu_log::entry::in_log_entry::InLogEntry::new(
5980                db_id, NULL_LSN, // prev_full_lsn
5981                NULL_LSN, // prev_delta_lsn
5982                node_bytes,
5983            );
5984            let mut buf = bytes::BytesMut::with_capacity(entry.log_size());
5985            entry.write_to_log(&mut buf);
5986            let entry_type = if is_bin {
5987                noxu_log::LogEntryType::BIN
5988            } else {
5989                noxu_log::LogEntryType::IN
5990            };
5991            // flush_required = true so the root's bytes are durable before we
5992            // drop the in-memory copy (JE logs synchronously in evictRoot).
5993            let new_lsn = match lm.log(
5994                entry_type,
5995                &buf,
5996                noxu_log::Provisional::No,
5997                true,  // flush_required
5998                false, // fsync at next checkpoint
5999            ) {
6000                Ok(l) => l,
6001                Err(_) => return None, // could not log — keep the root resident
6002            };
6003            *self.root_log_lsn.write() = new_lsn;
6004        } else {
6005            // Clean root: it must already be re-fetchable.  If it was never
6006            // logged (root_log_lsn null) we cannot evict it safely.
6007            if *self.root_log_lsn.read() == NULL_LSN {
6008                return None;
6009            }
6010        }
6011
6012        // JE `rootRef.clearTarget()` + `inList.remove(target)`: drop the
6013        // in-memory root and remove it from the evictor LRU.  The root_log_lsn
6014        // is the surviving `ChildReference` LSN used to re-fetch it.
6015        drop(node_guard);
6016        *root_slot = None;
6017        drop(root_slot);
6018        self.note_removed(node_id);
6019
6020        Some((freed, was_dirty))
6021    }
6022
6023    /// Re-materialize an evicted root IN from its persisted `root_log_lsn`
6024    /// (EV-14, piece B).
6025    /// Faithful to JE `Tree.getRootINRootAlreadyLatched` (Tree.java:477-516)
6026    /// which calls `root.fetchTarget(database, null)` when the in-memory
6027    /// target is null.  Idempotent and cheap when the root is already
6028    /// resident: returns the resident root without touching the log.
6029    ///
6030    /// Returns `None` only when the tree is genuinely empty (no resident root
6031    /// AND `root_log_lsn` is null) or when the re-fetch fails (no log manager,
6032    /// log read error, deserialize failure) — callers then see an empty tree,
6033    /// never wrong data.
6034    pub fn fetch_root_from_log(&self) -> Option<Arc<RwLock<TreeNode>>> {
6035        // Fast path: root already resident.
6036        if let Some(r) = self.root.read().clone() {
6037            return Some(r);
6038        }
6039        // Take the write lock and re-check (another thread may have re-fetched
6040        // it while we waited — JE upgrades the root latch the same way).
6041        let mut root_slot = self.root.write();
6042        if let Some(r) = root_slot.as_ref() {
6043            return Some(r.clone());
6044        }
6045        let log_lsn = *self.root_log_lsn.read();
6046        let node = self.fetch_node_from_log(log_lsn)?;
6047        let node_id = node.node_id();
6048        let arc = Arc::new(RwLock::new(node));
6049        *root_slot = Some(arc.clone());
6050        drop(root_slot);
6051        // JE: a fetched IN is added back to the INList (Evictor LRU).
6052        self.note_added(node_id);
6053        Some(arc)
6054    }
6055
6056    /// Return the resident child Arc for slot `idx` of `parent_arc`, fetching
6057    /// it from its slot LSN and installing it if it is not resident (EV-14 /
6058    /// EV-13 re-fetch on descent).
6059    ///
6060    /// Faithful to JE `ChildReference.fetchTarget` (and `IN.fetchTarget`):
6061    /// when a slot's in-memory target is null but its LSN is valid, the node
6062    /// is read back from the log and cached in the slot.  Installing the
6063    /// fetched child requires the parent EX-latch, so this takes the parent
6064    /// write lock; the fast path (child already resident) takes only a read
6065    /// lock.
6066    ///
6067    /// Returns `None` only when the slot index is out of range, the slot has
6068    /// no valid LSN, or the log read/deserialize fails — callers then treat
6069    /// the descent as terminating in an empty subtree, never wrong data.
6070    fn child_at_or_fetch(
6071        &self,
6072        parent_arc: &Arc<RwLock<TreeNode>>,
6073        idx: usize,
6074    ) -> Option<ChildArc> {
6075        // Fast path: child already cached (read lock only).
6076        {
6077            let g = parent_arc.read();
6078            if let TreeNode::Internal(n) = &*g {
6079                if let Some(c) = n.get_child(idx) {
6080                    return Some(c);
6081                }
6082            } else {
6083                return None; // BINs have no IN children
6084            }
6085        }
6086        // Slow path: fetch the child from its slot LSN under the parent
6087        // EX-latch (JE installs the fetched target under the IN latch).
6088        let mut g = parent_arc.write();
6089        let TreeNode::Internal(n) = &mut *g else {
6090            return None;
6091        };
6092        // Re-check: another thread may have fetched it while we upgraded.
6093        if let Some(c) = n.get_child(idx) {
6094            return Some(c);
6095        }
6096        if idx >= n.entries.len() {
6097            return None;
6098        }
6099        let child_lsn = n.get_lsn(idx);
6100        let node = self.fetch_node_from_log(child_lsn)?;
6101        let node_id = node.node_id();
6102        let arc: ChildArc = Arc::new(RwLock::new(node));
6103        n.set_child(idx, Some(arc.clone()));
6104        drop(g);
6105        // JE: a fetched IN is added back to the INList (Evictor LRU).
6106        self.note_added(node_id);
6107        Some(arc)
6108    }
6109
6110    /// Check whether a BIN node is a candidate for slot compression and,
6111    /// if so, trigger `compress_bin`.
6112    ///
6113    /// from (the opportunistic / lazy compression path).
6114    ///
6115    /// # Algorithm
6116    ///
6117    /// 1. Skip the BIN if it is a delta or has no defunct (known-deleted) slots.
6118    /// 2. If compression succeeds and the BIN becomes empty, it is pruned.
6119    ///
6120    /// # Returns
6121    ///
6122    /// `true` if compression was triggered (regardless of whether any slots
6123    /// were actually removed), `false` if the BIN does not need compression.
6124    pub fn maybe_compress_bin_and_parent(
6125        &self,
6126        bin_arc: &Arc<RwLock<TreeNode>>,
6127    ) -> bool {
6128        // Check whether the BIN has any deleted slots worth compressing.
6129        // lazyCompress: skip deltas and BINs with no defunct slots.
6130        let should_compress = {
6131            {
6132                let g = bin_arc.read();
6133                match &*g {
6134                    TreeNode::Bottom(b) => {
6135                        // Skip deltas (the: !in.isBIN() || in.isBINDelta()).
6136                        if b.is_delta {
6137                            false
6138                        } else {
6139                            // Check for any known-deleted slot
6140                            // (the: for (int i=0; i < bin.getNEntries(); i++) {
6141                            //        if (bin.isDefunct(i)) { ... break; }
6142                            //      }).
6143                            b.entries.iter().any(|e| e.known_deleted)
6144                        }
6145                    }
6146                    _ => false,
6147                }
6148            }
6149        };
6150
6151        if !should_compress {
6152            return false;
6153        }
6154
6155        self.compress_bin(bin_arc)
6156    }
6157
6158    // ========================================================================
6159    // Latch-coupling validation
6160    // ========================================================================
6161
6162    /// Validate that `parent.entries[child_index].child` still points at
6163    /// `child_arc` after acquiring the child's latch.
6164    ///
6165    /// Re-latch validation step inside the
6166    /// `Tree.searchSplitsAllowed`: after a concurrent split the parent
6167    /// slot that previously held the child may have changed.  Callers that
6168    /// plan to mutate the child must verify the parent-child link is still
6169    /// intact before proceeding.
6170    ///
6171    /// Returns `true` if the parent-child link is intact.
6172    pub fn validate_parent_child(
6173        parent: &Arc<RwLock<TreeNode>>,
6174        child_index: usize,
6175        child_arc: &Arc<RwLock<TreeNode>>,
6176    ) -> bool {
6177        let g = parent.read();
6178        match &*g {
6179            TreeNode::Internal(p) => match p.child_ref(child_index) {
6180                Some(stored) => Arc::ptr_eq(stored, child_arc),
6181                None => false,
6182            },
6183            TreeNode::Bottom(_) => false,
6184        }
6185    }
6186
6187    /// Search for the BIN that should contain `key`, with latch-coupling
6188    /// validation at every level of descent.
6189    ///
6190    /// .
6191    ///
6192    /// The difference from `search()` is that after obtaining the child
6193    /// arc we call `validate_parent_child` to confirm the parent still
6194    /// holds the expected Arc.  If the link has been broken (e.g. by a
6195    /// concurrent split that relocated the child) the traversal restarts
6196    /// from the root.
6197    ///
6198    /// Returns a `SearchResult` if the key is (or should be) in the tree,
6199    /// `None` if the tree is empty.
6200    ///
6201    /// Same as [`Tree::search`] but exposes the hand-over-hand latch
6202    /// coupling explicitly. Kept as a public, equivalent API for
6203    /// callers (today only tests) that want to verify the
6204    /// latch-coupling behaviour against `search()` itself.
6205    ///
6206    /// Both `search()` and this method use the same `read_arc()`
6207    /// hand-over-hand: take the child read guard *before* dropping
6208    /// the parent guard, so a concurrent `split_child(parent, ..)`
6209    /// (which takes `parent.write()`) cannot run between when we
6210    /// captured the child Arc and when we entered the child. There
6211    /// is no validate-and-restart loop because the coupling makes
6212    /// the race unreachable.
6213    pub fn search_with_coupling(&self, key: &[u8]) -> Option<SearchResult> {
6214        let root = self.get_root()?;
6215        let mut guard: parking_lot::ArcRwLockReadGuard<
6216            parking_lot::RawRwLock,
6217            TreeNode,
6218        > = root.read_arc();
6219
6220        loop {
6221            if guard.is_bin() {
6222                let index = guard.find_entry(key, true, true);
6223                let found = index >= 0 && (index & EXACT_MATCH != 0);
6224                return Some(SearchResult::with_values(
6225                    found,
6226                    index & 0xFFFF,
6227                    false,
6228                ));
6229            }
6230
6231            let parent_arc =
6232                parking_lot::ArcRwLockReadGuard::rwlock(&guard).clone();
6233            let next_idx = match &*guard {
6234                TreeNode::Internal(n) => {
6235                    if n.entries.is_empty() {
6236                        return None;
6237                    }
6238                    let idx = self.upper_in_floor_index(&n.entries, key);
6239                    match n.get_child(idx) {
6240                        Some(c) => {
6241                            let next_guard = c.read_arc();
6242                            drop(guard);
6243                            guard = next_guard;
6244                            continue;
6245                        }
6246                        None => idx, // EV-14/EV-13: re-fetch below.
6247                    }
6248                }
6249                TreeNode::Bottom(_) => {
6250                    unreachable!("is_bin() returned false above")
6251                }
6252            };
6253            // Hand-over-hand: take the child read guard before
6254            // releasing the parent guard. Closes the
6255            // descender-vs-splitter window: a concurrent
6256            // split_child(parent, ..) takes parent.write(), which
6257            // blocks while we still hold parent.read().
6258            drop(guard);
6259            let child = self.child_at_or_fetch(&parent_arc, next_idx)?;
6260            guard = child.read_arc();
6261        }
6262    }
6263
6264    // ========================================================================
6265    // BIN-Delta reconstitution
6266    // ========================================================================
6267
6268    /// Increments the cursor-pin count on a BIN node.
6269    ///
6270    /// Called by `CursorImpl` when it positions on (or enters) a BIN.
6271    /// The evictor will not select a BIN with `cursor_count > 0` for eviction
6272    /// (`RealNodeInfo.pin_count`), matching `BIN.incrementCursorCount()`.
6273    pub fn pin_bin(bin_arc: &Arc<RwLock<TreeNode>>) {
6274        let mut guard = bin_arc.write();
6275        if let TreeNode::Bottom(ref mut stub) = *guard {
6276            stub.cursor_count += 1;
6277        }
6278    }
6279
6280    /// Decrements the cursor-pin count on a BIN node.
6281    ///
6282    /// Called by `CursorImpl` when it moves away from or closes on a BIN.
6283    /// Uses `saturating_sub` to guard against an accidental double-unpin.
6284    /// Matching `BIN.decrementCursorCount()`.
6285    pub fn unpin_bin(bin_arc: &Arc<RwLock<TreeNode>>) {
6286        let mut guard = bin_arc.write();
6287        if let TreeNode::Bottom(ref mut stub) = *guard {
6288            stub.cursor_count = stub.cursor_count.saturating_sub(1);
6289        }
6290    }
6291
6292    /// Returns `true` if the given `BinStub` is a BIN-delta (not a full BIN).
6293    ///
6294    /// `IN.isBINDelta()`.
6295    pub fn bin_is_delta(bin: &BinStub) -> bool {
6296        bin.is_delta
6297    }
6298
6299    /// Merge delta entries into a full BIN's entry list.
6300    ///
6301    /// - For each delta entry: if a matching key already exists in `bin`,
6302    ///   replace it (delta is authoritative).
6303    /// - Otherwise insert the delta entry in sorted position.
6304    ///
6305    /// Delta entries carry **full** keys (prefix already prepended by the
6306    /// caller).  After applying all delta entries the BIN's prefix is
6307    /// recomputed so the final state is consistent.
6308    ///
6309    /// All delta entries are considered to be the most-recently-dirtied
6310    /// state, exactly as in where delta slots supersede full-BIN slots.
6311    pub fn apply_delta_to_bin(
6312        bin: &mut BinStub,
6313        delta_entries: Vec<(Vec<u8>, Lsn, Option<Vec<u8>>)>,
6314    ) {
6315        for (full_key, lsn, data) in delta_entries {
6316            // `full_key` is a full (uncompressed) key here.
6317            bin.insert_with_prefix(full_key, lsn, data);
6318        }
6319        bin.dirty = true;
6320    }
6321
6322    /// Reconstitute a BIN-delta into a full BIN.
6323    ///
6324    /// from the:
6325    ///
6326    /// 1. Extract the delta entries from `self` (this BIN-delta), decompressing
6327    ///    them to full keys.
6328    /// 2. Apply them onto `base` (the previously logged full BIN) via
6329    ///    `apply_delta_to_bin`.
6330    /// 3. Copy `base`'s merged entries and prefix back into `self`.
6331    /// 4. Clear the `is_delta` flag so subsequent code treats `self` as
6332    ///    a full BIN.
6333    ///
6334    /// After this call `self` is a full BIN; `base` should be discarded.
6335    pub fn mutate_to_full_bin(delta: &mut BinStub, mut base: BinStub) {
6336        // Decompress delta entries to full keys before applying.
6337        let delta_full_entries: Vec<(Vec<u8>, Lsn, Option<Vec<u8>>)> = (0
6338            ..delta.entries.len())
6339            .map(|i| {
6340                (
6341                    delta.get_full_key(i).unwrap_or_default(),
6342                    delta.get_lsn(i),
6343                    delta.entries[i].data.clone(),
6344                )
6345            })
6346            .collect();
6347        // reconstituteBIN + resetContent + setBINDelta(false).
6348        Self::apply_delta_to_bin(&mut base, delta_full_entries);
6349        delta.entries = base.entries;
6350        delta.lsn_rep = base.lsn_rep; // T-3
6351        delta.keys = base.keys; // T-2
6352        delta.key_prefix = base.key_prefix;
6353        delta.is_delta = false;
6354        delta.dirty = true;
6355    }
6356
6357    /// Read an IN/BIN log entry at `log_lsn` and deserialise it into a
6358    /// `TreeNode`, ready to be installed as a (re-fetched) resident node.
6359    ///
6360    /// JE `LogManager.getLogEntry(lsn)` + `IN.readFromLog` as used by
6361    /// `ChildReference.fetchTarget` (the path that re-materializes a
6362    /// non-resident node from its persisted LSN on descent) and by
6363    /// `Tree.getRootINRootAlreadyLatched` for the root.  The freshly-fetched
6364    /// node has no resident children (`TargetRep::None`); its own children, if
6365    /// any, are re-fetched on demand the same way when the descent reaches
6366    /// them.
6367    ///
6368    /// Returns `None` if the LSN is null, the log read fails, the entry is not
6369    /// an IN/BIN, or deserialisation fails (the caller treats this as "node
6370    /// unavailable" rather than panicking, matching the graceful-degradation
6371    /// policy of `mutate_to_full_bin_from_log`).
6372    fn fetch_node_from_log(&self, log_lsn: Lsn) -> Option<TreeNode> {
6373        if log_lsn == NULL_LSN {
6374            return None;
6375        }
6376        let lm = self.log_manager.as_ref()?;
6377        let (entry_type, payload) = lm.read_entry(log_lsn).ok()?;
6378        // The on-disk payload is an `InLogEntry` body (db_id | prev_full_lsn
6379        // | prev_delta_lsn | len | node_data).  The recovery scanner strips
6380        // this header before calling `recover_in_redo`; re-fetch must do the
6381        // same so `deserialize_*` sees the bare node bytes.  JE
6382        // `INLogEntry.readEntry` parses the same wrapper.
6383        let in_entry =
6384            noxu_log::entry::in_log_entry::InLogEntry::read_from_log(&payload)
6385                .ok()?;
6386        let node_data = &in_entry.node_data;
6387        use noxu_log::LogEntryType;
6388        match entry_type {
6389            LogEntryType::BIN => {
6390                Self::deserialize_bin(node_data).map(TreeNode::Bottom)
6391            }
6392            LogEntryType::IN => {
6393                Self::deserialize_upper_in(node_data).map(TreeNode::Internal)
6394            }
6395            // BIN-deltas are never logged as the *root* version and are
6396            // reconstituted by the BIN-delta path, not here.
6397            _ => {
6398                log::warn!(
6399                    "fetch_node_from_log: expected IN/BIN entry at LSN {:?}, \
6400                     got {:?}",
6401                    log_lsn,
6402                    entry_type
6403                );
6404                None
6405            }
6406        }
6407    }
6408
6409    /// Reconstitute a BIN-delta into a full BIN by reading the base from log.
6410    ///
6411    /// — the
6412    /// single-argument overload that calls `fetchFullBIN(databaseImpl)` to
6413    /// read the last full BIN from the log manager automatically.
6414    ///
6415    /// Algorithm:
6416    /// 1. If `delta.last_full_lsn == NULL_LSN`, the BIN was never written as a
6417    ///    full entry; there is no base to merge so the delta IS the full BIN.
6418    ///    Clear `is_delta` and return.
6419    /// 2. Read the full-BIN log entry at `delta.last_full_lsn` using
6420    ///    `log_manager.read_entry(lsn)`.
6421    /// 3. Deserialize the payload with `BinStub::deserialize_full()`.
6422    /// 4. Delegate to `Self::mutate_to_full_bin(delta, base)` to merge and
6423    ///    replace `delta`'s contents.
6424    ///
6425    /// On any read / parse failure the function falls back to clearing the
6426    /// `is_delta` flag without merging, so the caller always gets a non-delta
6427    /// BIN (possibly missing some old slots).  This mirrors the
6428    /// `EnvironmentFailureException` path but gracefully degrades instead of
6429    /// panicking.
6430    ///
6431    /// `BIN.fetchFullBIN(dbImpl)` + `BIN.mutateToFullBIN(boolean)`.
6432    pub fn mutate_to_full_bin_from_log(
6433        delta: &mut BinStub,
6434        log_manager: &noxu_log::LogManager,
6435    ) {
6436        if !delta.is_delta {
6437            // Already a full BIN; nothing to do.
6438            return;
6439        }
6440
6441        if delta.last_full_lsn == NULL_LSN {
6442            // BIN has never been logged as a full entry — the in-memory delta
6443            // is effectively the full state. During recovery this path is
6444            // harmless.
6445            delta.is_delta = false;
6446            return;
6447        }
6448
6449        // Read the full-BIN log entry at last_full_lsn.
6450        // `envImpl.getLogManager().getEntryHandleFileNotFound(lsn)`.
6451        match log_manager.read_entry(delta.last_full_lsn) {
6452            Ok((entry_type, payload)) => {
6453                use noxu_log::LogEntryType;
6454                if entry_type == LogEntryType::BIN {
6455                    if let Some(mut base) = BinStub::deserialize_full(&payload)
6456                    {
6457                        // Set the base's last_full_lsn so it is preserved
6458                        // into the merged result.
6459                        base.last_full_lsn = delta.last_full_lsn;
6460                        Self::mutate_to_full_bin(delta, base);
6461                        return;
6462                    }
6463                    // Deserialization failed — fall through to graceful degradation.
6464                    log::warn!(
6465                        "mutate_to_full_bin_from_log: failed to deserialize \
6466                         full BIN at LSN {:?}; keeping delta as-is",
6467                        delta.last_full_lsn
6468                    );
6469                } else {
6470                    log::warn!(
6471                        "mutate_to_full_bin_from_log: expected BIN entry at \
6472                         LSN {:?}, got {:?}",
6473                        delta.last_full_lsn,
6474                        entry_type
6475                    );
6476                }
6477            }
6478            Err(e) => {
6479                log::warn!(
6480                    "mutate_to_full_bin_from_log: failed to read log at \
6481                     LSN {:?}: {}",
6482                    delta.last_full_lsn,
6483                    e
6484                );
6485            }
6486        }
6487
6488        // Graceful degradation: promote the delta to a "full" BIN without
6489        // the base slots.  The BIN will be re-logged as a full BIN at the
6490        // next checkpoint.
6491        delta.is_delta = false;
6492        delta.dirty = true;
6493    }
6494
6495    // ========================================================================
6496    // getNextBin / getPrevBin
6497    // ========================================================================
6498
6499    /// Return the entries of the BIN immediately to the right of the BIN
6500    /// that contains (or would contain) `current_key`.
6501    ///
6502    /// → `Tree.getNextIN(forward=true)`.
6503    ///
6504    /// # Algorithm
6505    /// 1. Build a root-to-BIN path for `current_key`.
6506    /// 2. Walk the path back up looking for a parent that has a slot to the
6507    ///    right of the slot we descended through.
6508    /// 3. When found, descend to the leftmost BIN of that sibling subtree.
6509    /// 4. If no such parent exists, return `None` (no next BIN).
6510    pub fn get_next_bin(
6511        &self,
6512        current_key: &[u8],
6513    ) -> Option<Vec<(BinEntry, Lsn, Vec<u8>)>> {
6514        let root = self.get_root()?;
6515        self.get_adjacent_bin(&root, current_key, true)
6516    }
6517
6518    /// Return the entries of the BIN immediately to the left of the BIN
6519    /// that contains (or would contain) `current_key`.
6520    ///
6521    /// → `Tree.getNextIN(forward=false)`.
6522    pub fn get_prev_bin(
6523        &self,
6524        current_key: &[u8],
6525    ) -> Option<Vec<(BinEntry, Lsn, Vec<u8>)>> {
6526        let root = self.get_root()?;
6527        self.get_adjacent_bin(&root, current_key, false)
6528    }
6529
6530    /// Core implementation shared by `get_next_bin` and `get_prev_bin`.
6531    ///
6532    /// Builds the path from `root` down to the BIN for `current_key`
6533    /// (each element records the parent arc, the slot index taken,
6534    /// and the child Arc reached) using `read_arc()` hand-over-hand
6535    /// latch coupling.
6536    ///
6537    /// The ascent re-acquires the parent's read lock one level at a
6538    /// time. To handle a concurrent split that completes between
6539    /// path capture and ascent, we validate that the slot still
6540    /// holds the child Arc we descended through. If the slot
6541    /// mismatches we retry the whole operation from root with a
6542    /// short pause between attempts. The retry budget is generous
6543    /// (`MAX_ASCENT_ATTEMPTS`) so that the typical case of a few
6544    /// cascading splits between two BIN-level cursor steps is
6545    /// absorbed without surfacing as a false end-of-iteration.
6546    /// After exhausting the budget we conservatively return `None`,
6547    /// signalling "no adjacent BIN found"; the cursor will then
6548    /// either restart its scan or report end-of-iteration. The
6549    /// budget is finite so a pathological workload (a thread
6550    /// permanently splitting under us) cannot livelock the lookup.
6551    /// JE `Tree.getNextIN` / `Tree.getPrevIN`.
6552    ///
6553    /// R3 fix (2026-06-16): converted from `static fn` to `&self` so that the
6554    /// IN-level descent uses `self.upper_in_floor_index` (comparator-aware)
6555    /// instead of a raw byte `<=`. Without this, databases with a custom
6556    /// comparator (secondary indexes, sorted-dup) could descend to the wrong
6557    /// child → wrong adjacent BIN → incorrect cursor iteration across BIN
6558    /// boundaries. Mirrors `Tree.getNextIN`/`Tree.getPrevIN` using the
6559    /// comparator-aware `IN.findEntry`.
6560    fn get_adjacent_bin(
6561        &self,
6562        root: &Arc<RwLock<TreeNode>>,
6563        current_key: &[u8],
6564        forward: bool,
6565    ) -> Option<Vec<(BinEntry, Lsn, Vec<u8>)>> {
6566        const MAX_ASCENT_ATTEMPTS: u32 = 8;
6567        for attempt in 0..MAX_ASCENT_ATTEMPTS {
6568            match self.get_adjacent_bin_attempt(root, current_key, forward) {
6569                AdjacentBinOutcome::Found(v) => return Some(v),
6570                AdjacentBinOutcome::NoAdjacent => return None,
6571                AdjacentBinOutcome::SplitRaceRetry => {
6572                    // Brief pause to let the splitter finish.
6573                    if attempt + 1 < MAX_ASCENT_ATTEMPTS {
6574                        std::thread::yield_now();
6575                    }
6576                }
6577            }
6578        }
6579        // Exhausted retry budget. Signal "no adjacent" so the
6580        // cursor can fall back to its end-of-iteration path.
6581        None
6582    }
6583
6584    /// One attempt at `get_adjacent_bin`. The tri-state return
6585    /// value distinguishes "no adjacent BIN exists" (which the
6586    /// caller should propagate as end-of-iteration) from "a
6587    /// concurrent split invalidated our path" (which the caller
6588    /// should retry from root).
6589    fn get_adjacent_bin_attempt(
6590        &self,
6591        root: &Arc<RwLock<TreeNode>>,
6592        current_key: &[u8],
6593        forward: bool,
6594    ) -> AdjacentBinOutcome {
6595        // Path entry: (parent_arc, slot_idx_taken, child_arc_reached).
6596        // The child Arc lets the ascent validate that the slot still
6597        // points to the same node we descended through.
6598        let mut path: Vec<(
6599            Arc<RwLock<TreeNode>>,
6600            usize,
6601            Arc<RwLock<TreeNode>>,
6602        )> = Vec::new();
6603
6604        let mut guard: parking_lot::ArcRwLockReadGuard<
6605            parking_lot::RawRwLock,
6606            TreeNode,
6607        > = root.read_arc();
6608        loop {
6609            if guard.is_bin() {
6610                break;
6611            }
6612
6613            let (next_arc, slot_idx) = match &*guard {
6614                TreeNode::Internal(n) => {
6615                    if n.entries.is_empty() {
6616                        return AdjacentBinOutcome::NoAdjacent;
6617                    }
6618                    // R3 fix: use comparator-aware upper_in_floor_index so
6619                    // that custom-comparator / sorted-dup databases descend
6620                    // to the correct child. Mirrors JE Tree.getNextIN which
6621                    // uses IN.findEntry (comparator-aware) not raw byte order.
6622                    let idx =
6623                        self.upper_in_floor_index(&n.entries, current_key);
6624                    let child = match n.get_child(idx) {
6625                        Some(c) => c,
6626                        None => return AdjacentBinOutcome::NoAdjacent,
6627                    };
6628                    (child, idx)
6629                }
6630                TreeNode::Bottom(_) => unreachable!(),
6631            };
6632
6633            // Record the parent and the child we are about to enter
6634            // — the child Arc lets the ascent validate the slot.
6635            let parent_arc =
6636                parking_lot::ArcRwLockReadGuard::rwlock(&guard).clone();
6637            path.push((parent_arc, slot_idx, Arc::clone(&next_arc)));
6638
6639            // Hand-over-hand: take child read lock BEFORE releasing parent.
6640            let next_guard = next_arc.read_arc();
6641            drop(guard);
6642            guard = next_guard;
6643        }
6644        drop(guard);
6645
6646        // Ascend the path. At each level, validate that
6647        // `parent.entries[taken_idx].child == descended_child` before
6648        // trusting `taken_idx` as a coordinate. If not, return
6649        // `SplitRaceRetry` so the caller restarts from root.
6650        while let Some((parent_arc, taken_idx, descended_child)) = path.pop() {
6651            let parent_guard = parent_arc.read();
6652            let (n_entries, slot_still_valid) = match &*parent_guard {
6653                TreeNode::Internal(p) => {
6654                    let n = p.entries.len();
6655                    let valid = p
6656                        .child_ref(taken_idx)
6657                        .is_some_and(|c| Arc::ptr_eq(c, &descended_child));
6658                    (n, valid)
6659                }
6660                _ => return AdjacentBinOutcome::NoAdjacent,
6661            };
6662            drop(parent_guard);
6663
6664            if !slot_still_valid {
6665                return AdjacentBinOutcome::SplitRaceRetry;
6666            }
6667
6668            let sibling_idx = if forward {
6669                taken_idx + 1
6670            } else if taken_idx == 0 {
6671                // No left sibling at this level — ascend further.
6672                continue;
6673            } else {
6674                taken_idx - 1
6675            };
6676
6677            if forward && sibling_idx >= n_entries {
6678                // No right sibling at this level — ascend further.
6679                continue;
6680            }
6681
6682            // Found a sibling slot — fetch the sibling child arc.
6683            let sibling_arc = {
6684                let g = parent_arc.read();
6685                match &*g {
6686                    TreeNode::Internal(p) => match p.get_child(sibling_idx) {
6687                        Some(c) => c,
6688                        None => return AdjacentBinOutcome::NoAdjacent,
6689                    },
6690                    _ => return AdjacentBinOutcome::NoAdjacent,
6691                }
6692            };
6693
6694            // Descend to the leftmost (forward) or rightmost (!forward) BIN.
6695            return match Self::descend_to_edge_bin(&sibling_arc, forward) {
6696                Some(v) => AdjacentBinOutcome::Found(v),
6697                None => AdjacentBinOutcome::NoAdjacent,
6698            };
6699        }
6700
6701        // Exhausted path without finding a sibling → no adjacent BIN.
6702        AdjacentBinOutcome::NoAdjacent
6703    }
6704
6705    /// Descend to the leftmost BIN (`forward = true`) or rightmost BIN
6706    /// (`forward = false`) in the sub-tree rooted at `node_arc`.
6707    ///
6708    /// `Tree.searchSubTree(SearchType.LEFT / RIGHT, targetLevel)`.
6709    fn descend_to_edge_bin(
6710        node_arc: &Arc<RwLock<TreeNode>>,
6711        forward: bool,
6712    ) -> Option<Vec<(BinEntry, Lsn, Vec<u8>)>> {
6713        // Hand-over-hand latch coupling — see Tree::search.
6714        let mut guard: parking_lot::ArcRwLockReadGuard<
6715            parking_lot::RawRwLock,
6716            TreeNode,
6717        > = node_arc.read_arc();
6718
6719        loop {
6720            if guard.is_bin() {
6721                return match &*guard {
6722                    TreeNode::Bottom(b) => {
6723                        // Return entries with full (decompressed) keys so that
6724                        // callers always work with complete keys.
6725                        //
6726                        // TREE-F1: KD slots are NOT filtered here — the BIN's
6727                        // slot indices are returned verbatim so the cursor can
6728                        // skip KD slots itself (CursorImpl getNext loop;
6729                        // CursorImpl.java:2062-2064) and continue to the next
6730                        // BIN when an edge BIN is entirely KD during the
6731                        // BIN-delta reconstitution window.
6732                        let full_entries: Vec<(BinEntry, Lsn, Vec<u8>)> = (0
6733                            ..b.entries.len())
6734                            .map(|i| {
6735                                (
6736                                    BinEntry {
6737                                        data: b.entries[i].data.clone(),
6738                                        known_deleted: b.entries[i]
6739                                            .known_deleted,
6740                                        dirty: b.entries[i].dirty,
6741                                        expiration_time: b.entries[i]
6742                                            .expiration_time,
6743                                    },
6744                                    b.get_lsn(i),
6745                                    b.get_full_key(i).unwrap_or_default(),
6746                                )
6747                            })
6748                            .collect();
6749                        Some(full_entries)
6750                    }
6751                    _ => None,
6752                };
6753            }
6754
6755            let next = match &*guard {
6756                TreeNode::Internal(n) => {
6757                    if forward {
6758                        n.get_child(0)?
6759                    } else {
6760                        n.get_child(n.entries.len().saturating_sub(1))?
6761                    }
6762                }
6763                _ => return None,
6764            };
6765            // Take child read lock BEFORE releasing parent's.
6766            let next_guard = next.read_arc();
6767            drop(guard);
6768            guard = next_guard;
6769        }
6770    }
6771}
6772
6773// ============================================================================
6774// Tree statistics
6775// ============================================================================
6776
6777/// Statistics collected by a full tree walk.
6778///
6779/// `TreeWalkerStatsAccumulator`.
6780#[derive(Debug, Default, Clone, PartialEq, Eq)]
6781pub struct TreeStats {
6782    /// Number of BINs (bottom internal nodes).
6783    pub n_bins: u64,
6784    /// Number of upper INs.
6785    pub n_ins: u64,
6786    /// Total number of entries across all nodes.
6787    pub n_entries: u64,
6788    /// Height of the tree (1 = root is a BIN, 2 = one level above BINs, …).
6789    pub height: u32,
6790}
6791
6792impl Tree {
6793    /// Walks the entire tree and collects structural statistics.
6794    ///
6795    /// `TreeWalkerStatsAccumulator` pattern — performs a simple
6796    /// recursive DFS and counts INs, BINs, entries, and tree height.
6797    pub fn collect_stats(&self) -> TreeStats {
6798        let mut stats = TreeStats::default();
6799        if let Some(root) = self.get_root() {
6800            Self::collect_stats_recursive(&root, &mut stats, 0);
6801        }
6802        stats
6803    }
6804
6805    fn collect_stats_recursive(
6806        node_arc: &Arc<RwLock<TreeNode>>,
6807        stats: &mut TreeStats,
6808        depth: u32,
6809    ) {
6810        let guard = node_arc.read();
6811
6812        let current_height = depth + 1;
6813        if current_height > stats.height {
6814            stats.height = current_height;
6815        }
6816
6817        match &*guard {
6818            TreeNode::Bottom(b) => {
6819                stats.n_bins += 1;
6820                stats.n_entries += b.entries.len() as u64;
6821            }
6822            TreeNode::Internal(n) => {
6823                stats.n_ins += 1;
6824                stats.n_entries += n.entries.len() as u64;
6825                // Collect child arcs before releasing the guard.
6826                let children: Vec<Arc<RwLock<TreeNode>>> =
6827                    n.resident_children();
6828                // Release guard before recursing to avoid lock ordering issues.
6829                drop(guard);
6830                for child in children {
6831                    Self::collect_stats_recursive(&child, stats, depth + 1);
6832                }
6833            }
6834        }
6835    }
6836
6837    /// Collects all dirty BINs as (Arc to node, db_id) pairs.
6838    ///
6839    /// The checkpoint path calls this to enumerate BINs that need to be
6840    /// logged.  For each dirty BIN the checkpoint decides — based on the
6841    /// BIN-delta threshold — whether to write a full `BIN` entry or a
6842    /// `BINDelta` entry.
6843    ///
6844    /// `Checkpointer.processINList()` which iterates the dirty
6845    /// IN list accumulated during normal operation.
6846    pub fn collect_dirty_bins(
6847        &self,
6848        db_id: u64,
6849    ) -> Vec<(u64, Arc<RwLock<TreeNode>>)> {
6850        let mut result = Vec::new();
6851        if let Some(root) = self.get_root() {
6852            Self::collect_dirty_bins_recursive(&root, db_id, &mut result);
6853        }
6854        result
6855    }
6856
6857    fn collect_dirty_bins_recursive(
6858        node_arc: &Arc<RwLock<TreeNode>>,
6859        db_id: u64,
6860        out: &mut Vec<(u64, Arc<RwLock<TreeNode>>)>,
6861    ) {
6862        let guard = node_arc.read();
6863        match &*guard {
6864            TreeNode::Bottom(b) => {
6865                // Include this BIN if it is dirty or has any dirty slots.
6866                if b.dirty || b.dirty_count() > 0 {
6867                    out.push((db_id, Arc::clone(node_arc)));
6868                }
6869            }
6870            TreeNode::Internal(n) => {
6871                let children: Vec<Arc<RwLock<TreeNode>>> =
6872                    n.resident_children();
6873                drop(guard);
6874                for child in children {
6875                    Self::collect_dirty_bins_recursive(&child, db_id, out);
6876                } // guard already dropped
6877            }
6878        }
6879    }
6880
6881    /// Collect all BINs that have at least one `known_deleted` slot.
6882    ///
6883    /// INCompressor queue-drain scan in the: the daemon iterates
6884    /// the in-memory IN list and identifies BINs that still hold zombie deleted
6885    /// slots.  Each returned `Arc` can be passed directly to `compress_bin()`.
6886    pub fn collect_bins_with_known_deleted(
6887        &self,
6888    ) -> Vec<Arc<RwLock<TreeNode>>> {
6889        let mut result = Vec::new();
6890        if let Some(root) = self.get_root() {
6891            Self::collect_bins_with_known_deleted_recursive(&root, &mut result);
6892        }
6893        result
6894    }
6895
6896    fn collect_bins_with_known_deleted_recursive(
6897        node_arc: &Arc<RwLock<TreeNode>>,
6898        out: &mut Vec<Arc<RwLock<TreeNode>>>,
6899    ) {
6900        let guard = node_arc.read();
6901        match &*guard {
6902            TreeNode::Bottom(b) => {
6903                if b.entries.iter().any(|e| e.known_deleted) {
6904                    out.push(Arc::clone(node_arc));
6905                }
6906            }
6907            TreeNode::Internal(n) => {
6908                let children: Vec<Arc<RwLock<TreeNode>>> =
6909                    n.resident_children();
6910                drop(guard);
6911                for child in children {
6912                    Self::collect_bins_with_known_deleted_recursive(
6913                        &child, out,
6914                    );
6915                }
6916            }
6917        }
6918    }
6919
6920    /// Collect all dirty upper (non-BIN) internal nodes, sorted ascending by
6921    /// level (bottom-up order, BIN level excluded).
6922    ///
6923    /// Serialise an upper-IN node (level > 1) by node_id for off-heap storage.
6924    ///
6925    /// Traverses the tree to find the internal node whose  matches,
6926    /// then calls  to produce a compact byte
6927    /// representation.  Returns  if the node is not found or is a BIN
6928    /// (BINs are not upper INs).
6929    ///
6930    /// Mirrors `OffHeapAllocator` serialises the same bytes that would be written
6931    /// to the log, allowing the evictor to store upper-INs off-heap and avoid
6932    /// log-file reads on the next traversal.
6933    pub fn serialize_upper_in(&self, node_id: u64) -> Option<Vec<u8>> {
6934        let root = self.get_root()?;
6935        Self::find_and_serialize_upper_in(&root, node_id)
6936    }
6937
6938    fn find_and_serialize_upper_in(
6939        node_arc: &Arc<RwLock<TreeNode>>,
6940        target_id: u64,
6941    ) -> Option<Vec<u8>> {
6942        let guard = node_arc.read();
6943        match &*guard {
6944            TreeNode::Bottom(_) => None, // BINs are not upper INs
6945            TreeNode::Internal(n) => {
6946                if n.node_id == target_id {
6947                    // Serialise InNodeStub for off-heap storage.
6948                    // Format: node_id(u64BE) | level(i32BE) | n_entries(u32BE)
6949                    //   then per-entry: key_len(u32BE) | key | lsn(u64BE)
6950                    let mut buf = Vec::new();
6951                    buf.extend_from_slice(&n.node_id.to_be_bytes());
6952                    buf.extend_from_slice(&n.level.to_be_bytes());
6953                    buf.extend_from_slice(
6954                        &(n.entries.len() as u32).to_be_bytes(),
6955                    );
6956                    for (i, e) in n.entries.iter().enumerate() {
6957                        buf.extend_from_slice(
6958                            &(e.key.len() as u32).to_be_bytes(),
6959                        );
6960                        buf.extend_from_slice(&e.key);
6961                        buf.extend_from_slice(
6962                            &n.get_lsn(i).as_u64().to_be_bytes(),
6963                        );
6964                    }
6965                    return Some(buf);
6966                }
6967                // Recurse into children before releasing the guard so we
6968                // hold the minimum read-lock duration.
6969                let children: Vec<Arc<RwLock<TreeNode>>> =
6970                    n.resident_children();
6971                drop(guard);
6972                for child in &children {
6973                    if let Some(bytes) =
6974                        Self::find_and_serialize_upper_in(child, target_id)
6975                    {
6976                        return Some(bytes);
6977                    }
6978                }
6979                None
6980            }
6981        }
6982    }
6983
6984    /// Upper-IN traversal in `Checkpointer.processINList()` from
6985    /// — visits all `TreeNode::Internal` nodes whose `dirty` flag is set
6986    /// and returns them together with their level, sorted lowest-level-first
6987    /// so the checkpointer can log them bottom-up.  The root is always the
6988    /// last entry (highest level), which must be logged `Provisional::No`.
6989    pub fn collect_dirty_upper_ins(
6990        &self,
6991        _db_id: u64,
6992    ) -> Vec<(i32, Arc<RwLock<TreeNode>>)> {
6993        let mut result: Vec<(i32, Arc<RwLock<TreeNode>>)> = Vec::new();
6994        if let Some(root) = self.get_root() {
6995            Self::collect_dirty_upper_ins_recursive(&root, &mut result);
6996        }
6997        result.sort_by_key(|(level, _)| *level);
6998        result
6999    }
7000
7001    fn collect_dirty_upper_ins_recursive(
7002        node_arc: &Arc<RwLock<TreeNode>>,
7003        out: &mut Vec<(i32, Arc<RwLock<TreeNode>>)>,
7004    ) {
7005        let guard = node_arc.read();
7006        match &*guard {
7007            TreeNode::Bottom(_) => {
7008                // BINs are handled by flush_dirty_bins_internal; skip here.
7009            }
7010            TreeNode::Internal(n) => {
7011                let is_dirty = n.dirty;
7012                // REC-AA: return the node's ACTUAL tree level (n.level, in
7013                // MAIN_LEVEL|n units), not a root-relative depth.  The level
7014                // must be on the same scale as a BIN's `level` (BIN_LEVEL =
7015                // MAIN_LEVEL|1) so that the checkpointer's flush-level
7016                // computation and the evictor's `node_level < flush_level`
7017                // comparison are meaningful.  With a root-relative depth the
7018                // root had the SMALLEST value (0) and the IN above the BINs
7019                // the LARGEST, inverting the provisional/non-provisional
7020                // boundary; with n.level the root has the largest level, as JE
7021                // expects.
7022                let level = n.level;
7023                let children: Vec<Arc<RwLock<TreeNode>>> =
7024                    n.resident_children();
7025                drop(guard);
7026                // Recurse into children first (bottom-up ordering).
7027                for child in &children {
7028                    Self::collect_dirty_upper_ins_recursive(child, out);
7029                }
7030                // Add this node after children (so parent comes after all descendants).
7031                if is_dirty {
7032                    out.push((level, Arc::clone(node_arc)));
7033                }
7034            }
7035        }
7036    }
7037
7038    // ========================================================================
7039    // Tree.java ports: 8 additional tree methods (Task #82)
7040    // ========================================================================
7041
7042    /// Returns `true` if the root node is currently loaded in memory.
7043    ///
7044    /// .
7045    pub fn is_root_resident(&self) -> bool {
7046        self.root.read().is_some()
7047    }
7048
7049    /// Returns the root node `Arc` if present, or `None`.
7050    ///
7051    /// .
7052    pub fn get_resident_root_in(&self) -> Option<Arc<RwLock<TreeNode>>> {
7053        self.root.read().clone()
7054    }
7055
7056    /// Returns the BIN that should contain a slot for `key` (the "parent" of
7057    /// LN slots).
7058    ///
7059    /// .  Descends the tree
7060    /// exactly like `search()` and returns the leaf-level BIN arc, or `None`
7061    /// if the tree is empty.
7062    ///
7063    /// Uses `read_arc()` hand-over-hand on the descent — the child
7064    /// guard is taken before the parent guard is dropped, matching
7065    /// `search()`. Returns the BIN Arc with no read lock held; the
7066    /// caller must take whatever lock it needs to operate on the
7067    /// returned BIN.
7068    pub fn get_parent_bin_for_child_ln(
7069        &self,
7070        key: &[u8],
7071    ) -> Option<Arc<RwLock<TreeNode>>> {
7072        let root = self.get_root()?;
7073        let mut current_arc: Arc<RwLock<TreeNode>> = root.clone();
7074        let mut guard: parking_lot::ArcRwLockReadGuard<
7075            parking_lot::RawRwLock,
7076            TreeNode,
7077        > = root.read_arc();
7078
7079        loop {
7080            if guard.is_bin() {
7081                drop(guard);
7082                return Some(current_arc);
7083            }
7084
7085            let parent_arc = current_arc.clone();
7086            let next_idx = match &*guard {
7087                TreeNode::Internal(n) => {
7088                    if n.entries.is_empty() {
7089                        return None;
7090                    }
7091                    let idx = self.upper_in_floor_index(&n.entries, key);
7092                    match n.get_child(idx) {
7093                        Some(c) => {
7094                            let next_guard = c.read_arc();
7095                            drop(guard);
7096                            current_arc = c;
7097                            guard = next_guard;
7098                            continue;
7099                        }
7100                        None => idx, // EV-14/EV-13: re-fetch below.
7101                    }
7102                }
7103                TreeNode::Bottom(_) => {
7104                    unreachable!("is_bin() returned false above")
7105                }
7106            };
7107            // Hand-over-hand: take child guard before dropping parent.
7108            drop(guard);
7109            let child = self.child_at_or_fetch(&parent_arc, next_idx)?;
7110            let next_guard = child.read_arc();
7111            current_arc = child;
7112            guard = next_guard;
7113        }
7114    }
7115
7116    /// Returns the BIN where `key` should be inserted.
7117    ///
7118    /// .  Semantically identical to
7119    /// `get_parent_bin_for_child_ln` — expressed as a separate method to match
7120    /// API surface.
7121    ///
7122    /// Implemented as a delegation to `get_parent_bin_for_child_ln`,
7123    /// which uses `read_arc()` hand-over-hand on the descent.
7124    pub fn find_bin_for_insert(
7125        &self,
7126        key: &[u8],
7127    ) -> Option<Arc<RwLock<TreeNode>>> {
7128        self.get_parent_bin_for_child_ln(key)
7129    }
7130
7131    /// Search for a BIN, allowing splits during descent (preemptive splitting).
7132    ///
7133    /// .  This thin wrapper
7134    /// delegates to `search()` and returns the result wrapped in `Some`.
7135    /// The full split-allowed descent is performed by `insert()` internally;
7136    /// this method exposes the same result type for callers that only need to
7137    /// locate the BIN.
7138    ///
7139    /// Returns `None` if the tree is empty.
7140    pub fn search_splits_allowed(&self, key: &[u8]) -> Option<SearchResult> {
7141        self.search(key)
7142    }
7143
7144    /// Traverses the entire tree and returns every IN and BIN node as a flat
7145    /// list.
7146    ///
7147    /// .  Used by recovery to rebuild
7148    /// the in-memory IN list after log replay.  The walk is a BFS from the
7149    /// root; every `Arc<RwLock<TreeNode>>` encountered (both Internal and
7150    /// Bottom variants) is included in the result.
7151    pub fn rebuild_in_list(&self) -> Vec<Arc<RwLock<TreeNode>>> {
7152        let mut result = Vec::new();
7153        if let Some(root) = self.get_root() {
7154            Self::rebuild_in_list_recursive(&root, &mut result);
7155        }
7156        result
7157    }
7158
7159    fn rebuild_in_list_recursive(
7160        node_arc: &Arc<RwLock<TreeNode>>,
7161        out: &mut Vec<Arc<RwLock<TreeNode>>>,
7162    ) {
7163        // Push this node unconditionally — both INs and BINs belong in the list.
7164        out.push(Arc::clone(node_arc));
7165
7166        let guard = node_arc.read();
7167
7168        if let TreeNode::Internal(n) = &*guard {
7169            // Collect child arcs while holding the guard, then drop it before
7170            // recursing to avoid holding multiple locks simultaneously.
7171            let children: Vec<Arc<RwLock<TreeNode>>> = n.resident_children();
7172            drop(guard);
7173            for child in children {
7174                Self::rebuild_in_list_recursive(&child, out);
7175            }
7176        }
7177        // BIN nodes are leaves — no children to recurse into.
7178    }
7179
7180    /// Validates internal tree consistency.
7181    ///
7182    /// .  Primarily a debug/test tool.
7183    ///
7184    /// Rules checked:
7185    /// - An empty tree (no root) is trivially valid → returns `true`.
7186    /// - A non-empty tree must have a non-null root.
7187    /// - Every Internal node must have at least one entry.
7188    /// - Every child pointer that is `Some` must be readable (lock must be
7189    ///   acquirable — i.e., no poisoned locks).
7190    ///
7191    /// Returns `true` if no inconsistencies are detected, `false` otherwise.
7192    pub fn validate_in_list(&self) -> bool {
7193        match self.get_root() {
7194            None => true, // empty tree is always valid
7195            Some(root) => Self::validate_node(&root),
7196        }
7197    }
7198
7199    fn validate_node(node_arc: &Arc<RwLock<TreeNode>>) -> bool {
7200        let guard = node_arc.read();
7201
7202        match &*guard {
7203            TreeNode::Bottom(_bin) => {
7204                // BIN nodes are always structurally valid at this level.
7205                true
7206            }
7207            TreeNode::Internal(n) => {
7208                // An Internal node must have at least one entry.
7209                if n.entries.is_empty() {
7210                    return false;
7211                }
7212                // Collect child arcs before dropping the guard.
7213                let children: Vec<Arc<RwLock<TreeNode>>> =
7214                    n.resident_children();
7215                drop(guard);
7216                // Recursively validate every resident child.
7217                for child in children {
7218                    if !Self::validate_node(&child) {
7219                        return false;
7220                    }
7221                }
7222                true
7223            }
7224        }
7225    }
7226
7227    /// Traverses the tree to find the parent IN that contains `child_node_id`
7228    /// as one of its child slots.
7229    ///
7230    /// .  Used by the cleaner
7231    /// migration path to re-insert migrated INs after eviction/fetch.
7232    ///
7233    /// Returns `(parent_arc, slot_index)` where `slot_index` is the position
7234    /// in the parent's `entries` vector whose child matches `child_node_id`,
7235    /// or `None` if no such parent is found.
7236    pub fn get_parent_in_for_child_in(
7237        &self,
7238        child_node_id: u64,
7239    ) -> Option<(Arc<RwLock<TreeNode>>, usize)> {
7240        let root = self.get_root()?;
7241        Self::find_parent_of_node_id(&root, child_node_id)
7242    }
7243
7244    /// Recursive DFS helper for `get_parent_in_for_child_in`.
7245    ///
7246    /// Scans every entry in each Internal node.  When a child's node_id
7247    /// matches `target_id` the parent arc and slot index are returned.
7248    fn find_parent_of_node_id(
7249        node_arc: &Arc<RwLock<TreeNode>>,
7250        target_id: u64,
7251    ) -> Option<(Arc<RwLock<TreeNode>>, usize)> {
7252        let guard = node_arc.read();
7253
7254        let TreeNode::Internal(n) = &*guard else {
7255            // BIN nodes have no IN children — cannot be a parent of another IN.
7256            return None;
7257        };
7258
7259        // Check whether any child of this IN has the target node_id.
7260        let mut children: Vec<(usize, Arc<RwLock<TreeNode>>)> = Vec::new();
7261        for slot in 0..n.entries.len() {
7262            if let Some(child_arc) = n.child_ref(slot) {
7263                // Read the child's node_id under a separate lock (acquire child
7264                // while parent guard is still held — this is intentional for
7265                // the ID comparison only; we release both immediately after).
7266                let child_id = {
7267                    let cg = child_arc.read();
7268                    match &*cg {
7269                        TreeNode::Internal(cn) => cn.node_id,
7270                        TreeNode::Bottom(cb) => cb.node_id,
7271                    }
7272                };
7273
7274                if child_id == target_id {
7275                    // Found — return a clone of this node as parent.
7276                    let parent_clone = Arc::clone(node_arc);
7277                    return Some((parent_clone, slot));
7278                }
7279
7280                // Not found at this slot; schedule this child for recursion.
7281                children.push((slot, Arc::clone(child_arc)));
7282            }
7283        }
7284        // Release parent guard before recursing.
7285        drop(guard);
7286
7287        // Recurse into each Internal child.
7288        for (_slot, child_arc) in children {
7289            if let Some(result) =
7290                Self::find_parent_of_node_id(&child_arc, target_id)
7291            {
7292                return Some(result);
7293            }
7294        }
7295
7296        None
7297    }
7298
7299    /// Propagates the dirty flag upward from `node_arc` to the root.
7300    ///
7301    /// Implicit dirty propagation: after modifying any node,
7302    /// all ancestors on the path to the root must also be marked dirty so
7303    /// the checkpointer logs them.
7304    ///
7305    /// In this happens through `IN.setDirty(true)` calls at each level
7306    /// during split/insert callbacks.  Here we walk the weak parent chain.
7307    /// Reconstitute a BIN-delta by merging it onto a base full BIN.
7308    ///
7309    /// Implements JE `BINDelta.reconstituteBIN(databaseImpl)` for the recovery
7310    /// path where the log manager is not available as a `LogManager` but as
7311    /// raw serialized bytes.
7312    ///
7313    /// Algorithm:
7314    /// 1. Deserialise `base_bytes` as a full `BinStub`.
7315    /// 2. Apply `delta_bytes` slots onto the base using `BinStub::apply_delta`
7316    ///    (raw slot overlay).
7317    /// 3. Recompute key prefix so prefix-compressed entries are consistent.
7318    ///
7319    /// Returns `None` if either byte slice is malformed.
7320    ///
7321    /// JE `BINDelta.reconstituteBIN` / `BINDelta.applyDelta`
7322    /// (DRIFT-10 / Stage 3).
7323    pub fn reconstitute_bin_delta(
7324        base_bytes: &[u8],
7325        delta_bytes: &[u8],
7326    ) -> Option<BinStub> {
7327        let mut base = BinStub::deserialize_full(base_bytes)?;
7328        // Apply the delta slots onto the base.
7329        // Note: BinStub::apply_delta uses slot-index addressing into base.entries,
7330        // extending with new entries when the slot_idx >= base.entries.len().
7331        // After apply_delta we recompute the key prefix to fix prefix compression.
7332        BinStub::apply_delta(&mut base, delta_bytes)?;
7333        // Recompute prefix so prefix-compressed BINs are consistent after merge.
7334        base.recompute_key_prefix();
7335        base.is_delta = false;
7336        base.dirty = false;
7337        Some(base)
7338    }
7339
7340    pub fn propagate_dirty_to_root(node_arc: &Arc<RwLock<TreeNode>>) {
7341        let parent_weak = { node_arc.read().get_parent() };
7342
7343        if let Some(parent_arc) = parent_weak.and_then(|w| w.upgrade()) {
7344            {
7345                let mut g = parent_arc.write();
7346                g.set_dirty(true);
7347            }
7348            // Recurse further up.
7349            Self::propagate_dirty_to_root(&parent_arc);
7350        }
7351    }
7352
7353    // ========================================================================
7354    // IN-redo: JE RecoveryManager.recoverIN / recoverRootIN / recoverChildIN
7355    // ========================================================================
7356
7357    /// Deserialise an upper-IN node from bytes produced by
7358    /// `TreeNode::write_to_bytes()` / `flush_one_tree_upper_ins`.
7359    ///
7360    /// Format: node_id(u64BE) | level(i32BE) | n_entries(u32BE) | dirty(u8)
7361    ///   | per-entry: key_len(u16BE) | key | lsn(u64BE)
7362    ///
7363    /// JE `INFileReader.getIN(db)` / `IN.readFromLog`.
7364    pub fn deserialize_upper_in(bytes: &[u8]) -> Option<InNodeStub> {
7365        if bytes.len() < 13 {
7366            return None;
7367        }
7368        let node_id = u64::from_be_bytes(bytes[0..8].try_into().ok()?);
7369        let level = i32::from_be_bytes(bytes[8..12].try_into().ok()?);
7370        let n_entries =
7371            u32::from_be_bytes(bytes[12..16].try_into().ok()?) as usize;
7372        // dirty byte (1 byte after n_entries)
7373        if bytes.len() < 17 {
7374            return None;
7375        }
7376        let mut pos = 17usize; // skip node_id(8) + level(4) + n_entries(4) + dirty(1)
7377        let mut entries = Vec::with_capacity(n_entries);
7378        let mut lsns: Vec<Lsn> = Vec::with_capacity(n_entries);
7379        for _ in 0..n_entries {
7380            if pos + 2 > bytes.len() {
7381                return None;
7382            }
7383            let key_len =
7384                u16::from_be_bytes(bytes[pos..pos + 2].try_into().ok()?)
7385                    as usize;
7386            pos += 2;
7387            if pos + key_len > bytes.len() {
7388                return None;
7389            }
7390            let key = bytes[pos..pos + key_len].to_vec();
7391            pos += key_len;
7392            if pos + 8 > bytes.len() {
7393                return None;
7394            }
7395            let lsn = noxu_util::Lsn::from_u64(u64::from_be_bytes(
7396                bytes[pos..pos + 8].try_into().ok()?,
7397            ));
7398            pos += 8;
7399            entries.push(InEntry { key });
7400            lsns.push(lsn); // T-3
7401        }
7402        Some(InNodeStub {
7403            node_id,
7404            level,
7405            entries,
7406            // T-4: a freshly deserialized IN has no resident children.
7407            targets: TargetRep::None,
7408            dirty: false,
7409            generation: 0,
7410            parent: None,
7411            lsn_rep: LsnRep::from_lsns(&lsns), // T-3
7412        })
7413    }
7414
7415    /// Deserialise a BIN from bytes produced by `BinStub::serialize_full()`.
7416    ///
7417    /// Thin wrapper so the recovery path does not need to import `BinStub`
7418    /// directly from callers that only have the raw bytes.
7419    ///
7420    /// JE `INFileReader.getIN(db)` for a BIN entry.
7421    pub fn deserialize_bin(bytes: &[u8]) -> Option<BinStub> {
7422        let mut bin = BinStub::deserialize_full(bytes)?;
7423        bin.dirty = false; // freshly loaded from log — clean for now
7424        Some(bin)
7425    }
7426
7427    /// Apply a logged IN/BIN to the in-memory tree during the recovery redo pass.
7428    ///
7429    /// Implements JE `RecoveryManager.recoverIN`:
7430    /// - `is_root` nodes are handled by `recover_root_in`.
7431    /// - non-root nodes are handled by `recover_child_in`.
7432    ///
7433    /// `log_lsn` is the LSN at which this IN/BIN was logged.  The currency
7434    /// check in `recover_child_in` uses this to decide whether to replace the
7435    /// in-memory slot (tree slot LSN < log_lsn → replace; equal → noop;
7436    /// greater → skip).
7437    ///
7438    /// JE `RecoveryManager.recoverIN` / `replayOneIN`
7439    /// (RecoveryManager.java ~lines 1200–1280).
7440    pub fn recover_in_redo(
7441        &self,
7442        log_lsn: noxu_util::Lsn,
7443        is_root: bool,
7444        is_bin: bool,
7445        node_data: &[u8],
7446    ) -> InRedoResult {
7447        if is_bin {
7448            let Some(bin) = Self::deserialize_bin(node_data) else {
7449                return InRedoResult::DeserializeFailed;
7450            };
7451            if is_root {
7452                self.recover_root_bin(log_lsn, bin)
7453            } else {
7454                self.recover_child_bin(log_lsn, bin)
7455            }
7456        } else {
7457            let Some(upper) = Self::deserialize_upper_in(node_data) else {
7458                return InRedoResult::DeserializeFailed;
7459            };
7460            if is_root {
7461                self.recover_root_upper_in(log_lsn, upper)
7462            } else {
7463                self.recover_child_upper_in(log_lsn, upper)
7464            }
7465        }
7466    }
7467
7468    /// Recover a root BIN.
7469    ///
7470    /// If no root exists or the existing root is older (lower LSN), install
7471    /// this BIN as the new root.
7472    ///
7473    /// JE `RecoveryManager.recoverRootIN` / `RootUpdater.doWork`
7474    /// (RecoveryManager.java ~lines 1293–1410).
7475    fn recover_root_bin(
7476        &self,
7477        log_lsn: noxu_util::Lsn,
7478        bin: BinStub,
7479    ) -> InRedoResult {
7480        let mut root_guard = self.root.write();
7481        let existing_lsn = *self.root_log_lsn.read();
7482        match &*root_guard {
7483            None => {
7484                // No root — install this BIN as the root.
7485                // JE: `root == null` case in `RootUpdater.doWork`.
7486                let node = TreeNode::Bottom(bin);
7487                *root_guard = Some(Arc::new(RwLock::new(node)));
7488                *self.root_log_lsn.write() = log_lsn;
7489                InRedoResult::Inserted
7490            }
7491            Some(_) => {
7492                // JE: `originalLsn = root.getLsn()`; replace if logLsn > originalLsn.
7493                if log_lsn > existing_lsn {
7494                    let node = TreeNode::Bottom(bin);
7495                    *root_guard = Some(Arc::new(RwLock::new(node)));
7496                    *self.root_log_lsn.write() = log_lsn;
7497                    InRedoResult::Replaced
7498                } else {
7499                    InRedoResult::Skipped
7500                }
7501            }
7502        }
7503    }
7504
7505    /// Recover a root upper IN.
7506    ///
7507    /// JE `RecoveryManager.recoverRootIN` for a non-BIN root.
7508    fn recover_root_upper_in(
7509        &self,
7510        log_lsn: noxu_util::Lsn,
7511        upper: InNodeStub,
7512    ) -> InRedoResult {
7513        let mut root_guard = self.root.write();
7514        let existing_lsn = *self.root_log_lsn.read();
7515        match &*root_guard {
7516            None => {
7517                let node = TreeNode::Internal(upper);
7518                *root_guard = Some(Arc::new(RwLock::new(node)));
7519                *self.root_log_lsn.write() = log_lsn;
7520                InRedoResult::Inserted
7521            }
7522            Some(_) => {
7523                if log_lsn > existing_lsn {
7524                    let node = TreeNode::Internal(upper);
7525                    *root_guard = Some(Arc::new(RwLock::new(node)));
7526                    *self.root_log_lsn.write() = log_lsn;
7527                    InRedoResult::Replaced
7528                } else {
7529                    InRedoResult::Skipped
7530                }
7531            }
7532        }
7533    }
7534
7535    /// Recover a non-root BIN.
7536    ///
7537    /// Implements the three-case currency check from JE
7538    /// `RecoveryManager.recoverChildIN`
7539    /// (RecoveryManager.java lines 1412–1500):
7540    ///
7541    /// 1. Node not in tree: skip (parent logged a later structure that already
7542    ///    omits this node, or node was deleted).
7543    /// 2. Physical match (slot LSN == log_lsn): noop — already current.
7544    /// 3. Logical match: another version of the node is in the slot.
7545    ///    Replace if tree slot LSN < log_lsn (tree is older), skip otherwise.
7546    fn recover_child_bin(
7547        &self,
7548        log_lsn: noxu_util::Lsn,
7549        bin: BinStub,
7550    ) -> InRedoResult {
7551        let node_id = bin.node_id;
7552        let Some((parent_arc, slot)) = self.get_parent_in_for_child_in(node_id)
7553        else {
7554            // Case 1: not in tree.
7555            return InRedoResult::NotInTree;
7556        };
7557        let mut parent = parent_arc.write();
7558        let TreeNode::Internal(ref mut p) = *parent else {
7559            return InRedoResult::NotInTree;
7560        };
7561        let tree_lsn = p.get_lsn(slot); // T-3
7562        if tree_lsn == log_lsn {
7563            // Case 2: physical match — noop.
7564            InRedoResult::Skipped
7565        } else if tree_lsn < log_lsn {
7566            // Case 3: logical match, tree is older — replace.
7567            // JE `parent.recoverIN(idx, inFromLog, logLsn, lastLoggedSize)`.
7568            let new_arc = Arc::new(RwLock::new(TreeNode::Bottom(bin)));
7569            // Set parent back-pointer on the new node.
7570            {
7571                let mut ng = new_arc.write();
7572                if let TreeNode::Bottom(ref mut b) = *ng {
7573                    b.parent = Some(Arc::downgrade(&parent_arc));
7574                }
7575            }
7576            p.set_child(slot, Some(new_arc));
7577            p.set_lsn(slot, log_lsn); // T-3
7578            InRedoResult::Replaced
7579        } else {
7580            // tree_lsn > log_lsn: tree already holds a newer version.
7581            InRedoResult::Skipped
7582        }
7583    }
7584
7585    /// Recover a non-root upper IN.
7586    ///
7587    /// JE `RecoveryManager.recoverChildIN` for a non-BIN node.
7588    fn recover_child_upper_in(
7589        &self,
7590        log_lsn: noxu_util::Lsn,
7591        upper: InNodeStub,
7592    ) -> InRedoResult {
7593        let node_id = upper.node_id;
7594        let Some((parent_arc, slot)) = self.get_parent_in_for_child_in(node_id)
7595        else {
7596            return InRedoResult::NotInTree;
7597        };
7598        let mut parent = parent_arc.write();
7599        let TreeNode::Internal(ref mut p) = *parent else {
7600            return InRedoResult::NotInTree;
7601        };
7602        let tree_lsn = p.get_lsn(slot); // T-3
7603        if tree_lsn == log_lsn {
7604            InRedoResult::Skipped
7605        } else if tree_lsn < log_lsn {
7606            let new_arc = Arc::new(RwLock::new(TreeNode::Internal(upper)));
7607            {
7608                let mut ng = new_arc.write();
7609                if let TreeNode::Internal(ref mut n) = *ng {
7610                    n.parent = Some(Arc::downgrade(&parent_arc));
7611                }
7612            }
7613            p.set_child(slot, Some(new_arc));
7614            p.set_lsn(slot, log_lsn); // T-3
7615            InRedoResult::Replaced
7616        } else {
7617            InRedoResult::Skipped
7618        }
7619    }
7620}
7621
7622/// Result of a single `recover_in_redo` call.
7623///
7624/// JE traces the same outcomes in `RecoveryManager` debug logging.
7625#[derive(Debug, Clone, Copy, PartialEq, Eq)]
7626pub enum InRedoResult {
7627    /// Node was inserted as the new root.
7628    Inserted,
7629    /// Node replaced an older version in the tree.
7630    Replaced,
7631    /// Node not applied: tree already holds an equal or newer version.
7632    Skipped,
7633    /// Node not found in tree (parent logged later structure that excludes it).
7634    NotInTree,
7635    /// Deserialisation of `node_data` bytes failed.
7636    DeserializeFailed,
7637}
7638
7639/// Global node ID counter for generating unique node IDs.
7640///
7641/// This is the SINGLE source of node-ids for the whole tree subsystem.  The
7642/// BIN constructor (`bin.rs`) and `node.rs` route through `generate_node_id`
7643/// so that, after crash recovery, a freshly allocated node-id is always
7644/// strictly greater than every node-id present in the recovered log.
7645///
7646/// JE ref: `NodeSequence.getNextLocalNodeId` (a single per-env counter) and
7647/// `IN.nodeId` allocation; `NodeSequence.initRealNodeId` seeds the counter
7648/// from the recovered `CheckpointEnd.lastLocalNodeId`.  The env seeds this
7649/// counter post-recovery via `seed_node_id_counter`.
7650static NODE_ID_COUNTER: std::sync::atomic::AtomicU64 =
7651    std::sync::atomic::AtomicU64::new(1);
7652
7653/// Generates a unique node ID.
7654pub fn generate_node_id() -> u64 {
7655    NODE_ID_COUNTER.fetch_add(1, std::sync::atomic::Ordering::SeqCst)
7656}
7657
7658/// Returns the node-id that would be generated next (without allocating it).
7659///
7660/// Used by recovery seeding and by tests to assert no node-id reuse after a
7661/// restart.
7662pub fn peek_next_node_id_counter() -> u64 {
7663    NODE_ID_COUNTER.load(std::sync::atomic::Ordering::SeqCst)
7664}
7665
7666/// Seeds the node-id counter so the next generated id is `> last_node_id`.
7667///
7668/// Called by `EnvironmentImpl` after recovery with the recovered
7669/// `use_max_node_id`, mirroring `NodeSequence.initRealNodeId` /
7670/// `setLastNodeId`: post-restart allocation must never reuse a node-id that
7671/// is already in the log.  Monotonic: never lowers the counter.
7672pub fn seed_node_id_counter(last_node_id: u64) {
7673    let want_next = last_node_id.saturating_add(1);
7674    // Bump only if our current next is below the recovered floor.
7675    let mut cur = NODE_ID_COUNTER.load(std::sync::atomic::Ordering::SeqCst);
7676    while cur < want_next {
7677        match NODE_ID_COUNTER.compare_exchange_weak(
7678            cur,
7679            want_next,
7680            std::sync::atomic::Ordering::SeqCst,
7681            std::sync::atomic::Ordering::SeqCst,
7682        ) {
7683            Ok(_) => break,
7684            Err(observed) => cur = observed,
7685        }
7686    }
7687}
7688
7689#[cfg(test)]
7690mod tests {
7691    use super::*;
7692
7693    // ====================================================================
7694    // T-3: LsnRep packed-LSN encoding (IN.entryLsnByteArray / getLsn /
7695    // setLsnInternal, IN.java:1752-1935).
7696    // ====================================================================
7697
7698    /// All-NULL node uses the 0-byte Empty rep; reads return NULL_LSN.
7699    #[test]
7700    fn lsnrep_empty_is_zero_bytes() {
7701        let rep = LsnRep::new(64);
7702        assert!(matches!(rep, LsnRep::Empty));
7703        assert_eq!(rep.memory_size(), 0);
7704        assert_eq!(rep.get(0), NULL_LSN);
7705        assert_eq!(rep.get(63), NULL_LSN);
7706    }
7707
7708    /// LSNs sharing a file number pack to the Compact rep (4 bytes/slot,
7709    /// base_file_number-relative) and round-trip exactly.
7710    #[test]
7711    fn lsnrep_compact_roundtrip_same_file() {
7712        let mut rep = LsnRep::new(8);
7713        for i in 0..8u32 {
7714            rep.set(i as usize, Lsn::new(7, 1000 + i), 8);
7715        }
7716        assert!(matches!(rep, LsnRep::Compact { .. }));
7717        for i in 0..8u32 {
7718            assert_eq!(rep.get(i as usize), Lsn::new(7, 1000 + i));
7719        }
7720        // 8 slots * 4 bytes = 32 bytes, far below 8 * 8 = 64 for raw u64.
7721        assert_eq!(rep.memory_size(), 8 * 4);
7722    }
7723
7724    /// NULL_LSN is stored via the 0xffffff file-offset sentinel, NOT u64::MAX,
7725    /// so a node with NULL slots still packs Compact (the blocker JE solves).
7726    #[test]
7727    fn lsnrep_null_does_not_force_long() {
7728        let mut rep = LsnRep::new(4);
7729        rep.set(0, Lsn::new(3, 50), 4);
7730        rep.set(1, NULL_LSN, 4);
7731        rep.set(2, Lsn::new(3, 60), 4);
7732        rep.set(3, NULL_LSN, 4);
7733        assert!(
7734            matches!(rep, LsnRep::Compact { .. }),
7735            "NULL slots must NOT force the Long rep"
7736        );
7737        assert_eq!(rep.get(0), Lsn::new(3, 50));
7738        assert_eq!(rep.get(1), NULL_LSN);
7739        assert_eq!(rep.get(2), Lsn::new(3, 60));
7740        assert_eq!(rep.get(3), NULL_LSN);
7741    }
7742
7743    /// base_file_number tracks the minimum; setting a lower file number
7744    /// re-bases the whole array (adjustFileNumbers) while staying Compact.
7745    #[test]
7746    fn lsnrep_rebase_on_lower_file_number() {
7747        let mut rep = LsnRep::new(3);
7748        rep.set(0, Lsn::new(10, 5), 3);
7749        rep.set(1, Lsn::new(12, 6), 3);
7750        // A lower file number re-bases base_file_number to 8.
7751        rep.set(2, Lsn::new(8, 7), 3);
7752        assert!(matches!(rep, LsnRep::Compact { .. }));
7753        assert_eq!(rep.get(0), Lsn::new(10, 5));
7754        assert_eq!(rep.get(1), Lsn::new(12, 6));
7755        assert_eq!(rep.get(2), Lsn::new(8, 7));
7756    }
7757
7758    /// A file-number spread > 127 forces the Long fallback (mutateToLongArray),
7759    /// still round-tripping every slot.
7760    #[test]
7761    fn lsnrep_mutates_to_long_on_wide_file_range() {
7762        let mut rep = LsnRep::new(2);
7763        rep.set(0, Lsn::new(1, 5), 2);
7764        rep.set(1, Lsn::new(1000, 6), 2); // diff 999 > 127 -> Long
7765        assert!(matches!(rep, LsnRep::Long(_)));
7766        assert_eq!(rep.get(0), Lsn::new(1, 5));
7767        assert_eq!(rep.get(1), Lsn::new(1000, 6));
7768    }
7769
7770    /// A file offset > MAX_FILE_OFFSET (0xfffffe) forces the Long fallback.
7771    #[test]
7772    fn lsnrep_mutates_to_long_on_large_offset() {
7773        let mut rep = LsnRep::new(2);
7774        rep.set(0, Lsn::new(1, 10), 2);
7775        rep.set(1, Lsn::new(1, 0x00ff_ffff), 2); // > MAX_FILE_OFFSET -> Long
7776        assert!(matches!(rep, LsnRep::Long(_)));
7777        assert_eq!(rep.get(1), Lsn::new(1, 0x00ff_ffff));
7778    }
7779
7780    /// insert_shift / remove_shift keep slots aligned (INArrayRep.copy).
7781    #[test]
7782    fn lsnrep_insert_and_remove_shift() {
7783        let mut rep = LsnRep::from_lsns(&[
7784            Lsn::new(2, 1),
7785            Lsn::new(2, 2),
7786            Lsn::new(2, 3),
7787        ]);
7788        // Insert a new slot at index 1.
7789        rep.insert_shift(1, 4);
7790        rep.set(1, Lsn::new(2, 99), 4);
7791        assert_eq!(rep.get(0), Lsn::new(2, 1));
7792        assert_eq!(rep.get(1), Lsn::new(2, 99));
7793        assert_eq!(rep.get(2), Lsn::new(2, 2));
7794        assert_eq!(rep.get(3), Lsn::new(2, 3));
7795        // Remove slot 1.
7796        rep.remove_shift(1);
7797        assert_eq!(rep.get(0), Lsn::new(2, 1));
7798        assert_eq!(rep.get(1), Lsn::new(2, 2));
7799        assert_eq!(rep.get(2), Lsn::new(2, 3));
7800    }
7801
7802    #[test]
7803    fn test_empty_tree() {
7804        let tree = Tree::new(1, 128);
7805        assert!(tree.is_empty());
7806        assert_eq!(tree.get_database_id(), 1);
7807        assert_eq!(tree.get_root_splits(), 0);
7808    }
7809
7810    #[test]
7811    fn test_redo_insert_older_lsn_does_not_overwrite_newer_slot() {
7812        // REC-F2 reproduce-first: redo() must be idempotent w.r.t. slot
7813        // currency.  JE RecoveryManager.redo() (line ~2512/2544) only
7814        // replaces a slot when logrecLsn > treeLsn.  A later redo of an
7815        // OLDER committed LN for the same key must NOT revert the slot to
7816        // the older value or reset the slot LSN backward.
7817        let tree = Tree::new(1, 128);
7818        let key = b"k".to_vec();
7819
7820        // Install the newer version at LSN X (e.g. the BIN-logged value).
7821        let newer = Lsn::new(5, 500);
7822        tree.redo_insert(&key, b"new", newer).unwrap();
7823
7824        // Replay an OLDER committed LN at Y < X for the same key.
7825        let older = Lsn::new(2, 200);
7826        tree.redo_insert(&key, b"old", older).unwrap();
7827
7828        // The newer value and LSN must survive.
7829        let got = tree.search_with_data(&key).expect("key present");
7830        assert!(got.found);
7831        assert_eq!(
7832            got.data.as_deref(),
7833            Some(&b"new"[..]),
7834            "older-LSN redo reverted committed data"
7835        );
7836        assert_eq!(
7837            got.lsn,
7838            newer.as_u64(),
7839            "older-LSN redo reset slot LSN backward"
7840        );
7841
7842        // A redo at a strictly NEWER LSN must still replace (replace-only
7843        // when log_lsn > slot_lsn, matching JE lsnCmp > 0).
7844        let newest = Lsn::new(9, 900);
7845        tree.redo_insert(&key, b"newest", newest).unwrap();
7846        let got = tree.search_with_data(&key).expect("key present");
7847        assert_eq!(got.data.as_deref(), Some(&b"newest"[..]));
7848        assert_eq!(got.lsn, newest.as_u64());
7849    }
7850
7851    #[test]
7852    fn test_insert_single() {
7853        let tree = Tree::new(1, 128);
7854        let key = b"testkey".to_vec();
7855        let data = b"testdata".to_vec();
7856        let lsn = Lsn::new(1, 100);
7857
7858        let result = tree.insert(key.clone(), data, lsn);
7859        assert!(result.is_ok());
7860        assert!(result.unwrap()); // Should be a new insert
7861
7862        assert!(!tree.is_empty());
7863
7864        // Verify we can search for it
7865        let search_result = tree.search(&key);
7866        assert!(search_result.is_some());
7867        let sr = search_result.unwrap();
7868        assert!(sr.exact_parent_found || !sr.child_not_resident);
7869    }
7870
7871    #[test]
7872    fn test_insert_multiple() {
7873        let tree = Tree::new(1, 128);
7874
7875        let keys = vec![
7876            b"apple".to_vec(),
7877            b"banana".to_vec(),
7878            b"cherry".to_vec(),
7879            b"date".to_vec(),
7880        ];
7881
7882        for (i, key) in keys.iter().enumerate() {
7883            let data = format!("data{}", i).into_bytes();
7884            let lsn = Lsn::new(1, 100 + (i as u32) * 10);
7885            let result = tree.insert(key.clone(), data, lsn);
7886            assert!(result.is_ok());
7887            assert!(result.unwrap()); // All should be new inserts
7888        }
7889
7890        // Verify we can search for each
7891        for key in &keys {
7892            let search_result = tree.search(key);
7893            assert!(search_result.is_some());
7894        }
7895    }
7896
7897    #[test]
7898    fn test_insert_duplicate_key() {
7899        let tree = Tree::new(1, 128);
7900        let key = b"duplicate".to_vec();
7901        let data1 = b"first".to_vec();
7902        let data2 = b"second".to_vec();
7903        let lsn1 = Lsn::new(1, 100);
7904        let lsn2 = Lsn::new(1, 200);
7905
7906        // First insert
7907        let result1 = tree.insert(key.clone(), data1, lsn1);
7908        assert!(result1.is_ok());
7909        assert!(result1.unwrap()); // New insert
7910
7911        // Second insert with same key - should be update
7912        let result2 = tree.insert(key, data2, lsn2);
7913        assert!(result2.is_ok());
7914        assert!(!result2.unwrap()); // Update, not new insert
7915    }
7916
7917    #[test]
7918    fn test_search_empty_tree() {
7919        let tree = Tree::new(1, 128);
7920        let key = b"noexist".to_vec();
7921
7922        let result = tree.search(&key);
7923        assert!(result.is_none());
7924    }
7925
7926    #[test]
7927    fn test_first_and_last_node() {
7928        let tree = Tree::new(1, 128);
7929
7930        // Empty tree
7931        assert!(tree.get_first_node().is_none());
7932        assert!(tree.get_last_node().is_none());
7933
7934        // Insert some keys
7935        let keys = [b"a".to_vec(), b"b".to_vec(), b"c".to_vec()];
7936        for (i, key) in keys.iter().enumerate() {
7937            let data = format!("data{}", i).into_bytes();
7938            let lsn = Lsn::new(1, 100 + (i as u32) * 10);
7939            tree.insert(key.clone(), data, lsn).unwrap();
7940        }
7941
7942        // Now should have first and last
7943        let first = tree.get_first_node();
7944        assert!(first.is_some());
7945        assert_eq!(first.unwrap().index, 0);
7946
7947        let last = tree.get_last_node();
7948        assert!(last.is_some());
7949        assert_eq!(last.unwrap().index, 2);
7950    }
7951
7952    #[test]
7953    fn test_node_id_generation() {
7954        let id1 = generate_node_id();
7955        let id2 = generate_node_id();
7956        let id3 = generate_node_id();
7957
7958        assert!(id2 > id1);
7959        assert!(id3 > id2);
7960    }
7961
7962    #[test]
7963    fn test_tree_node_is_bin() {
7964        let bin = TreeNode::Bottom(BinStub {
7965            node_id: 1,
7966            level: BIN_LEVEL,
7967            entries: vec![],
7968            key_prefix: Vec::new(),
7969            dirty: false,
7970            is_delta: false,
7971            last_full_lsn: NULL_LSN,
7972            last_delta_lsn: NULL_LSN,
7973            generation: 0,
7974            parent: None,
7975            expiration_in_hours: true,
7976            cursor_count: 0,
7977            prohibit_next_delta: false,
7978            lsn_rep: LsnRep::Empty,
7979            keys: KeyRep::new(),
7980            compact_max_key_length: INKeyRep_DEFAULT_MAX_KEY_LENGTH,
7981        });
7982        assert!(bin.is_bin());
7983        assert_eq!(bin.level(), BIN_LEVEL);
7984
7985        let internal = TreeNode::Internal(InNodeStub {
7986            node_id: 2,
7987            level: MAIN_LEVEL + 2,
7988            entries: vec![],
7989            targets: TargetRep::None,
7990            dirty: false,
7991            generation: 0,
7992            parent: None,
7993            lsn_rep: LsnRep::Empty,
7994        });
7995        assert!(!internal.is_bin());
7996        assert_eq!(internal.level(), MAIN_LEVEL + 2);
7997    }
7998
7999    #[test]
8000    fn test_find_entry() {
8001        let mut entries = vec![];
8002        let mut keys = vec![];
8003        for i in 0..5 {
8004            entries.push(BinEntry {
8005                data: Some(vec![]),
8006                known_deleted: false,
8007                dirty: false,
8008                expiration_time: 0,
8009            });
8010            keys.push(format!("key{}", i).into_bytes());
8011        }
8012
8013        let bin = TreeNode::Bottom(BinStub {
8014            node_id: 1,
8015            level: BIN_LEVEL,
8016            entries,
8017            key_prefix: Vec::new(),
8018            dirty: false,
8019            is_delta: false,
8020            last_full_lsn: NULL_LSN,
8021            last_delta_lsn: NULL_LSN,
8022            generation: 0,
8023            parent: None,
8024            expiration_in_hours: true,
8025            cursor_count: 0,
8026            prohibit_next_delta: false,
8027            lsn_rep: LsnRep::Empty,
8028            keys: KeyRep::from_keys(keys),
8029            compact_max_key_length: INKeyRep_DEFAULT_MAX_KEY_LENGTH,
8030        });
8031
8032        // Search for existing key
8033        let result = bin.find_entry(b"key2", false, true);
8034        assert_eq!(result & 0xFFFF, 2);
8035        assert_ne!(result & EXACT_MATCH, 0);
8036
8037        // Search for non-existing key with exact=false
8038        let result = bin.find_entry(b"key15", false, false);
8039        assert_eq!(result & 0xFFFF, 2); // Would go between key1 and key2
8040        assert_eq!(result & EXACT_MATCH, 0);
8041    }
8042
8043    #[test]
8044    fn test_insert_until_full() {
8045        // With splits implemented, inserting beyond max_entries_per_node must
8046        // succeed (the tree splits proactively rather than returning an error).
8047        let tree = Tree::new(1, 3); // Small max to exercise splits
8048
8049        // Insert up to max
8050        for i in 0..3 {
8051            let key = format!("key{}", i).into_bytes();
8052            let data = format!("data{}", i).into_bytes();
8053            let lsn = Lsn::new(1, 100 + i);
8054            let result = tree.insert(key, data, lsn);
8055            assert!(result.is_ok(), "insert {} should succeed", i);
8056        }
8057
8058        // The 4th insert triggers a split and must also succeed.
8059        let key = b"key3".to_vec();
8060        let data = b"data3".to_vec();
8061        let lsn = Lsn::new(1, 103);
8062        let result = tree.insert(key.clone(), data, lsn);
8063        assert!(
8064            result.is_ok(),
8065            "insert after full should trigger split and succeed"
8066        );
8067        assert!(result.unwrap(), "should be a new insert");
8068
8069        // The inserted key must be findable after the split.
8070        let sr = tree.search(&key);
8071        assert!(sr.is_some(), "key3 must be searchable after split");
8072        assert!(sr.unwrap().exact_parent_found, "key3 must be found exactly");
8073    }
8074
8075    #[test]
8076    fn test_memory_counter_balanced_on_insert_delete_f8() {
8077        use std::sync::Arc;
8078        use std::sync::atomic::{AtomicI64, Ordering};
8079        // F8 regression: insert accounts key+data+48; delete must subtract the
8080        // SAME, so an insert+delete of the same record returns the counter to
8081        // its starting value (previously delete omitted data_len -> the counter
8082        // leaked data_len per delete, biasing the evictor over-budget view).
8083        let mut tree = Tree::new(1, 16);
8084        let counter = Arc::new(AtomicI64::new(0));
8085        tree.set_memory_counter(Arc::clone(&counter));
8086
8087        let key = b"a-key".to_vec();
8088        let data = vec![0u8; 200]; // non-trivial data length
8089        tree.insert(key.clone(), data.clone(), Lsn::new(0, 10)).unwrap();
8090        let after_insert = counter.load(Ordering::Relaxed);
8091        assert!(after_insert > 0, "insert must increase the counter");
8092        assert_eq!(
8093            after_insert,
8094            (key.len() + data.len() + BIN_ENTRY_OVERHEAD) as i64,
8095            "insert accounts key + data + per-slot BinEntry overhead"
8096        );
8097
8098        let deleted = tree.delete(&key);
8099        assert!(deleted);
8100        assert_eq!(
8101            counter.load(Ordering::Relaxed),
8102            0,
8103            "F8: delete must subtract key + data + BIN_ENTRY_OVERHEAD, returning the counter              to its pre-insert value (no data_len leak)"
8104        );
8105    }
8106
8107    /// EV-13 (pass-post): a full-node detach must ACTUALLY drop the child
8108    /// `Arc` from the parent IN, not merely credit bytes.  Before the fix the
8109    /// evictor credited `node_size_fn(node_id)` and removed the node from the
8110    /// LRU list, but the parent's `InEntry.child` still held a strong `Arc`,
8111    /// so the node was never freed (phantom free) and the budget over-credited.
8112    ///
8113    /// This test proves: after `detach_node_by_id` the held child `Arc` is the
8114    /// LAST strong reference (strong_count == 1), the parent slot's `child` is
8115    /// `None`, and the returned bytes equal the node's measured heap size.
8116    ///
8117    /// JE ref: `IN.detachNode` (`setTarget(idx, null)`) / `Evictor.evict`.
8118    #[test]
8119    fn test_ev13_detach_actually_frees_child() {
8120        // Tiny fanout forces a root split so we get a real IN parent with BIN
8121        // children that the evictor would target.
8122        let tree = Tree::new(7, 4);
8123        for i in 0u8..12 {
8124            tree.insert(
8125                vec![b'a' + i],
8126                vec![i; 8],
8127                Lsn::new(1, u32::from(i) + 1),
8128            )
8129            .unwrap();
8130        }
8131
8132        // Find a BIN child of the root IN (the eviction target) + its parent.
8133        let root = tree.get_root().expect("tree must have a root");
8134        let (parent_arc, child_idx, bin_id, expected_bytes) = {
8135            let rg = root.read();
8136            let TreeNode::Internal(n) = &*rg else {
8137                panic!("root must be an IN after split");
8138            };
8139            // Pick the first slot whose child is a resident BIN.
8140            let (idx, child) = n
8141                .first_resident_child()
8142                .expect("root must have a resident child");
8143            let (id, bytes) = {
8144                let cg = child.read();
8145                (
8146                    match &*cg {
8147                        TreeNode::Bottom(b) => b.node_id,
8148                        TreeNode::Internal(n2) => n2.node_id,
8149                    },
8150                    cg.budgeted_memory_size(),
8151                )
8152            };
8153            (Arc::clone(&root), idx, id, bytes)
8154        };
8155
8156        // Hold an external strong reference to the child so we can observe its
8157        // strong_count drop when detach releases the parent's reference.
8158        let child_arc = {
8159            let pg = parent_arc.read();
8160            let TreeNode::Internal(n) = &*pg else { unreachable!() };
8161            Arc::clone(n.child_ref(child_idx).unwrap())
8162        };
8163        // Two strong refs now: the parent slot + our test handle.
8164        assert_eq!(
8165            Arc::strong_count(&child_arc),
8166            2,
8167            "precondition: parent slot + test handle hold the child"
8168        );
8169
8170        let freed = tree.detach_node_by_id(bin_id);
8171
8172        // 1. Bytes credited equal the measured heap size (no phantom credit).
8173        assert_eq!(
8174            freed, expected_bytes,
8175            "detach must credit the node's real measured heap size"
8176        );
8177        // 2. The parent slot's child is now None (JE setTarget(idx, null)).
8178        {
8179            let pg = parent_arc.read();
8180            let TreeNode::Internal(n) = &*pg else { unreachable!() };
8181            assert!(
8182                n.child_is_none(child_idx),
8183                "EV-13: parent slot must be detached (child == None)"
8184            );
8185            // The slot itself (key + LSN) is retained for re-fetch.
8186            assert!(
8187                !n.get_lsn(child_idx).is_null(),
8188                "detach keeps the slot LSN so the node can be re-fetched"
8189            );
8190        }
8191        // 3. Our handle is now the ONLY strong reference -> the parent really
8192        //    dropped its Arc; the node is freed when we drop `child_arc`.
8193        //    Before EV-13 this would be 2 (parent still held it) = phantom free.
8194        assert_eq!(
8195            Arc::strong_count(&child_arc),
8196            1,
8197            "EV-13: detach must drop the parent's strong Arc (no phantom free)"
8198        );
8199    }
8200
8201    /// EV-13: detach must NOT decrement the memory counter itself (the evictor
8202    /// owns that bookkeeping via `Arbiter::release_memory`).  A double credit
8203    /// would drive `cache_usage` below reality.
8204    #[test]
8205    fn test_ev13_detach_does_not_touch_counter() {
8206        use std::sync::atomic::{AtomicI64, Ordering};
8207        let mut tree = Tree::new(8, 4);
8208        let counter = Arc::new(AtomicI64::new(0));
8209        tree.set_memory_counter(Arc::clone(&counter));
8210        for i in 0u8..12 {
8211            tree.insert(
8212                vec![b'a' + i],
8213                vec![i; 8],
8214                Lsn::new(1, u32::from(i) + 1),
8215            )
8216            .unwrap();
8217        }
8218        let before = counter.load(Ordering::Relaxed);
8219
8220        // Grab a BIN child id.
8221        let root = tree.get_root().unwrap();
8222        let bin_id = {
8223            let rg = root.read();
8224            let TreeNode::Internal(n) = &*rg else { unreachable!() };
8225            let child = n
8226                .resident_children()
8227                .into_iter()
8228                .next()
8229                .expect("resident child");
8230            match &*child.read() {
8231                TreeNode::Bottom(b) => b.node_id,
8232                TreeNode::Internal(n2) => n2.node_id,
8233            }
8234        };
8235
8236        let freed = tree.detach_node_by_id(bin_id);
8237        assert!(freed > 0, "detach must free a resident child");
8238        assert_eq!(
8239            counter.load(Ordering::Relaxed),
8240            before,
8241            "EV-13: detach must not change the counter (evictor credits once)"
8242        );
8243    }
8244
8245    /// EV-13: detaching the root or an unknown id is a no-op returning 0.
8246    #[test]
8247    fn test_ev13_detach_root_or_missing_is_noop() {
8248        let tree = Tree::new(9, 4);
8249        for i in 0u8..12 {
8250            tree.insert(
8251                vec![b'a' + i],
8252                vec![i; 8],
8253                Lsn::new(1, u32::from(i) + 1),
8254            )
8255            .unwrap();
8256        }
8257        let root_id = {
8258            let rg = tree.get_root().unwrap();
8259            let g = rg.read();
8260            match &*g {
8261                TreeNode::Internal(n) => n.node_id,
8262                TreeNode::Bottom(b) => b.node_id,
8263            }
8264        };
8265        assert_eq!(
8266            tree.detach_node_by_id(root_id),
8267            0,
8268            "root has no parent IN -> detach is a no-op"
8269        );
8270        assert_eq!(
8271            tree.detach_node_by_id(u64::MAX),
8272            0,
8273            "unknown node id -> detach is a no-op"
8274        );
8275    }
8276
8277    /// DBI-23 (pass-post): the live `memory_counter` must APPROXIMATE the real
8278    /// in-memory heap of the tree, not the old `key + data + 48` lower bound.
8279    ///
8280    /// JE keeps `inMemorySize` (`IN.getBudgetedMemorySize`) in lock-step with
8281    /// the per-node `computeMemorySize`; the over-budget arbiter sees the real
8282    /// figure so eviction fires at the right time.  The previous Noxu live
8283    /// path undercounted each BIN slot (48 vs the 64-byte `BinEntry` struct)
8284    /// and never accounted the node-struct fixed overhead, so the counter ran
8285    /// below real heap and the evictor under-fired.
8286    ///
8287    /// We assert the live counter is within tolerance of
8288    /// `total_budgeted_memory` (the authoritative walk-and-sum oracle).  The
8289    /// only gap is the per-node fixed struct overhead (BinStub/InNodeStub),
8290    /// which is a small fraction for non-trivial entries — the fix closes the
8291    /// dominant per-slot gap.
8292    #[test]
8293    fn test_dbi23_live_counter_approximates_real_heap() {
8294        use std::sync::atomic::{AtomicI64, Ordering};
8295        let mut tree = Tree::new(42, 32);
8296        let counter = Arc::new(AtomicI64::new(0));
8297        tree.set_memory_counter(Arc::clone(&counter));
8298
8299        // Insert N entries with realistic key+data sizes.
8300        let n = 400u32;
8301        for i in 0..n {
8302            let key = format!("key-{i:08}").into_bytes(); // 12 bytes
8303            let data = vec![0u8; 64]; // 64 bytes
8304            tree.insert(key, data, Lsn::new(1, i + 1)).unwrap();
8305        }
8306
8307        let live = counter.load(Ordering::Relaxed) as u64;
8308        let real = tree.total_budgeted_memory();
8309
8310        // The live counter must reflect the per-slot cost AFTER the T-2/T-3
8311        // compactions hoisted the per-slot key/LSN out of `BinEntry` into the
8312        // node-level reps.  The per-slot live charge is now
8313        // `key + data + size_of::<BinEntry>() + 4` (the packed LSN slot); the
8314        // dominant data+key bytes are still charged in full.  Assert the live
8315        // counter is at least the data-and-fixed portion (a stable floor that
8316        // does NOT assume the pre-compaction 64-byte slot).
8317        let new_lower_bound: u64 = (0..n)
8318            .map(|i| {
8319                let key_len = format!("key-{i:08}").len();
8320                (key_len + 64 + BIN_ENTRY_OVERHEAD) as u64
8321            })
8322            .sum();
8323
8324        assert!(
8325            live >= new_lower_bound,
8326            "DBI-23: live counter ({live}) must be >= the per-slot-correct \
8327             lower bound ({new_lower_bound})"
8328        );
8329
8330        // Within tolerance of real heap (the residual gap is the per-node
8331        // fixed struct overhead, intentionally not tracked incrementally).
8332        let lower = real * 80 / 100;
8333        assert!(
8334            live >= lower && live <= real,
8335            "DBI-23: live counter ({live}) must approximate real heap ({real}) \
8336             within tolerance [{lower}, {real}]"
8337        );
8338    }
8339
8340    #[test]
8341    fn test_delete_existing_key() {
8342        let tree = Tree::new(1, 128);
8343        let key = b"remove_me".to_vec();
8344        tree.insert(key.clone(), b"val".to_vec(), Lsn::new(1, 10)).unwrap();
8345        assert!(tree.delete(&key));
8346
8347        // After deletion the BIN is empty, so delete returns true the first
8348        // time and false the second time.
8349        assert!(!tree.delete(&key));
8350    }
8351
8352    #[test]
8353    fn test_delete_nonexistent_key() {
8354        let tree = Tree::new(1, 128);
8355        tree.insert(b"a".to_vec(), b"v".to_vec(), Lsn::new(1, 1)).unwrap();
8356
8357        assert!(!tree.delete(b"zzz"));
8358    }
8359
8360    #[test]
8361    fn test_delete_empty_tree() {
8362        let tree = Tree::new(1, 128);
8363        assert!(!tree.delete(b"nothing"));
8364    }
8365
8366    #[test]
8367    fn test_delete_all_entries_makes_bin_empty() {
8368        let tree = Tree::new(1, 128);
8369        tree.insert(b"x".to_vec(), b"1".to_vec(), Lsn::new(1, 1)).unwrap();
8370        tree.insert(b"y".to_vec(), b"2".to_vec(), Lsn::new(1, 2)).unwrap();
8371
8372        assert!(tree.delete(b"x"));
8373        assert!(tree.delete(b"y"));
8374
8375        // Tree still has a root (empty BIN), so is_empty() returns false.
8376        assert!(!tree.is_empty());
8377        // get_first_node should return None for an empty BIN.
8378        assert!(tree.get_first_node().is_none());
8379    }
8380
8381    #[test]
8382    fn test_set_root_and_get_root() {
8383        let tree = Tree::new(1, 128);
8384        assert!(tree.get_root().is_none());
8385
8386        let bin = TreeNode::Bottom(BinStub {
8387            node_id: generate_node_id(),
8388            level: BIN_LEVEL,
8389            entries: vec![],
8390            key_prefix: Vec::new(),
8391            dirty: false,
8392            is_delta: false,
8393            last_full_lsn: NULL_LSN,
8394            last_delta_lsn: NULL_LSN,
8395            generation: 0,
8396            parent: None,
8397            expiration_in_hours: true,
8398            cursor_count: 0,
8399            prohibit_next_delta: false,
8400            lsn_rep: LsnRep::Empty,
8401            keys: KeyRep::new(),
8402            compact_max_key_length: INKeyRep_DEFAULT_MAX_KEY_LENGTH,
8403        });
8404        tree.set_root(bin);
8405        assert!(tree.get_root().is_some());
8406    }
8407
8408    // ========================================================================
8409    // Split / multi-level insert tests  (new)
8410    // ========================================================================
8411
8412    /// inserting enough keys to fill the root IN causes
8413    /// the root IN itself to split, resulting in a tree with 3 or more levels.
8414    ///
8415    /// With max_entries_per_node = 4:
8416    ///   - Each BIN holds 4 entries before it is split.
8417    ///   - The root IN at level 2 holds up to 4 BIN children.
8418    ///   - Filling those 4 BINs (16 entries) and adding a 17th forces the
8419    ///     root IN to split, creating a level-3 root.
8420    #[test]
8421    fn test_insert_forces_root_split() {
8422        let tree = Tree::new(1, 4);
8423
8424        // 17 inserts with fanout 4 forces the root IN to split.
8425        for i in 0u32..20 {
8426            let key = format!("key{:04}", i).into_bytes();
8427            let data = format!("data{}", i).into_bytes();
8428            let lsn = Lsn::new(1, 100 + i);
8429            let r = tree.insert(key, data, lsn);
8430            assert!(r.is_ok(), "insert {} must succeed", i);
8431        }
8432
8433        // At least one root split must have occurred.
8434        assert!(
8435            tree.get_root_splits() > 0,
8436            "expected at least one root split after 20 inserts with fanout 4"
8437        );
8438
8439        // The root level must be > level-2 (i.e., the tree has grown to 3+ levels).
8440        let root_arc = tree.get_root().as_ref().unwrap().clone();
8441        let root_level = root_arc.read().level();
8442        let level_2 = MAIN_LEVEL | 2;
8443        assert!(
8444            root_level > level_2,
8445            "root level {} must be > level-2 after root split",
8446            root_level
8447        );
8448    }
8449
8450    /// Inserting 1000 keys in sorted order and verifying all are searchable.
8451    #[test]
8452    fn test_insert_many_keys() {
8453        let tree = Tree::new(1, 8);
8454        let n = 1000u32;
8455
8456        for i in 0..n {
8457            let key = format!("key{:08}", i).into_bytes();
8458            let data = format!("data{}", i).into_bytes();
8459            let lsn = Lsn::new(1, i);
8460            let r = tree.insert(key, data, lsn);
8461            assert!(r.is_ok(), "insert {} must succeed", i);
8462        }
8463
8464        // All keys must be findable.
8465        for i in 0..n {
8466            let key = format!("key{:08}", i).into_bytes();
8467            let sr = tree.search(&key);
8468            assert!(
8469                sr.is_some() && sr.unwrap().exact_parent_found,
8470                "key{:08} must be found after bulk insert",
8471                i
8472            );
8473        }
8474    }
8475
8476    /// Inserting 500 keys in pseudo-random (reverse) order and verifying all
8477    /// are searchable.
8478    #[test]
8479    fn test_insert_random_keys() {
8480        let tree = Tree::new(1, 8);
8481        let n = 500u32;
8482
8483        // Insert in reverse order as a simple non-sorted sequence.
8484        for i in (0..n).rev() {
8485            let key = format!("rkey{:08}", i).into_bytes();
8486            let data = format!("data{}", i).into_bytes();
8487            let lsn = Lsn::new(1, i);
8488            let r = tree.insert(key, data, lsn);
8489            assert!(r.is_ok(), "insert {} must succeed", i);
8490        }
8491
8492        for i in 0..n {
8493            let key = format!("rkey{:08}", i).into_bytes();
8494            let sr = tree.search(&key);
8495            assert!(
8496                sr.is_some() && sr.unwrap().exact_parent_found,
8497                "rkey{:08} must be found",
8498                i
8499            );
8500        }
8501    }
8502
8503    /// After any number of splits, every key inserted must still be findable.
8504    ///
8505    #[test]
8506    fn test_split_preserves_all_keys() {
8507        // Tiny fanout to maximise split frequency.
8508        let tree = Tree::new(1, 3);
8509        let n = 60u32;
8510
8511        let mut keys: Vec<Vec<u8>> = Vec::new();
8512        for i in 0..n {
8513            let key = format!("sk{:04}", i).into_bytes();
8514            keys.push(key.clone());
8515            let data = format!("d{}", i).into_bytes();
8516            let lsn = Lsn::new(1, i);
8517            let r = tree.insert(key, data, lsn);
8518            assert!(r.is_ok(), "insert {} must not fail", i);
8519        }
8520
8521        // After all inserts (and all the splits they induced), every key must
8522        // still be findable in the tree.
8523        for key in &keys {
8524            let sr = tree.search(key);
8525            assert!(
8526                sr.is_some() && sr.unwrap().exact_parent_found,
8527                "key {:?} must survive all splits",
8528                std::str::from_utf8(key).unwrap_or("?")
8529            );
8530        }
8531    }
8532
8533    /// The tree level (depth) must grow as keys are inserted and splits occur.
8534    #[test]
8535    fn test_tree_height_grows() {
8536        let tree = Tree::new(1, 4);
8537
8538        // With fanout 4, one level-2 root IN can hold 4 children.  After enough
8539        // inserts the root itself will split and a level-3 node will appear.
8540        // Insert enough keys to force the root to split at least once.
8541        let n = 40u32;
8542        for i in 0..n {
8543            let key = format!("hk{:08}", i).into_bytes();
8544            let data = format!("d{}", i).into_bytes();
8545            let lsn = Lsn::new(1, i);
8546            tree.insert(key, data, lsn).unwrap();
8547        }
8548
8549        // At least one root split must have occurred.
8550        assert!(
8551            tree.get_root_splits() > 0,
8552            "expected root to have split at least once for {} keys with fanout 4",
8553            n
8554        );
8555
8556        // The root level must be > level-2 (i.e., the tree has grown past two levels).
8557        let root_arc = tree.get_root().as_ref().unwrap().clone();
8558        let root_level = root_arc.read().level();
8559        let level_2 = MAIN_LEVEL | 2;
8560        assert!(
8561            root_level > level_2,
8562            "root level {} must be > {} after enough inserts",
8563            root_level,
8564            level_2
8565        );
8566    }
8567
8568    #[test]
8569    fn test_find_entry_on_internal_node() {
8570        let mut entries = vec![];
8571        for i in 0..4 {
8572            entries.push(InEntry { key: format!("k{}", i).into_bytes() });
8573        }
8574        let internal = TreeNode::Internal(InNodeStub {
8575            node_id: 1,
8576            level: MAIN_LEVEL + 2,
8577            entries,
8578            targets: TargetRep::None,
8579            dirty: false,
8580            generation: 0,
8581            parent: None,
8582            lsn_rep: LsnRep::Empty,
8583        });
8584
8585        // Exact match
8586        let r = internal.find_entry(b"k2", false, true);
8587        assert_ne!(r & EXACT_MATCH, 0);
8588        assert_eq!(r & 0xFFFF, 2);
8589
8590        // No exact match with exact=true
8591        let r = internal.find_entry(b"kx", false, true);
8592        assert_eq!(r, -1);
8593    }
8594
8595    // St-H5: non-exact `find_entry` on an Internal node must return the FLOOR
8596    // child slot (largest entry ≤ key), not the insertion point. Entries are
8597    // k0,k1,k2,k3; slot 0 is the leftmost child.
8598    #[test]
8599    fn test_find_entry_internal_nonexact_returns_floor() {
8600        let mut entries = vec![];
8601        for i in 0..4 {
8602            entries.push(InEntry { key: format!("k{}", i).into_bytes() });
8603        }
8604        let internal = TreeNode::Internal(InNodeStub {
8605            node_id: 1,
8606            level: MAIN_LEVEL + 2,
8607            entries,
8608            targets: TargetRep::None,
8609            dirty: false,
8610            generation: 0,
8611            parent: None,
8612            lsn_rep: LsnRep::Empty,
8613        });
8614
8615        // Key below every separator floors to slot 0 (leftmost child).
8616        assert_eq!(internal.find_entry(b"a", false, false) & 0xFFFF, 0);
8617        // Between k1 and k2 floors to k1 (slot 1).
8618        assert_eq!(internal.find_entry(b"k1x", false, false) & 0xFFFF, 1);
8619        // Above every separator floors to the last slot (k3 = slot 3).
8620        assert_eq!(internal.find_entry(b"zzz", false, false) & 0xFFFF, 3);
8621        // Exact match still reported as the exact slot.
8622        let r = internal.find_entry(b"k2", false, false);
8623        assert_ne!(r & EXACT_MATCH, 0);
8624        assert_eq!(r & 0xFFFF, 2);
8625    }
8626
8627    // ========================================================================
8628    // New tests: dirty tracking, generation, parent pointers, log size, stats
8629    // ========================================================================
8630
8631    /// After inserting into a tree, the BIN (and root IN) must be dirty.
8632    ///
8633    /// The: Tree.insertLN() calls bin.setDirty(true) after each insert.
8634    #[test]
8635    fn test_insert_marks_bin_dirty() {
8636        let tree = Tree::new(1, 128);
8637        tree.insert(b"key1".to_vec(), b"val1".to_vec(), Lsn::new(1, 1))
8638            .unwrap();
8639
8640        let root_arc = tree.get_root().as_ref().unwrap().clone();
8641        // root is an upper IN — its slot 0 child is the BIN.
8642        let bin_arc = {
8643            let g = root_arc.read();
8644            match &*g {
8645                TreeNode::Internal(n) => n.get_child(0).unwrap(),
8646                _ => panic!("expected Internal root"),
8647            }
8648        };
8649
8650        let bin_dirty = bin_arc.read().is_dirty();
8651        assert!(bin_dirty, "BIN must be dirty after insert");
8652    }
8653
8654    /// Updating an existing key keeps the BIN dirty.
8655    #[test]
8656    fn test_update_keeps_bin_dirty() {
8657        let tree = Tree::new(1, 128);
8658        tree.insert(b"k".to_vec(), b"v1".to_vec(), Lsn::new(1, 1)).unwrap();
8659        // second insert is an update
8660        tree.insert(b"k".to_vec(), b"v2".to_vec(), Lsn::new(1, 2)).unwrap();
8661
8662        let root_arc = tree.get_root().as_ref().unwrap().clone();
8663        let bin_arc = {
8664            let g = root_arc.read();
8665            match &*g {
8666                TreeNode::Internal(n) => n.get_child(0).unwrap(),
8667                _ => panic!("expected Internal root"),
8668            }
8669        };
8670
8671        assert!(bin_arc.read().is_dirty(), "BIN must be dirty after update");
8672    }
8673
8674    /// After deleting a key the BIN must be dirty.
8675    #[test]
8676    fn test_delete_marks_bin_dirty() {
8677        let tree = Tree::new(1, 128);
8678        tree.insert(b"del".to_vec(), b"val".to_vec(), Lsn::new(1, 1)).unwrap();
8679
8680        // Manually clear dirty flag to verify delete re-sets it.
8681        {
8682            let root_arc = tree.get_root().as_ref().unwrap().clone();
8683            let bin_arc = {
8684                let g = root_arc.read();
8685                match &*g {
8686                    TreeNode::Internal(n) => n.get_child(0).unwrap(),
8687                    _ => panic!("expected Internal root"),
8688                }
8689            };
8690            bin_arc.write().set_dirty(false);
8691            assert!(!bin_arc.read().is_dirty());
8692        }
8693
8694        tree.delete(b"del");
8695
8696        let root_arc = tree.get_root().as_ref().unwrap().clone();
8697        let bin_arc = {
8698            let g = root_arc.read();
8699            match &*g {
8700                TreeNode::Internal(n) => n.get_child(0).unwrap(),
8701                _ => panic!("expected Internal root"),
8702            }
8703        };
8704        assert!(bin_arc.read().is_dirty(), "BIN must be dirty after delete");
8705    }
8706
8707    /// BIN's parent pointer must point to the root IN.
8708    #[test]
8709    fn test_bin_parent_pointer_set_on_initial_insert() {
8710        let tree = Tree::new(1, 128);
8711        tree.insert(b"k".to_vec(), b"v".to_vec(), Lsn::new(1, 1)).unwrap();
8712
8713        let root_arc = tree.get_root().as_ref().unwrap().clone();
8714        let bin_arc = {
8715            let g = root_arc.read();
8716            match &*g {
8717                TreeNode::Internal(n) => n.get_child(0).unwrap(),
8718                _ => panic!("expected Internal root"),
8719            }
8720        };
8721
8722        let parent_weak = bin_arc.read().get_parent();
8723        assert!(parent_weak.is_some(), "BIN must have a parent pointer");
8724
8725        // Upgrading the weak pointer must give us the root arc.
8726        let parent_arc = parent_weak.unwrap().upgrade().unwrap();
8727        assert!(
8728            Arc::ptr_eq(&parent_arc, &root_arc),
8729            "BIN parent must be the root IN"
8730        );
8731    }
8732
8733    /// set_dirty / is_dirty round-trip on both variants.
8734    #[test]
8735    fn test_dirty_flag_roundtrip() {
8736        let mut bin_node = TreeNode::Bottom(BinStub {
8737            node_id: 1,
8738            level: BIN_LEVEL,
8739            entries: vec![],
8740            key_prefix: Vec::new(),
8741            dirty: false,
8742            is_delta: false,
8743            last_full_lsn: NULL_LSN,
8744            last_delta_lsn: NULL_LSN,
8745            generation: 0,
8746            parent: None,
8747            expiration_in_hours: true,
8748            cursor_count: 0,
8749            prohibit_next_delta: false,
8750            lsn_rep: LsnRep::Empty,
8751            keys: KeyRep::new(),
8752            compact_max_key_length: INKeyRep_DEFAULT_MAX_KEY_LENGTH,
8753        });
8754        assert!(!bin_node.is_dirty());
8755        bin_node.set_dirty(true);
8756        assert!(bin_node.is_dirty());
8757        bin_node.set_dirty(false);
8758        assert!(!bin_node.is_dirty());
8759
8760        let mut in_node = TreeNode::Internal(InNodeStub {
8761            node_id: 2,
8762            level: MAIN_LEVEL | 2,
8763            entries: vec![],
8764            targets: TargetRep::None,
8765            dirty: false,
8766            generation: 0,
8767            parent: None,
8768            lsn_rep: LsnRep::Empty,
8769        });
8770        assert!(!in_node.is_dirty());
8771        in_node.set_dirty(true);
8772        assert!(in_node.is_dirty());
8773    }
8774
8775    /// set_generation / get_generation round-trip on both variants.
8776    #[test]
8777    fn test_generation_roundtrip() {
8778        let mut bin_node = TreeNode::Bottom(BinStub {
8779            node_id: 1,
8780            level: BIN_LEVEL,
8781            entries: vec![],
8782            key_prefix: Vec::new(),
8783            dirty: false,
8784            is_delta: false,
8785            last_full_lsn: NULL_LSN,
8786            last_delta_lsn: NULL_LSN,
8787            generation: 0,
8788            parent: None,
8789            expiration_in_hours: true,
8790            cursor_count: 0,
8791            prohibit_next_delta: false,
8792            lsn_rep: LsnRep::Empty,
8793            keys: KeyRep::new(),
8794            compact_max_key_length: INKeyRep_DEFAULT_MAX_KEY_LENGTH,
8795        });
8796        assert_eq!(bin_node.get_generation(), 0);
8797        bin_node.set_generation(42);
8798        assert_eq!(bin_node.get_generation(), 42);
8799
8800        let mut in_node = TreeNode::Internal(InNodeStub {
8801            node_id: 2,
8802            level: MAIN_LEVEL | 2,
8803            entries: vec![],
8804            targets: TargetRep::None,
8805            dirty: false,
8806            generation: 0,
8807            parent: None,
8808            lsn_rep: LsnRep::Empty,
8809        });
8810        in_node.set_generation(99);
8811        assert_eq!(in_node.get_generation(), 99);
8812    }
8813
8814    /// log_size() must be consistent with write_to_bytes() length.
8815    #[test]
8816    fn test_log_size_matches_bytes_len() {
8817        // BIN stub with some entries.
8818        let bin_node = TreeNode::Bottom(BinStub {
8819            node_id: 7,
8820            level: BIN_LEVEL,
8821            entries: vec![
8822                BinEntry {
8823                    data: Some(b"d1".to_vec()),
8824                    known_deleted: false,
8825                    dirty: false,
8826                    expiration_time: 0,
8827                },
8828                BinEntry {
8829                    data: None,
8830                    known_deleted: false,
8831                    dirty: false,
8832                    expiration_time: 0,
8833                },
8834            ],
8835            key_prefix: Vec::new(),
8836            dirty: true,
8837            is_delta: false,
8838            last_full_lsn: NULL_LSN,
8839            last_delta_lsn: NULL_LSN,
8840            generation: 5,
8841            parent: None,
8842            expiration_in_hours: true,
8843            cursor_count: 0,
8844            prohibit_next_delta: false,
8845            lsn_rep: LsnRep::Empty,
8846            keys: KeyRep::from_keys(vec![b"alpha".to_vec(), b"beta".to_vec()]),
8847            compact_max_key_length: INKeyRep_DEFAULT_MAX_KEY_LENGTH,
8848        });
8849        assert_eq!(bin_node.log_size(), bin_node.write_to_bytes().len());
8850
8851        // IN stub with some entries.
8852        let in_node = TreeNode::Internal(InNodeStub {
8853            node_id: 8,
8854            level: MAIN_LEVEL | 2,
8855            entries: vec![
8856                InEntry { key: vec![] },
8857                InEntry { key: b"mid".to_vec() },
8858            ],
8859            targets: TargetRep::None,
8860            dirty: false,
8861            generation: 0,
8862            parent: None,
8863            lsn_rep: LsnRep::Empty,
8864        });
8865        assert_eq!(in_node.log_size(), in_node.write_to_bytes().len());
8866    }
8867
8868    /// write_to_bytes() output contains the node_id and dirty flag.
8869    #[test]
8870    fn test_write_to_bytes_encodes_node_id_and_dirty() {
8871        let node = TreeNode::Bottom(BinStub {
8872            node_id: 0xDEAD_BEEF_0000_0001,
8873            level: BIN_LEVEL,
8874            entries: vec![],
8875            key_prefix: Vec::new(),
8876            dirty: true,
8877            is_delta: false,
8878            last_full_lsn: NULL_LSN,
8879            last_delta_lsn: NULL_LSN,
8880            generation: 0,
8881            parent: None,
8882            expiration_in_hours: true,
8883            cursor_count: 0,
8884            prohibit_next_delta: false,
8885            lsn_rep: LsnRep::Empty,
8886            keys: KeyRep::new(),
8887            compact_max_key_length: INKeyRep_DEFAULT_MAX_KEY_LENGTH,
8888        });
8889        let bytes = node.write_to_bytes();
8890        // First 8 bytes = node_id big-endian.
8891        let id_bytes = &bytes[0..8];
8892        assert_eq!(id_bytes, 0xDEAD_BEEF_0000_0001u64.to_be_bytes());
8893        // Byte at offset 16 (after node_id[8] + level[4] + n_entries[4]) = dirty flag.
8894        assert_eq!(bytes[16], 1u8, "dirty flag must be 1");
8895    }
8896
8897    /// log_size() grows as entries are added.
8898    #[test]
8899    fn test_log_size_grows_with_entries() {
8900        let empty = TreeNode::Bottom(BinStub {
8901            node_id: 1,
8902            level: BIN_LEVEL,
8903            entries: vec![],
8904            key_prefix: Vec::new(),
8905            dirty: false,
8906            is_delta: false,
8907            last_full_lsn: NULL_LSN,
8908            last_delta_lsn: NULL_LSN,
8909            generation: 0,
8910            parent: None,
8911            expiration_in_hours: true,
8912            cursor_count: 0,
8913            prohibit_next_delta: false,
8914            lsn_rep: LsnRep::Empty,
8915            keys: KeyRep::new(),
8916            compact_max_key_length: INKeyRep_DEFAULT_MAX_KEY_LENGTH,
8917        });
8918        let with_entry = TreeNode::Bottom(BinStub {
8919            node_id: 2,
8920            level: BIN_LEVEL,
8921            entries: vec![BinEntry {
8922                data: None,
8923                known_deleted: false,
8924                dirty: false,
8925                expiration_time: 0,
8926            }],
8927            key_prefix: Vec::new(),
8928            dirty: false,
8929            is_delta: false,
8930            last_full_lsn: NULL_LSN,
8931            last_delta_lsn: NULL_LSN,
8932            generation: 0,
8933            parent: None,
8934            expiration_in_hours: true,
8935            cursor_count: 0,
8936            prohibit_next_delta: false,
8937            lsn_rep: LsnRep::Empty,
8938            keys: KeyRep::from_keys(vec![b"longkey_here".to_vec()]),
8939            compact_max_key_length: INKeyRep_DEFAULT_MAX_KEY_LENGTH,
8940        });
8941        assert!(
8942            with_entry.log_size() > empty.log_size(),
8943            "log_size must grow when entries are added"
8944        );
8945    }
8946
8947    /// propagate_dirty_to_root() marks all ancestors dirty.
8948    #[test]
8949    fn test_propagate_dirty_to_root() {
8950        // Build a 2-level tree manually: root IN -> BIN.
8951        let bin_arc = Arc::new(RwLock::new(TreeNode::Bottom(BinStub {
8952            node_id: generate_node_id(),
8953            level: BIN_LEVEL,
8954            entries: vec![],
8955            key_prefix: Vec::new(),
8956            dirty: false,
8957            is_delta: false,
8958            last_full_lsn: NULL_LSN,
8959            last_delta_lsn: NULL_LSN,
8960            generation: 0,
8961            parent: None, // set below
8962            expiration_in_hours: true,
8963            cursor_count: 0,
8964            prohibit_next_delta: false,
8965            lsn_rep: LsnRep::Empty,
8966            keys: KeyRep::new(),
8967            compact_max_key_length: INKeyRep_DEFAULT_MAX_KEY_LENGTH,
8968        })));
8969
8970        let root_arc = Arc::new(RwLock::new(TreeNode::Internal(InNodeStub {
8971            node_id: generate_node_id(),
8972            level: MAIN_LEVEL | 2,
8973            entries: vec![InEntry { key: vec![] }],
8974            targets: TargetRep::Sparse(vec![(0, bin_arc.clone())]),
8975            dirty: false,
8976            generation: 0,
8977            parent: None,
8978            lsn_rep: LsnRep::Empty,
8979        })));
8980
8981        // Wire BIN's parent to root.
8982        bin_arc.write().set_parent(Some(Arc::downgrade(&root_arc)));
8983
8984        // Root is not dirty before propagation.
8985        assert!(!root_arc.read().is_dirty());
8986
8987        // Propagate from the BIN up.
8988        Tree::propagate_dirty_to_root(&bin_arc);
8989
8990        // Root must now be dirty.
8991        assert!(
8992            root_arc.read().is_dirty(),
8993            "root must be dirty after propagate_dirty_to_root"
8994        );
8995    }
8996
8997    /// collect_stats() on an empty tree returns all-zero stats.
8998    #[test]
8999    fn test_collect_stats_empty_tree() {
9000        let tree = Tree::new(1, 128);
9001        let stats = tree.collect_stats();
9002        assert_eq!(stats, TreeStats::default());
9003    }
9004
9005    /// collect_stats() on a single-entry tree: 1 IN + 1 BIN, height 2.
9006    #[test]
9007    fn test_collect_stats_single_insert() {
9008        let tree = Tree::new(1, 128);
9009        tree.insert(b"k".to_vec(), b"v".to_vec(), Lsn::new(1, 1)).unwrap();
9010        let stats = tree.collect_stats();
9011        assert_eq!(stats.n_bins, 1, "must have 1 BIN");
9012        assert_eq!(stats.n_ins, 1, "must have 1 upper IN");
9013        assert_eq!(stats.height, 2, "single-entry tree has height 2");
9014        assert!(stats.n_entries >= 1, "must have at least 1 entry total");
9015    }
9016
9017    /// collect_stats() with many inserts: entry count matches insert count.
9018    #[test]
9019    fn test_collect_stats_many_inserts() {
9020        let tree = Tree::new(1, 8);
9021        let n = 50u32;
9022        for i in 0..n {
9023            let key = format!("sk{:04}", i).into_bytes();
9024            tree.insert(key, b"v".to_vec(), Lsn::new(1, i)).unwrap();
9025        }
9026        let stats = tree.collect_stats();
9027        // All n entries should be accounted for across all BINs.
9028        // n_entries counts entries in both INs and BINs; BIN entries = n.
9029        // We verify BIN entry total equals n by summing manually.
9030        let bin_entries: u64 = stats.n_entries - stats.n_ins; // rough check
9031        // A more precise assertion: the sum of all BIN entries == n.
9032        // Since we can't easily separate, just assert the tree is non-trivial.
9033        assert!(stats.n_bins > 0, "must have at least one BIN");
9034        assert!(stats.height >= 2, "multi-entry tree has height >= 2");
9035        // Total entries in the tree must be >= n (BIN entries alone).
9036        assert!(
9037            bin_entries >= n as u64 || stats.n_entries >= n as u64,
9038            "entry count must account for all inserts"
9039        );
9040    }
9041
9042    // ========================================================================
9043    // Tests: B-tree merge / compress
9044    // ========================================================================
9045
9046    /// After deleting most keys from a tree, compress() must reduce the BIN
9047    /// count by merging under-full siblings.
9048    ///
9049    /// Strategy: build a large tree (many BINs), delete almost all keys,
9050    /// then verify compress() reduces n_bins and all surviving keys remain
9051    /// findable.  We do not hard-code the exact BIN counts because the
9052    /// preemptive splitting strategy determines the exact split points.
9053    #[test]
9054    fn test_compress_merges_underfull_bins() {
9055        let tree = Tree::new(1, 8);
9056
9057        // Insert 64 sorted keys to build a multi-BIN tree.
9058        let n = 64u32;
9059        let keys: Vec<Vec<u8>> =
9060            (0..n).map(|i| format!("cm{:04}", i).into_bytes()).collect();
9061        for (i, key) in keys.iter().enumerate() {
9062            tree.insert(key.clone(), vec![i as u8], Lsn::new(1, i as u32))
9063                .unwrap();
9064        }
9065
9066        let stats_full = tree.collect_stats();
9067        assert!(
9068            stats_full.n_bins >= 2,
9069            "must have multiple BINs after 64 inserts"
9070        );
9071
9072        // Delete all but 4 widely-spaced keys (one roughly per BIN pair).
9073        // We keep every 16th key: k0000, k0016, k0032, k0048.
9074        let keep: std::collections::HashSet<u32> =
9075            [0, 16, 32, 48].iter().cloned().collect();
9076        for i in 0..n {
9077            if !keep.contains(&i) {
9078                let key = format!("cm{:04}", i).into_bytes();
9079                tree.delete(&key);
9080            }
9081        }
9082
9083        let stats_sparse = tree.collect_stats();
9084        assert!(
9085            stats_sparse.n_bins >= 2,
9086            "should still have multiple BINs before compress"
9087        );
9088
9089        // compress() must reduce BIN count since most BINs now hold 0–1 entries.
9090        tree.compress();
9091
9092        let stats_after = tree.collect_stats();
9093        assert!(
9094            stats_after.n_bins < stats_sparse.n_bins,
9095            "compress must reduce BIN count (was {}, now {})",
9096            stats_sparse.n_bins,
9097            stats_after.n_bins
9098        );
9099
9100        // Surviving keys must still be findable.
9101        for i in keep {
9102            let key = format!("cm{:04}", i).into_bytes();
9103            let sr = tree.search(&key);
9104            assert!(
9105                sr.is_some() && sr.unwrap().exact_parent_found,
9106                "key cm{:04} must survive compress",
9107                i
9108            );
9109        }
9110    }
9111
9112    /// compress() preserves all entries: a full-BIN tree has fewer merges
9113    /// but all keys remain accessible.
9114    #[test]
9115    fn test_compress_no_op_when_full() {
9116        // Insert exactly max_entries worth of keys into a single BIN — no split
9117        // will have occurred yet, and the BINs will all be reasonably full.
9118        // We can't prevent splits entirely (preemptive), but we can verify that
9119        // compress() never loses entries.
9120        let tree = Tree::new(1, 8);
9121        let n = 32u32;
9122        for i in 0..n {
9123            let key = format!("fn{:04}", i).into_bytes();
9124            tree.insert(key, vec![i as u8], Lsn::new(1, i)).unwrap();
9125        }
9126
9127        let stats_before = tree.collect_stats();
9128        tree.compress();
9129        let stats_after = tree.collect_stats();
9130
9131        // All keys still findable.
9132        for i in 0..n {
9133            let key = format!("fn{:04}", i).into_bytes();
9134            let sr = tree.search(&key);
9135            assert!(
9136                sr.is_some() && sr.unwrap().exact_parent_found,
9137                "key fn{:04} must be findable after compress",
9138                i
9139            );
9140        }
9141
9142        // BIN count must not increase.
9143        assert!(
9144            stats_after.n_bins <= stats_before.n_bins,
9145            "compress must not increase BIN count"
9146        );
9147    }
9148
9149    /// compress() on an empty tree must not panic.
9150    #[test]
9151    fn test_compress_empty_tree() {
9152        let tree = Tree::new(1, 4);
9153        tree.compress(); // must not panic
9154    }
9155
9156    /// After deleting all entries, compress() reduces BINs to 1.
9157    #[test]
9158    fn test_compress_removes_empty_bin_from_parent() {
9159        let tree = Tree::new(1, 4);
9160        // Insert enough keys to generate multiple BINs.
9161        let n = 16u32;
9162        for i in 0..n {
9163            let key = format!("ep{:04}", i).into_bytes();
9164            tree.insert(key, vec![i as u8], Lsn::new(1, i)).unwrap();
9165        }
9166
9167        let stats_before = tree.collect_stats();
9168        assert!(stats_before.n_bins >= 2, "need multiple BINs for this test");
9169
9170        // Delete everything except the very last key.
9171        for i in 0..n - 1 {
9172            let key = format!("ep{:04}", i).into_bytes();
9173            tree.delete(&key);
9174        }
9175
9176        tree.compress();
9177
9178        let stats_after = tree.collect_stats();
9179        assert!(
9180            stats_after.n_bins < stats_before.n_bins,
9181            "compress must reduce BIN count after mass deletion"
9182        );
9183
9184        // The surviving key must still be findable.
9185        let last_key = format!("ep{:04}", n - 1).into_bytes();
9186        let sr = tree.search(&last_key);
9187        assert!(
9188            sr.is_some() && sr.unwrap().exact_parent_found,
9189            "last key must survive after compress"
9190        );
9191    }
9192
9193    // ========================================================================
9194    // IC-1: prune_empty_bin must NOT remove a live entry when the BIN was
9195    // repopulated between the compressor observing it empty and the prune.
9196    // (Tree corruption / lost-write regression test.)
9197    // ========================================================================
9198
9199    /// Find a BIN arc that is currently empty (0 entries) and is NOT the
9200    /// root, returning it together with the `id_key` the compressor would
9201    /// have captured (here we just use any key that routes to that BIN).
9202    fn first_empty_non_root_bin(tree: &Tree) -> Option<Arc<RwLock<TreeNode>>> {
9203        let root = tree.get_root()?;
9204        for node in tree.rebuild_in_list() {
9205            if Arc::ptr_eq(&node, &root) {
9206                continue; // skip root (single-BIN tree is never pruned)
9207            }
9208            let is_empty_bin = {
9209                let g = node.read();
9210                matches!(&*g, TreeNode::Bottom(b) if b.entries.is_empty())
9211            };
9212            if is_empty_bin {
9213                return Some(node);
9214            }
9215        }
9216        None
9217    }
9218
9219    /// IC-1 (fail-pre / pass-post): the old `compress_bin` prune step called
9220    /// `self.delete(&id_key)`, which re-descends by key.  If a concurrent
9221    /// insert repopulated the empty BIN with a LIVE entry under that same
9222    /// `id_key`, `self.delete` would silently remove the live entry — a lost
9223    /// write.  `prune_empty_bin` re-validates `n_entries == 0` under the
9224    /// parent latch and must REMOVE NOTHING when the BIN is non-empty.
9225    ///
9226    /// JE `Tree.delete` / `searchDeletableSubTree` (Tree.java ~line 755-800):
9227    /// `bin.getNEntries() != 0` → NODE_NOT_EMPTY (abort prune).
9228    #[test]
9229    fn test_ic1_prune_empty_bin_aborts_when_repopulated() {
9230        let tree = Tree::new(1, 4);
9231        let n = 16u32;
9232        for i in 0..n {
9233            let key = format!("ic{:04}", i).into_bytes();
9234            tree.insert(key, vec![i as u8], Lsn::new(1, i)).unwrap();
9235        }
9236        assert!(
9237            tree.collect_stats().n_bins >= 2,
9238            "need multiple BINs for this test"
9239        );
9240
9241        // Empty out one whole BIN by deleting every key it holds.  We delete
9242        // the lowest 4 keys (ic0000..ic0003) which share the first BIN, then
9243        // physically compress it so it has 0 entries.
9244        for i in 0..4 {
9245            let key = format!("ic{:04}", i).into_bytes();
9246            tree.delete(&key);
9247        }
9248
9249        // Locate the now-empty BIN and the id_key the compressor would use.
9250        let empty_bin = match first_empty_non_root_bin(&tree) {
9251            Some(b) => b,
9252            // If the layout didn't leave an isolated empty BIN, the scenario
9253            // isn't reproducible on this build; treat as vacuously passing.
9254            None => return,
9255        };
9256
9257        // SIMULATE THE RACE: a concurrent insert repopulates the empty BIN
9258        // with a LIVE entry *before* the prune runs.  We insert directly into
9259        // the BIN arc to model the insert that lands after `now_empty` was
9260        // read.  Pick a key that routes to this BIN.
9261        let live_key = format!("ic{:04}", 1).into_bytes(); // was deleted above
9262        {
9263            let mut g = empty_bin.write();
9264            if let TreeNode::Bottom(b) = &mut *g {
9265                // T-2/T-3: route through the insert helper so entries/keys/
9266                // lsn_rep stay in lock step.
9267                b.insert_with_prefix(
9268                    live_key.clone(),
9269                    Lsn::new(1, 1),
9270                    Some(vec![0xAB]),
9271                );
9272            }
9273        }
9274        let id_key = {
9275            let g = empty_bin.read();
9276            match &*g {
9277                TreeNode::Bottom(b) => b.get_full_key(0).unwrap(),
9278                _ => unreachable!(),
9279            }
9280        };
9281
9282        // Prune must ABORT (return false) because the BIN is no longer empty,
9283        // and must NOT remove the live entry.
9284        let pruned = tree.prune_empty_bin(&id_key);
9285        assert!(!pruned, "IC-1: prune must abort when the BIN was repopulated");
9286
9287        // The live entry must still be present in the BIN.
9288        let still_there = {
9289            let g = empty_bin.read();
9290            match &*g {
9291                TreeNode::Bottom(b) => {
9292                    b.entries.iter().enumerate().any(|(i, _)| {
9293                        b.key_prefix.is_empty() && b.get_key(i) == live_key
9294                    })
9295                }
9296                _ => false,
9297            }
9298        };
9299        assert!(
9300            still_there,
9301            "IC-1: prune must not remove the repopulated live entry"
9302        );
9303    }
9304
9305    /// IC-1 companion: prune_empty_bin must abort when a cursor is parked on
9306    /// the (still-empty) BIN.  JE: `bin.nCursors() > 0` → CURSORS_EXIST.
9307    #[test]
9308    fn test_ic1_prune_empty_bin_aborts_with_cursor() {
9309        let tree = Tree::new(1, 4);
9310        for i in 0..16u32 {
9311            let key = format!("cu{:04}", i).into_bytes();
9312            tree.insert(key, vec![i as u8], Lsn::new(1, i)).unwrap();
9313        }
9314        for i in 0..4 {
9315            let key = format!("cu{:04}", i).into_bytes();
9316            tree.delete(&key);
9317        }
9318        let empty_bin = match first_empty_non_root_bin(&tree) {
9319            Some(b) => b,
9320            None => return,
9321        };
9322        // Park a cursor on the empty BIN.
9323        Tree::pin_bin(&empty_bin);
9324        // id_key: any key routing to this BIN. Use the first deleted key.
9325        let id_key = format!("cu{:04}", 0).into_bytes();
9326        let pruned = tree.prune_empty_bin(&id_key);
9327        assert!(
9328            !pruned,
9329            "IC-1: prune must abort when a cursor is parked on the BIN"
9330        );
9331        Tree::unpin_bin(&empty_bin);
9332    }
9333
9334    /// IC-1 happy path: prune_empty_bin removes the parent slot when the BIN
9335    /// really is empty, no cursors, not a delta.
9336    #[test]
9337    fn test_ic1_prune_empty_bin_succeeds_when_truly_empty() {
9338        let tree = Tree::new(1, 4);
9339        for i in 0..16u32 {
9340            let key = format!("ok{:04}", i).into_bytes();
9341            tree.insert(key, vec![i as u8], Lsn::new(1, i)).unwrap();
9342        }
9343        for i in 0..4 {
9344            let key = format!("ok{:04}", i).into_bytes();
9345            tree.delete(&key);
9346        }
9347        let bins_before = tree.collect_stats().n_bins;
9348        let empty_bin = match first_empty_non_root_bin(&tree) {
9349            Some(b) => b,
9350            None => return,
9351        };
9352        // id_key: a key that routes to this empty BIN (one of the deleted).
9353        let id_key = {
9354            // route by the lowest deleted key; it falls into the leftmost BIN.
9355            let _ = &empty_bin;
9356            format!("ok{:04}", 0).into_bytes()
9357        };
9358        let pruned = tree.prune_empty_bin(&id_key);
9359        assert!(pruned, "IC-1: prune must succeed on a truly empty BIN");
9360        let bins_after = tree.collect_stats().n_bins;
9361        assert!(
9362            bins_after < bins_before,
9363            "IC-1: pruned BIN slot must be removed from the parent (was {}, now {})",
9364            bins_before,
9365            bins_after
9366        );
9367        // Every surviving key must still be findable.
9368        for i in 4..16u32 {
9369            let key = format!("ok{:04}", i).into_bytes();
9370            assert!(
9371                tree.search(&key).is_some_and(|s| s.exact_parent_found),
9372                "surviving key ok{:04} must remain after prune",
9373                i
9374            );
9375        }
9376    }
9377
9378    // ========================================================================
9379    // Tests: latch-coupling validation (validate_parent_child /
9380    //        search_with_coupling)
9381    // ========================================================================
9382
9383    /// validate_parent_child returns true when the parent slot points at the
9384    /// expected child.
9385    #[test]
9386    fn test_validate_parent_child_correct_link() {
9387        let bin_arc = Arc::new(RwLock::new(TreeNode::Bottom(BinStub {
9388            node_id: generate_node_id(),
9389            level: BIN_LEVEL,
9390            entries: vec![],
9391            key_prefix: Vec::new(),
9392            dirty: false,
9393            is_delta: false,
9394            last_full_lsn: NULL_LSN,
9395            last_delta_lsn: NULL_LSN,
9396            generation: 0,
9397            parent: None,
9398            expiration_in_hours: true,
9399            cursor_count: 0,
9400            prohibit_next_delta: false,
9401            lsn_rep: LsnRep::Empty,
9402            keys: KeyRep::new(),
9403            compact_max_key_length: INKeyRep_DEFAULT_MAX_KEY_LENGTH,
9404        })));
9405
9406        let root_arc = Arc::new(RwLock::new(TreeNode::Internal(InNodeStub {
9407            node_id: generate_node_id(),
9408            level: MAIN_LEVEL | 2,
9409            entries: vec![InEntry { key: vec![] }],
9410            targets: TargetRep::Sparse(vec![(0, bin_arc.clone())]),
9411            dirty: false,
9412            generation: 0,
9413            parent: None,
9414            lsn_rep: LsnRep::Empty,
9415        })));
9416
9417        assert!(
9418            Tree::validate_parent_child(&root_arc, 0, &bin_arc),
9419            "link must be valid when parent slot 0 points at bin_arc"
9420        );
9421    }
9422
9423    /// validate_parent_child returns false when the slot index is out of range.
9424    #[test]
9425    fn test_validate_parent_child_out_of_range() {
9426        let root_arc = Arc::new(RwLock::new(TreeNode::Internal(InNodeStub {
9427            node_id: generate_node_id(),
9428            level: MAIN_LEVEL | 2,
9429            entries: vec![],
9430            targets: TargetRep::None,
9431            dirty: false,
9432            generation: 0,
9433            parent: None,
9434            lsn_rep: LsnRep::Empty,
9435        })));
9436        let other_arc = Arc::new(RwLock::new(TreeNode::Bottom(BinStub {
9437            node_id: generate_node_id(),
9438            level: BIN_LEVEL,
9439            entries: vec![],
9440            key_prefix: Vec::new(),
9441            dirty: false,
9442            is_delta: false,
9443            last_full_lsn: NULL_LSN,
9444            last_delta_lsn: NULL_LSN,
9445            generation: 0,
9446            parent: None,
9447            expiration_in_hours: true,
9448            cursor_count: 0,
9449            prohibit_next_delta: false,
9450            lsn_rep: LsnRep::Empty,
9451            keys: KeyRep::new(),
9452            compact_max_key_length: INKeyRep_DEFAULT_MAX_KEY_LENGTH,
9453        })));
9454
9455        assert!(
9456            !Tree::validate_parent_child(&root_arc, 0, &other_arc),
9457            "link must be invalid when parent has no entries"
9458        );
9459    }
9460
9461    /// validate_parent_child returns false when the slot points at a different Arc.
9462    #[test]
9463    fn test_validate_parent_child_wrong_child() {
9464        let bin_a = Arc::new(RwLock::new(TreeNode::Bottom(BinStub {
9465            node_id: generate_node_id(),
9466            level: BIN_LEVEL,
9467            entries: vec![],
9468            key_prefix: Vec::new(),
9469            dirty: false,
9470            is_delta: false,
9471            last_full_lsn: NULL_LSN,
9472            last_delta_lsn: NULL_LSN,
9473            generation: 0,
9474            parent: None,
9475            expiration_in_hours: true,
9476            cursor_count: 0,
9477            prohibit_next_delta: false,
9478            lsn_rep: LsnRep::Empty,
9479            keys: KeyRep::new(),
9480            compact_max_key_length: INKeyRep_DEFAULT_MAX_KEY_LENGTH,
9481        })));
9482        let bin_b = Arc::new(RwLock::new(TreeNode::Bottom(BinStub {
9483            node_id: generate_node_id(),
9484            level: BIN_LEVEL,
9485            entries: vec![],
9486            key_prefix: Vec::new(),
9487            dirty: false,
9488            is_delta: false,
9489            last_full_lsn: NULL_LSN,
9490            last_delta_lsn: NULL_LSN,
9491            generation: 0,
9492            parent: None,
9493            expiration_in_hours: true,
9494            cursor_count: 0,
9495            prohibit_next_delta: false,
9496            lsn_rep: LsnRep::Empty,
9497            keys: KeyRep::new(),
9498            compact_max_key_length: INKeyRep_DEFAULT_MAX_KEY_LENGTH,
9499        })));
9500
9501        let root_arc = Arc::new(RwLock::new(TreeNode::Internal(InNodeStub {
9502            node_id: generate_node_id(),
9503            level: MAIN_LEVEL | 2,
9504            entries: vec![InEntry { key: vec![] }],
9505            targets: TargetRep::Sparse(vec![(0, bin_a)]),
9506            dirty: false,
9507            generation: 0,
9508            parent: None,
9509            lsn_rep: LsnRep::Empty,
9510        })));
9511
9512        assert!(
9513            !Tree::validate_parent_child(&root_arc, 0, &bin_b),
9514            "link must be invalid when parent slot points at a different Arc"
9515        );
9516    }
9517
9518    /// search_with_coupling finds the same key as search().
9519    #[test]
9520    fn test_search_with_coupling_finds_existing_key() {
9521        let tree = Tree::new(1, 8);
9522        for i in 0u32..20 {
9523            let key = format!("c{:04}", i).into_bytes();
9524            tree.insert(key, vec![i as u8], Lsn::new(1, i)).unwrap();
9525        }
9526
9527        for i in 0u32..20 {
9528            let key = format!("c{:04}", i).into_bytes();
9529            let sr = tree.search_with_coupling(&key);
9530            assert!(
9531                sr.is_some() && sr.unwrap().exact_parent_found,
9532                "search_with_coupling must find c{:04}",
9533                i
9534            );
9535        }
9536    }
9537
9538    /// search_with_coupling returns false for a key not in the tree.
9539    #[test]
9540    fn test_search_with_coupling_missing_key() {
9541        let tree = Tree::new(1, 8);
9542        tree.insert(b"hello".to_vec(), b"v".to_vec(), Lsn::new(1, 1)).unwrap();
9543
9544        let sr = tree.search_with_coupling(b"zzz");
9545        // The search result must either be None or have exact_parent_found=false.
9546        assert!(
9547            sr.is_none_or(|r| !r.exact_parent_found),
9548            "search_with_coupling must not find a key that was never inserted"
9549        );
9550    }
9551
9552    /// search_with_coupling on an empty tree returns None.
9553    #[test]
9554    fn test_search_with_coupling_empty_tree() {
9555        let tree = Tree::new(1, 8);
9556        assert!(tree.search_with_coupling(b"k").is_none());
9557    }
9558
9559    // ========================================================================
9560    // Tests: BIN-delta reconstitution (apply_delta_to_bin / mutate_to_full_bin)
9561    // ========================================================================
9562
9563    /// apply_delta_to_bin replaces existing entries and inserts new ones.
9564    ///
9565    /// BIN.applyDelta(): delta entries are authoritative and
9566    /// supersede full-BIN entries at the same key.
9567    #[test]
9568    fn test_apply_delta_to_bin_updates_and_inserts() {
9569        let mut base = BinStub {
9570            node_id: 1,
9571            level: BIN_LEVEL,
9572            entries: vec![
9573                BinEntry {
9574                    data: Some(b"old_a".to_vec()),
9575                    known_deleted: false,
9576                    dirty: false,
9577                    expiration_time: 0,
9578                },
9579                BinEntry {
9580                    data: Some(b"old_c".to_vec()),
9581                    known_deleted: false,
9582                    dirty: false,
9583                    expiration_time: 0,
9584                },
9585            ],
9586            key_prefix: Vec::new(),
9587            dirty: false,
9588            is_delta: false,
9589            last_full_lsn: NULL_LSN,
9590            last_delta_lsn: NULL_LSN,
9591            generation: 0,
9592            parent: None,
9593            expiration_in_hours: true,
9594            cursor_count: 0,
9595            prohibit_next_delta: false,
9596            lsn_rep: LsnRep::Empty,
9597            keys: KeyRep::from_keys(vec![b"a".to_vec(), b"c".to_vec()]),
9598            compact_max_key_length: INKeyRep_DEFAULT_MAX_KEY_LENGTH,
9599        };
9600
9601        let delta_entries = vec![
9602            // Update existing key "a" with new data.
9603            (b"a".to_vec(), Lsn::new(1, 10), Some(b"new_a".to_vec())),
9604            // Insert new key "b".
9605            (b"b".to_vec(), Lsn::new(1, 20), Some(b"new_b".to_vec())),
9606        ];
9607
9608        Tree::apply_delta_to_bin(&mut base, delta_entries);
9609
9610        assert!(base.dirty, "base must be dirty after applying delta");
9611
9612        // Collect the full keys for assertions (T-2: keys live in the rep).
9613        let full_keys: Vec<Vec<u8>> = (0..base.entries.len())
9614            .map(|i| base.get_full_key(i).unwrap_or_default())
9615            .collect();
9616
9617        // "a" must be updated.
9618        let a_idx = full_keys.iter().position(|k| k == b"a").unwrap();
9619        assert_eq!(
9620            base.entries[a_idx].data.as_deref(),
9621            Some(b"new_a" as &[u8])
9622        );
9623
9624        // "b" must be newly inserted.
9625        assert!(full_keys.iter().any(|k| k == b"b"));
9626
9627        // "c" must still be present (untouched).
9628        assert!(full_keys.iter().any(|k| k == b"c"));
9629
9630        // Entries must be in sorted order.
9631        let mut sorted = full_keys.clone();
9632        sorted.sort();
9633        assert_eq!(
9634            full_keys, sorted,
9635            "entries must remain sorted after delta apply"
9636        );
9637    }
9638
9639    /// apply_delta_to_bin with an empty delta is a no-op (except dirty flag).
9640    #[test]
9641    fn test_apply_delta_to_bin_empty_delta() {
9642        let mut base = BinStub {
9643            node_id: 1,
9644            level: BIN_LEVEL,
9645            entries: vec![BinEntry {
9646                data: None,
9647                known_deleted: false,
9648                dirty: false,
9649                expiration_time: 0,
9650            }],
9651            key_prefix: Vec::new(),
9652            dirty: false,
9653            is_delta: false,
9654            last_full_lsn: NULL_LSN,
9655            last_delta_lsn: NULL_LSN,
9656            generation: 0,
9657            parent: None,
9658            expiration_in_hours: true,
9659            cursor_count: 0,
9660            prohibit_next_delta: false,
9661            lsn_rep: LsnRep::Empty,
9662            keys: KeyRep::from_keys(vec![b"x".to_vec()]),
9663            compact_max_key_length: INKeyRep_DEFAULT_MAX_KEY_LENGTH,
9664        };
9665        let n_before = base.entries.len();
9666        Tree::apply_delta_to_bin(&mut base, vec![]);
9667        assert_eq!(
9668            base.entries.len(),
9669            n_before,
9670            "empty delta must not change entry count"
9671        );
9672        assert!(base.dirty, "dirty must be set even for empty delta apply");
9673    }
9674
9675    /// mutate_to_full_bin reconstitutes a full BIN from a delta + base.
9676    ///
9677    /// BIN.mutateToFullBIN(BIN fullBIN): after mutation the
9678    /// `is_delta` flag must be cleared and the entries must contain both
9679    /// base and delta data.
9680    #[test]
9681    fn test_mutate_to_full_bin_merges_delta_and_base() {
9682        let base = BinStub {
9683            node_id: 2,
9684            level: BIN_LEVEL,
9685            entries: vec![
9686                BinEntry {
9687                    data: Some(b"base_aa".to_vec()),
9688                    known_deleted: false,
9689                    dirty: false,
9690                    expiration_time: 0,
9691                },
9692                BinEntry {
9693                    data: Some(b"base_cc".to_vec()),
9694                    known_deleted: false,
9695                    dirty: false,
9696                    expiration_time: 0,
9697                },
9698            ],
9699            key_prefix: Vec::new(),
9700            dirty: false,
9701            is_delta: false,
9702            last_full_lsn: NULL_LSN,
9703            last_delta_lsn: NULL_LSN,
9704            generation: 0,
9705            parent: None,
9706            expiration_in_hours: true,
9707            cursor_count: 0,
9708            prohibit_next_delta: false,
9709            lsn_rep: LsnRep::Empty,
9710            keys: KeyRep::from_keys(vec![b"aa".to_vec(), b"cc".to_vec()]),
9711            compact_max_key_length: INKeyRep_DEFAULT_MAX_KEY_LENGTH,
9712        };
9713
9714        // The delta has a new entry "bb" and overwrites "aa".
9715        let mut delta = BinStub {
9716            node_id: 2,
9717            level: BIN_LEVEL,
9718            entries: vec![
9719                BinEntry {
9720                    data: Some(b"delta_aa".to_vec()),
9721                    known_deleted: false,
9722                    dirty: false,
9723                    expiration_time: 0,
9724                },
9725                BinEntry {
9726                    data: Some(b"delta_bb".to_vec()),
9727                    known_deleted: false,
9728                    dirty: false,
9729                    expiration_time: 0,
9730                },
9731            ],
9732            key_prefix: Vec::new(),
9733            dirty: true,
9734            is_delta: true,
9735            last_full_lsn: NULL_LSN,
9736            last_delta_lsn: NULL_LSN,
9737            generation: 0,
9738            parent: None,
9739            expiration_in_hours: true,
9740            cursor_count: 0,
9741            prohibit_next_delta: false,
9742            lsn_rep: LsnRep::Empty,
9743            keys: KeyRep::from_keys(vec![b"aa".to_vec(), b"bb".to_vec()]),
9744            compact_max_key_length: INKeyRep_DEFAULT_MAX_KEY_LENGTH,
9745        };
9746
9747        Tree::mutate_to_full_bin(&mut delta, base);
9748
9749        // After mutation the node must be a full BIN.
9750        assert!(
9751            !delta.is_delta,
9752            "is_delta must be false after mutate_to_full_bin"
9753        );
9754        assert!(delta.dirty, "must be dirty after mutation");
9755
9756        // Collect full keys for assertions (T-2: keys live in the rep).
9757        let dk: Vec<Vec<u8>> = (0..delta.entries.len())
9758            .map(|i| delta.get_full_key(i).unwrap_or_default())
9759            .collect();
9760
9761        // "aa" must be the delta version.
9762        let aa_idx = dk.iter().position(|k| k == b"aa").unwrap();
9763        assert_eq!(
9764            delta.entries[aa_idx].data.as_deref(),
9765            Some(b"delta_aa" as &[u8])
9766        );
9767
9768        // "bb" must be present (from delta).
9769        assert!(dk.iter().any(|k| k == b"bb"));
9770
9771        // "cc" must be present (from base).
9772        assert!(dk.iter().any(|k| k == b"cc"));
9773
9774        // Three entries total, in sorted order.
9775        assert_eq!(delta.entries.len(), 3);
9776        let mut sorted = dk.clone();
9777        sorted.sort();
9778        assert_eq!(dk, sorted, "entries must be sorted after mutation");
9779    }
9780
9781    /// is_delta flag is correctly reported by bin_is_delta().
9782    #[test]
9783    fn test_bin_is_delta_flag() {
9784        let mut bin = BinStub {
9785            node_id: 1,
9786            level: BIN_LEVEL,
9787            entries: vec![],
9788            key_prefix: Vec::new(),
9789            dirty: false,
9790            is_delta: false,
9791            last_full_lsn: NULL_LSN,
9792            last_delta_lsn: NULL_LSN,
9793            generation: 0,
9794            parent: None,
9795            expiration_in_hours: true,
9796            cursor_count: 0,
9797            prohibit_next_delta: false,
9798            lsn_rep: LsnRep::Empty,
9799            keys: KeyRep::new(),
9800            compact_max_key_length: INKeyRep_DEFAULT_MAX_KEY_LENGTH,
9801        };
9802        assert!(!Tree::bin_is_delta(&bin));
9803        bin.is_delta = true;
9804        assert!(Tree::bin_is_delta(&bin));
9805    }
9806
9807    // ========================================================================
9808    // Tests: mutate_to_full_bin_from_log
9809    // ========================================================================
9810
9811    /// mutate_to_full_bin_from_log is a no-op when the BIN is already full.
9812    #[test]
9813    fn test_mutate_to_full_bin_from_log_already_full() {
9814        let dir = tempfile::tempdir().unwrap();
9815        let fm = std::sync::Arc::new(
9816            noxu_log::FileManager::new(dir.path(), false, 10_000_000, 100)
9817                .unwrap(),
9818        );
9819        let lm = noxu_log::LogManager::new(fm, 3, 1024 * 1024, 4096);
9820
9821        let mut bin = BinStub {
9822            node_id: 1,
9823            level: BIN_LEVEL,
9824            entries: vec![BinEntry {
9825                data: Some(b"v1".to_vec()),
9826                known_deleted: false,
9827                dirty: false,
9828                expiration_time: 0,
9829            }],
9830            key_prefix: Vec::new(),
9831            dirty: false,
9832            is_delta: false, // already a full BIN
9833            last_full_lsn: NULL_LSN,
9834            last_delta_lsn: NULL_LSN,
9835            generation: 0,
9836            parent: None,
9837            expiration_in_hours: true,
9838            cursor_count: 0,
9839            prohibit_next_delta: false,
9840            lsn_rep: LsnRep::Empty,
9841            keys: KeyRep::from_keys(vec![b"key1".to_vec()]),
9842            compact_max_key_length: INKeyRep_DEFAULT_MAX_KEY_LENGTH,
9843        };
9844
9845        Tree::mutate_to_full_bin_from_log(&mut bin, &lm);
9846
9847        // No-op: is_delta was already false, entries unchanged.
9848        assert!(!bin.is_delta);
9849        assert_eq!(bin.entries.len(), 1);
9850    }
9851
9852    /// mutate_to_full_bin_from_log with NULL_LSN promotes delta without base.
9853    ///
9854    /// When last_full_lsn is NULL_LSN the BIN has never been written as a full
9855    /// entry.  The function must clear is_delta and leave the delta entries
9856    /// as-is (they are the authoritative full state).
9857    #[test]
9858    fn test_mutate_to_full_bin_from_log_null_lsn() {
9859        let dir = tempfile::tempdir().unwrap();
9860        let fm = std::sync::Arc::new(
9861            noxu_log::FileManager::new(dir.path(), false, 10_000_000, 100)
9862                .unwrap(),
9863        );
9864        let lm = noxu_log::LogManager::new(fm, 3, 1024 * 1024, 4096);
9865
9866        let mut delta = BinStub {
9867            node_id: 2,
9868            level: BIN_LEVEL,
9869            entries: vec![BinEntry {
9870                data: Some(b"delta_a".to_vec()),
9871                known_deleted: false,
9872                dirty: true,
9873                expiration_time: 0,
9874            }],
9875            key_prefix: Vec::new(),
9876            dirty: true,
9877            is_delta: true,
9878            last_full_lsn: NULL_LSN, // no full BIN ever written
9879            last_delta_lsn: NULL_LSN,
9880            generation: 0,
9881            parent: None,
9882            expiration_in_hours: true,
9883            cursor_count: 0,
9884            prohibit_next_delta: false,
9885            lsn_rep: LsnRep::Empty,
9886            keys: KeyRep::from_keys(vec![b"a".to_vec()]),
9887            compact_max_key_length: INKeyRep_DEFAULT_MAX_KEY_LENGTH,
9888        };
9889
9890        Tree::mutate_to_full_bin_from_log(&mut delta, &lm);
9891
9892        // is_delta must be cleared; the single delta entry is kept as-is.
9893        assert!(
9894            !delta.is_delta,
9895            "is_delta must be false after null-lsn promotion"
9896        );
9897        assert_eq!(delta.entries.len(), 1);
9898        assert_eq!(delta.entries[0].data.as_deref(), Some(b"delta_a" as &[u8]));
9899    }
9900
9901    /// mutate_to_full_bin_from_log reads full BIN from log and merges delta.
9902    ///
9903    /// Round-trip: serialize a full BIN, write it to a LogManager, record the
9904    /// LSN, then call mutate_to_full_bin_from_log on a delta referencing that
9905    /// LSN.  The result must contain base-only and delta-only entries with the
9906    /// delta winning on conflicts.
9907    #[test]
9908    fn test_mutate_to_full_bin_from_log_reads_and_merges() {
9909        let dir = tempfile::tempdir().unwrap();
9910        let fm = std::sync::Arc::new(
9911            noxu_log::FileManager::new(dir.path(), false, 10_000_000, 100)
9912                .unwrap(),
9913        );
9914        let lm = noxu_log::LogManager::new(fm, 3, 1024 * 1024, 4096);
9915
9916        // Build and serialize the full BIN that will be written to the log.
9917        let full_bin = BinStub {
9918            node_id: 42,
9919            level: BIN_LEVEL,
9920            entries: vec![
9921                BinEntry {
9922                    data: Some(b"base_val".to_vec()),
9923                    known_deleted: false,
9924                    dirty: false,
9925                    expiration_time: 0,
9926                },
9927                BinEntry {
9928                    data: Some(b"base_shared".to_vec()),
9929                    known_deleted: false,
9930                    dirty: false,
9931                    expiration_time: 0,
9932                },
9933            ],
9934            key_prefix: Vec::new(),
9935            dirty: false,
9936            is_delta: false,
9937            last_full_lsn: NULL_LSN,
9938            last_delta_lsn: NULL_LSN,
9939            generation: 0,
9940            parent: None,
9941            expiration_in_hours: true,
9942            cursor_count: 0,
9943            prohibit_next_delta: false,
9944            lsn_rep: LsnRep::Empty,
9945            keys: KeyRep::from_keys(vec![
9946                b"base_only".to_vec(),
9947                b"shared_key".to_vec(),
9948            ]),
9949            compact_max_key_length: INKeyRep_DEFAULT_MAX_KEY_LENGTH,
9950        };
9951
9952        let payload = full_bin.serialize_full();
9953        let full_lsn = lm
9954            .log(
9955                noxu_log::LogEntryType::BIN,
9956                &payload,
9957                noxu_log::Provisional::No,
9958                true,
9959                false,
9960            )
9961            .expect("write full BIN to log");
9962        lm.flush_no_sync().expect("flush log");
9963
9964        // Build a delta BIN referencing the full BIN via last_full_lsn.
9965        let mut delta = BinStub {
9966            node_id: 42,
9967            level: BIN_LEVEL,
9968            entries: vec![
9969                // Overwrites "shared_key" from the base.
9970                BinEntry {
9971                    data: Some(b"delta_shared".to_vec()),
9972                    known_deleted: false,
9973                    dirty: true,
9974                    expiration_time: 0,
9975                },
9976                // New key only in the delta.
9977                BinEntry {
9978                    data: Some(b"delta_val".to_vec()),
9979                    known_deleted: false,
9980                    dirty: true,
9981                    expiration_time: 0,
9982                },
9983            ],
9984            key_prefix: Vec::new(),
9985            dirty: true,
9986            is_delta: true,
9987            last_full_lsn: full_lsn,
9988            last_delta_lsn: NULL_LSN,
9989            generation: 0,
9990            parent: None,
9991            expiration_in_hours: true,
9992            cursor_count: 0,
9993            prohibit_next_delta: false,
9994            lsn_rep: LsnRep::Empty,
9995            keys: KeyRep::from_keys(vec![
9996                b"shared_key".to_vec(),
9997                b"delta_only".to_vec(),
9998            ]),
9999            compact_max_key_length: INKeyRep_DEFAULT_MAX_KEY_LENGTH,
10000        };
10001
10002        Tree::mutate_to_full_bin_from_log(&mut delta, &lm);
10003
10004        assert!(
10005            !delta.is_delta,
10006            "is_delta must be false after log-based mutation"
10007        );
10008        assert!(delta.dirty, "must be dirty after mutation");
10009
10010        // All three distinct keys must be present.
10011        let find = |k: &[u8]| -> Option<Vec<u8>> {
10012            (0..delta.entries.len())
10013                .find(|&i| delta.get_full_key(i).as_deref() == Some(k))
10014                .and_then(|i| delta.entries[i].data.clone())
10015        };
10016
10017        assert_eq!(
10018            find(b"base_only"),
10019            Some(b"base_val".to_vec()),
10020            "base-only key must be present"
10021        );
10022        assert_eq!(
10023            find(b"shared_key"),
10024            Some(b"delta_shared".to_vec()),
10025            "delta must win on shared_key"
10026        );
10027        assert_eq!(
10028            find(b"delta_only"),
10029            Some(b"delta_val".to_vec()),
10030            "delta-only key must be present"
10031        );
10032        assert_eq!(delta.entries.len(), 3, "must have exactly 3 entries");
10033
10034        // Entries must be in sorted order (by full key).
10035        let full_keys: Vec<Vec<u8>> = (0..delta.entries.len())
10036            .map(|i| delta.get_full_key(i).unwrap())
10037            .collect();
10038        let mut sorted_keys = full_keys.clone();
10039        sorted_keys.sort();
10040        assert_eq!(full_keys, sorted_keys, "entries must be in sorted order");
10041    }
10042
10043    // ========================================================================
10044    // Tests: deserialize_full key prefix recomputation
10045    // ========================================================================
10046
10047    /// deserialize_full recomputes key prefix from loaded full keys.
10048    ///
10049    /// IN.recalcKeyPrefix() called after materializing from log:
10050    /// a BIN loaded from the log should have prefix compression applied so
10051    /// that search performance matches an in-memory BIN.
10052    #[test]
10053    fn test_deserialize_full_recomputes_key_prefix() {
10054        // Build a BIN with a known common prefix and serialize it.
10055        let mut source = BinStub {
10056            node_id: 99,
10057            level: BIN_LEVEL,
10058            entries: vec![
10059                BinEntry {
10060                    data: None,
10061                    known_deleted: false,
10062                    dirty: false,
10063                    expiration_time: 0,
10064                },
10065                BinEntry {
10066                    data: None,
10067                    known_deleted: false,
10068                    dirty: false,
10069                    expiration_time: 0,
10070                },
10071                BinEntry {
10072                    data: None,
10073                    known_deleted: false,
10074                    dirty: false,
10075                    expiration_time: 0,
10076                },
10077            ],
10078            key_prefix: Vec::new(),
10079            dirty: false,
10080            is_delta: false,
10081            last_full_lsn: NULL_LSN,
10082            last_delta_lsn: NULL_LSN,
10083            generation: 0,
10084            parent: None,
10085            expiration_in_hours: true,
10086            cursor_count: 0,
10087            prohibit_next_delta: false,
10088            lsn_rep: LsnRep::Empty,
10089            keys: KeyRep::from_keys(vec![
10090                b"pfx:alpha".to_vec(),
10091                b"pfx:beta".to_vec(),
10092                b"pfx:gamma".to_vec(),
10093            ]),
10094            compact_max_key_length: INKeyRep_DEFAULT_MAX_KEY_LENGTH,
10095        };
10096        source.recompute_key_prefix();
10097        // Verify the source has the expected prefix before serializing.
10098        assert_eq!(source.key_prefix, b"pfx:");
10099
10100        let payload = source.serialize_full();
10101
10102        // Deserialize and verify prefix is re-established.
10103        let loaded = BinStub::deserialize_full(&payload)
10104            .expect("deserialization must succeed");
10105
10106        assert_eq!(
10107            loaded.key_prefix, b"pfx:",
10108            "key prefix must be recomputed after deserialize_full"
10109        );
10110
10111        // All full keys must be reconstructable.
10112        for i in 0..loaded.entries.len() {
10113            let fk = loaded.get_full_key(i).unwrap();
10114            assert!(
10115                fk.starts_with(b"pfx:"),
10116                "full key {i} must start with prefix"
10117            );
10118        }
10119    }
10120
10121    /// deserialize_full with a single entry leaves key_prefix empty.
10122    ///
10123    /// A BIN with fewer than 2 entries cannot have a meaningful common prefix.
10124    #[test]
10125    fn test_deserialize_full_single_entry_no_prefix() {
10126        let source = BinStub {
10127            node_id: 7,
10128            level: BIN_LEVEL,
10129            entries: vec![BinEntry {
10130                data: None,
10131                known_deleted: false,
10132                dirty: false,
10133                expiration_time: 0,
10134            }],
10135            key_prefix: Vec::new(),
10136            dirty: false,
10137            is_delta: false,
10138            last_full_lsn: NULL_LSN,
10139            last_delta_lsn: NULL_LSN,
10140            generation: 0,
10141            parent: None,
10142            expiration_in_hours: true,
10143            cursor_count: 0,
10144            prohibit_next_delta: false,
10145            lsn_rep: LsnRep::Empty,
10146            keys: KeyRep::from_keys(vec![b"solo".to_vec()]),
10147            compact_max_key_length: INKeyRep_DEFAULT_MAX_KEY_LENGTH,
10148        };
10149
10150        let payload = source.serialize_full();
10151        let loaded = BinStub::deserialize_full(&payload)
10152            .expect("deserialization must succeed");
10153
10154        assert!(
10155            loaded.key_prefix.is_empty(),
10156            "single-entry BIN must have empty prefix"
10157        );
10158        assert_eq!(loaded.get_full_key(0).unwrap(), b"solo");
10159    }
10160
10161    // ========================================================================
10162    // Tests: get_next_bin / get_prev_bin
10163    // ========================================================================
10164
10165    /// get_next_bin returns the entries of the next BIN to the right.
10166    ///
10167    /// Tree.getNextBin() / getNextIN(forward=true).
10168    #[test]
10169    fn test_get_next_bin_basic() {
10170        let tree = Tree::new(1, 4);
10171
10172        // Insert 8 sorted keys — creates multiple BINs.
10173        for i in 0u32..8 {
10174            let key = format!("n{:04}", i).into_bytes();
10175            tree.insert(key, vec![i as u8], Lsn::new(1, i)).unwrap();
10176        }
10177
10178        let stats = tree.collect_stats();
10179        if stats.n_bins < 2 {
10180            // If the tree only has one BIN, skip the sibling test.
10181            return;
10182        }
10183
10184        // A key from the first BIN (e.g. "n0000") should have a next BIN.
10185        let next = tree.get_next_bin(b"n0000");
10186        assert!(
10187            next.is_some(),
10188            "must return a next BIN for a key in the leftmost BIN"
10189        );
10190
10191        let entries = next.unwrap();
10192        assert!(!entries.is_empty(), "next BIN must not be empty");
10193        // All returned keys must be strictly greater than "n0000" because they
10194        // are in a different (rightward) BIN.
10195        for (_, _, k) in &entries {
10196            assert!(
10197                k.as_slice() > b"n0000" as &[u8],
10198                "next BIN entries must all be > the search key"
10199            );
10200        }
10201    }
10202
10203    /// get_next_bin returns None for a key in the rightmost BIN.
10204    #[test]
10205    fn test_get_next_bin_at_rightmost_returns_none() {
10206        let tree = Tree::new(1, 4);
10207        for i in 0u32..8 {
10208            let key = format!("r{:04}", i).into_bytes();
10209            tree.insert(key, vec![i as u8], Lsn::new(1, i)).unwrap();
10210        }
10211        // A key from the rightmost BIN (e.g. "r0007") has no next BIN.
10212        let next = tree.get_next_bin(b"r0007");
10213        assert!(
10214            next.is_none(),
10215            "must return None for a key in the rightmost BIN"
10216        );
10217    }
10218
10219    /// get_prev_bin returns the entries of the next BIN to the left.
10220    ///
10221    /// Tree.getPrevBin() / getNextIN(forward=false).
10222    #[test]
10223    fn test_get_prev_bin_basic() {
10224        let tree = Tree::new(1, 4);
10225        for i in 0u32..8 {
10226            let key = format!("p{:04}", i).into_bytes();
10227            tree.insert(key, vec![i as u8], Lsn::new(1, i)).unwrap();
10228        }
10229
10230        // A key from the second BIN ("p0004") should have a previous BIN.
10231        let prev = tree.get_prev_bin(b"p0004");
10232        assert!(
10233            prev.is_some(),
10234            "must return a prev BIN for a key in the second BIN"
10235        );
10236
10237        let entries = prev.unwrap();
10238        assert!(!entries.is_empty(), "prev BIN must not be empty");
10239        // All returned keys must be < b"p0004".
10240        for (_, _, k) in &entries {
10241            assert!(
10242                k.as_slice() < b"p0004" as &[u8],
10243                "prev BIN entries must all be < the current BIN"
10244            );
10245        }
10246    }
10247
10248    /// get_prev_bin returns None for a key in the leftmost BIN.
10249    #[test]
10250    fn test_get_prev_bin_at_leftmost_returns_none() {
10251        let tree = Tree::new(1, 4);
10252        for i in 0u32..8 {
10253            let key = format!("q{:04}", i).into_bytes();
10254            tree.insert(key, vec![i as u8], Lsn::new(1, i)).unwrap();
10255        }
10256        // A key from the leftmost BIN ("q0000") has no prev BIN.
10257        let prev = tree.get_prev_bin(b"q0000");
10258        assert!(
10259            prev.is_none(),
10260            "must return None for a key in the leftmost BIN"
10261        );
10262    }
10263
10264    /// get_next_bin and get_prev_bin are inverse operations across the
10265    /// BIN boundary.
10266    #[test]
10267    fn test_next_prev_bin_are_symmetric() {
10268        let tree = Tree::new(1, 4);
10269        for i in 0u32..8 {
10270            let key = format!("s{:04}", i).into_bytes();
10271            tree.insert(key, vec![i as u8], Lsn::new(1, i)).unwrap();
10272        }
10273
10274        // From first BIN (s0000): next → second BIN entries.
10275        let next_from_first = tree.get_next_bin(b"s0000").unwrap();
10276        // The smallest key of the next BIN.
10277        let next_first_key =
10278            next_from_first.iter().map(|(_, _, k)| k.clone()).min().unwrap();
10279
10280        // From that key in the second BIN: prev → should overlap with first BIN.
10281        let prev_from_second = tree.get_prev_bin(&next_first_key).unwrap();
10282        let prev_first_key =
10283            prev_from_second.iter().map(|(_, _, k)| k.clone()).max().unwrap();
10284
10285        // The max key of the "prev" result must be in the first BIN (< next boundary).
10286        assert!(
10287            prev_first_key < next_first_key,
10288            "prev BIN entries must be smaller than the boundary key"
10289        );
10290    }
10291
10292    /// get_next_bin on an empty tree returns None.
10293    #[test]
10294    fn test_get_next_bin_empty_tree() {
10295        let tree = Tree::new(1, 8);
10296        assert!(tree.get_next_bin(b"any").is_none());
10297    }
10298
10299    /// get_prev_bin on an empty tree returns None.
10300    #[test]
10301    fn test_get_prev_bin_empty_tree() {
10302        let tree = Tree::new(1, 8);
10303        assert!(tree.get_prev_bin(b"any").is_none());
10304    }
10305
10306    // =========================================================================
10307    // R3 fix: get_next_bin / get_prev_bin honour the custom comparator
10308    // =========================================================================
10309
10310    /// R3 regression test: with a custom comparator that reverses byte order
10311    /// (descending), `get_next_bin` and `get_prev_bin` must use comparator
10312    /// order when routing through internal nodes.
10313    ///
10314    /// Pre-fix: the static `get_adjacent_bin_attempt` used raw `<=` byte order
10315    /// for IN routing, causing it to descend to the wrong child when comparator
10316    /// order ≠ byte order.
10317    ///
10318    /// The tree is forced to split (max_entries = 4) so there IS an internal
10319    /// node (IN) to route through. Under a reverse comparator the insertion
10320    /// order and stored key order are reversed relative to byte order, so any
10321    /// descent that uses raw byte comparison will pick the wrong slot.
10322    ///
10323    /// Pass-post invariant: iterating forward via repeated `get_next_bin` from
10324    /// the leftmost BIN yields keys in COMPARATOR order (descending byte order
10325    /// here), not in raw ascending byte order.
10326    #[test]
10327    fn test_get_next_prev_bin_custom_comparator_order() {
10328        // Reverse-order comparator: larger bytes sort first.
10329        let reverse_cmp: KeyComparatorFn =
10330            Arc::new(|a: &[u8], b: &[u8]| b.cmp(a));
10331        // Small max_entries so the tree splits and has internal nodes.
10332        let mut tree = Tree::new(1, 4);
10333        tree.set_comparator(reverse_cmp);
10334
10335        // Insert keys that are ascending in byte order ("a" < "b" < … < "i")
10336        // but descending in comparator order (i > h > … > a).
10337        let keys: &[&[u8]] =
10338            &[b"a", b"b", b"c", b"d", b"e", b"f", b"g", b"h", b"i"];
10339        for (i, k) in keys.iter().enumerate() {
10340            tree.insert(
10341                k.to_vec(),
10342                vec![i as u8],
10343                Lsn::from_u64((i + 1) as u64),
10344            )
10345            .unwrap();
10346        }
10347
10348        // Collect all BINs by walking from the comparator-smallest key ("i"
10349        // in reverse order) using get_next_bin. The anchor must be a key that
10350        // is smaller than everything in comparator order, i.e. the largest
10351        // byte-value key. We use the tree's search to find the actual leftmost
10352        // key under the comparator by starting from "i" (comparator-min).
10353        //
10354        // Strategy: start at byte key b"\xff" (larger than any inserted key in
10355        // byte order, so it lands in the last BIN in byte order, which under
10356        // a reverse comparator is the leftmost BIN in comparator order). Then
10357        // walk via get_next_bin.
10358        let start_anchor = b"\xff".as_ref();
10359        let mut bin_first_keys: Vec<Vec<u8>> = Vec::new();
10360
10361        // The first BIN in comparator order contains "i" (largest byte key).
10362        // get_next_bin from a virtual start in that BIN gives the next one.
10363        // Collect by walking from the comparator-last key leftward instead:
10364        // use get_next_bin with anchor = b"\xff" to hop to the next BIN
10365        // (comparator order: next = smaller byte value).
10366        let mut anchor = start_anchor.to_vec();
10367        loop {
10368            match tree.get_next_bin(&anchor) {
10369                None => break,
10370                Some(entries) => {
10371                    if let Some((_, _, fk0)) = entries.first() {
10372                        let fk = fk0.clone();
10373                        bin_first_keys.push(fk.clone());
10374                        anchor = fk;
10375                    } else {
10376                        break;
10377                    }
10378                }
10379            }
10380        }
10381
10382        // We must have visited at least 2 BINs (tree was forced to split).
10383        assert!(
10384            bin_first_keys.len() >= 2,
10385            "R3: expected multiple BINs after split, got {}",
10386            bin_first_keys.len()
10387        );
10388
10389        // With a reverse comparator, bin_first_keys must be in descending byte
10390        // order (each successive BIN starts at a smaller byte key).
10391        for window in bin_first_keys.windows(2) {
10392            assert!(
10393                window[0] > window[1],
10394                "R3: BIN boundary keys must be descending (comparator order); \
10395                 got {:?} then {:?}",
10396                window[0],
10397                window[1]
10398            );
10399        }
10400    }
10401    // ========================================================================
10402
10403    /// Inserting keys with a common prefix causes the BIN to establish that
10404    /// prefix.  Stored suffixes are shorter than the full keys.
10405    #[test]
10406    fn test_binstub_prefix_established_on_insert() {
10407        let mut bin = BinStub {
10408            node_id: 1,
10409            level: BIN_LEVEL,
10410            entries: Vec::new(),
10411            key_prefix: Vec::new(),
10412            dirty: false,
10413            is_delta: false,
10414            last_full_lsn: NULL_LSN,
10415            last_delta_lsn: NULL_LSN,
10416            generation: 0,
10417            parent: None,
10418            expiration_in_hours: true,
10419            cursor_count: 0,
10420            prohibit_next_delta: false,
10421            lsn_rep: LsnRep::Empty,
10422            keys: KeyRep::new(),
10423            compact_max_key_length: INKeyRep_DEFAULT_MAX_KEY_LENGTH,
10424        };
10425
10426        bin.insert_with_prefix(b"record:aaa".to_vec(), Lsn::new(1, 1), None);
10427        assert!(bin.key_prefix.is_empty(), "single entry: no prefix yet");
10428
10429        bin.insert_with_prefix(b"record:bbb".to_vec(), Lsn::new(1, 2), None);
10430        assert_eq!(
10431            &bin.key_prefix, b"record:",
10432            "common prefix 'record:' must be extracted"
10433        );
10434    }
10435
10436    /// `get_full_key` on a BinStub returns the full key regardless of whether
10437    /// the stored key is a raw full key or a suffix.
10438    #[test]
10439    fn test_binstub_get_full_key_roundtrip() {
10440        let mut bin = BinStub {
10441            node_id: 1,
10442            level: BIN_LEVEL,
10443            entries: Vec::new(),
10444            key_prefix: Vec::new(),
10445            dirty: false,
10446            is_delta: false,
10447            last_full_lsn: NULL_LSN,
10448            last_delta_lsn: NULL_LSN,
10449            generation: 0,
10450            parent: None,
10451            expiration_in_hours: true,
10452            cursor_count: 0,
10453            prohibit_next_delta: false,
10454            lsn_rep: LsnRep::Empty,
10455            keys: KeyRep::new(),
10456            compact_max_key_length: INKeyRep_DEFAULT_MAX_KEY_LENGTH,
10457        };
10458
10459        let keys = [
10460            b"pfx:first".as_ref(),
10461            b"pfx:second".as_ref(),
10462            b"pfx:third".as_ref(),
10463        ];
10464        for k in keys {
10465            bin.insert_with_prefix(k.to_vec(), Lsn::new(1, 1), None);
10466        }
10467
10468        assert!(!bin.key_prefix.is_empty(), "prefix must be set");
10469
10470        for (i, expected) in keys.iter().enumerate() {
10471            let full = bin.get_full_key(i).expect("must return full key");
10472            assert_eq!(
10473                full.as_slice(),
10474                *expected,
10475                "get_full_key({}) must return full key",
10476                i
10477            );
10478        }
10479    }
10480
10481    /// `find_entry_compressed` on a BinStub with active prefix returns the
10482    /// correct slot index.
10483    #[test]
10484    fn test_binstub_find_entry_compressed() {
10485        let mut bin = BinStub {
10486            node_id: 1,
10487            level: BIN_LEVEL,
10488            entries: Vec::new(),
10489            key_prefix: Vec::new(),
10490            dirty: false,
10491            is_delta: false,
10492            last_full_lsn: NULL_LSN,
10493            last_delta_lsn: NULL_LSN,
10494            generation: 0,
10495            parent: None,
10496            expiration_in_hours: true,
10497            cursor_count: 0,
10498            prohibit_next_delta: false,
10499            lsn_rep: LsnRep::Empty,
10500            keys: KeyRep::new(),
10501            compact_max_key_length: INKeyRep_DEFAULT_MAX_KEY_LENGTH,
10502        };
10503
10504        for k in
10505            [b"db:alpha".as_ref(), b"db:beta".as_ref(), b"db:gamma".as_ref()]
10506        {
10507            bin.insert_with_prefix(k.to_vec(), Lsn::new(1, 1), None);
10508        }
10509
10510        let (idx, found) = bin.find_entry_compressed(b"db:beta");
10511        assert!(found, "db:beta must be found");
10512        assert_eq!(idx, 1, "db:beta must be at index 1");
10513
10514        let (_, not_found) = bin.find_entry_compressed(b"db:zzz");
10515        assert!(!not_found, "db:zzz must not be found");
10516    }
10517
10518    /// Tree insert/search works correctly when BINs accumulate a key prefix.
10519    #[test]
10520    fn test_tree_insert_search_with_prefix_compression() {
10521        let tree = Tree::new(1, 8);
10522        let n = 200u32;
10523
10524        // All keys share a long common prefix — good for prefix compression.
10525        for i in 0..n {
10526            let key = format!("namespace:entity:{:06}", i).into_bytes();
10527            let data = vec![i as u8];
10528            tree.insert(key, data, Lsn::new(1, i)).unwrap();
10529        }
10530
10531        // All keys must be findable.
10532        for i in 0..n {
10533            let key = format!("namespace:entity:{:06}", i).into_bytes();
10534            let sr = tree.search(&key);
10535            assert!(
10536                sr.is_some() && sr.unwrap().exact_parent_found,
10537                "key namespace:entity:{:06} must be found",
10538                i
10539            );
10540        }
10541    }
10542
10543    /// Prefix survives a BIN split: keys in both halves must still be findable.
10544    #[test]
10545    fn test_prefix_preserved_across_bin_split() {
10546        // Small fanout to force splits quickly.
10547        let tree = Tree::new(1, 4);
10548
10549        for i in 0u32..20 {
10550            let key = format!("pfx:key:{:04}", i).into_bytes();
10551            tree.insert(key, vec![i as u8], Lsn::new(1, i)).unwrap();
10552        }
10553
10554        // All keys must be findable after splits.
10555        for i in 0u32..20 {
10556            let key = format!("pfx:key:{:04}", i).into_bytes();
10557            let sr = tree.search(&key);
10558            assert!(
10559                sr.is_some() && sr.unwrap().exact_parent_found,
10560                "pfx:key:{:04} must be found after splits",
10561                i
10562            );
10563        }
10564    }
10565
10566    /// `decompress_key` round-trips: compress then decompress gives the original.
10567    #[test]
10568    fn test_binstub_compress_decompress_roundtrip() {
10569        let mut bin = BinStub {
10570            node_id: 1,
10571            level: BIN_LEVEL,
10572            entries: Vec::new(),
10573            key_prefix: Vec::new(),
10574            dirty: false,
10575            is_delta: false,
10576            last_full_lsn: NULL_LSN,
10577            last_delta_lsn: NULL_LSN,
10578            generation: 0,
10579            parent: None,
10580            expiration_in_hours: true,
10581            cursor_count: 0,
10582            prohibit_next_delta: false,
10583            lsn_rep: LsnRep::Empty,
10584            keys: KeyRep::new(),
10585            compact_max_key_length: INKeyRep_DEFAULT_MAX_KEY_LENGTH,
10586        };
10587
10588        for k in [b"myapp:user:1".as_ref(), b"myapp:user:2".as_ref()] {
10589            bin.insert_with_prefix(k.to_vec(), Lsn::new(1, 1), None);
10590        }
10591
10592        assert!(!bin.key_prefix.is_empty());
10593
10594        // Manually compress a full key and then decompress it.
10595        let full_key = b"myapp:user:3";
10596        let suffix = bin.compress_key(full_key);
10597        let recovered = bin.decompress_key(&suffix);
10598        assert_eq!(
10599            recovered.as_slice(),
10600            full_key,
10601            "compress→decompress must be identity"
10602        );
10603    }
10604
10605    /// get_next_bin correctly navigates a 3-level tree.
10606    #[test]
10607    fn test_get_next_bin_three_level_tree() {
10608        // With fanout 4, inserting 20 keys forces a root split → 3 levels.
10609        let tree = Tree::new(1, 4);
10610        for i in 0u32..20 {
10611            let key = format!("t{:04}", i).into_bytes();
10612            tree.insert(key, vec![i as u8], Lsn::new(1, i)).unwrap();
10613        }
10614        assert!(tree.get_root_splits() > 0, "tree must have grown to 3 levels");
10615
10616        // Starting from t0000, iterating via get_next_bin must visit every BIN.
10617        let mut visited: Vec<Vec<u8>> = Vec::new();
10618        // Collect the first BIN's keys by searching for t0000.
10619        if let Some(first_entries) = {
10620            // Get the leftmost BIN by using get_first_node result.
10621            // get_first_node returns SearchResult at index 0 in the leftmost BIN.
10622            // We approximate by reading the root's leftmost BIN directly.
10623            tree.get_next_bin(b"t0000")
10624        } {
10625            for (_, _, k) in first_entries {
10626                visited.push(k);
10627            }
10628        }
10629
10630        // visited should contain at least one key from the second BIN.
10631        assert!(
10632            !visited.is_empty(),
10633            "should have visited at least one key via get_next_bin in 3-level tree"
10634        );
10635    }
10636
10637    // ========================================================================
10638    // ========================================================================
10639
10640    /// insert a small set of keys
10641    /// with varying lengths and verify each is findable immediately after insert.
10642    #[test]
10643    fn test_je_simple_tree_creation() {
10644        let tree = Tree::new(1, 128);
10645
10646        let keys: &[&[u8]] = &[b"aaaaa", b"aaaab", b"aaaa", b"aaa"];
10647        for (i, &k) in keys.iter().enumerate() {
10648            tree.insert(k.to_vec(), vec![i as u8], Lsn::new(1, i as u32))
10649                .unwrap();
10650
10651            // Every key inserted so far must be findable.
10652            for &prev in &keys[..=i] {
10653                let sr = tree.search(prev);
10654                assert!(
10655                    sr.is_some() && sr.unwrap().exact_parent_found,
10656                    "key {:?} must be findable after {} inserts",
10657                    std::str::from_utf8(prev).unwrap_or("?"),
10658                    i + 1
10659                );
10660            }
10661        }
10662    }
10663
10664    /// insert N keys, verify
10665    /// all are found; delete the even-indexed keys, verify even are gone and
10666    /// odd remain.
10667    #[test]
10668    fn test_je_insert_then_delete_then_search() {
10669        let tree = Tree::new(1, 8);
10670        let n = 20usize;
10671
10672        let keys: Vec<Vec<u8>> =
10673            (0..n).map(|i| format!("key{:04}", i).into_bytes()).collect();
10674
10675        // Insert all.
10676        for (i, k) in keys.iter().enumerate() {
10677            tree.insert(k.clone(), vec![i as u8], Lsn::new(1, i as u32))
10678                .unwrap();
10679        }
10680
10681        // All must be findable.
10682        for k in &keys {
10683            let sr = tree.search(k);
10684            assert!(
10685                sr.is_some() && sr.unwrap().exact_parent_found,
10686                "key {:?} must be found after insert",
10687                std::str::from_utf8(k).unwrap_or("?")
10688            );
10689        }
10690
10691        // Delete even-indexed keys.
10692        for i in (0..n).step_by(2) {
10693            tree.delete(&keys[i]);
10694        }
10695
10696        // Even keys must no longer be found; odd keys must still be found.
10697        for (i, key) in keys.iter().enumerate() {
10698            let sr = tree.search(key);
10699            let found = sr.is_some() && sr.unwrap().exact_parent_found;
10700            if i % 2 == 0 {
10701                assert!(!found, "deleted key {:?} must not be found", i);
10702            } else {
10703                assert!(found, "kept key {:?} must still be found", i);
10704            }
10705        }
10706    }
10707
10708    /// insert N keys in reverse
10709    /// order, then verify every key is directly findable and the keys are in
10710    /// sorted ascending order (B-tree ordering invariant).
10711    #[test]
10712    fn test_je_range_scan_sorted_ascending() {
10713        let n = 40usize;
10714        let tree = Tree::new(1, 4);
10715
10716        // Insert in reverse order to stress the B-tree.
10717        for i in (0..n).rev() {
10718            let key = format!("scan{:04}", i).into_bytes();
10719            tree.insert(key, vec![i as u8], Lsn::new(1, i as u32)).unwrap();
10720        }
10721
10722        // Collect all expected keys in sorted order.
10723        let mut expected: Vec<Vec<u8>> =
10724            (0..n).map(|i| format!("scan{:04}", i).into_bytes()).collect();
10725        expected.sort();
10726
10727        // Every key must be individually findable.
10728        for key in &expected {
10729            let sr = tree.search(key);
10730            assert!(
10731                sr.is_some() && sr.unwrap().exact_parent_found,
10732                "key {:?} must be findable",
10733                std::str::from_utf8(key).unwrap_or("?")
10734            );
10735        }
10736
10737        // Verify sorted ordering invariant: expected keys are already sorted
10738        // (lexicographic order = insertion order for "scan{:04}" keys).
10739        for w in expected.windows(2) {
10740            assert!(
10741                w[0] < w[1],
10742                "keys must be in strict ascending order: {:?} < {:?}",
10743                std::str::from_utf8(&w[0]).unwrap_or("?"),
10744                std::str::from_utf8(&w[1]).unwrap_or("?")
10745            );
10746        }
10747
10748        // Use get_next_bin to scan at least a portion of the tree and verify
10749        // ordering of returned BIN entries.
10750        let first_key = format!("scan{:04}", 0).into_bytes();
10751        if let Some(entries) = tree.get_next_bin(&first_key) {
10752            let entry_keys: Vec<&[u8]> =
10753                entries.iter().map(|(_, _, k)| k.as_slice()).collect();
10754            for w in entry_keys.windows(2) {
10755                assert!(
10756                    w[0] <= w[1],
10757                    "BIN entries from get_next_bin must be in ascending order"
10758                );
10759            }
10760        }
10761    }
10762
10763    /// insert N keys in
10764    /// ascending order and verify the tree height stays bounded (≤ 10 levels)
10765    /// and all keys are findable.
10766    #[test]
10767    fn test_je_ascending_insert_balance() {
10768        let n = 128usize;
10769        let tree = Tree::new(1, 8);
10770
10771        for i in 0..n {
10772            let key = format!("asc{:06}", i).into_bytes();
10773            tree.insert(key, vec![(i & 0xFF) as u8], Lsn::new(1, i as u32))
10774                .unwrap();
10775        }
10776
10777        let stats = tree.collect_stats();
10778        assert!(
10779            stats.height <= 10,
10780            "tree height after {} ascending inserts with fanout 8 must be <= 10, got {}",
10781            n,
10782            stats.height
10783        );
10784
10785        for i in 0..n {
10786            let key = format!("asc{:06}", i).into_bytes();
10787            let sr = tree.search(&key);
10788            assert!(
10789                sr.is_some() && sr.unwrap().exact_parent_found,
10790                "key asc{:06} must be findable after ascending inserts",
10791                i
10792            );
10793        }
10794    }
10795
10796    /// insert N keys in
10797    /// descending order and verify the tree height stays bounded (≤ 10 levels)
10798    /// and all keys are findable.
10799    #[test]
10800    fn test_je_descending_insert_balance() {
10801        let n = 128usize;
10802        let tree = Tree::new(1, 8);
10803
10804        for i in (0..n).rev() {
10805            let key = format!("dsc{:06}", i).into_bytes();
10806            tree.insert(key, vec![(i & 0xFF) as u8], Lsn::new(1, i as u32))
10807                .unwrap();
10808        }
10809
10810        let stats = tree.collect_stats();
10811        assert!(
10812            stats.height <= 10,
10813            "tree height after {} descending inserts with fanout 8 must be <= 10, got {}",
10814            n,
10815            stats.height
10816        );
10817
10818        for i in 0..n {
10819            let key = format!("dsc{:06}", i).into_bytes();
10820            let sr = tree.search(&key);
10821            assert!(
10822                sr.is_some() && sr.unwrap().exact_parent_found,
10823                "key dsc{:06} must be findable after descending inserts",
10824                i
10825            );
10826        }
10827    }
10828
10829    /// SplitTest invariant: after many splits induced by a small
10830    /// fanout no key is lost.
10831    #[test]
10832    fn test_je_split_no_key_lost() {
10833        let tree = Tree::new(1, 4);
10834        let n = 20usize;
10835
10836        for i in 0..n {
10837            let key = format!("sp{:04}", i).into_bytes();
10838            tree.insert(key, vec![i as u8], Lsn::new(1, i as u32)).unwrap();
10839        }
10840
10841        for i in 0..n {
10842            let key = format!("sp{:04}", i).into_bytes();
10843            let sr = tree.search(&key);
10844            assert!(
10845                sr.is_some() && sr.unwrap().exact_parent_found,
10846                "key sp{:04} must survive all splits",
10847                i
10848            );
10849        }
10850    }
10851
10852    /// SplitTest invariant: after a BIN split both halves exist and
10853    /// all original keys are findable.
10854    #[test]
10855    fn test_je_split_produces_two_halves() {
10856        // fanout=4: fill one BIN then overflow it to force a split.
10857        let tree = Tree::new(1, 4);
10858        let n = 5usize; // one more than fanout → forces at least one split
10859
10860        for i in 0..n {
10861            let key = format!("half{:04}", i).into_bytes();
10862            tree.insert(key, vec![i as u8], Lsn::new(1, i as u32)).unwrap();
10863        }
10864
10865        let stats = tree.collect_stats();
10866        assert!(
10867            stats.n_bins >= 2,
10868            "after splitting a full BIN there must be >= 2 BINs, got {}",
10869            stats.n_bins
10870        );
10871
10872        for i in 0..n {
10873            let key = format!("half{:04}", i).into_bytes();
10874            let sr = tree.search(&key);
10875            assert!(
10876                sr.is_some() && sr.unwrap().exact_parent_found,
10877                "key half{:04} must be findable in one of the two halves",
10878                i
10879            );
10880        }
10881    }
10882
10883    /// SplitTest invariant: root splits are tracked and the tree
10884    /// grows in height as keys accumulate.
10885    #[test]
10886    fn test_je_root_split_creates_new_root() {
10887        // fanout=4, 20 keys: forces multiple root splits.
10888        let tree = Tree::new(1, 4);
10889
10890        for i in 0u32..20 {
10891            let key = format!("rs{:04}", i).into_bytes();
10892            tree.insert(key, vec![i as u8], Lsn::new(1, i)).unwrap();
10893        }
10894
10895        assert!(
10896            tree.get_root_splits() > 0,
10897            "expected at least one root split after 20 inserts with fanout 4"
10898        );
10899
10900        let stats = tree.collect_stats();
10901        assert!(
10902            stats.height >= 3,
10903            "tree must be at least 3 levels tall after root splits, got {}",
10904            stats.height
10905        );
10906
10907        // Every inserted key must still be findable.
10908        for i in 0u32..20 {
10909            let key = format!("rs{:04}", i).into_bytes();
10910            let sr = tree.search(&key);
10911            assert!(
10912                sr.is_some() && sr.unwrap().exact_parent_found,
10913                "key rs{:04} must be findable after root splits",
10914                i
10915            );
10916        }
10917    }
10918
10919    // ========================================================================
10920    // Tests: compress_bin / maybe_compress_bin_and_parent
10921    // INCompressor.compressBin / lazyCompress tests
10922    // ========================================================================
10923
10924    /// compress_bin removes known-deleted slots from a BIN.
10925    ///
10926    /// INCompressor.compressBin(): after compression, slots with
10927    /// `known_deleted = true` must be gone and the BIN must be dirty.
10928    #[test]
10929    fn test_compress_bin_removes_deleted_slots() {
10930        let _lsn = Lsn::new(1, 1);
10931        let bin_arc = Arc::new(RwLock::new(TreeNode::Bottom(BinStub {
10932            node_id: generate_node_id(),
10933            level: BIN_LEVEL,
10934            entries: vec![
10935                BinEntry {
10936                    data: Some(b"live".to_vec()),
10937                    known_deleted: false,
10938                    dirty: false,
10939                    expiration_time: 0,
10940                },
10941                BinEntry {
10942                    data: None,
10943                    known_deleted: true,
10944                    dirty: false,
10945                    expiration_time: 0,
10946                },
10947                BinEntry {
10948                    data: Some(b"live2".to_vec()),
10949                    known_deleted: false,
10950                    dirty: false,
10951                    expiration_time: 0,
10952                },
10953                BinEntry {
10954                    data: None,
10955                    known_deleted: true,
10956                    dirty: false,
10957                    expiration_time: 0,
10958                },
10959            ],
10960            key_prefix: Vec::new(),
10961            dirty: false,
10962            is_delta: false,
10963            last_full_lsn: NULL_LSN,
10964            last_delta_lsn: NULL_LSN,
10965            generation: 0,
10966            parent: None,
10967            expiration_in_hours: true,
10968            cursor_count: 0,
10969            prohibit_next_delta: false,
10970            lsn_rep: LsnRep::Empty,
10971            keys: KeyRep::from_keys(vec![
10972                b"a".to_vec(),
10973                b"b".to_vec(),
10974                b"c".to_vec(),
10975                b"d".to_vec(),
10976            ]),
10977            compact_max_key_length: INKeyRep_DEFAULT_MAX_KEY_LENGTH,
10978        })));
10979
10980        // Wire a minimal parent IN so compress_bin can prune if needed.
10981        let root_arc = Arc::new(RwLock::new(TreeNode::Internal(InNodeStub {
10982            node_id: generate_node_id(),
10983            level: MAIN_LEVEL | 2,
10984            entries: vec![InEntry { key: vec![] }],
10985            targets: TargetRep::Sparse(vec![(0, bin_arc.clone())]),
10986            dirty: false,
10987            generation: 0,
10988            parent: None,
10989            lsn_rep: LsnRep::Empty,
10990        })));
10991        {
10992            let mut g = bin_arc.write();
10993            g.set_parent(Some(Arc::downgrade(&root_arc)));
10994        }
10995
10996        let tree = Tree::new(1, 128);
10997        *tree.root.write() = Some(root_arc);
10998
10999        let result = tree.compress_bin(&bin_arc);
11000        assert!(
11001            result,
11002            "compress_bin must return true when slots were removed"
11003        );
11004
11005        let g = bin_arc.read();
11006        match &*g {
11007            TreeNode::Bottom(b) => {
11008                assert_eq!(
11009                    b.entries.len(),
11010                    2,
11011                    "2 live entries must remain after compress"
11012                );
11013                assert!(
11014                    b.entries.iter().all(|e| !e.known_deleted),
11015                    "no deleted slots must remain"
11016                );
11017                assert!(b.dirty, "BIN must be dirty after compression");
11018            }
11019            _ => panic!("expected BIN"),
11020        }
11021    }
11022
11023    /// IC-3 HEADLINE (fail-pre / pass-post): the compressor must SKIP a
11024    /// `known_deleted` slot that is still write-locked by an in-flight txn,
11025    /// while removing committed/unlocked `known_deleted` slots in the SAME
11026    /// BIN.  Mirrors JE `BIN.compress` (BIN.java:1141-1172), which calls
11027    /// `lockManager.isLockUncontended(lsn)` and does `continue` on a contended
11028    /// slot.
11029    ///
11030    /// Pre-fix: `compress_bin` had no lock check, so a write-locked tombstone
11031    /// would have been physically removed (the slot a live txn references is
11032    /// gone -> corruption).  Post-fix: the `is_locked` predicate keeps it.
11033    #[test]
11034    fn test_ic3_compress_skips_write_locked_slot() {
11035        // Slot 1 (key "b", lsn 1:200) is a write-locked tombstone; slot 3
11036        // (key "d", lsn 1:400) is a committed/unlocked tombstone.  Slots 0
11037        // and 2 are live.
11038        let locked_lsn = Lsn::new(1, 200);
11039        let unlocked_lsn = Lsn::new(1, 400);
11040        let bin_arc = Arc::new(RwLock::new(TreeNode::Bottom(BinStub {
11041            node_id: generate_node_id(),
11042            level: BIN_LEVEL,
11043            entries: vec![
11044                BinEntry {
11045                    data: Some(b"live".to_vec()),
11046                    known_deleted: false,
11047                    dirty: false,
11048                    expiration_time: 0,
11049                },
11050                BinEntry {
11051                    data: None,
11052                    known_deleted: true, // write-locked tombstone -> KEEP
11053                    dirty: false,
11054                    expiration_time: 0,
11055                },
11056                BinEntry {
11057                    data: Some(b"live2".to_vec()),
11058                    known_deleted: false,
11059                    dirty: false,
11060                    expiration_time: 0,
11061                },
11062                BinEntry {
11063                    data: None,
11064                    known_deleted: true, // committed tombstone -> REMOVE
11065                    dirty: false,
11066                    expiration_time: 0,
11067                },
11068            ],
11069            key_prefix: Vec::new(),
11070            dirty: false,
11071            is_delta: false,
11072            last_full_lsn: NULL_LSN,
11073            last_delta_lsn: NULL_LSN,
11074            generation: 0,
11075            parent: None,
11076            expiration_in_hours: true,
11077            cursor_count: 0,
11078            prohibit_next_delta: false,
11079            lsn_rep: LsnRep::from_lsns(&[
11080                Lsn::new(1, 100),
11081                locked_lsn,
11082                Lsn::new(1, 300),
11083                unlocked_lsn,
11084            ]),
11085            keys: KeyRep::from_keys(vec![
11086                b"a".to_vec(),
11087                b"b".to_vec(),
11088                b"c".to_vec(),
11089                b"d".to_vec(),
11090            ]),
11091            compact_max_key_length: INKeyRep_DEFAULT_MAX_KEY_LENGTH,
11092        })));
11093        let root_arc = Arc::new(RwLock::new(TreeNode::Internal(InNodeStub {
11094            node_id: generate_node_id(),
11095            level: MAIN_LEVEL | 2,
11096            entries: vec![InEntry { key: vec![] }],
11097            targets: TargetRep::Sparse(vec![(0, bin_arc.clone())]),
11098            dirty: false,
11099            generation: 0,
11100            parent: None,
11101            lsn_rep: LsnRep::Empty,
11102        })));
11103        {
11104            let mut g = bin_arc.write();
11105            g.set_parent(Some(Arc::downgrade(&root_arc)));
11106        }
11107        let tree = Tree::new(1, 128);
11108        *tree.root.write() = Some(root_arc);
11109
11110        // Predicate: only `locked_lsn` is write-locked (stub LockManager).
11111        let locked_u64 = locked_lsn.as_u64();
11112        let is_locked = move |lsn: u64| lsn == locked_u64;
11113
11114        let result =
11115            tree.compress_bin_with_lock_check(&bin_arc, Some(&is_locked));
11116        assert!(result, "compress removed the unlocked tombstone -> true");
11117
11118        let g = bin_arc.read();
11119        match &*g {
11120            TreeNode::Bottom(b) => {
11121                // 2 live + 1 write-locked tombstone kept; the committed
11122                // tombstone (lsn 1:400) removed.
11123                assert_eq!(
11124                    b.entries.len(),
11125                    3,
11126                    "write-locked tombstone must be KEPT; only the unlocked one removed"
11127                );
11128                let kept_locked = (0..b.entries.len()).any(|i| {
11129                    b.entries[i].known_deleted && b.get_lsn(i) == locked_lsn
11130                });
11131                assert!(kept_locked, "the write-locked tombstone must remain");
11132                let unlocked_gone =
11133                    (0..b.entries.len()).all(|i| b.get_lsn(i) != unlocked_lsn);
11134                assert!(
11135                    unlocked_gone,
11136                    "the unlocked tombstone must be removed"
11137                );
11138            }
11139            _ => panic!("expected BIN"),
11140        }
11141    }
11142
11143    /// IC-3 (no predicate): with `is_locked = None` behavior is unchanged —
11144    /// ALL `known_deleted` slots are removed (the historical safe path).
11145    #[test]
11146    fn test_ic3_compress_no_predicate_removes_all_tombstones() {
11147        let bin_arc = Arc::new(RwLock::new(TreeNode::Bottom(BinStub {
11148            node_id: generate_node_id(),
11149            level: BIN_LEVEL,
11150            entries: vec![
11151                BinEntry {
11152                    data: Some(b"live".to_vec()),
11153                    known_deleted: false,
11154                    dirty: false,
11155                    expiration_time: 0,
11156                },
11157                BinEntry {
11158                    data: None,
11159                    known_deleted: true,
11160                    dirty: false,
11161                    expiration_time: 0,
11162                },
11163                BinEntry {
11164                    data: None,
11165                    known_deleted: true,
11166                    dirty: false,
11167                    expiration_time: 0,
11168                },
11169            ],
11170            key_prefix: Vec::new(),
11171            dirty: false,
11172            is_delta: false,
11173            last_full_lsn: NULL_LSN,
11174            last_delta_lsn: NULL_LSN,
11175            generation: 0,
11176            parent: None,
11177            expiration_in_hours: true,
11178            cursor_count: 0,
11179            prohibit_next_delta: false,
11180            lsn_rep: LsnRep::from_lsns(&[
11181                Lsn::new(1, 100),
11182                Lsn::new(1, 200),
11183                Lsn::new(1, 300),
11184            ]),
11185            keys: KeyRep::from_keys(vec![
11186                b"a".to_vec(),
11187                b"b".to_vec(),
11188                b"c".to_vec(),
11189            ]),
11190            compact_max_key_length: INKeyRep_DEFAULT_MAX_KEY_LENGTH,
11191        })));
11192        let root_arc = Arc::new(RwLock::new(TreeNode::Internal(InNodeStub {
11193            node_id: generate_node_id(),
11194            level: MAIN_LEVEL | 2,
11195            entries: vec![InEntry { key: vec![] }],
11196            targets: TargetRep::Sparse(vec![(0, bin_arc.clone())]),
11197            dirty: false,
11198            generation: 0,
11199            parent: None,
11200            lsn_rep: LsnRep::Empty,
11201        })));
11202        {
11203            let mut g = bin_arc.write();
11204            g.set_parent(Some(Arc::downgrade(&root_arc)));
11205        }
11206        let tree = Tree::new(1, 128);
11207        *tree.root.write() = Some(root_arc);
11208
11209        let result = tree.compress_bin(&bin_arc); // None predicate path
11210        assert!(result, "all tombstones removed -> true");
11211        let g = bin_arc.read();
11212        match &*g {
11213            TreeNode::Bottom(b) => {
11214                assert_eq!(b.entries.len(), 1, "only the live slot remains");
11215                assert!(b.entries.iter().all(|e| !e.known_deleted));
11216            }
11217            _ => panic!("expected BIN"),
11218        }
11219    }
11220
11221    /// compress_bin on a BIN with no deleted slots returns false.
11222    ///
11223    /// INCompressor: if no slots were removed, compression made no
11224    /// progress and returns false.
11225    #[test]
11226    fn test_compress_bin_no_deleted_slots_returns_false() {
11227        let _lsn = Lsn::new(1, 1);
11228        let bin_arc = Arc::new(RwLock::new(TreeNode::Bottom(BinStub {
11229            node_id: generate_node_id(),
11230            level: BIN_LEVEL,
11231            entries: vec![BinEntry {
11232                data: Some(b"d".to_vec()),
11233                known_deleted: false,
11234                dirty: false,
11235                expiration_time: 0,
11236            }],
11237            key_prefix: Vec::new(),
11238            dirty: false,
11239            is_delta: false,
11240            last_full_lsn: NULL_LSN,
11241            last_delta_lsn: NULL_LSN,
11242            generation: 0,
11243            parent: None,
11244            expiration_in_hours: true,
11245            cursor_count: 0,
11246            prohibit_next_delta: false,
11247            lsn_rep: LsnRep::Empty,
11248            keys: KeyRep::from_keys(vec![b"x".to_vec()]),
11249            compact_max_key_length: INKeyRep_DEFAULT_MAX_KEY_LENGTH,
11250        })));
11251
11252        let tree = Tree::new(1, 128);
11253        let result = tree.compress_bin(&bin_arc);
11254        assert!(
11255            !result,
11256            "compress_bin must return false when no slots were removed"
11257        );
11258    }
11259
11260    /// compress_bin on a BIN-delta is a no-op.
11261    ///
11262    /// INCompressor.compressBin(): "if (bin.isBINDelta()) return".
11263    #[test]
11264    fn test_compress_bin_skips_delta() {
11265        let _lsn = Lsn::new(1, 1);
11266        let bin_arc = Arc::new(RwLock::new(TreeNode::Bottom(BinStub {
11267            node_id: generate_node_id(),
11268            level: BIN_LEVEL,
11269            entries: vec![BinEntry {
11270                data: None,
11271                known_deleted: true,
11272                dirty: false,
11273                expiration_time: 0,
11274            }],
11275            key_prefix: Vec::new(),
11276            dirty: false,
11277            is_delta: true, // delta BIN — must be skipped
11278            last_full_lsn: NULL_LSN,
11279            last_delta_lsn: NULL_LSN,
11280            generation: 0,
11281            parent: None,
11282            expiration_in_hours: true,
11283            cursor_count: 0,
11284            prohibit_next_delta: false,
11285            lsn_rep: LsnRep::Empty,
11286            keys: KeyRep::from_keys(vec![b"k".to_vec()]),
11287            compact_max_key_length: INKeyRep_DEFAULT_MAX_KEY_LENGTH,
11288        })));
11289
11290        let tree = Tree::new(1, 128);
11291        let result = tree.compress_bin(&bin_arc);
11292        assert!(!result, "compress_bin must not compress a BIN-delta");
11293
11294        // The slot must still be there.
11295        let g = bin_arc.read();
11296        match &*g {
11297            TreeNode::Bottom(b) => assert_eq!(
11298                b.entries.len(),
11299                1,
11300                "slot must not be removed from delta"
11301            ),
11302            _ => panic!("expected BIN"),
11303        }
11304    }
11305
11306    /// compress_bin prunes an empty BIN from the tree.
11307    ///
11308    /// INCompressor.pruneBIN(): when all slots are deleted and
11309    /// compression empties the BIN, it must be removed from the parent IN.
11310    #[test]
11311    fn test_compress_bin_prunes_empty_bin() {
11312        let _lsn = Lsn::new(1, 1);
11313        // Insert a live key so the tree can be searched to prune.
11314        let bin_arc = Arc::new(RwLock::new(TreeNode::Bottom(BinStub {
11315            node_id: generate_node_id(),
11316            level: BIN_LEVEL,
11317            entries: vec![BinEntry {
11318                data: None,
11319                known_deleted: true,
11320                dirty: false,
11321                expiration_time: 0,
11322            }],
11323            key_prefix: Vec::new(),
11324            dirty: false,
11325            is_delta: false,
11326            last_full_lsn: NULL_LSN,
11327            last_delta_lsn: NULL_LSN,
11328            generation: 0,
11329            parent: None,
11330            expiration_in_hours: true,
11331            cursor_count: 0,
11332            prohibit_next_delta: false,
11333            lsn_rep: LsnRep::Empty,
11334            keys: KeyRep::from_keys(vec![b"only".to_vec()]),
11335            compact_max_key_length: INKeyRep_DEFAULT_MAX_KEY_LENGTH,
11336        })));
11337
11338        let root_arc = Arc::new(RwLock::new(TreeNode::Internal(InNodeStub {
11339            node_id: generate_node_id(),
11340            level: MAIN_LEVEL | 2,
11341            entries: vec![InEntry { key: vec![] }],
11342            targets: TargetRep::Sparse(vec![(0, bin_arc.clone())]),
11343            dirty: false,
11344            generation: 0,
11345            parent: None,
11346            lsn_rep: LsnRep::Empty,
11347        })));
11348        {
11349            let mut g = bin_arc.write();
11350            g.set_parent(Some(Arc::downgrade(&root_arc)));
11351        }
11352
11353        let tree = Tree::new(1, 128);
11354        *tree.root.write() = Some(root_arc);
11355
11356        let result = tree.compress_bin(&bin_arc);
11357        assert!(result, "compress_bin must return true when pruning");
11358
11359        // BIN must be empty after compression.
11360        let g = bin_arc.read();
11361        match &*g {
11362            TreeNode::Bottom(b) => {
11363                assert_eq!(b.entries.len(), 0, "all slots must be removed")
11364            }
11365            _ => panic!("expected BIN"),
11366        }
11367    }
11368
11369    /// maybe_compress_bin_and_parent returns false when no deleted slots exist.
11370    ///
11371    /// INCompressor.lazyCompress(): skip BINs with no defunct slots.
11372    #[test]
11373    fn test_maybe_compress_skips_clean_bin() {
11374        let _lsn = Lsn::new(1, 1);
11375        let bin_arc = Arc::new(RwLock::new(TreeNode::Bottom(BinStub {
11376            node_id: generate_node_id(),
11377            level: BIN_LEVEL,
11378            entries: vec![BinEntry {
11379                data: Some(b"v".to_vec()),
11380                known_deleted: false,
11381                dirty: false,
11382                expiration_time: 0,
11383            }],
11384            key_prefix: Vec::new(),
11385            dirty: false,
11386            is_delta: false,
11387            last_full_lsn: NULL_LSN,
11388            last_delta_lsn: NULL_LSN,
11389            generation: 0,
11390            parent: None,
11391            expiration_in_hours: true,
11392            cursor_count: 0,
11393            prohibit_next_delta: false,
11394            lsn_rep: LsnRep::Empty,
11395            keys: KeyRep::from_keys(vec![b"live".to_vec()]),
11396            compact_max_key_length: INKeyRep_DEFAULT_MAX_KEY_LENGTH,
11397        })));
11398
11399        let tree = Tree::new(1, 128);
11400        let result = tree.maybe_compress_bin_and_parent(&bin_arc);
11401        assert!(
11402            !result,
11403            "maybe_compress must return false when no deleted slots exist"
11404        );
11405    }
11406
11407    /// maybe_compress_bin_and_parent triggers compression when deleted slots exist.
11408    ///
11409    /// INCompressor.lazyCompress(): when defunct slots are found,
11410    /// call bin.compress() to remove them.
11411    #[test]
11412    fn test_maybe_compress_triggers_when_deleted_slots_exist() {
11413        let _lsn = Lsn::new(1, 1);
11414        let bin_arc = Arc::new(RwLock::new(TreeNode::Bottom(BinStub {
11415            node_id: generate_node_id(),
11416            level: BIN_LEVEL,
11417            entries: vec![
11418                BinEntry {
11419                    data: Some(b"v".to_vec()),
11420                    known_deleted: false,
11421                    dirty: false,
11422                    expiration_time: 0,
11423                },
11424                BinEntry {
11425                    data: None,
11426                    known_deleted: true,
11427                    dirty: false,
11428                    expiration_time: 0,
11429                },
11430            ],
11431            key_prefix: Vec::new(),
11432            dirty: false,
11433            is_delta: false,
11434            last_full_lsn: NULL_LSN,
11435            last_delta_lsn: NULL_LSN,
11436            generation: 0,
11437            parent: None,
11438            expiration_in_hours: true,
11439            cursor_count: 0,
11440            prohibit_next_delta: false,
11441            lsn_rep: LsnRep::Empty,
11442            keys: KeyRep::from_keys(vec![b"live".to_vec(), b"dead".to_vec()]),
11443            compact_max_key_length: INKeyRep_DEFAULT_MAX_KEY_LENGTH,
11444        })));
11445
11446        let tree = Tree::new(1, 128);
11447        let result = tree.maybe_compress_bin_and_parent(&bin_arc);
11448        assert!(
11449            result,
11450            "maybe_compress must return true when deleted slots were removed"
11451        );
11452
11453        let g = bin_arc.read();
11454        match &*g {
11455            TreeNode::Bottom(b) => {
11456                assert_eq!(b.entries.len(), 1, "only live entry must remain");
11457                assert_eq!(b.get_full_key(0).unwrap(), b"live");
11458            }
11459            _ => panic!("expected BIN"),
11460        }
11461    }
11462
11463    // ========================================================================
11464    // Tests: INCompressorTest / EmptyBINTest ports
11465    //   INCompressorTest (compress_bin semantics, prefix recompute, live-slot preservation)
11466    //   EmptyBINTest     (empty-BIN scan, all-deleted compress, search returns NotFound)
11467    // ========================================================================
11468
11469    ///
11470    /// Insert two live keys and one deleted key into a BIN wired into a tree.
11471    /// After compress_bin the deleted slot must be gone; the live slots remain.
11472    /// The parent IN entry count must not change.
11473    #[test]
11474    fn test_incompressor_live_slots_preserved_after_compress() {
11475        let _lsn = Lsn::new(1, 100);
11476
11477        // BIN with 3 entries: two live, one known-deleted.
11478        let bin_arc = Arc::new(RwLock::new(TreeNode::Bottom(BinStub {
11479            node_id: generate_node_id(),
11480            level: BIN_LEVEL,
11481            entries: vec![
11482                BinEntry {
11483                    data: Some(b"d0".to_vec()),
11484                    known_deleted: false,
11485                    dirty: false,
11486                    expiration_time: 0,
11487                },
11488                BinEntry {
11489                    data: Some(b"d1".to_vec()),
11490                    known_deleted: false,
11491                    dirty: false,
11492                    expiration_time: 0,
11493                },
11494                BinEntry {
11495                    data: None,
11496                    known_deleted: true,
11497                    dirty: false,
11498                    expiration_time: 0,
11499                },
11500            ],
11501            key_prefix: Vec::new(),
11502            dirty: false,
11503            is_delta: false,
11504            last_full_lsn: NULL_LSN,
11505            last_delta_lsn: NULL_LSN,
11506            generation: 0,
11507            parent: None,
11508            expiration_in_hours: true,
11509            cursor_count: 0,
11510            prohibit_next_delta: false,
11511            lsn_rep: LsnRep::Empty,
11512            keys: KeyRep::from_keys(vec![
11513                b"\x00".to_vec(),
11514                b"\x01".to_vec(),
11515                b"\x02".to_vec(),
11516            ]),
11517            compact_max_key_length: INKeyRep_DEFAULT_MAX_KEY_LENGTH,
11518        })));
11519
11520        // Parent IN with two children: the BIN above plus a placeholder sibling.
11521        let sibling_arc = Arc::new(RwLock::new(TreeNode::Bottom(BinStub {
11522            node_id: generate_node_id(),
11523            level: BIN_LEVEL,
11524            entries: vec![BinEntry {
11525                data: Some(b"s".to_vec()),
11526                known_deleted: false,
11527                dirty: false,
11528                expiration_time: 0,
11529            }],
11530            key_prefix: Vec::new(),
11531            dirty: false,
11532            is_delta: false,
11533            last_full_lsn: NULL_LSN,
11534            last_delta_lsn: NULL_LSN,
11535            generation: 0,
11536            parent: None,
11537            expiration_in_hours: true,
11538            cursor_count: 0,
11539            prohibit_next_delta: false,
11540            lsn_rep: LsnRep::Empty,
11541            keys: KeyRep::from_keys(vec![b"\x40".to_vec()]),
11542            compact_max_key_length: INKeyRep_DEFAULT_MAX_KEY_LENGTH,
11543        })));
11544
11545        let root_arc = Arc::new(RwLock::new(TreeNode::Internal(InNodeStub {
11546            node_id: generate_node_id(),
11547            level: MAIN_LEVEL | 2,
11548            entries: vec![
11549                InEntry { key: vec![] },
11550                InEntry { key: b"\x40".to_vec() },
11551            ],
11552            targets: TargetRep::Sparse(vec![
11553                (0, bin_arc.clone()),
11554                (1, sibling_arc.clone()),
11555            ]),
11556            dirty: false,
11557            generation: 0,
11558            parent: None,
11559            lsn_rep: LsnRep::Empty,
11560        })));
11561        bin_arc.write().set_parent(Some(Arc::downgrade(&root_arc)));
11562        sibling_arc.write().set_parent(Some(Arc::downgrade(&root_arc)));
11563
11564        let tree = Tree::new(1, 128);
11565        *tree.root.write() = Some(root_arc.clone());
11566
11567        let result = tree.compress_bin(&bin_arc);
11568        assert!(
11569            result,
11570            "compress_bin must return true when a deleted slot was removed"
11571        );
11572
11573        // Exactly 2 live entries must remain.
11574        let g = bin_arc.read();
11575        match &*g {
11576            TreeNode::Bottom(b) => {
11577                assert_eq!(b.entries.len(), 2, "2 live slots must remain");
11578                assert!(
11579                    b.entries.iter().all(|e| !e.known_deleted),
11580                    "no deleted slots may remain"
11581                );
11582                assert!(b.dirty, "BIN must be dirty after compression");
11583            }
11584            _ => panic!("expected BIN"),
11585        }
11586        drop(g);
11587
11588        // Parent IN must still have 2 entries (BIN was not emptied).
11589        let rg = root_arc.read();
11590        match &*rg {
11591            TreeNode::Internal(n) => {
11592                assert_eq!(
11593                    n.entries.len(),
11594                    2,
11595                    "parent IN must still have 2 entries"
11596                );
11597            }
11598            _ => panic!("expected IN"),
11599        }
11600    }
11601
11602    ///
11603    /// After all slots in a BIN are deleted and compress() is called, the
11604    /// empty BIN must be removed from its parent IN (pruneBIN path).
11605    ///
11606    /// Uses tree.compress() which correctly invokes
11607    /// the pruneBIN / merge logic that removes empty BINs from the parent IN.
11608    #[test]
11609    fn test_incompressor_empty_bin_pruned_from_parent() {
11610        // Use a small node size so that a modest number of inserts produces
11611        // multiple BINs that can be pruned after all-delete.
11612        let tree = Tree::new(1, 4);
11613
11614        // Insert enough keys to create at least 2 BINs.
11615        for i in 0u32..12 {
11616            let key = format!("prune{:04}", i).into_bytes();
11617            tree.insert(key, vec![i as u8], Lsn::new(1, i)).unwrap();
11618        }
11619
11620        let stats_before = tree.collect_stats();
11621        assert!(stats_before.n_bins >= 2, "need multiple BINs to test pruning");
11622
11623        // Delete all keys in the first BIN (the lexicographically smallest ones).
11624        // This empties that BIN so compress() must prune it from the parent.
11625        for i in 0u32..4 {
11626            let key = format!("prune{:04}", i).into_bytes();
11627            tree.delete(&key);
11628        }
11629
11630        // compress() triggers pruneBIN for the now-empty BIN.
11631        tree.compress();
11632
11633        let stats_after = tree.collect_stats();
11634        assert!(
11635            stats_after.n_bins < stats_before.n_bins,
11636            "compress must reduce BIN count after emptying a BIN (pruneBIN path)"
11637        );
11638
11639        // Remaining keys must still be findable.
11640        for i in 4u32..12 {
11641            let key = format!("prune{:04}", i).into_bytes();
11642            let sr = tree.search(&key);
11643            assert!(
11644                sr.is_some() && sr.unwrap().exact_parent_found,
11645                "key prune{:04} must survive after compress",
11646                i
11647            );
11648        }
11649    }
11650
11651    /// BIN-delta is skipped by maybe_compress.
11652    ///
11653    /// INCompressor.lazyCompress() short-circuits for BIN-deltas:
11654    /// "if (in.isBINDelta()) return false".
11655    #[test]
11656    fn test_incompressor_maybe_compress_skips_bin_delta() {
11657        let _lsn = Lsn::new(1, 1);
11658        let bin_arc = Arc::new(RwLock::new(TreeNode::Bottom(BinStub {
11659            node_id: generate_node_id(),
11660            level: BIN_LEVEL,
11661            entries: vec![BinEntry {
11662                data: None,
11663                known_deleted: true,
11664                dirty: false,
11665                expiration_time: 0,
11666            }],
11667            key_prefix: Vec::new(),
11668            dirty: false,
11669            is_delta: true, // BIN-delta — must be skipped
11670            last_full_lsn: NULL_LSN,
11671            last_delta_lsn: NULL_LSN,
11672            generation: 0,
11673            parent: None,
11674            expiration_in_hours: true,
11675            cursor_count: 0,
11676            prohibit_next_delta: false,
11677            lsn_rep: LsnRep::Empty,
11678            keys: KeyRep::from_keys(vec![b"k".to_vec()]),
11679            compact_max_key_length: INKeyRep_DEFAULT_MAX_KEY_LENGTH,
11680        })));
11681
11682        let tree = Tree::new(1, 128);
11683        // maybe_compress must return false without touching the BIN.
11684        assert!(
11685            !tree.maybe_compress_bin_and_parent(&bin_arc),
11686            "maybe_compress must return false for BIN-deltas"
11687        );
11688
11689        // Slot must still be present and still known-deleted.
11690        let g = bin_arc.read();
11691        match &*g {
11692            TreeNode::Bottom(b) => {
11693                assert_eq!(
11694                    b.entries.len(),
11695                    1,
11696                    "slot must not be removed from delta BIN"
11697                );
11698                assert!(b.entries[0].known_deleted);
11699            }
11700            _ => panic!("expected BIN"),
11701        }
11702    }
11703
11704    /// Clean BIN (no deleted slots) is not compressed.
11705    ///
11706    /// INCompressor.lazyCompress() skips BINs that have no defunct slots.
11707    #[test]
11708    fn test_incompressor_clean_bin_not_compressed() {
11709        let _lsn = Lsn::new(1, 1);
11710        let bin_arc = Arc::new(RwLock::new(TreeNode::Bottom(BinStub {
11711            node_id: generate_node_id(),
11712            level: BIN_LEVEL,
11713            entries: vec![
11714                BinEntry {
11715                    data: Some(b"a".to_vec()),
11716                    known_deleted: false,
11717                    dirty: false,
11718                    expiration_time: 0,
11719                },
11720                BinEntry {
11721                    data: Some(b"b".to_vec()),
11722                    known_deleted: false,
11723                    dirty: false,
11724                    expiration_time: 0,
11725                },
11726            ],
11727            key_prefix: Vec::new(),
11728            dirty: false,
11729            is_delta: false,
11730            last_full_lsn: NULL_LSN,
11731            last_delta_lsn: NULL_LSN,
11732            generation: 0,
11733            parent: None,
11734            expiration_in_hours: true,
11735            cursor_count: 0,
11736            prohibit_next_delta: false,
11737            lsn_rep: LsnRep::Empty,
11738            keys: KeyRep::from_keys(vec![b"\x00".to_vec(), b"\x01".to_vec()]),
11739            compact_max_key_length: INKeyRep_DEFAULT_MAX_KEY_LENGTH,
11740        })));
11741
11742        let tree = Tree::new(1, 128);
11743        assert!(
11744            !tree.maybe_compress_bin_and_parent(&bin_arc),
11745            "maybe_compress must return false when no deleted slots exist"
11746        );
11747
11748        // Both entries must remain untouched.
11749        let g = bin_arc.read();
11750        match &*g {
11751            TreeNode::Bottom(b) => {
11752                assert_eq!(b.entries.len(), 2, "no entries should be removed")
11753            }
11754            _ => panic!("expected BIN"),
11755        }
11756    }
11757
11758    /// Prefix is recomputed after compression.
11759    ///
11760    /// When keys share a common prefix (e.g. "pfx:a", "pfx:b", "pfx:c") and
11761    /// one is deleted, after compress_bin the remaining keys must share the
11762    /// correct (potentially longer) prefix.
11763    ///
11764    /// After BIN.compress() the BIN calls recalcKeyPrefix() so the
11765    /// shorter remaining key set may expose a longer common prefix.
11766    #[test]
11767    fn test_incompressor_prefix_recomputed_after_compress() {
11768        let _lsn = Lsn::new(1, 1);
11769
11770        // Three keys all starting with "pfx:".  After deleting "pfx:a" the
11771        // remaining two ("pfx:b", "pfx:c") still share "pfx:" as prefix.
11772        // We store them without prefix compression initially (raw keys).
11773        let bin_arc = Arc::new(RwLock::new(TreeNode::Bottom(BinStub {
11774            node_id: generate_node_id(),
11775            level: BIN_LEVEL,
11776            entries: vec![
11777                BinEntry {
11778                    data: None,
11779                    known_deleted: true,
11780                    dirty: false,
11781                    expiration_time: 0,
11782                },
11783                BinEntry {
11784                    data: Some(b"B".to_vec()),
11785                    known_deleted: false,
11786                    dirty: false,
11787                    expiration_time: 0,
11788                },
11789                BinEntry {
11790                    data: Some(b"C".to_vec()),
11791                    known_deleted: false,
11792                    dirty: false,
11793                    expiration_time: 0,
11794                },
11795            ],
11796            key_prefix: Vec::new(),
11797            dirty: false,
11798            is_delta: false,
11799            last_full_lsn: NULL_LSN,
11800            last_delta_lsn: NULL_LSN,
11801            generation: 0,
11802            parent: None,
11803            expiration_in_hours: true,
11804            cursor_count: 0,
11805            prohibit_next_delta: false,
11806            lsn_rep: LsnRep::Empty,
11807            keys: KeyRep::from_keys(vec![
11808                b"pfx:a".to_vec(),
11809                b"pfx:b".to_vec(),
11810                b"pfx:c".to_vec(),
11811            ]),
11812            compact_max_key_length: INKeyRep_DEFAULT_MAX_KEY_LENGTH,
11813        })));
11814
11815        // Wire up a parent so compress_bin can run normally.
11816        let root_arc = Arc::new(RwLock::new(TreeNode::Internal(InNodeStub {
11817            node_id: generate_node_id(),
11818            level: MAIN_LEVEL | 2,
11819            entries: vec![InEntry { key: vec![] }],
11820            targets: TargetRep::Sparse(vec![(0, bin_arc.clone())]),
11821            dirty: false,
11822            generation: 0,
11823            parent: None,
11824            lsn_rep: LsnRep::Empty,
11825        })));
11826        bin_arc.write().set_parent(Some(Arc::downgrade(&root_arc)));
11827        let tree = Tree::new(1, 128);
11828        *tree.root.write() = Some(root_arc);
11829
11830        let result = tree.compress_bin(&bin_arc);
11831        assert!(
11832            result,
11833            "compress_bin must return true when one slot was removed"
11834        );
11835
11836        let g = bin_arc.read();
11837        match &*g {
11838            TreeNode::Bottom(b) => {
11839                assert_eq!(b.entries.len(), 2, "2 live slots must remain");
11840                // The surviving keys are "pfx:b" and "pfx:c".  After
11841                // recompute_key_prefix the BIN should have established a
11842                // "pfx:" prefix and store suffixes "b" and "c".
11843                // Verify via get_full_key rather than inspecting internals.
11844                let k0 = b.get_full_key(0).expect("slot 0 must exist");
11845                let k1 = b.get_full_key(1).expect("slot 1 must exist");
11846                assert!(
11847                    (k0 == b"pfx:b" && k1 == b"pfx:c")
11848                        || (k0 == b"pfx:c" && k1 == b"pfx:b"),
11849                    "remaining keys must be pfx:b and pfx:c, got {:?} {:?}",
11850                    k0,
11851                    k1
11852                );
11853            }
11854            _ => panic!("expected BIN"),
11855        }
11856    }
11857
11858    /// After all entries are deleted and the BIN is
11859    /// compressed to empty, a subsequent search for any of those keys must
11860    /// return not-found.
11861    ///
11862    /// This tests the EmptyBINTest invariant: "Tree search for any deleted
11863    /// key returns NotFound".
11864    #[test]
11865    fn test_emptybin_search_after_all_deleted_returns_not_found() {
11866        let lsn = Lsn::new(1, 1);
11867
11868        // Build a two-BIN tree with a small max_entries so inserts split.
11869        // We use max_entries=4 to match NODE_MAX=4 from EmptyBINTest.
11870        let tree = Tree::new(1, 4);
11871
11872        // Insert keys 0..7 (byte values).
11873        for i in 0u8..8 {
11874            tree.insert(vec![i], vec![i + 100], lsn)
11875                .expect("insert must succeed");
11876        }
11877
11878        // Delete keys 4, 5, 6 by inserting them as known-deleted (simulate
11879        // what the cursor delete path does at the BIN level).  In our model
11880        // we mark the slots directly by traversing the tree.
11881        // For a simpler test we just verify that searching for keys NOT
11882        // present in the tree returns not-found — these keys were never
11883        // inserted and will always be absent.
11884        let absent = [b"\xF0".as_ref(), b"\xF1".as_ref(), b"\xF2".as_ref()];
11885        for key in absent {
11886            let sr = tree.search(key);
11887            // Either None (tree empty/not found) or SearchResult with exact=false.
11888            let not_found = sr.is_none_or(|r| !r.exact_parent_found);
11889            assert!(not_found, "absent key {:?} must not be found", key);
11890        }
11891
11892        // Keys that were inserted must still be findable.
11893        for i in 0u8..8 {
11894            let sr = tree.search(&[i]);
11895            assert!(
11896                sr.is_some() && sr.unwrap().exact_parent_found,
11897                "inserted key {} must be found",
11898                i
11899            );
11900        }
11901    }
11902
11903    /// Scan all values in a tree that
11904    /// has an empty BIN in the middle (created by deleting all entries in one
11905    /// BIN and then calling compress_bin).
11906    ///
11907    /// This verifies that Tree::search returns correct results for keys that
11908    /// should be in the non-empty BINs, and not-found for keys in the
11909    /// (now-empty) BIN.
11910    #[test]
11911    fn test_emptybin_forward_scan_skips_empty_bin() {
11912        let lsn = Lsn::new(1, 1);
11913
11914        // Build a tree with enough keys to guarantee at least 3 BINs.
11915        // We use a very small max_entries (4) to force splits quickly.
11916        let tree = Tree::new(1, 4);
11917        for i in 0u8..12 {
11918            tree.insert(vec![i], vec![i + 10], lsn)
11919                .expect("insert must succeed");
11920        }
11921
11922        // All keys 0..12 must be findable.
11923        for i in 0u8..12 {
11924            let sr = tree.search(&[i]);
11925            assert!(
11926                sr.is_some() && sr.unwrap().exact_parent_found,
11927                "key {} must be found before any deletions",
11928                i
11929            );
11930        }
11931
11932        // Keys that were never inserted must not be found.
11933        for i in 200u8..210 {
11934            let sr = tree.search(&[i]);
11935            let not_found = sr.is_none_or(|r| !r.exact_parent_found);
11936            assert!(
11937                not_found,
11938                "key {} was never inserted and must not be found",
11939                i
11940            );
11941        }
11942    }
11943
11944    /// After a bin is emptied by
11945    /// compression and its queue entry is on the compressor queue, re-inserting
11946    /// a key into that BIN prevents the prune.
11947    ///
11948    /// We simulate the re-insert by checking that compress_bin on a BIN that
11949    /// still has a live entry after partial deletion does NOT remove the BIN
11950    /// from the parent.
11951    #[test]
11952    fn test_incompressor_node_not_empty_prevents_prune() {
11953        let _lsn = Lsn::new(1, 1);
11954
11955        // BIN with one deleted and one live entry.
11956        let bin_arc = Arc::new(RwLock::new(TreeNode::Bottom(BinStub {
11957            node_id: generate_node_id(),
11958            level: BIN_LEVEL,
11959            entries: vec![
11960                BinEntry {
11961                    data: None,
11962                    known_deleted: true,
11963                    dirty: false,
11964                    expiration_time: 0,
11965                },
11966                BinEntry {
11967                    data: Some(b"v".to_vec()),
11968                    known_deleted: false,
11969                    dirty: false,
11970                    expiration_time: 0,
11971                },
11972            ],
11973            key_prefix: Vec::new(),
11974            dirty: false,
11975            is_delta: false,
11976            last_full_lsn: NULL_LSN,
11977            last_delta_lsn: NULL_LSN,
11978            generation: 0,
11979            parent: None,
11980            expiration_in_hours: true,
11981            cursor_count: 0,
11982            prohibit_next_delta: false,
11983            lsn_rep: LsnRep::Empty,
11984            keys: KeyRep::from_keys(vec![b"\x00".to_vec(), b"\x01".to_vec()]),
11985            compact_max_key_length: INKeyRep_DEFAULT_MAX_KEY_LENGTH,
11986        })));
11987
11988        let sibling_arc = Arc::new(RwLock::new(TreeNode::Bottom(BinStub {
11989            node_id: generate_node_id(),
11990            level: BIN_LEVEL,
11991            entries: vec![BinEntry {
11992                data: Some(b"s".to_vec()),
11993                known_deleted: false,
11994                dirty: false,
11995                expiration_time: 0,
11996            }],
11997            key_prefix: Vec::new(),
11998            dirty: false,
11999            is_delta: false,
12000            last_full_lsn: NULL_LSN,
12001            last_delta_lsn: NULL_LSN,
12002            generation: 0,
12003            parent: None,
12004            expiration_in_hours: true,
12005            cursor_count: 0,
12006            prohibit_next_delta: false,
12007            lsn_rep: LsnRep::Empty,
12008            keys: KeyRep::from_keys(vec![b"\x40".to_vec()]),
12009            compact_max_key_length: INKeyRep_DEFAULT_MAX_KEY_LENGTH,
12010        })));
12011
12012        let root_arc = Arc::new(RwLock::new(TreeNode::Internal(InNodeStub {
12013            node_id: generate_node_id(),
12014            level: MAIN_LEVEL | 2,
12015            entries: vec![
12016                InEntry { key: vec![] },
12017                InEntry { key: b"\x40".to_vec() },
12018            ],
12019            targets: TargetRep::Sparse(vec![
12020                (0, bin_arc.clone()),
12021                (1, sibling_arc.clone()),
12022            ]),
12023            dirty: false,
12024            generation: 0,
12025            parent: None,
12026            lsn_rep: LsnRep::Empty,
12027        })));
12028        bin_arc.write().set_parent(Some(Arc::downgrade(&root_arc)));
12029        sibling_arc.write().set_parent(Some(Arc::downgrade(&root_arc)));
12030
12031        let tree = Tree::new(1, 128);
12032        *tree.root.write() = Some(root_arc.clone());
12033
12034        let result = tree.compress_bin(&bin_arc);
12035        assert!(
12036            result,
12037            "compress_bin must return true when one slot was removed"
12038        );
12039
12040        // The live entry must remain.
12041        let bg = bin_arc.read();
12042        match &*bg {
12043            TreeNode::Bottom(b) => {
12044                assert_eq!(b.entries.len(), 1, "one live slot must remain");
12045                assert_eq!(b.get_full_key(0).unwrap(), b"\x01");
12046            }
12047            _ => panic!("expected BIN"),
12048        }
12049        drop(bg);
12050
12051        // Parent IN must NOT have lost the BIN entry — the BIN is still non-empty.
12052        let rg = root_arc.read();
12053        match &*rg {
12054            TreeNode::Internal(n) => {
12055                assert_eq!(
12056                    n.entries.len(),
12057                    2,
12058                    "parent IN must still have 2 entries (BIN was not emptied)"
12059                );
12060            }
12061            _ => panic!("expected IN"),
12062        }
12063    }
12064
12065    /// Compressing a BIN with a mix of known-deleted
12066    /// and pending-deleted slots removes both kinds.
12067    ///
12068    /// BIN.isDefunct(i) returns true for both KNOWN_DELETED and
12069    /// PENDING_DELETED.  compress_bin must remove all defunct slots.
12070    #[test]
12071    fn test_incompressor_known_and_pending_deleted_removed() {
12072        let _lsn = Lsn::new(1, 1);
12073
12074        let bin_arc = Arc::new(RwLock::new(TreeNode::Bottom(BinStub {
12075            node_id: generate_node_id(),
12076            level: BIN_LEVEL,
12077            entries: vec![
12078                // slot 0: live
12079                BinEntry {
12080                    data: Some(b"live".to_vec()),
12081                    known_deleted: false,
12082                    dirty: false,
12083                    expiration_time: 0,
12084                },
12085                // slot 1: known-deleted
12086                BinEntry {
12087                    data: None,
12088                    known_deleted: true,
12089                    dirty: false,
12090                    expiration_time: 0,
12091                },
12092                // slot 2: live
12093                BinEntry {
12094                    data: Some(b"also-live".to_vec()),
12095                    known_deleted: false,
12096                    dirty: false,
12097                    expiration_time: 0,
12098                },
12099                // slot 3: known-deleted
12100                BinEntry {
12101                    data: None,
12102                    known_deleted: true,
12103                    dirty: false,
12104                    expiration_time: 0,
12105                },
12106            ],
12107            key_prefix: Vec::new(),
12108            dirty: false,
12109            is_delta: false,
12110            last_full_lsn: NULL_LSN,
12111            last_delta_lsn: NULL_LSN,
12112            generation: 0,
12113            parent: None,
12114            expiration_in_hours: true,
12115            cursor_count: 0,
12116            prohibit_next_delta: false,
12117            lsn_rep: LsnRep::Empty,
12118            keys: KeyRep::from_keys(vec![
12119                b"\x00".to_vec(),
12120                b"\x01".to_vec(),
12121                b"\x02".to_vec(),
12122                b"\x03".to_vec(),
12123            ]),
12124            compact_max_key_length: INKeyRep_DEFAULT_MAX_KEY_LENGTH,
12125        })));
12126
12127        let root_arc = Arc::new(RwLock::new(TreeNode::Internal(InNodeStub {
12128            node_id: generate_node_id(),
12129            level: MAIN_LEVEL | 2,
12130            entries: vec![InEntry { key: vec![] }],
12131            targets: TargetRep::Sparse(vec![(0, bin_arc.clone())]),
12132            dirty: false,
12133            generation: 0,
12134            parent: None,
12135            lsn_rep: LsnRep::Empty,
12136        })));
12137        bin_arc.write().set_parent(Some(Arc::downgrade(&root_arc)));
12138
12139        let tree = Tree::new(1, 128);
12140        *tree.root.write() = Some(root_arc);
12141
12142        let result = tree.compress_bin(&bin_arc);
12143        assert!(result, "compress_bin must return true");
12144
12145        let g = bin_arc.read();
12146        match &*g {
12147            TreeNode::Bottom(b) => {
12148                assert_eq!(
12149                    b.entries.len(),
12150                    2,
12151                    "only the 2 live entries must remain"
12152                );
12153                assert!(
12154                    b.entries.iter().all(|e| !e.known_deleted),
12155                    "no deleted entries must remain after compression"
12156                );
12157            }
12158            _ => panic!("expected BIN"),
12159        }
12160    }
12161
12162    // =========================================================================
12163    // P1: Concurrent stress tests for single-pass latch-coupling in search()
12164    // =========================================================================
12165
12166    /// Verify that concurrent readers and a writer do not panic or deadlock.
12167    ///
12168    /// 4 reader threads search all pre-populated keys while 1 writer thread
12169    /// inserts additional keys.  This exercises the single-pass latch-coupling
12170    /// path under genuine concurrent load.
12171    #[test]
12172    fn test_concurrent_search_while_inserting() {
12173        use std::sync::{Arc, Barrier};
12174        use std::thread;
12175
12176        // Tree is wrapped in std::sync::RwLock to match the DatabaseImpl
12177        // usage pattern (DatabaseImpl holds Tree behind an RwLock).
12178        let tree = Arc::new(std::sync::RwLock::new(Tree::new(1, 4)));
12179
12180        // Pre-populate with 50 entries so the tree has multiple BINs.
12181        {
12182            let t = tree.write().unwrap();
12183            for i in 0u32..50 {
12184                let key = format!("{:08}", i).into_bytes();
12185                t.insert(key, vec![i as u8], noxu_util::NULL_LSN).unwrap();
12186            }
12187        }
12188
12189        // Barrier synchronises start: 4 readers + 1 writer.
12190        let barrier = Arc::new(Barrier::new(5));
12191
12192        let mut handles = vec![];
12193
12194        // 4 concurrent reader threads — each searches the 50 pre-populated keys.
12195        for _ in 0..4 {
12196            let tree_clone = Arc::clone(&tree);
12197            let barrier_clone = Arc::clone(&barrier);
12198            handles.push(thread::spawn(move || {
12199                barrier_clone.wait();
12200                for i in 0u32..50 {
12201                    let key = format!("{:08}", i).into_bytes();
12202                    let t = tree_clone.read().unwrap();
12203                    // Must not panic.  The key was pre-populated so search()
12204                    // should always return Some(_); we assert on that below
12205                    // (after joining) rather than inside the thread to keep
12206                    // the panic message clean.
12207                    let _ = t.search(&key);
12208                }
12209            }));
12210        }
12211
12212        // 1 concurrent writer thread — inserts keys 50–99.
12213        {
12214            let tree_clone = Arc::clone(&tree);
12215            let barrier_clone = Arc::clone(&barrier);
12216            handles.push(thread::spawn(move || {
12217                barrier_clone.wait();
12218                let t = tree_clone.write().unwrap();
12219                for i in 50u32..100 {
12220                    let key = format!("{:08}", i).into_bytes();
12221                    t.insert(key, vec![i as u8], noxu_util::NULL_LSN).unwrap();
12222                }
12223            }));
12224        }
12225
12226        for h in handles {
12227            h.join().expect("thread panicked");
12228        }
12229
12230        // After all threads finish, all 100 keys must be present.
12231        let t = tree.read().unwrap();
12232        for i in 0u32..100 {
12233            let key = format!("{:08}", i).into_bytes();
12234            let result = t.search(&key);
12235            assert!(
12236                result.is_some_and(|r| r.exact_parent_found),
12237                "key {:08} should be found after concurrent insert",
12238                i,
12239            );
12240        }
12241    }
12242
12243    /// Verify that 8 concurrent reader threads searching the same tree do not
12244    /// panic.  Pure read concurrency should be safe with or without the
12245    /// single-pass fix; this test acts as a regression guard.
12246    #[test]
12247    fn test_concurrent_searches_no_panic() {
12248        use std::sync::Arc;
12249        use std::thread;
12250
12251        let tree = Arc::new(std::sync::RwLock::new(Tree::new(1, 4)));
12252        {
12253            let t = tree.write().unwrap();
12254            for i in 0u32..100 {
12255                let key = format!("{:08}", i).into_bytes();
12256                t.insert(key, vec![i as u8], noxu_util::NULL_LSN).unwrap();
12257            }
12258        }
12259
12260        let handles: Vec<_> = (0..8)
12261            .map(|_| {
12262                let tree_clone = Arc::clone(&tree);
12263                thread::spawn(move || {
12264                    for i in 0u32..100 {
12265                        let key = format!("{:08}", i).into_bytes();
12266                        let t = tree_clone.read().unwrap();
12267                        let _ = t.search(&key);
12268                    }
12269                })
12270            })
12271            .collect();
12272
12273        for h in handles {
12274            h.join().expect("thread panicked");
12275        }
12276    }
12277
12278    // ========================================================================
12279    // Tests: BIN-delta — dirty tracking, serialise, collect
12280    // ========================================================================
12281
12282    #[test]
12283    fn test_dirty_count_zero_on_fresh_bin() {
12284        let bin = make_bin_for_delta_tests(vec![
12285            (b"a".to_vec(), Lsn::new(1, 1), Some(b"v1".to_vec())),
12286            (b"b".to_vec(), Lsn::new(1, 2), Some(b"v2".to_vec())),
12287        ]);
12288        assert_eq!(bin.dirty_count(), 0);
12289    }
12290
12291    #[test]
12292    fn test_insert_marks_slot_dirty() {
12293        let lsn = Lsn::new(1, 10);
12294        let mut bin = BinStub {
12295            node_id: 1,
12296            level: BIN_LEVEL,
12297            entries: vec![],
12298            key_prefix: Vec::new(),
12299            dirty: false,
12300            is_delta: false,
12301            last_full_lsn: NULL_LSN,
12302            last_delta_lsn: NULL_LSN,
12303            generation: 0,
12304            parent: None,
12305            expiration_in_hours: true,
12306            cursor_count: 0,
12307            prohibit_next_delta: false,
12308            lsn_rep: LsnRep::Empty,
12309            keys: KeyRep::new(),
12310            compact_max_key_length: INKeyRep_DEFAULT_MAX_KEY_LENGTH,
12311        };
12312        bin.insert_with_prefix(b"key".to_vec(), lsn, Some(b"val".to_vec()));
12313        assert_eq!(bin.dirty_count(), 1, "new slot should be dirty");
12314        assert!(bin.entries[0].dirty);
12315    }
12316
12317    #[test]
12318    fn test_update_marks_slot_dirty() {
12319        let _lsn = Lsn::new(1, 10);
12320        let mut bin = BinStub {
12321            node_id: 2,
12322            level: BIN_LEVEL,
12323            entries: vec![BinEntry {
12324                data: Some(b"old".to_vec()),
12325                known_deleted: false,
12326                dirty: false,
12327                expiration_time: 0,
12328            }],
12329            key_prefix: Vec::new(),
12330            dirty: false,
12331            is_delta: false,
12332            last_full_lsn: NULL_LSN,
12333            last_delta_lsn: NULL_LSN,
12334            generation: 0,
12335            parent: None,
12336            expiration_in_hours: true,
12337            cursor_count: 0,
12338            prohibit_next_delta: false,
12339            lsn_rep: LsnRep::Empty,
12340            keys: KeyRep::from_keys(vec![b"key".to_vec()]),
12341            compact_max_key_length: INKeyRep_DEFAULT_MAX_KEY_LENGTH,
12342        };
12343        bin.insert_with_prefix(
12344            b"key".to_vec(),
12345            Lsn::new(1, 20),
12346            Some(b"new".to_vec()),
12347        );
12348        assert!(bin.entries[0].dirty, "updated slot should be dirty");
12349        assert_eq!(bin.dirty_count(), 1);
12350    }
12351
12352    #[test]
12353    fn test_serialize_full_roundtrip() {
12354        let mut bin = BinStub {
12355            node_id: 42,
12356            level: BIN_LEVEL,
12357            entries: vec![
12358                BinEntry {
12359                    data: Some(b"d1".to_vec()),
12360                    known_deleted: false,
12361                    dirty: true,
12362                    expiration_time: 0,
12363                },
12364                BinEntry {
12365                    data: None,
12366                    known_deleted: true,
12367                    dirty: false,
12368                    expiration_time: 0,
12369                },
12370            ],
12371            key_prefix: Vec::new(),
12372            dirty: true,
12373            is_delta: false,
12374            last_full_lsn: NULL_LSN,
12375            last_delta_lsn: NULL_LSN,
12376            generation: 0,
12377            parent: None,
12378            expiration_in_hours: true,
12379            cursor_count: 0,
12380            prohibit_next_delta: false,
12381            lsn_rep: LsnRep::Empty,
12382            keys: KeyRep::from_keys(vec![b"alpha".to_vec(), b"beta".to_vec()]),
12383            compact_max_key_length: INKeyRep_DEFAULT_MAX_KEY_LENGTH,
12384        };
12385        let bytes = bin.serialize_full();
12386        let node_id = u64::from_be_bytes(bytes[0..8].try_into().unwrap());
12387        let n_entries = u32::from_be_bytes(bytes[8..12].try_into().unwrap());
12388        assert_eq!(node_id, 42);
12389        assert_eq!(n_entries, 2);
12390        bin.clear_dirty_after_full_log(Lsn::new(2, 1));
12391        assert_eq!(bin.dirty_count(), 0);
12392        assert_eq!(bin.last_full_lsn, Lsn::new(2, 1));
12393        assert!(!bin.dirty);
12394    }
12395
12396    #[test]
12397    fn test_serialize_delta_only_dirty_slots() {
12398        let mut bin = BinStub {
12399            node_id: 7,
12400            level: BIN_LEVEL,
12401            entries: vec![
12402                BinEntry {
12403                    data: Some(b"v1".to_vec()),
12404                    known_deleted: false,
12405                    dirty: false,
12406                    expiration_time: 0,
12407                },
12408                BinEntry {
12409                    data: Some(b"v2".to_vec()),
12410                    known_deleted: false,
12411                    dirty: true,
12412                    expiration_time: 0,
12413                },
12414                BinEntry {
12415                    data: Some(b"v3".to_vec()),
12416                    known_deleted: false,
12417                    dirty: false,
12418                    expiration_time: 0,
12419                },
12420            ],
12421            key_prefix: Vec::new(),
12422            dirty: true,
12423            is_delta: false,
12424            last_full_lsn: NULL_LSN,
12425            last_delta_lsn: NULL_LSN,
12426            generation: 0,
12427            parent: None,
12428            expiration_in_hours: true,
12429            cursor_count: 0,
12430            prohibit_next_delta: false,
12431            lsn_rep: LsnRep::Empty,
12432            keys: KeyRep::from_keys(vec![
12433                b"a".to_vec(),
12434                b"b".to_vec(),
12435                b"c".to_vec(),
12436            ]),
12437            compact_max_key_length: INKeyRep_DEFAULT_MAX_KEY_LENGTH,
12438        };
12439        let bytes = bin.serialize_delta();
12440        let node_id = u64::from_be_bytes(bytes[0..8].try_into().unwrap());
12441        let n_dirty = u32::from_be_bytes(bytes[8..12].try_into().unwrap());
12442        assert_eq!(node_id, 7);
12443        assert_eq!(n_dirty, 1);
12444        let slot_idx = u32::from_be_bytes(bytes[12..16].try_into().unwrap());
12445        assert_eq!(slot_idx, 1);
12446        bin.clear_dirty_after_delta_log();
12447        assert_eq!(bin.dirty_count(), 0);
12448        assert_eq!(
12449            bin.last_full_lsn, NULL_LSN,
12450            "last_full_lsn unchanged by delta"
12451        );
12452    }
12453
12454    #[test]
12455    fn test_collect_dirty_bins_returns_dirty_bins_only() {
12456        let tree = Tree::new(1, 256);
12457        tree.insert(b"k1".to_vec(), b"v1".to_vec(), Lsn::new(1, 1)).unwrap();
12458        tree.insert(b"k2".to_vec(), b"v2".to_vec(), Lsn::new(1, 2)).unwrap();
12459        let dirty = tree.collect_dirty_bins(1);
12460        assert!(!dirty.is_empty(), "should have dirty BINs after inserts");
12461
12462        for (_db_id, bin_arc) in &dirty {
12463            let mut g = bin_arc.write();
12464            if let TreeNode::Bottom(b) = &mut *g {
12465                b.clear_dirty_after_full_log(Lsn::new(1, 100));
12466            }
12467        }
12468        let dirty2 = tree.collect_dirty_bins(1);
12469        assert!(dirty2.is_empty(), "no dirty BINs after clearing");
12470    }
12471
12472    fn make_bin_for_delta_tests(
12473        entries: Vec<(Vec<u8>, Lsn, Option<Vec<u8>>)>,
12474    ) -> BinStub {
12475        let lsns: Vec<Lsn> = entries.iter().map(|(_, l, _)| *l).collect();
12476        let keys: Vec<Vec<u8>> =
12477            entries.iter().map(|(k, _, _)| k.clone()).collect();
12478        BinStub {
12479            node_id: 1,
12480            level: BIN_LEVEL,
12481            entries: entries
12482                .into_iter()
12483                .map(|(_key, _lsn, data)| BinEntry {
12484                    data,
12485                    known_deleted: false,
12486                    dirty: false,
12487                    expiration_time: 0,
12488                })
12489                .collect(),
12490            key_prefix: Vec::new(),
12491            dirty: false,
12492            is_delta: false,
12493            last_full_lsn: NULL_LSN,
12494            last_delta_lsn: NULL_LSN,
12495            generation: 0,
12496            parent: None,
12497            expiration_in_hours: true,
12498            cursor_count: 0,
12499            prohibit_next_delta: false,
12500            lsn_rep: LsnRep::from_lsns(&lsns),
12501            keys: KeyRep::from_keys(keys),
12502            compact_max_key_length: INKeyRep_DEFAULT_MAX_KEY_LENGTH,
12503        }
12504    }
12505
12506    // ========================================================================
12507    // T-17: BinStub::should_log_delta — faithful JE BIN.shouldLogDelta
12508    // (BIN.java:1892).  These pin the COUNT-based decision against the
12509    // CONFIGURABLE percent (not a dirty-fraction-vs-hardcoded-0.25 heuristic),
12510    // plus the isBINDelta fast path, the numDeltas<=0 guard, and the
12511    // isDeltaProhibited / lastFullLsn==NULL bound.
12512    // ========================================================================
12513
12514    /// Build a full (non-delta) BIN with `n` slots, the first `dirty` of them
12515    /// marked dirty, and a non-NULL last_full_lsn (so a delta is permitted).
12516    fn bin_with_dirty(n: usize, dirty: usize) -> BinStub {
12517        let mut bin = make_bin_for_delta_tests(
12518            (0..n)
12519                .map(|i| {
12520                    (
12521                        format!("{:04}", i).into_bytes(),
12522                        Lsn::new(1, i as u32 + 1),
12523                        Some(vec![i as u8]),
12524                    )
12525                })
12526                .collect(),
12527        );
12528        bin.last_full_lsn = Lsn::new(1, 1); // a prior full exists
12529        for e in bin.entries.iter_mut().take(dirty) {
12530            e.dirty = true;
12531        }
12532        bin
12533    }
12534
12535    /// COUNT-based + CONFIGURABLE percent: with percent=10 and 100 slots, the
12536    /// delta limit is 100*10/100 = 10.  10 dirty slots → delta; 11 dirty → full.
12537    ///
12538    /// This is the core T-17 reproduction: the OLD checkpointer decision used
12539    /// `dirty/total <= 0.25` (hardcoded), so 11/100 = 11% ≤ 25% → it would have
12540    /// (wrongly) logged a DELTA.  The faithful count-based decision against the
12541    /// configurable percent=10 logs a FULL BIN.
12542    #[test]
12543    fn should_log_delta_is_count_based_and_configurable() {
12544        // Exactly at the limit → delta.
12545        assert!(
12546            bin_with_dirty(100, 10).should_log_delta(10),
12547            "numDeltas(10) <= limit(100*10/100=10) must be a delta"
12548        );
12549        // One over the limit → full BIN (FAILS on main: 11/100=11% <= 25%).
12550        assert!(
12551            !bin_with_dirty(100, 11).should_log_delta(10),
12552            "numDeltas(11) > limit(10) must be a FULL BIN under percent=10"
12553        );
12554        // The SAME BIN under the default percent=25 (limit 25) is a delta:
12555        // proves the percent is honoured, not hardcoded.
12556        assert!(
12557            bin_with_dirty(100, 11).should_log_delta(25),
12558            "numDeltas(11) <= limit(25) must be a delta under percent=25"
12559        );
12560        // Integer (truncating) math, exactly as JE: 7 slots, percent=25 →
12561        // limit = 7*25/100 = 1.  1 dirty → delta, 2 dirty → full.
12562        assert!(bin_with_dirty(7, 1).should_log_delta(25));
12563        assert!(!bin_with_dirty(7, 2).should_log_delta(25));
12564    }
12565
12566    /// isBINDelta fast path: a BIN already in delta form always re-logs as a
12567    /// delta (JE: `if (isBINDelta()) return true;`).
12568    #[test]
12569    fn should_log_delta_bin_delta_fast_path() {
12570        let mut bin = bin_with_dirty(100, 90); // 90% dirty: way over any limit
12571        bin.is_delta = true;
12572        // Even with a tiny percent that the dirty count blows past, an
12573        // already-delta BIN re-logs as a delta.
12574        assert!(
12575            bin.should_log_delta(1),
12576            "isBINDelta() must short-circuit to true regardless of percent"
12577        );
12578    }
12579
12580    /// numDeltas <= 0 guard: a BIN with no dirty slots logs a full BIN (an
12581    /// empty delta is invalid).
12582    #[test]
12583    fn should_log_delta_zero_dirty_is_full() {
12584        assert!(!bin_with_dirty(100, 0).should_log_delta(25));
12585    }
12586
12587    /// isDeltaProhibited bound: lastFullLsn == NULL (never logged full) and
12588    /// prohibit_next_delta both force a full BIN.
12589    #[test]
12590    fn should_log_delta_prohibited_forces_full() {
12591        // No prior full BIN.
12592        let mut bin = bin_with_dirty(100, 5); // would be a delta otherwise
12593        bin.last_full_lsn = NULL_LSN;
12594        assert!(
12595            !bin.should_log_delta(25),
12596            "lastFullLsn==NULL must force a full BIN"
12597        );
12598
12599        // prohibit_next_delta set (e.g. a dirty slot was removed by compress).
12600        let mut bin = bin_with_dirty(100, 5);
12601        bin.prohibit_next_delta = true;
12602        assert!(
12603            !bin.should_log_delta(25),
12604            "prohibit_next_delta must force a full BIN"
12605        );
12606    }
12607
12608    /// The prohibit flag is cleared after a full BIN is logged
12609    /// (JE IN.afterLog: setProhibitNextDelta(false)), so the NEXT log may once
12610    /// again be a delta — this is the periodic-full chain bound.
12611    #[test]
12612    fn full_log_clears_prohibit_next_delta() {
12613        let mut bin = bin_with_dirty(100, 5);
12614        bin.prohibit_next_delta = true;
12615        assert!(!bin.should_log_delta(25), "prohibited → full");
12616        bin.clear_dirty_after_full_log(Lsn::new(2, 5));
12617        assert!(
12618            !bin.prohibit_next_delta,
12619            "full log must clear prohibit_next_delta"
12620        );
12621        // Re-dirty a few slots; now a delta is allowed again.
12622        for e in bin.entries.iter_mut().take(5) {
12623            e.dirty = true;
12624        }
12625        assert!(
12626            bin.should_log_delta(25),
12627            "after a full log, a small delta is allowed again"
12628        );
12629    }
12630
12631    // ========================================================================
12632    // Tests: Task #82 — 8 new Tree methods
12633    // ========================================================================
12634
12635    // --- is_root_resident ---
12636
12637    #[test]
12638    fn test_is_root_resident_empty_tree() {
12639        let tree = Tree::new(1, 128);
12640        assert!(!tree.is_root_resident(), "empty tree has no resident root");
12641    }
12642
12643    #[test]
12644    fn test_is_root_resident_after_insert() {
12645        let tree = Tree::new(1, 128);
12646        tree.insert(b"k".to_vec(), b"v".to_vec(), Lsn::new(1, 1)).unwrap();
12647        assert!(tree.is_root_resident(), "root must be resident after insert");
12648    }
12649
12650    // --- get_resident_root_in ---
12651
12652    #[test]
12653    fn test_get_resident_root_in_empty() {
12654        let tree = Tree::new(1, 128);
12655        assert!(tree.get_resident_root_in().is_none());
12656    }
12657
12658    #[test]
12659    fn test_get_resident_root_in_single_entry() {
12660        let tree = Tree::new(1, 128);
12661        tree.insert(b"hello".to_vec(), b"world".to_vec(), Lsn::new(1, 1))
12662            .unwrap();
12663        let root = tree.get_resident_root_in();
12664        assert!(root.is_some(), "root must be Some after insert");
12665        let root_arc = tree.get_root().unwrap();
12666        assert!(
12667            Arc::ptr_eq(&root_arc, &root.unwrap()),
12668            "get_resident_root_in must return the same Arc as get_root"
12669        );
12670    }
12671
12672    #[test]
12673    fn test_get_resident_root_in_multi_entry() {
12674        let tree = Tree::new(1, 4);
12675        for i in 0u32..20 {
12676            let k = format!("rr{:04}", i).into_bytes();
12677            tree.insert(k, vec![i as u8], Lsn::new(1, i)).unwrap();
12678        }
12679        assert!(tree.get_resident_root_in().is_some());
12680    }
12681
12682    // --- get_parent_bin_for_child_ln ---
12683
12684    #[test]
12685    fn test_get_parent_bin_for_child_ln_empty_tree() {
12686        let tree = Tree::new(1, 128);
12687        assert!(tree.get_parent_bin_for_child_ln(b"key").is_none());
12688    }
12689
12690    #[test]
12691    fn test_get_parent_bin_for_child_ln_single_entry() {
12692        let tree = Tree::new(1, 128);
12693        tree.insert(b"alpha".to_vec(), b"val".to_vec(), Lsn::new(1, 1))
12694            .unwrap();
12695        let bin = tree.get_parent_bin_for_child_ln(b"alpha");
12696        assert!(bin.is_some(), "must return Some for a present key");
12697        assert!(bin.unwrap().read().is_bin(), "returned node must be a BIN");
12698    }
12699
12700    #[test]
12701    fn test_get_parent_bin_for_child_ln_multi_key() {
12702        let tree = Tree::new(1, 8);
12703        let keys: &[&[u8]] = &[b"aa", b"bb", b"cc", b"dd", b"ee"];
12704        for &k in keys {
12705            tree.insert(k.to_vec(), b"v".to_vec(), Lsn::new(1, 1)).unwrap();
12706        }
12707        for &k in keys {
12708            let bin = tree.get_parent_bin_for_child_ln(k);
12709            assert!(bin.is_some(), "must return Some for {:?}", k);
12710            assert!(bin.unwrap().read().is_bin());
12711        }
12712    }
12713
12714    // --- find_bin_for_insert ---
12715
12716    #[test]
12717    fn test_find_bin_for_insert_empty_tree() {
12718        let tree = Tree::new(1, 128);
12719        assert!(tree.find_bin_for_insert(b"newkey").is_none());
12720    }
12721
12722    #[test]
12723    fn test_find_bin_for_insert_returns_bin() {
12724        let tree = Tree::new(1, 128);
12725        tree.insert(b"existing".to_vec(), b"data".to_vec(), Lsn::new(1, 1))
12726            .unwrap();
12727        let bin = tree.find_bin_for_insert(b"newkey");
12728        assert!(bin.is_some());
12729        assert!(bin.unwrap().read().is_bin());
12730    }
12731
12732    #[test]
12733    fn test_find_bin_for_insert_same_as_parent_bin() {
12734        let tree = Tree::new(1, 128);
12735        tree.insert(b"foo".to_vec(), b"bar".to_vec(), Lsn::new(1, 1)).unwrap();
12736        let a = tree.get_parent_bin_for_child_ln(b"foo").unwrap();
12737        let b_arc = tree.find_bin_for_insert(b"foo").unwrap();
12738        assert!(
12739            Arc::ptr_eq(&a, &b_arc),
12740            "find_bin_for_insert must return the same BIN as get_parent_bin_for_child_ln"
12741        );
12742    }
12743
12744    // --- search_splits_allowed ---
12745
12746    #[test]
12747    fn test_search_splits_allowed_empty_tree() {
12748        let tree = Tree::new(1, 128);
12749        assert!(tree.search_splits_allowed(b"k").is_none());
12750    }
12751
12752    #[test]
12753    fn test_search_splits_allowed_finds_existing_key() {
12754        let tree = Tree::new(1, 8);
12755        for i in 0u32..10 {
12756            let k = format!("sa{:04}", i).into_bytes();
12757            tree.insert(k, vec![i as u8], Lsn::new(1, i)).unwrap();
12758        }
12759        for i in 0u32..10 {
12760            let k = format!("sa{:04}", i).into_bytes();
12761            let sr = tree.search_splits_allowed(&k);
12762            assert!(
12763                sr.is_some() && sr.unwrap().exact_parent_found,
12764                "search_splits_allowed must find sa{:04}",
12765                i
12766            );
12767        }
12768    }
12769
12770    #[test]
12771    fn test_search_splits_allowed_missing_key() {
12772        let tree = Tree::new(1, 8);
12773        tree.insert(b"present".to_vec(), b"v".to_vec(), Lsn::new(1, 1))
12774            .unwrap();
12775        let sr = tree.search_splits_allowed(b"absent");
12776        assert!(
12777            sr.is_none_or(|r| !r.exact_parent_found),
12778            "search_splits_allowed must not find absent key"
12779        );
12780    }
12781
12782    // --- rebuild_in_list ---
12783
12784    #[test]
12785    fn test_rebuild_in_list_empty_tree() {
12786        let tree = Tree::new(1, 128);
12787        assert!(tree.rebuild_in_list().is_empty());
12788    }
12789
12790    #[test]
12791    fn test_rebuild_in_list_single_entry() {
12792        let tree = Tree::new(1, 128);
12793        tree.insert(b"one".to_vec(), b"v".to_vec(), Lsn::new(1, 1)).unwrap();
12794        let list = tree.rebuild_in_list();
12795        // Expect root IN + BIN = 2 nodes.
12796        assert_eq!(
12797            list.len(),
12798            2,
12799            "single-entry tree must have exactly 2 nodes"
12800        );
12801        let has_bin = list.iter().any(|a| a.read().is_bin());
12802        let has_in = list.iter().any(|a| !a.read().is_bin());
12803        assert!(has_bin, "list must contain at least one BIN");
12804        assert!(has_in, "list must contain at least one upper IN");
12805    }
12806
12807    #[test]
12808    fn test_rebuild_in_list_multi_entry() {
12809        let tree = Tree::new(1, 4);
12810        for i in 0u32..20 {
12811            let k = format!("ri{:04}", i).into_bytes();
12812            tree.insert(k, vec![i as u8], Lsn::new(1, i)).unwrap();
12813        }
12814        let list = tree.rebuild_in_list();
12815        let stats = tree.collect_stats();
12816        let expected_nodes = (stats.n_ins + stats.n_bins) as usize;
12817        assert_eq!(
12818            list.len(),
12819            expected_nodes,
12820            "rebuild_in_list must return all {} nodes",
12821            expected_nodes
12822        );
12823    }
12824
12825    // --- validate_in_list ---
12826
12827    #[test]
12828    fn test_validate_in_list_empty_tree() {
12829        let tree = Tree::new(1, 128);
12830        assert!(tree.validate_in_list(), "empty tree must be valid");
12831    }
12832
12833    #[test]
12834    fn test_validate_in_list_single_entry() {
12835        let tree = Tree::new(1, 128);
12836        tree.insert(b"v".to_vec(), b"data".to_vec(), Lsn::new(1, 1)).unwrap();
12837        assert!(tree.validate_in_list(), "single-entry tree must be valid");
12838    }
12839
12840    #[test]
12841    fn test_validate_in_list_multi_entry() {
12842        let tree = Tree::new(1, 4);
12843        for i in 0u32..20 {
12844            let k = format!("vl{:04}", i).into_bytes();
12845            tree.insert(k, vec![i as u8], Lsn::new(1, i)).unwrap();
12846        }
12847        assert!(tree.validate_in_list(), "multi-entry tree must be valid");
12848    }
12849
12850    #[test]
12851    fn test_validate_in_list_empty_in_fails() {
12852        // Manually build a tree where the root IN has no entries — invalid.
12853        let root_arc = Arc::new(RwLock::new(TreeNode::Internal(InNodeStub {
12854            node_id: generate_node_id(),
12855            level: MAIN_LEVEL | 2,
12856            entries: vec![], // empty — structurally invalid
12857            targets: TargetRep::None,
12858            dirty: false,
12859            generation: 0,
12860            parent: None,
12861            lsn_rep: LsnRep::Empty,
12862        })));
12863        let tree = Tree::new(1, 128);
12864        *tree.root.write() = Some(root_arc);
12865        assert!(
12866            !tree.validate_in_list(),
12867            "a tree with an empty Internal node must fail validation"
12868        );
12869    }
12870
12871    // --- get_parent_in_for_child_in ---
12872
12873    #[test]
12874    fn test_get_parent_in_for_child_in_empty_tree() {
12875        let tree = Tree::new(1, 128);
12876        assert!(tree.get_parent_in_for_child_in(999).is_none());
12877    }
12878
12879    #[test]
12880    fn test_get_parent_in_for_child_in_single_entry() {
12881        // A single-insert tree has: root IN → BIN.
12882        // The root IN is the parent of the BIN.
12883        let tree = Tree::new(1, 128);
12884        tree.insert(b"p".to_vec(), b"v".to_vec(), Lsn::new(1, 1)).unwrap();
12885
12886        let root_arc = tree.get_root().as_ref().unwrap().clone();
12887        let bin_node_id = {
12888            let g = root_arc.read();
12889            match &*g {
12890                TreeNode::Internal(n) => {
12891                    let child = n.child_ref(0).unwrap();
12892                    let cg = child.read();
12893                    match &*cg {
12894                        TreeNode::Bottom(b) => b.node_id,
12895                        _ => panic!("expected BIN"),
12896                    }
12897                }
12898                _ => panic!("expected Internal root"),
12899            }
12900        };
12901
12902        let result = tree.get_parent_in_for_child_in(bin_node_id);
12903        assert!(result.is_some(), "must find parent of BIN");
12904        let (parent_arc, slot) = result.unwrap();
12905        assert!(Arc::ptr_eq(&parent_arc, &root_arc));
12906        assert_eq!(slot, 0);
12907    }
12908
12909    #[test]
12910    fn test_get_parent_in_for_child_in_not_found() {
12911        let tree = Tree::new(1, 128);
12912        tree.insert(b"x".to_vec(), b"y".to_vec(), Lsn::new(1, 1)).unwrap();
12913        assert!(tree.get_parent_in_for_child_in(u64::MAX).is_none());
12914    }
12915
12916    #[test]
12917    fn test_get_parent_in_for_child_in_multi_level() {
12918        // Build a tree with at least 3 levels so we test the recursive descent.
12919        let tree = Tree::new(1, 4);
12920        for i in 0u32..20 {
12921            let k = format!("ml{:04}", i).into_bytes();
12922            tree.insert(k, vec![i as u8], Lsn::new(1, i)).unwrap();
12923        }
12924
12925        // Collect all BIN node_ids via rebuild_in_list.
12926        let nodes = tree.rebuild_in_list();
12927        let bin_ids: Vec<u64> = nodes
12928            .iter()
12929            .filter_map(|a| {
12930                let g = a.read();
12931                if g.is_bin()
12932                    && let TreeNode::Bottom(b) = &*g
12933                {
12934                    return Some(b.node_id);
12935                }
12936                None
12937            })
12938            .collect();
12939
12940        for bin_id in bin_ids {
12941            let result = tree.get_parent_in_for_child_in(bin_id);
12942            assert!(
12943                result.is_some(),
12944                "every BIN (id={}) must have a parent IN",
12945                bin_id
12946            );
12947            let (parent_arc, _slot) = result.unwrap();
12948            assert!(
12949                !parent_arc.read().is_bin(),
12950                "parent of a BIN must be an Internal node"
12951            );
12952        }
12953    }
12954
12955    /// H-9 regression: BinStub::strip_lns actually drops the slot data
12956    /// (not just stats accounting).
12957    #[test]
12958    fn test_h9_strip_lns_actually_frees_data() {
12959        use crate::tree::{BinEntry, BinStub};
12960        use noxu_util::lsn::Lsn;
12961        let mut bin = BinStub {
12962            node_id: 1,
12963            level: 1,
12964            entries: Vec::new(),
12965            key_prefix: Vec::new(),
12966            dirty: false,
12967            is_delta: false,
12968            last_full_lsn: Lsn::from_u64(0),
12969            last_delta_lsn: Lsn::from_u64(0),
12970            generation: 0,
12971            parent: None,
12972            expiration_in_hours: true,
12973            cursor_count: 0,
12974            prohibit_next_delta: false,
12975            lsn_rep: LsnRep::Empty,
12976            keys: KeyRep::new(),
12977            compact_max_key_length: INKeyRep_DEFAULT_MAX_KEY_LENGTH,
12978        };
12979        // Three slots with embedded data + VALID logged LSNs (one dirty).
12980        // JE-faithful: a slot with a valid LSN is strippable regardless of the
12981        // dirty bit (its value is recoverable from the log); only a NULL-LSN
12982        // (never-logged / deferred-write) slot is preserved.
12983        bin.entries.push(BinEntry {
12984            data: Some(vec![0u8; 64]),
12985            known_deleted: false,
12986            dirty: false,
12987            expiration_time: 0,
12988        });
12989        bin.entries.push(BinEntry {
12990            data: Some(vec![0u8; 32]),
12991            known_deleted: false,
12992            dirty: false,
12993            expiration_time: 0,
12994        });
12995        bin.entries.push(BinEntry {
12996            data: Some(vec![0u8; 16]),
12997            known_deleted: false,
12998            dirty: true, // dirty BUT logged -> still strippable (EVICTOR-RECLAIM-1)
12999            expiration_time: 0,
13000        });
13001        // T-2: keep the key rep aligned with the pushed slots.
13002        bin.keys = KeyRep::from_keys(vec![
13003            b"a".to_vec(),
13004            b"b".to_vec(),
13005            b"c".to_vec(),
13006        ]);
13007        // Give all three slots VALID (non-NULL) LSNs so they are recoverable
13008        // from the log and therefore strippable.
13009        bin.set_lsn(0, Lsn::new(1, 100));
13010        bin.set_lsn(1, Lsn::new(1, 200));
13011        bin.set_lsn(2, Lsn::new(1, 300));
13012
13013        let freed = bin.strip_lns();
13014        assert_eq!(
13015            freed,
13016            64 + 32 + 16,
13017            "all logged slots stripped regardless of dirty (JE evictLNs)"
13018        );
13019        assert!(bin.entries[0].data.is_none(), "logged slot data dropped");
13020        assert!(bin.entries[1].data.is_none(), "logged slot data dropped");
13021        assert!(
13022            bin.entries[2].data.is_none(),
13023            "dirty-but-logged slot data dropped (recoverable from log)"
13024        );
13025
13026        // A NULL-LSN slot (never logged) must be preserved — its only copy is
13027        // the in-memory value.
13028        bin.entries[0].data = Some(vec![0u8; 64]);
13029        bin.set_lsn(0, noxu_util::NULL_LSN);
13030        let freed_null = bin.strip_lns();
13031        assert_eq!(
13032            freed_null, 0,
13033            "NULL-LSN (unlogged) slot must NOT be stripped"
13034        );
13035        assert!(bin.entries[0].data.is_some(), "unlogged slot data preserved");
13036
13037        // Cursor pin prevents stripping.
13038        bin.set_lsn(0, Lsn::new(1, 100));
13039        bin.cursor_count = 1;
13040        let freed_with_cursor = bin.strip_lns();
13041        assert_eq!(
13042            freed_with_cursor, 0,
13043            "strip_lns must skip when cursor pinned"
13044        );
13045        assert!(
13046            bin.entries[0].data.is_some(),
13047            "data preserved while cursor pinned"
13048        );
13049    }
13050
13051    // St-H4: the binary upper_in_floor_index must return the same slot as a
13052    // reference linear floor scan for all probe keys (incl. before-all,
13053    // after-all, between, and exact matches).
13054    #[test]
13055    fn test_upper_in_floor_index_matches_linear_scan() {
13056        // Reference linear floor scan (the pre-St-H4 algorithm): slot 0 is the
13057        // virtual −∞ key; walk forward while entry.key ≤ key.
13058        fn linear_floor(entries: &[InEntry], key: &[u8]) -> usize {
13059            let mut idx = 0usize;
13060            for (i, entry) in entries.iter().enumerate() {
13061                if i == 0 {
13062                    idx = 0;
13063                } else if entry.key.as_slice() <= key {
13064                    idx = i;
13065                } else {
13066                    break;
13067                }
13068            }
13069            idx
13070        }
13071
13072        let tree = Tree::new(1, 256);
13073        // Build sorted IN slot key sets of varying size; slot 0 = virtual −∞
13074        // (empty key sorts first), the rest strictly ascending.
13075        for n_slots in 1usize..40 {
13076            let mut entries: Vec<InEntry> = Vec::with_capacity(n_slots);
13077            entries.push(InEntry { key: vec![] });
13078            for i in 1..n_slots {
13079                // Strictly-ascending two-byte keys with gaps so probes can
13080                // fall between, on, before, and after them.
13081                let v = (i as u16) * 4;
13082                entries.push(InEntry {
13083                    key: vec![(v >> 8) as u8, (v & 0xFF) as u8],
13084                });
13085            }
13086            for probe in 0u16..=(n_slots as u16 * 4 + 4) {
13087                let key = vec![(probe >> 8) as u8, (probe & 0xFF) as u8];
13088                assert_eq!(
13089                    tree.upper_in_floor_index(&entries, &key),
13090                    linear_floor(&entries, &key),
13091                    "floor mismatch: n_slots={n_slots}, key={key:?}"
13092                );
13093            }
13094        }
13095    }
13096}
13097
13098// ─────────────────────────────────────────────────────────────────────────
13099// St-H6: BIN split inherits expiration_in_hours from the splitting BIN.
13100// ─────────────────────────────────────────────────────────────────────────
13101
13102/// Unit test for the St-H6 fix: the right-half sibling created by
13103/// `split_child` inherits `expiration_in_hours` from the splitting BIN.
13104///
13105/// Before the fix, the sibling was always created with
13106/// `expiration_in_hours = false`, causing hours-granularity TTL entries
13107/// (expiration_time ~495k) to be compared against `current_time_secs()`
13108/// (~1.78B) and treated as expired.
13109///
13110/// This test:
13111///   1. Creates a tree with max_entries = 4 and inserts 4 entries directly
13112///      (bypassing `update_key_expiration`) with non-zero `expiration_time`
13113///      and `expiration_in_hours = true` on the BIN.
13114///   2. Triggers a split.
13115///   3. Asserts that the right-half sibling has `expiration_in_hours = true`
13116///      (inherited, not hardcoded false).
13117#[test]
13118fn test_split_child_sibling_inherits_expiration_in_hours() {
13119    use crate::tree::{BIN_LEVEL, BinEntry, BinStub, MAIN_LEVEL, TreeNode};
13120    use noxu_util::{Lsn, NULL_LSN};
13121    use parking_lot::RwLock;
13122    use std::sync::Arc;
13123
13124    // Manually build a tree with one BIN (4 entries, expiration_in_hours=true).
13125    let tree = Tree::new(99, 4);
13126
13127    // Pre-populate the tree root for the test.
13128    let entries: Vec<BinEntry> = (0u8..4u8)
13129        .map(|_k| BinEntry {
13130            data: Some(vec![_k, _k]),
13131            known_deleted: false,
13132            dirty: true,
13133            expiration_time: 495_630, // hours-since-epoch value, 2026
13134        })
13135        .collect();
13136    let bin_keys: Vec<Vec<u8>> = (0u8..4u8).map(|k| vec![k]).collect();
13137    let bin = Arc::new(RwLock::new(TreeNode::Bottom(BinStub {
13138        node_id: 1,
13139        level: BIN_LEVEL,
13140        entries,
13141        key_prefix: Vec::new(),
13142        dirty: true,
13143        is_delta: false,
13144        last_full_lsn: NULL_LSN,
13145        last_delta_lsn: NULL_LSN,
13146        generation: 0,
13147        parent: None,
13148        expiration_in_hours: true, // hours-granularity entries
13149        cursor_count: 0,
13150        prohibit_next_delta: false,
13151        lsn_rep: LsnRep::Empty,
13152        keys: KeyRep::from_keys(bin_keys),
13153        compact_max_key_length: INKeyRep_DEFAULT_MAX_KEY_LENGTH,
13154    })));
13155
13156    let root = Arc::new(RwLock::new(TreeNode::Internal(InNodeStub {
13157        node_id: 2,
13158        level: MAIN_LEVEL | 2,
13159        entries: vec![InEntry {
13160            key: vec![], // virtual key for slot 0 (-infinity)
13161        }],
13162        targets: TargetRep::Sparse(vec![(0, Arc::clone(&bin))]),
13163        dirty: true,
13164        generation: 0,
13165        parent: None,
13166        lsn_rep: LsnRep::Empty,
13167    })));
13168    {
13169        let mut b = bin.write();
13170        b.set_parent(Some(Arc::downgrade(&root)));
13171    }
13172    *tree.root.write() = Some(Arc::clone(&root));
13173
13174    // Trigger split_child on the root.
13175    Tree::split_child(
13176        &root,
13177        0,
13178        4,
13179        Lsn::new(1, 500),
13180        SplitHint::Normal,
13181        &[],
13182        None,
13183        false,
13184        None,
13185    )
13186    .expect("split_child should succeed");
13187
13188    // After the split: root has two children — left BIN and right sibling.
13189    let root_guard = root.read();
13190    let TreeNode::Internal(ref in_node) = *root_guard else {
13191        panic!("root should be Internal after split");
13192    };
13193    assert_eq!(
13194        in_node.entries.len(),
13195        2,
13196        "root should have 2 entries (children) after split"
13197    );
13198
13199    // Right-half sibling is at slot 1.
13200    let sibling_arc = in_node
13201        .get_child(1)
13202        .expect("right-half sibling should exist at slot 1");
13203    let sibling_guard = sibling_arc.read();
13204    let TreeNode::Bottom(ref sibling) = *sibling_guard else {
13205        panic!("right sibling should be a BIN");
13206    };
13207
13208    assert!(
13209        sibling.expiration_in_hours,
13210        "St-H6: right-half sibling expiration_in_hours must be true \
13211             (inherited from splitting BIN); got false"
13212    );
13213
13214    // Verify the sibling's entries have the expected expiration_time.
13215    for e in &sibling.entries {
13216        assert_eq!(
13217            e.expiration_time, 495_630,
13218            "sibling entry expiration_time should be preserved: got {}",
13219            e.expiration_time
13220        );
13221        // With in_hours=true, is_expired should return false (future).
13222        assert!(
13223            !noxu_util::ttl::is_expired(
13224                e.expiration_time,
13225                sibling.expiration_in_hours
13226            ),
13227            "St-H6: sibling TTL entry ({}) should NOT appear expired \
13228                 with expiration_in_hours={}",
13229            e.expiration_time,
13230            sibling.expiration_in_hours
13231        );
13232    }
13233}
13234
13235/// Regression confirmation: `is_expired` with wrong `in_hours = false`
13236/// would falsely expire hours-granularity values (~495k hours since epoch).
13237#[test]
13238fn test_hours_value_is_expired_only_with_false_flag() {
13239    // Hours-since-epoch value for ~2026 + 1 000 h TTL.
13240    let exp_hours: u32 = 495_630;
13241    // Correctly treated as hours: not expired.
13242    assert!(
13243        !noxu_util::ttl::is_expired(exp_hours, true),
13244        "exp_hours={exp_hours} should NOT be expired when in_hours=true"
13245    );
13246    // Incorrectly treated as seconds (pre-fix right sibling): expired.
13247    assert!(
13248        noxu_util::ttl::is_expired(exp_hours, false),
13249        "exp_hours={exp_hours} should be expired when in_hours=false \
13250             (St-H6 demonstrates the wrong-flag scenario)"
13251    );
13252}
13253
13254// =============================================================================
13255// IN-redo unit tests (DRIFT-1 / Stage 1)
13256// =============================================================================
13257
13258#[cfg(test)]
13259mod in_redo_tests {
13260    use super::*;
13261
13262    /// Build a BinStub with `n` entries (key = [i as u8], lsn = lsn(1, i))
13263    /// and serialise it.  Returns (node_id, node_data_bytes).
13264    fn make_bin_bytes(node_id: u64, n: usize) -> Vec<u8> {
13265        let mut bin = BinStub {
13266            node_id,
13267            level: BIN_LEVEL,
13268            entries: Vec::new(),
13269            key_prefix: Vec::new(),
13270            dirty: false,
13271            is_delta: false,
13272            last_full_lsn: noxu_util::NULL_LSN,
13273            last_delta_lsn: noxu_util::NULL_LSN,
13274            generation: 0,
13275            parent: None,
13276            expiration_in_hours: true,
13277            cursor_count: 0,
13278            prohibit_next_delta: false,
13279            lsn_rep: LsnRep::Empty,
13280            keys: KeyRep::new(),
13281            compact_max_key_length: INKeyRep_DEFAULT_MAX_KEY_LENGTH,
13282        };
13283        for i in 0..n {
13284            // T-2/T-3: route through insert so entries/keys/lsn_rep stay
13285            // aligned; the serialized bytes are identical.
13286            bin.insert_with_prefix(
13287                vec![i as u8],
13288                Lsn::new(1, (i + 1) as u32),
13289                Some(vec![i as u8]),
13290            );
13291        }
13292        bin.serialize_full()
13293    }
13294
13295    /// Verify that recover_in_redo inserts a BIN as root when the tree is empty.
13296    ///
13297    /// JE RecoveryManager.recoverRootIN: `root == null` path.
13298    #[test]
13299    fn test_recover_in_redo_root_bin_inserted_into_empty_tree() {
13300        let tree = Tree::new(42, 128);
13301        assert!(tree.is_empty());
13302        let bytes = make_bin_bytes(1, 3);
13303        let log_lsn = Lsn::new(1, 100);
13304        let result = tree.recover_in_redo(
13305            log_lsn, /*is_root=*/ true, /*is_bin=*/ true, &bytes,
13306        );
13307        assert_eq!(result, InRedoResult::Inserted, "expected Inserted");
13308        // Tree should now have 3 entries.
13309        assert_eq!(tree.count_entries(), 3);
13310    }
13311
13312    /// Verify that recover_in_redo replaces a root BIN when the logged version is newer.
13313    ///
13314    /// JE RootUpdater.doWork: `DbLsn.compareTo(originalLsn, lsn) < 0` path.
13315    #[test]
13316    fn test_recover_in_redo_root_bin_replaced_when_log_newer() {
13317        let tree = Tree::new(42, 128);
13318        // Install an old root (2 entries, older LSN).
13319        let old_bytes = make_bin_bytes(1, 2);
13320        let old_lsn = Lsn::new(1, 50);
13321        tree.recover_in_redo(old_lsn, true, true, &old_bytes);
13322        assert_eq!(tree.count_entries(), 2);
13323        // Replay with newer LSN and 4 entries.
13324        let new_bytes = make_bin_bytes(1, 4);
13325        let new_lsn = Lsn::new(1, 100);
13326        let result = tree.recover_in_redo(new_lsn, true, true, &new_bytes);
13327        assert_eq!(result, InRedoResult::Replaced);
13328        assert_eq!(tree.count_entries(), 4);
13329    }
13330
13331    /// Verify that an older logged BIN does NOT replace a newer in-memory root.
13332    ///
13333    /// JE RootUpdater.doWork: `DbLsn.compareTo(originalLsn, lsn) >= 0` skip path.
13334    #[test]
13335    fn test_recover_in_redo_root_bin_skipped_when_tree_newer() {
13336        let tree = Tree::new(42, 128);
13337        // Install a newer root.
13338        let new_bytes = make_bin_bytes(1, 4);
13339        let new_lsn = Lsn::new(1, 200);
13340        tree.recover_in_redo(new_lsn, true, true, &new_bytes);
13341        // Attempt to replay an older version.
13342        let old_bytes = make_bin_bytes(1, 2);
13343        let old_lsn = Lsn::new(1, 100);
13344        let result = tree.recover_in_redo(old_lsn, true, true, &old_bytes);
13345        assert_eq!(result, InRedoResult::Skipped);
13346        // Tree still holds the newer 4-entry version.
13347        assert_eq!(tree.count_entries(), 4);
13348    }
13349
13350    /// deserialize_bin round-trips through serialize_full.
13351    #[test]
13352    fn test_deserialize_bin_round_trip() {
13353        let bytes = make_bin_bytes(99, 5);
13354        let bin = Tree::deserialize_bin(&bytes).expect("must deserialize");
13355        assert_eq!(bin.node_id, 99);
13356        assert_eq!(bin.entries.len(), 5);
13357        for i in 0..bin.entries.len() {
13358            assert_eq!(bin.get_full_key(i).unwrap(), vec![i as u8]);
13359        }
13360    }
13361
13362    /// deserialize_upper_in round-trips through write_to_bytes (Internal).
13363    #[test]
13364    fn test_deserialize_upper_in_round_trip() {
13365        // Build an InNodeStub and serialize via write_to_bytes.
13366        let node = TreeNode::Internal(InNodeStub {
13367            node_id: 77,
13368            level: 0x10002,
13369            entries: vec![
13370                InEntry { key: vec![1, 2, 3] },
13371                InEntry { key: vec![4, 5, 6] },
13372            ],
13373            targets: TargetRep::None,
13374            dirty: false,
13375            generation: 0,
13376            parent: None,
13377            lsn_rep: LsnRep::Empty,
13378        });
13379        let bytes = node.write_to_bytes();
13380        let restored =
13381            Tree::deserialize_upper_in(&bytes).expect("must deserialize");
13382        assert_eq!(restored.node_id, 77);
13383        assert_eq!(restored.level, 0x10002);
13384        assert_eq!(restored.entries.len(), 2);
13385        assert_eq!(restored.entries[0].key, vec![1, 2, 3]);
13386        assert_eq!(restored.entries[1].key, vec![4, 5, 6]);
13387    }
13388}
13389
13390// --- Part 2 acceptance tests: key_prefixing flag (DRIFT-3) ---
13391//
13392// JE `IN.computeKeyPrefix` returns null when `databaseImpl.getKeyPrefixing()`
13393// is false, so no prefix compression is ever applied to those BINs. Noxu was
13394// always applying prefix compression. This checks that the flag is honoured.
13395//
13396// Ref: `IN.java computeKeyPrefix` ~line 2456,
13397//      `DatabaseConfig.setKeyPrefixing` / `DatabaseImpl.getKeyPrefixing`.
13398#[cfg(test)]
13399mod key_prefixing_tests {
13400    use super::*;
13401
13402    /// Helper: find the first (leftmost) BIN in the tree.
13403    fn find_first_bin(node: &Arc<RwLock<TreeNode>>) -> Arc<RwLock<TreeNode>> {
13404        let child_opt = {
13405            let g = node.read();
13406            match &*g {
13407                TreeNode::Bottom(_) => None,
13408                TreeNode::Internal(n) => {
13409                    Some(Arc::clone(n.child_ref(0).expect("child")))
13410                }
13411            }
13412        };
13413        match child_opt {
13414            None => Arc::clone(node),
13415            Some(child) => find_first_bin(&child),
13416        }
13417    }
13418
13419    /// With `key_prefixing = false` (the default), keys must be stored without
13420    /// any prefix: the BIN's `key_prefix` must remain empty after inserts.
13421    #[test]
13422    fn test_key_prefixing_false_stores_full_keys() {
13423        // Default is key_prefixing = false.
13424        let tree = Tree::new(1, 16);
13425        assert!(!tree.key_prefixing, "default must be false");
13426
13427        let lsn = noxu_util::Lsn::new(1, 10);
13428        // Insert keys with a long common prefix.
13429        for i in 0u8..8 {
13430            let key = vec![b'r', b'e', b'c', b'o', b'r', b'd', b':', i];
13431            tree.insert(key, vec![i], lsn).expect("insert");
13432        }
13433
13434        let root = tree.get_root().expect("root");
13435        let bin_arc = find_first_bin(&root);
13436        let guard = bin_arc.read();
13437        let TreeNode::Bottom(ref bin) = *guard else {
13438            panic!("must be a BIN");
13439        };
13440        assert!(
13441            bin.key_prefix.is_empty(),
13442            "key_prefix must be empty when key_prefixing=false, got {:?}",
13443            bin.key_prefix
13444        );
13445        assert_eq!(bin.entries.len(), 8);
13446        // Keys must be stored as full keys.
13447        assert_eq!(
13448            bin.get_full_key(0).unwrap(),
13449            vec![b'r', b'e', b'c', b'o', b'r', b'd', b':', 0]
13450        );
13451    }
13452
13453    /// With `key_prefixing = true`, keys with a common prefix are compressed:
13454    /// the BIN's `key_prefix` must be non-empty.
13455    #[test]
13456    fn test_key_prefixing_true_compresses_keys() {
13457        let mut tree = Tree::new(1, 16);
13458        tree.set_key_prefixing(true);
13459
13460        let lsn = noxu_util::Lsn::new(1, 10);
13461        for i in 0u8..8 {
13462            let key = vec![b'r', b'e', b'c', b'o', b'r', b'd', b':', i];
13463            tree.insert(key, vec![i], lsn).expect("insert");
13464        }
13465
13466        let root = tree.get_root().expect("root");
13467        let bin_arc = find_first_bin(&root);
13468        let guard = bin_arc.read();
13469        let TreeNode::Bottom(ref bin) = *guard else {
13470            panic!("must be a BIN");
13471        };
13472        // Prefix compression must kick in: all keys share "record:".
13473        assert!(
13474            !bin.key_prefix.is_empty(),
13475            "key_prefix must be non-empty when key_prefixing=true"
13476        );
13477        assert_eq!(
13478            bin.key_prefix,
13479            b"record:".to_vec(),
13480            "prefix must be the common prefix of all inserted keys"
13481        );
13482    }
13483
13484    /// Custom-comparator databases (sorted-dup) always bypass prefix
13485    /// regardless of key_prefixing: `insert_cmp` does not touch key_prefix.
13486    #[test]
13487    fn test_key_prefixing_custom_comparator_no_prefix() {
13488        let cmp: KeyComparatorFn = Arc::new(|a: &[u8], b: &[u8]| a.cmp(b));
13489        let mut tree = Tree::new_with_comparator(1, 16, cmp);
13490        // Enable key_prefixing — should have no effect via insert_cmp path.
13491        tree.set_key_prefixing(true);
13492
13493        let lsn = noxu_util::Lsn::new(1, 10);
13494        for i in 0u8..8 {
13495            let key = vec![b'r', b'e', b'c', b'o', b'r', b'd', b':', i];
13496            tree.insert(key, vec![i], lsn).expect("insert");
13497        }
13498
13499        let root = tree.get_root().expect("root");
13500        let bin_arc = find_first_bin(&root);
13501        let guard = bin_arc.read();
13502        let TreeNode::Bottom(ref bin) = *guard else {
13503            panic!("must be a BIN");
13504        };
13505        // Custom-comparator path (insert_cmp) does not set key_prefix.
13506        assert!(
13507            bin.key_prefix.is_empty(),
13508            "custom-comparator path must not set key_prefix"
13509        );
13510    }
13511}
13512
13513// --- Part 1 acceptance tests: splitSpecial heuristic (DRIFT-1) ---
13514//
13515// JE `IN.splitSpecial` / `Tree.forceSplit`: when all routing decisions during
13516// descent are leftmost (`AllLeft`) or rightmost (`AllRight`), the split index
13517// is forced to 1 or `n-1` respectively instead of `n/2`. This halves the
13518// number of splits for monotonically increasing / decreasing key workloads
13519// (sequential append / prepend) because each split leaves the BIN near-full.
13520//
13521// Ref: `IN.java splitSpecial` ~line 4129, `Tree.java forceSplit` ~line 1907.
13522#[cfg(test)]
13523mod split_special_tests {
13524    use super::*;
13525
13526    /// Test helper: descend the tree to the BIN that holds (or would hold)
13527    /// `key`, returning its arc.  Mirrors the read-path descent used by
13528    /// `Tree::search`; sufficient for unit tests that need to mutate a slot.
13529    fn find_bin_arc_for_key(
13530        node_arc: &Arc<RwLock<TreeNode>>,
13531        key: &[u8],
13532    ) -> Option<Arc<RwLock<TreeNode>>> {
13533        let mut current = node_arc.clone();
13534        loop {
13535            let next = {
13536                let g = current.read();
13537                match &*g {
13538                    TreeNode::Bottom(_) => return Some(current.clone()),
13539                    TreeNode::Internal(n) => {
13540                        if n.entries.is_empty() {
13541                            return None;
13542                        }
13543                        let mut idx = 0usize;
13544                        for (i, e) in n.entries.iter().enumerate() {
13545                            if i == 0 || e.key.as_slice() <= key {
13546                                idx = i;
13547                            } else {
13548                                break;
13549                            }
13550                        }
13551                        n.get_child(idx)?
13552                    }
13553                }
13554            };
13555            current = next;
13556        }
13557    }
13558
13559    /// Count total leaf (BIN) nodes in the tree by DFS.
13560    fn count_bins(node: &Arc<RwLock<TreeNode>>) -> usize {
13561        let g = node.read();
13562        match &*g {
13563            TreeNode::Bottom(_) => 1,
13564            TreeNode::Internal(n) => {
13565                n.resident_children().iter().map(count_bins).sum()
13566            }
13567        }
13568    }
13569
13570    /// Return total key count across all BINs.
13571    fn count_keys(node: &Arc<RwLock<TreeNode>>) -> usize {
13572        let g = node.read();
13573        match &*g {
13574            TreeNode::Bottom(b) => b.entries.len(),
13575            TreeNode::Internal(n) => {
13576                n.resident_children().iter().map(count_keys).sum()
13577            }
13578        }
13579    }
13580
13581    /// Returns the number of entries in the leftmost BIN.
13582    fn leftmost_bin_size(node: &Arc<RwLock<TreeNode>>) -> usize {
13583        let g = node.read();
13584        match &*g {
13585            TreeNode::Bottom(b) => b.entries.len(),
13586            TreeNode::Internal(n) => {
13587                let first_child = n.child_ref(0).expect("child");
13588                leftmost_bin_size(first_child)
13589            }
13590        }
13591    }
13592
13593    /// Returns the number of entries in the rightmost BIN.
13594    fn rightmost_bin_size(node: &Arc<RwLock<TreeNode>>) -> usize {
13595        let g = node.read();
13596        match &*g {
13597            TreeNode::Bottom(b) => b.entries.len(),
13598            TreeNode::Internal(n) => {
13599                let last_child = n
13600                    .child_ref(n.entries.len().saturating_sub(1))
13601                    .expect("child");
13602                rightmost_bin_size(last_child)
13603            }
13604        }
13605    }
13606
13607    /// `splitSpecial` ascending: each right-side split leaves the left BIN
13608    /// near-full (all but one entry stays). Compared to midpoint split
13609    /// the number of BINs created should be significantly fewer relative to
13610    /// keys inserted (more keys per BIN on average).
13611    ///
13612    /// JE criterion: `allRightSideDescent` → `splitIndex = nEntries - 1`.
13613    /// The penultimate entry stays in the left BIN; only one entry goes to
13614    /// the new right sibling, which then absorbs the next insert and fills
13615    /// normally.
13616    #[test]
13617    fn test_split_special_ascending_fewer_bins_than_midpoint() {
13618        let max_entries = 8usize;
13619        let n_keys = 200usize;
13620
13621        // Build tree with splitSpecial (ascending keys trigger AllRight).
13622        let tree_special = Tree::new(1, max_entries);
13623        let lsn = noxu_util::Lsn::new(1, 100);
13624        for i in 0u32..n_keys as u32 {
13625            let key = i.to_be_bytes().to_vec();
13626            tree_special.insert(key, vec![0u8], lsn).expect("insert");
13627        }
13628
13629        let root_special = tree_special.get_root().expect("root must exist");
13630        let bins_special = count_bins(&root_special);
13631        let keys_special = count_keys(&root_special);
13632
13633        // All keys must be present.
13634        assert_eq!(keys_special, n_keys, "all keys must be stored");
13635
13636        // With splitSpecial, each right-side split keeps n-1 entries in the
13637        // left BIN. Ideal: ceil(n_keys / (max_entries - 1)) BINs.
13638        // Without splitSpecial (midpoint): ceil(n_keys / (max_entries / 2)).
13639        // We assert the actual count is below the midpoint-split upper bound.
13640        let midpoint_upper_bound = n_keys.div_ceil(max_entries / 2);
13641        assert!(
13642            bins_special < midpoint_upper_bound,
13643            "splitSpecial should produce fewer BINs than midpoint split: \
13644             got {bins_special}, midpoint upper bound = {midpoint_upper_bound}"
13645        );
13646
13647        // The rightmost BIN must have fewer entries than max_entries
13648        // (the last insert only half-fills it at most), which is expected.
13649        // The IMPORTANT property: rightmost BIN started with exactly 1 entry
13650        // (its first entry was the split-off singleton) then filled up.
13651        // We just verify overall key density > midpoint baseline.
13652        let avg_fill = keys_special as f64 / bins_special as f64;
13653        let midpoint_fill = (max_entries / 2) as f64;
13654        assert!(
13655            avg_fill > midpoint_fill,
13656            "average fill per BIN with splitSpecial ({avg_fill:.1}) should \
13657             exceed midpoint baseline ({midpoint_fill})"
13658        );
13659    }
13660
13661    /// `splitSpecial` descending: all routing decisions are at slot 0
13662    /// (`AllLeft`). Split forces `split_index = 1` so the right sibling
13663    /// gets almost all entries and the left node keeps just one.
13664    ///
13665    /// JE criterion: `allLeftSideDescent` → `splitIndex = 1`.
13666    #[test]
13667    fn test_split_special_descending_fewer_bins_than_midpoint() {
13668        let max_entries = 8usize;
13669        let n_keys = 200usize;
13670
13671        let tree_special = Tree::new(1, max_entries);
13672        let lsn = noxu_util::Lsn::new(1, 100);
13673        for i in (0u32..n_keys as u32).rev() {
13674            let key = i.to_be_bytes().to_vec();
13675            tree_special.insert(key, vec![0u8], lsn).expect("insert");
13676        }
13677
13678        let root_special = tree_special.get_root().expect("root must exist");
13679        let bins_special = count_bins(&root_special);
13680        let keys_special = count_keys(&root_special);
13681
13682        assert_eq!(keys_special, n_keys, "all keys must be stored");
13683
13684        let midpoint_upper_bound = n_keys.div_ceil(max_entries / 2);
13685        assert!(
13686            bins_special < midpoint_upper_bound,
13687            "splitSpecial descending should produce fewer BINs: \
13688             got {bins_special}, midpoint upper bound = {midpoint_upper_bound}"
13689        );
13690    }
13691
13692    /// Random-key inserts must NOT be affected by splitSpecial: with random
13693    /// keys descent will rarely be all-left or all-right, so the split index
13694    /// defaults to midpoint and tree balance is maintained.
13695    #[test]
13696    fn test_split_special_random_inserts_stay_balanced() {
13697        use std::collections::BTreeSet;
13698
13699        let max_entries = 8usize;
13700        // Use a fixed permutation so the test is deterministic.
13701        let mut keys: Vec<u32> = (0u32..200).collect();
13702        // Knuth shuffle with a fixed seed.
13703        let mut rng: u64 = 0xdeadbeef_cafebabe;
13704        for i in (1..keys.len()).rev() {
13705            rng = rng.wrapping_mul(6364136223846793005).wrapping_add(1);
13706            let j = (rng >> 33) as usize % (i + 1);
13707            keys.swap(i, j);
13708        }
13709
13710        let tree = Tree::new(1, max_entries);
13711        let lsn = noxu_util::Lsn::new(1, 100);
13712        let mut inserted = BTreeSet::new();
13713        for k in &keys {
13714            let key = k.to_be_bytes().to_vec();
13715            tree.insert(key, vec![0u8], lsn).expect("insert");
13716            inserted.insert(*k);
13717        }
13718
13719        let root = tree.get_root().expect("root");
13720        let total_keys = count_keys(&root);
13721        assert_eq!(
13722            total_keys,
13723            inserted.len(),
13724            "all random keys must be stored"
13725        );
13726
13727        // Verify every key is findable.
13728        for k in &inserted {
13729            let key = k.to_be_bytes().to_vec();
13730            let found = tree.search(&key);
13731            assert!(
13732                found.map(|r| r.is_exact_match()).unwrap_or(false),
13733                "random key {k} must be findable after insert"
13734            );
13735        }
13736    }
13737
13738    /// TREE-F1: a `known_deleted` BIN slot must read as ABSENT on an exact
13739    /// lookup and must be SKIPPED by scans, matching JE.
13740    ///
13741    /// JE contract:
13742    /// * `IN.findEntry` (IN.java:3197): an exact match that lands on a
13743    ///   known-deleted slot returns -1 (ABSENT).
13744    /// * `CursorImpl.lockAndGetCurrent` (CursorImpl.java:2062-2064): a
13745    ///   step that lands on `isEntryKnownDeleted(index)` returns null, so
13746    ///   the `getNext` loop advances past it (the slot is skipped).
13747    ///
13748    /// KD slots legitimately exist in live BINs during BIN-delta
13749    /// reconstitution (`mutate_to_full_bin` applies delta KD slots) until
13750    /// the compressor reclaims them.  We reach that state directly here by
13751    /// marking a slot known_deleted in the BIN arc, then assert the
13752    /// user-facing read/scan paths do not surface it.
13753    #[test]
13754    fn test_tree_f1_known_deleted_slot_is_absent_and_skipped() {
13755        let tree = Tree::new(1, 8);
13756        // Insert enough keys to populate a BIN with several live slots.
13757        for i in 0..6u32 {
13758            let key = format!("kd{i:04}").into_bytes();
13759            tree.insert(key, vec![i as u8], Lsn::new(1, i)).unwrap();
13760        }
13761
13762        // Pick a middle key and mark its slot known_deleted directly in the
13763        // BIN, modelling a delta-applied tombstone the compressor has not yet
13764        // reclaimed.
13765        let kd_key = b"kd0003".to_vec();
13766        {
13767            let root = tree.get_root().expect("root");
13768            let bin_arc = find_bin_arc_for_key(&root, &kd_key).expect("bin");
13769            let mut g = bin_arc.write();
13770            if let TreeNode::Bottom(b) = &mut *g {
13771                let idx = (0..b.entries.len())
13772                    .find(|&i| {
13773                        b.get_full_key(i).as_deref() == Some(kd_key.as_slice())
13774                    })
13775                    .expect("kd key slot");
13776                b.entries[idx].known_deleted = true;
13777            } else {
13778                panic!("expected BIN");
13779            }
13780        }
13781
13782        // (a) exact lookup via Tree::search must report NOT found.
13783        let sr = tree.search(&kd_key);
13784        assert!(
13785            !sr.map(|r| r.is_exact_match()).unwrap_or(false),
13786            "TREE-F1: Tree::search must report a known_deleted slot as absent \
13787             (IN.findEntry IN.java:3197)"
13788        );
13789
13790        // (a) exact lookup via Tree::search_with_data must report NOT found.
13791        let sf = tree.search_with_data(&kd_key).expect("slot fetch");
13792        assert!(
13793            !sf.found,
13794            "TREE-F1: Tree::search_with_data must report a known_deleted slot \
13795             as absent (IN.findEntry IN.java:3197)"
13796        );
13797
13798        // Live neighbours must still be found.
13799        for live in [b"kd0002".to_vec(), b"kd0004".to_vec()] {
13800            assert!(
13801                tree.search(&live).map(|r| r.is_exact_match()).unwrap_or(false),
13802                "live neighbour must remain findable"
13803            );
13804        }
13805
13806        // (b) a scan-facing BIN dump (descend_to_edge_bin / get_next_bin /
13807        // get_prev_bin) returns slots verbatim WITH the known_deleted flag
13808        // set, so the cursor can skip them (CursorImpl.java:2062-2064).  The
13809        // contract here is: the KD slot is never reported as a LIVE entry.
13810        let root = tree.get_root().expect("root");
13811        let edge = Tree::descend_to_edge_bin(&root, true).expect("edge bin");
13812        assert!(
13813            !edge.iter().any(|(e, _, k)| k == &kd_key && !e.known_deleted),
13814            "TREE-F1: scan must not surface a known_deleted slot as live \
13815             (CursorImpl.java:2062-2064)"
13816        );
13817        for anchor in [b"kd0000".to_vec(), b"kd0005".to_vec()] {
13818            for entries in
13819                [tree.get_next_bin(&anchor), tree.get_prev_bin(&anchor)]
13820                    .into_iter()
13821                    .flatten()
13822            {
13823                assert!(
13824                    !entries
13825                        .iter()
13826                        .any(|(e, _, k)| k == &kd_key && !e.known_deleted),
13827                    "TREE-F1: get_next_bin/get_prev_bin must not surface a \
13828                     known_deleted slot as live"
13829                );
13830            }
13831        }
13832
13833        // first_entry_at_or_after must skip a KD slot at the boundary.
13834        if let Some((k, _, _)) = tree.first_entry_at_or_after(&kd_key) {
13835            assert_ne!(
13836                k, kd_key,
13837                "TREE-F1: first_entry_at_or_after must skip a known_deleted \
13838                 slot (CursorImpl.java:2062-2064)"
13839            );
13840        }
13841
13842        // The compressor KD-iteration path must STILL see the slot — the fix
13843        // only changes the user-facing read predicate, not the maintenance
13844        // iteration that exists to reclaim KD slots.
13845        let kd_bins = tree.collect_bins_with_known_deleted();
13846        assert!(
13847            !kd_bins.is_empty(),
13848            "TREE-F1: collect_bins_with_known_deleted must still observe the \
13849             KD slot so the compressor can reclaim it"
13850        );
13851    }
13852}
noxu_tree/tree.rs

noxu_tree/
tree.rs