noxu_tree/tree.rs
1//! B+tree implementation.
2//!
3//!
4//! Tree implements the B+tree. It provides search, insert, and delete
5//! operations on the tree structure. The tree uses latch-coupling for
6//! concurrent access: when traversing down the tree, the parent latch
7//! is released after the child latch is acquired.
8//!
9//! # Architecture
10//!
11//! The tree has a hierarchical structure:
12//! - Internal Nodes (IN) at levels 2 and above
13//! - Bottom Internal Nodes (BIN) at level 1
14//! - Leaf Nodes (LN) containing actual data
15//!
16//! # Locking Strategy
17//!
18//! - Root latch protects the root pointer itself
19//! - Each node has its own latch for concurrent access
20//! - Search uses latch-coupling: acquire child, release parent
21//! - Modifications may require exclusive latches
22
23use crate::error::TreeError;
24use crate::key::{create_key_prefix, get_key_prefix_length};
25use crate::search_result::SearchResult;
26use noxu_latch::{LatchContext, SharedLatch};
27use noxu_util::{Lsn, NULL_LSN};
28use parking_lot::RwLock;
29use std::sync::atomic::{AtomicI64, AtomicU64, Ordering};
30use std::sync::{Arc, Weak};
31
32/// Observer that mirrors JE's `INList` feeding the evictor's `LRUList`s.
33///
34/// The tree owns no eviction policy of its own; instead it notifies a
35/// registered listener whenever an IN/BIN node enters the resident cache, is
36/// accessed, or is removed. The `Evictor` (in `noxu-evictor`) implements this
37/// trait, but the dependency is one-way (`noxu-evictor` → `noxu-tree`), so the
38/// tree refers to the listener only through this trait object — avoiding a
39/// circular crate dependency.
40///
41/// JE reference: `IN.fetchTarget` / split / `rebuildINList` call
42/// `Evictor.addBack`; node access calls `Evictor.moveBack`; node removal
43/// calls `Evictor.remove`.
44pub trait InListListener: Send + Sync {
45 /// A node has just become resident in the cache (JE `Evictor.addBack`).
46 fn note_ins_added(&self, node_id: u64);
47 /// A resident node was accessed (JE `Evictor.moveBack` — LRU touch).
48 fn note_ins_accessed(&self, node_id: u64);
49 /// A node was removed from the cache (JE `Evictor.remove`).
50 fn note_ins_removed(&self, node_id: u64);
51}
52
53// Level and flag constants re-exported here for tree-internal use.
54pub const DBMAP_LEVEL: i32 = 0x20000;
55pub const MAIN_LEVEL: i32 = 0x10000;
56pub const LEVEL_MASK: i32 = 0x0ffff;
57pub const MIN_LEVEL: i32 = -1;
58pub const BIN_LEVEL: i32 = MAIN_LEVEL | 1;
59pub const EXACT_MATCH: i32 = 1 << 16;
60pub const INSERT_SUCCESS: i32 = 1 << 17;
61
62/// Per-slot fixed memory overhead for a BIN entry, in bytes (DBI-23).
63///
64/// This is the heap footprint of one `BinEntry` *struct* as it lives inside
65/// the BIN's `Vec<BinEntry>` buffer — NOT counting the variable-length key and
66/// data bytes, which are separate heap allocations counted on top of this.
67///
68/// Faithful to JE `IN.getEntryInMemorySize` + the per-slot `entryStates` /
69/// LSN-array overhead folded into `IN.computeMemorySize` (IN.java ~4632):
70/// JE measures the slot's fixed cost with `Sizeof` on the JVM; Rust has a
71/// fixed struct layout so `size_of::<BinEntry>()` is exact.
72///
73/// T-2/T-3: the per-slot `key` (`Vec<u8>` header) and `lsn` (`u64`) were
74/// hoisted out of `BinEntry` into the node-level `KeyRep`/`LsnRep`. The
75/// `size_of::<BinEntry>()` therefore shrank; we add back the packed per-slot
76/// LSN-rep cost (`LsnRep::BYTES_PER_LSN_ENTRY`, 4 bytes) so the incremental
77/// live counter still approximates the walked heap (the key bytes are charged
78/// separately as `key.len()` at the call site, matching the compact key rep).
79///
80/// Derived (not hard-coded) so a layout change to `BinEntry` is tracked
81/// automatically — see `bin_stub_conformance` for the drift guard.
82pub const BIN_ENTRY_OVERHEAD: usize =
83 std::mem::size_of::<BinEntry>() + LsnRep::BYTES_PER_LSN_ENTRY;
84
85/// Per-slot fixed memory overhead for an IN entry, in bytes (DBI-23).
86///
87/// Heap footprint of one `InEntry` struct inside the IN's `Vec<InEntry>`
88/// buffer (key bytes counted separately). JE `IN.getEntryInMemorySize` for
89/// an upper IN plus the per-slot state/LSN/target overhead from
90/// `IN.computeMemorySize`.
91pub const IN_ENTRY_OVERHEAD: usize = std::mem::size_of::<InEntry>();
92
93/// Type alias for the key comparator used by sorted-duplicate databases.
94///
95/// The comparator takes two full (uncompressed) keys and returns their
96/// relative ordering. For sorted-dup databases this is `DupKeyData::compare`,
97/// which splits each key into primary + data parts and applies separate
98/// comparators to each. For normal databases this field is `None` and
99/// lexicographic byte comparison is used.
100///
101/// `DatabaseImpl.btreeComparator` / `DatabaseImpl.dupComparator`.
102pub type KeyComparatorFn =
103 Arc<dyn Fn(&[u8], &[u8]) -> std::cmp::Ordering + Send + Sync>;
104
105/// Combined search result carrying slot data and the BIN arc, returned by
106/// [`Tree::search_with_data`].
107///
108/// Avoids the double-descent pattern where `Tree::search` checked key
109/// existence and a second call re-descended to fetch the actual slot bytes.
110/// One descent now serves both purposes (Wave-11-I optimisation).
111pub struct SlotFetch {
112 /// `true` if an exact key match was found and is not expired.
113 pub found: bool,
114 /// Data bytes for the slot (`None` when `found` is `false`).
115 pub data: Option<Vec<u8>>,
116 /// Raw slot LSN as `u64`; zero when `found` is `false`.
117 pub lsn: u64,
118 /// Slot index within the BIN. Set to the actual BIN slot index when
119 /// `found` is `true`; `0` otherwise.
120 ///
121 /// Used by `CursorImpl` to set `current_index` correctly so that
122 /// `retrieve_next` advances to the right slot after a search.
123 pub slot_index: usize,
124 /// Arc to the BIN that the descent reached. Always `Some` when the
125 /// tree has at least one node, regardless of whether `found` is `true`.
126 pub bin_arc: Arc<RwLock<TreeNode>>,
127}
128
129/// The B+tree.
130///
131///
132///
133/// This is the main tree structure that manages the B+tree nodes and
134/// provides operations for search, insert, delete, and tree maintenance.
135pub struct Tree {
136 /// Database ID this tree belongs to.
137 database_id: u64,
138
139 /// Maximum entries per node (from config).
140 max_entries_per_node: usize,
141
142 /// Root of the tree. None if tree is empty.
143 ///
144 /// Wrapped in `RwLock` so that `insert`, `delete`, and other mutating
145 /// operations can take `&self` (interior mutability), enabling concurrent
146 /// access to different BIN nodes without requiring a global `&mut Tree`
147 /// borrow. The root pointer itself is only written during root splits
148 /// and initial creation; all other access is read-only.
149 ///
150 /// `Tree.root` protected by the root latch.
151 root: RwLock<Option<Arc<RwLock<TreeNode>>>>,
152
153 /// Latch protecting the root reference itself.
154 /// Must be held when changing the root pointer.
155 root_latch: SharedLatch,
156
157 /// LSN at which the current root IN/BIN was last logged.
158 ///
159 /// Used by the IN-redo currency check (`recover_root_bin` /
160 /// `recover_root_upper_in`) to decide whether a logged root replaces the
161 /// in-memory one. Updated whenever a new root is installed via
162 /// `set_root_with_lsn` or the IN-redo recover-root path.
163 ///
164 /// JE `RootUpdater.originalLsn` / `ChildReference.getLsn()` for the root.
165 root_log_lsn: RwLock<noxu_util::Lsn>,
166
167 /// Statistics: number of times the root has been split.
168 root_splits: AtomicU64,
169
170 /// Statistics: number of latch upgrades from shared to exclusive.
171 relatches_required: AtomicU64,
172
173 /// Optional custom key comparator for sorted-duplicate databases.
174 ///
175 /// When `Some`, all key comparisons in tree traversal (upper IN routing
176 /// and BIN entry search/insert/delete) use this comparator instead of
177 /// lexicographic byte comparison.
178 ///
179 /// / `dupComparator` stored on the
180 /// database and consulted at every `IN.findEntry()` call.
181 pub key_comparator: Option<KeyComparatorFn>,
182
183 /// Shared memory counter for the evictor / MemoryBudget.
184 ///
185 /// Updated on every BIN entry insert (+key+data+overhead) and delete
186 /// (-key+overhead) so the evictor sees real cache pressure.
187 ///
188 /// `env.getMemoryBudget().updateTreeMemoryUsage(delta)` call
189 /// in the equivalent `IN.updateMemorySize()`. In Noxu the counter is an
190 /// `Arc<AtomicI64>` shared with the `Arbiter` (and later `MemoryBudget`)
191 /// to avoid a circular crate dependency (`noxu-tree` → `noxu-dbi`).
192 pub memory_counter: Option<Arc<AtomicI64>>,
193
194 /// Optional listener fed on node add/access/remove, mirroring JE's
195 /// `INList` feeding the evictor's `LRUList`s.
196 ///
197 /// When `None` (the default — used by unit tests with no environment),
198 /// the notifications are no-ops. `EnvironmentImpl` installs the
199 /// `Evictor` here so production inserts/accesses populate the LRU lists
200 /// the evictor drains.
201 ///
202 /// JE reference: `IN.fetchTarget`/split/`rebuildINList` → `addBack`,
203 /// access → `moveBack`, removal → `remove`.
204 pub in_list_listener: Option<Arc<dyn InListListener>>,
205
206 /// Optional log manager so an evicted root IN can be re-materialized from
207 /// its persisted `root_log_lsn` on the next access (EV-14, piece B).
208 ///
209 /// JE's `Tree` reaches the log via `database.getEnv().getLogManager()`;
210 /// `Tree.getRootINRootAlreadyLatched` calls `root.fetchTarget(...)` which
211 /// reads the root IN back from its `ChildReference` LSN when the in-memory
212 /// target is null (Tree.java:477-516, ChildReference.fetchTarget). Noxu
213 /// has no env back-reference here, so the log manager is installed
214 /// directly (the same one-way wiring as `in_list_listener`). When `None`
215 /// (unit tests with no environment), an evicted root cannot be re-fetched
216 /// — but `evict_root` refuses to evict without a log manager, so the root
217 /// is never made non-resident in that configuration.
218 pub log_manager: Option<Arc<noxu_log::LogManager>>,
219
220 /// Capacity hint for the recovery redo path.
221 ///
222 /// When non-zero, the first BIN created by `redo_insert` (the first-key
223 /// path) pre-allocates its `entries` Vec with this capacity so that
224 /// redo insertions proceed without Vec-resize doublings. The value is
225 /// clamped to `max_entries_per_node` at use.
226 ///
227 /// Set by `hint_redo_capacity` before the redo loop.
228 /// Wave 11-K optimisation (Fix 3).
229 redo_capacity_hint: usize,
230
231 /// Whether key-prefix compression is enabled for this tree's BINs.
232 ///
233 /// JE `DatabaseImpl.getKeyPrefixing()` / `DatabaseConfig.setKeyPrefixing()`.
234 /// When `false`, `IN.computeKeyPrefix` returns `null` in JE — no prefix
235 /// is ever set. Noxu mirrors this: `insert_with_prefix` is skipped in
236 /// favour of `insert_raw`, and `recompute_key_prefix` is not called on
237 /// BIN halves after a split.
238 ///
239 /// Default: `false` (matches JE's `DatabaseConfig.KEY_PREFIXING_DEFAULT`).
240 ///
241 /// Ref: `IN.java computeKeyPrefix` ~line 2456.
242 pub key_prefixing: bool,
243 /// T-5: maximum post-prefix key length (bytes) for the compact key rep
244 /// (`INKeyRep.MaxKeySize`). A node packs all its keys into one fixed-width
245 /// byte array when every post-prefix key is `<=` this length; a longer key
246 /// inflates the node to the `Default` rep. `<= 0` disables the compact
247 /// rep entirely.
248 ///
249 /// Default 16 (`TREE_COMPACT_MAX_KEY_LENGTH` /
250 /// `INKeyRep.MaxKeySize.DEFAULT_MAX_KEY_LENGTH`). Wired from
251 /// `EnvironmentConfig` via `Tree::set_compact_max_key_length`
252 /// (`IN.getCompactMaxKeyLength`, IN.java:4929).
253 pub compact_max_key_length: i32,
254}
255
256/// A node in the tree.
257///
258/// TreeNode wraps an upper IN or a BIN. Each variant carries a lightweight
259/// stub whose fields mirror the persistent IN/BIN structure. The stubs will
260/// be replaced with full InNode/Bin types as the implementation matures; the
261/// API surface here is intentionally minimal.
262#[derive(Debug)]
263pub enum TreeNode {
264 /// Internal Node (IN) - non-leaf node in the tree.
265 Internal(InNodeStub),
266
267 /// Bottom Internal Node (BIN) - leaf-level internal node.
268 Bottom(BinStub),
269}
270
271/// Type alias for a resident child pointer.
272pub type ChildArc = Arc<RwLock<TreeNode>>;
273
274/// T-4: per-node representation of the resident-child-pointer array.
275///
276/// Faithful to JE `INTargetRep` (`INTargetRep.java`), the abstract array of
277/// target pointers to an IN's cached children. These arrays are usually
278/// sparse — most upper INs have NO resident children — so JE never stores a
279/// full per-slot `Node[]` until many children are actually cached:
280///
281/// * `None` — `INTargetRep.None`: a shared singleton, 0 child-pointer
282/// bytes, used when no children are cached (the common case for upper
283/// INs). `get` returns null for every slot.
284/// * `Sparse` — `INTargetRep.Sparse`: a small parallel `(index, target)[]`
285/// for 1..=`MAX_ENTRIES` cached children (JE caps at 4). `get(j)` is a
286/// linear scan of the index array.
287/// * `Default`— `INTargetRep.Default`: the full `Vec<Option<Arc>>`, one
288/// slot per entry, used once more than `MAX_ENTRIES` children are
289/// resident.
290///
291/// A node starts `None` and grows `None → Sparse → Default`. JE does not
292/// shrink back when entries are nulled (it only compacts on IN-stripping) to
293/// avoid transitionary rep churn; we follow the same policy — `set_child` only
294/// inflates, and `compact()` (called on eviction/stripping) collapses an
295/// empty/small `Default`/`Sparse` back toward `None`.
296#[derive(Debug)]
297pub enum TargetRep {
298 /// `INTargetRep.None` — no children cached (shared-singleton semantics).
299 None,
300 /// `INTargetRep.Sparse` — a few cached children, `(slot_index, child)`.
301 /// Invariant: `len() <= SPARSE_MAX_ENTRIES`.
302 Sparse(Vec<(u16, ChildArc)>),
303 /// `INTargetRep.Default` — full parallel array, one slot per entry.
304 Default(Vec<Option<ChildArc>>),
305}
306
307impl TargetRep {
308 /// `INTargetRep.Sparse.MAX_ENTRIES` (INTargetRep.java) — the maximum
309 /// number of cached children the `Sparse` rep holds before inflating to
310 /// `Default`.
311 pub const SPARSE_MAX_ENTRIES: usize = 4;
312
313 /// `INTargetRep.get(idx)` — the cached child for slot `idx`, or `None`.
314 #[inline]
315 pub fn get(&self, idx: usize) -> Option<&ChildArc> {
316 match self {
317 TargetRep::None => None,
318 TargetRep::Sparse(v) => {
319 v.iter().find(|(i, _)| *i as usize == idx).map(|(_, c)| c)
320 }
321 TargetRep::Default(v) => v.get(idx).and_then(|o| o.as_ref()),
322 }
323 }
324
325 /// `INTargetRep.set(idx, node, parent)` — set (or clear, when `node` is
326 /// `None`) the cached child for slot `idx`, mutating the representation
327 /// upward (`None → Sparse → Default`) as needed.
328 pub fn set(&mut self, idx: usize, node: Option<ChildArc>) {
329 match self {
330 TargetRep::None => {
331 // INTargetRep.None.set: clearing stays None; setting mutates
332 // to a Sparse rep and sets there.
333 if let Some(child) = node {
334 *self = TargetRep::Sparse(vec![(idx as u16, child)]);
335 }
336 }
337 TargetRep::Sparse(v) => {
338 // Update existing slot in place.
339 if let Some(pos) =
340 v.iter().position(|(i, _)| *i as usize == idx)
341 {
342 match node {
343 Some(child) => v[pos].1 = child,
344 None => {
345 v.swap_remove(pos);
346 }
347 }
348 return;
349 }
350 // New child: clearing a non-present slot is a no-op.
351 let Some(child) = node else { return };
352 if v.len() < Self::SPARSE_MAX_ENTRIES {
353 v.push((idx as u16, child));
354 return;
355 }
356 // Full — INTargetRep.Sparse.set mutates to Default.
357 let cap = v.iter().map(|(i, _)| *i as usize).max().unwrap_or(0);
358 let cap = cap.max(idx) + 1;
359 let mut def: Vec<Option<ChildArc>> = vec![None; cap];
360 for (i, c) in v.drain(..) {
361 def[i as usize] = Some(c);
362 }
363 def[idx] = Some(child);
364 *self = TargetRep::Default(def);
365 }
366 TargetRep::Default(v) => {
367 if idx >= v.len() {
368 if node.is_none() {
369 return;
370 }
371 v.resize_with(idx + 1, || None);
372 }
373 v[idx] = node;
374 }
375 }
376 }
377
378 /// `INTargetRep.None`-aware take: remove and return the cached child for
379 /// slot `idx`, leaving the slot empty (JE `IN.setTarget(idx, null)` plus
380 /// returning the old target).
381 pub fn take(&mut self, idx: usize) -> Option<ChildArc> {
382 match self {
383 TargetRep::None => None,
384 TargetRep::Sparse(v) => v
385 .iter()
386 .position(|(i, _)| *i as usize == idx)
387 .map(|pos| v.swap_remove(pos).1),
388 TargetRep::Default(v) => v.get_mut(idx).and_then(|o| o.take()),
389 }
390 }
391
392 /// JE `INArrayRep.copy(from, to, n, parent)` adapted to slice ops: shift
393 /// the child mapping when an entry is INSERTED at `idx` (all children at
394 /// slots `>= idx` move up by one). Mirrors how `Vec::insert` shifts the
395 /// parallel `entries` array.
396 pub fn insert_shift(&mut self, idx: usize) {
397 match self {
398 TargetRep::None => {}
399 TargetRep::Sparse(v) => {
400 for (i, _) in v.iter_mut() {
401 if (*i as usize) >= idx {
402 *i += 1;
403 }
404 }
405 }
406 TargetRep::Default(v) => {
407 if idx <= v.len() {
408 v.insert(idx, None);
409 }
410 }
411 }
412 }
413
414 /// JE `INArrayRep.copy` adapted: shift the child mapping when the entry at
415 /// `idx` is REMOVED (all children at slots `> idx` move down by one; the
416 /// child at `idx` itself is dropped). Mirrors `Vec::remove`.
417 pub fn remove_shift(&mut self, idx: usize) {
418 match self {
419 TargetRep::None => {}
420 TargetRep::Sparse(v) => {
421 v.retain(|(i, _)| *i as usize != idx);
422 for (i, _) in v.iter_mut() {
423 if (*i as usize) > idx {
424 *i -= 1;
425 }
426 }
427 }
428 TargetRep::Default(v) => {
429 if idx < v.len() {
430 v.remove(idx);
431 }
432 }
433 }
434 }
435
436 /// `INTargetRep.compact(parent)` — collapse toward the most compact rep:
437 /// an empty rep becomes `None`; a `Default` with `<= MAX_ENTRIES` children
438 /// becomes `Sparse` (or `None`). Called when an IN is stripped/evicted.
439 pub fn compact(&mut self) {
440 let count = self.resident_count();
441 if count == 0 {
442 *self = TargetRep::None;
443 return;
444 }
445 if count <= Self::SPARSE_MAX_ENTRIES
446 && let TargetRep::Default(v) = self
447 {
448 let sparse: Vec<(u16, ChildArc)> = v
449 .iter()
450 .enumerate()
451 .filter_map(|(i, o)| o.as_ref().map(|c| (i as u16, c.clone())))
452 .collect();
453 *self = TargetRep::Sparse(sparse);
454 }
455 }
456
457 /// Number of resident (non-null) children.
458 pub fn resident_count(&self) -> usize {
459 match self {
460 TargetRep::None => 0,
461 TargetRep::Sparse(v) => v.len(),
462 TargetRep::Default(v) => v.iter().filter(|o| o.is_some()).count(),
463 }
464 }
465
466 /// True if no children are cached (`INTargetRep.None` or empty).
467 pub fn is_empty(&self) -> bool {
468 self.resident_count() == 0
469 }
470
471 /// Iterate every resident child (in unspecified order).
472 pub fn iter_children(&self) -> Box<dyn Iterator<Item = ChildArc> + '_> {
473 match self {
474 TargetRep::None => Box::new(std::iter::empty()),
475 TargetRep::Sparse(v) => Box::new(v.iter().map(|(_, c)| c.clone())),
476 TargetRep::Default(v) => {
477 Box::new(v.iter().filter_map(|o| o.clone()))
478 }
479 }
480 }
481
482 /// `INTargetRep.calculateMemorySize()` — heap bytes of the rep itself
483 /// (excluding the children it points at). `None` is 0 (shared singleton),
484 /// matching `INTargetRep.None.calculateMemorySize() == 0`.
485 pub fn memory_size(&self) -> usize {
486 use std::mem::size_of;
487 match self {
488 TargetRep::None => 0,
489 TargetRep::Sparse(v) => v.capacity() * size_of::<(u16, ChildArc)>(),
490 TargetRep::Default(v) => {
491 v.capacity() * size_of::<Option<ChildArc>>()
492 }
493 }
494 }
495}
496
497/// T-3: node-level packed LSN array — `IN.entryLsnByteArray` /
498/// `IN.entryLsnLongArray` (IN.java:251-289, getLsn/setLsnInternal
499/// IN.java:1752-1935).
500///
501/// JE stores one LSN per slot. A naive `Lsn` (u64) costs 8 bytes/slot even
502/// though most LSNs in a node share a file number and have a file offset that
503/// fits in 3 bytes. JE's compact rep is a single `byte[]` with
504/// `BYTES_PER_LSN_ENTRY == 4` bytes per slot:
505///
506/// * `base_file_number` is the lowest file number of any non-NULL LSN in the
507/// node;
508/// * byte 0 of each slot = `file_number - base_file_number` (0..=127,
509/// `Byte.MAX_VALUE`);
510/// * bytes 1..4 = the 3-byte little-endian file offset (max
511/// `MAX_FILE_OFFSET == 0xff_fffe`).
512///
513/// The NULL_LSN blocker (Noxu `NULL_LSN == u64::MAX`) is solved EXACTLY as JE
514/// does it: NULL is NOT stored as the raw u64; the slot's 3 file-offset bytes
515/// are set to `0xff_ffff` (`THREE_BYTE_NEGATIVE_ONE`), a value `MAX_FILE_OFFSET`
516/// can never reach, and `get_lsn` maps it back to `NULL_LSN`.
517///
518/// If a file-number difference exceeds 127 or a file offset exceeds
519/// `MAX_FILE_OFFSET`, the rep mutates to `Long` (one `u64` per slot), matching
520/// JE's `mutateToLongArray` (IN.java:1924). An all-NULL node uses `Empty`
521/// (0 bytes), matching the EMPTY_REP/initial-capacity-free state.
522#[derive(Debug)]
523pub enum LsnRep {
524 /// All slots NULL — 0 heap bytes (the `byteArray == null` initial state).
525 Empty,
526 /// `IN.entryLsnByteArray` — 4 bytes/slot, `base_file_number`-relative.
527 Compact { base_file_number: u32, bytes: Vec<u8> },
528 /// `IN.entryLsnLongArray` — 8 bytes/slot fallback after `mutateToLongArray`.
529 Long(Vec<Lsn>),
530}
531
532impl LsnRep {
533 /// `IN.BYTES_PER_LSN_ENTRY` (IN.java:151).
534 pub const BYTES_PER_LSN_ENTRY: usize = 4;
535 /// `IN.MAX_FILE_OFFSET` (IN.java:152) — max file offset the 3-byte form holds.
536 const MAX_FILE_OFFSET: u32 = 0x00ff_fffe;
537 /// `IN.THREE_BYTE_NEGATIVE_ONE` (IN.java:153) — the NULL sentinel in the
538 /// 3 file-offset bytes.
539 const THREE_BYTE_NEGATIVE_ONE: u32 = 0x00ff_ffff;
540 /// `Byte.MAX_VALUE` — max file-number difference the 1-byte offset holds.
541 const MAX_FILE_NUMBER_OFFSET: u32 = 127;
542
543 /// A rep sized for `n` slots, all NULL. Returns `Empty` (0 bytes); the
544 /// Compact byte array is lazily allocated by the first non-NULL `set_lsn`
545 /// — `base_file_number` is unknown until then (IN.java:1820, the
546 /// `baseFileNumber == -1` first-entry case).
547 #[inline]
548 pub fn new(_n: usize) -> Self {
549 LsnRep::Empty
550 }
551
552 /// Build a rep from a per-slot `Lsn` slice (used by node construction and
553 /// split, where slots arrive together). Equivalent to `new(lsns.len())`
554 /// followed by `set(i, lsns[i])` for each slot.
555 pub fn from_lsns(lsns: &[Lsn]) -> Self {
556 let mut rep = LsnRep::Empty;
557 let n = lsns.len();
558 for (i, &lsn) in lsns.iter().enumerate() {
559 rep.set(i, lsn, n);
560 }
561 rep
562 }
563
564 /// `IN.getLsn(idx)` (IN.java:1752).
565 pub fn get(&self, idx: usize) -> Lsn {
566 match self {
567 LsnRep::Empty => NULL_LSN,
568 LsnRep::Long(v) => v.get(idx).copied().unwrap_or(NULL_LSN),
569 LsnRep::Compact { base_file_number, bytes } => {
570 let off = idx * Self::BYTES_PER_LSN_ENTRY;
571 if off + Self::BYTES_PER_LSN_ENTRY > bytes.len() {
572 return NULL_LSN;
573 }
574 let file_offset = Self::get_3byte(bytes, off + 1);
575 if file_offset == Self::THREE_BYTE_NEGATIVE_ONE {
576 NULL_LSN
577 } else {
578 let file_number = base_file_number + bytes[off] as u32;
579 Lsn::new(file_number, file_offset)
580 }
581 }
582 }
583 }
584
585 /// `IN.setLsnInternal(idx, value)` (IN.java:1801) — set the LSN of slot
586 /// `idx`, mutating Empty→Compact→Long as necessary. `n` is the node's
587 /// slot count (sizes a freshly-allocated Compact array).
588 pub fn set(&mut self, idx: usize, lsn: Lsn, n: usize) {
589 // Empty: first non-NULL value allocates the Compact array; a NULL set
590 // on an Empty rep is a no-op (all slots already read NULL).
591 if let LsnRep::Empty = self {
592 if lsn.is_null() {
593 return;
594 }
595 let cap = n.max(idx + 1);
596 *self = LsnRep::Compact {
597 base_file_number: lsn.file_number(),
598 bytes: vec![0u8; cap * Self::BYTES_PER_LSN_ENTRY],
599 };
600 // Mark every other slot NULL (3-byte offset = 0xffffff).
601 if let LsnRep::Compact { bytes, .. } = self {
602 for s in 0..cap {
603 if s != idx {
604 Self::put_3byte(
605 bytes,
606 s * Self::BYTES_PER_LSN_ENTRY + 1,
607 Self::THREE_BYTE_NEGATIVE_ONE,
608 );
609 }
610 }
611 }
612 self.set(idx, lsn, n);
613 return;
614 }
615
616 if let LsnRep::Long(v) = self {
617 if idx >= v.len() {
618 v.resize(idx + 1, NULL_LSN);
619 }
620 v[idx] = lsn;
621 return;
622 }
623
624 // Compact path.
625 let LsnRep::Compact { base_file_number, bytes } = self else {
626 unreachable!()
627 };
628 let need = (idx + 1) * Self::BYTES_PER_LSN_ENTRY;
629 if need > bytes.len() {
630 let old = bytes.len() / Self::BYTES_PER_LSN_ENTRY;
631 bytes.resize(need, 0);
632 for s in old..(idx + 1) {
633 Self::put_3byte(
634 bytes,
635 s * Self::BYTES_PER_LSN_ENTRY + 1,
636 Self::THREE_BYTE_NEGATIVE_ONE,
637 );
638 }
639 }
640 let off = idx * Self::BYTES_PER_LSN_ENTRY;
641
642 if lsn.is_null() {
643 // IN.java:1812 — file-number offset 0, file offset -1 (0xffffff).
644 bytes[off] = 0;
645 Self::put_3byte(bytes, off + 1, Self::THREE_BYTE_NEGATIVE_ONE);
646 return;
647 }
648
649 let this_file_number = lsn.file_number();
650 let this_file_offset = lsn.file_offset();
651
652 // Whether to fall back to the Long rep.
653 let mutate = this_file_offset > Self::MAX_FILE_OFFSET || {
654 if this_file_number < *base_file_number {
655 // IN.java:1827 — try to re-base downward; bail if any existing
656 // slot would then exceed the 1-byte file-number offset.
657 !Self::adjust_file_numbers(
658 bytes,
659 *base_file_number,
660 this_file_number,
661 )
662 } else {
663 this_file_number - *base_file_number
664 > Self::MAX_FILE_NUMBER_OFFSET
665 }
666 };
667
668 if mutate {
669 // IN.java:1924 mutateToLongArray.
670 let nelts = bytes.len() / Self::BYTES_PER_LSN_ENTRY;
671 let mut longs = vec![NULL_LSN; nelts.max(idx + 1)];
672 for (s, slot) in longs.iter_mut().enumerate().take(nelts) {
673 *slot = self_get_compact(*base_file_number, bytes, s);
674 }
675 longs[idx] = lsn;
676 *self = LsnRep::Long(longs);
677 return;
678 }
679
680 if this_file_number < *base_file_number {
681 *base_file_number = this_file_number;
682 }
683 bytes[off] = (this_file_number - *base_file_number) as u8;
684 Self::put_3byte(bytes, off + 1, this_file_offset);
685 }
686
687 /// `IN.adjustFileNumbers` (IN.java:1855) — re-base to a lower file number,
688 /// rewriting every existing slot's 1-byte offset. Returns false (and
689 /// leaves `bytes` unchanged) if any slot would overflow the 1-byte offset.
690 fn adjust_file_numbers(
691 bytes: &mut [u8],
692 old_base: u32,
693 new_base: u32,
694 ) -> bool {
695 let stride = Self::BYTES_PER_LSN_ENTRY;
696 // First pass: verify none overflow.
697 let mut i = 0;
698 while i < bytes.len() {
699 if Self::get_3byte(bytes, i + 1) != Self::THREE_BYTE_NEGATIVE_ONE {
700 let cur_fn = old_base + bytes[i] as u32;
701 if cur_fn - new_base > Self::MAX_FILE_NUMBER_OFFSET {
702 return false;
703 }
704 }
705 i += stride;
706 }
707 // Second pass: apply.
708 let mut i = 0;
709 while i < bytes.len() {
710 if Self::get_3byte(bytes, i + 1) != Self::THREE_BYTE_NEGATIVE_ONE {
711 let cur_fn = old_base + bytes[i] as u32;
712 bytes[i] = (cur_fn - new_base) as u8;
713 }
714 i += stride;
715 }
716 true
717 }
718
719 /// `INArrayRep.copy` analogue: shift LSNs when an entry is inserted at
720 /// `idx` (slots `>= idx` move up one). Mirrors `targets.insert_shift`.
721 pub fn insert_shift(&mut self, idx: usize, n: usize) {
722 match self {
723 LsnRep::Empty => {}
724 LsnRep::Long(v) => {
725 if idx <= v.len() {
726 v.insert(idx, NULL_LSN);
727 }
728 }
729 LsnRep::Compact { bytes, .. } => {
730 let stride = Self::BYTES_PER_LSN_ENTRY;
731 let cap = (n.max((bytes.len() / stride) + 1)) * stride;
732 bytes.resize(cap, 0);
733 let at = idx * stride;
734 // Shift the tail up by one slot.
735 bytes.copy_within(at..cap - stride, at + stride);
736 // The new slot reads NULL.
737 Self::put_3byte(bytes, at + 1, Self::THREE_BYTE_NEGATIVE_ONE);
738 }
739 }
740 }
741
742 /// `INArrayRep.copy` analogue: shift LSNs when entry `idx` is removed
743 /// (slots `> idx` move down one). Mirrors `targets.remove_shift`.
744 pub fn remove_shift(&mut self, idx: usize) {
745 match self {
746 LsnRep::Empty => {}
747 LsnRep::Long(v) => {
748 if idx < v.len() {
749 v.remove(idx);
750 }
751 }
752 LsnRep::Compact { bytes, .. } => {
753 let stride = Self::BYTES_PER_LSN_ENTRY;
754 let at = idx * stride;
755 if at + stride <= bytes.len() {
756 bytes.copy_within(at + stride.., at);
757 let newlen = bytes.len() - stride;
758 bytes.truncate(newlen);
759 }
760 }
761 }
762 }
763
764 /// `IN.computeLsnOverhead` analogue: heap bytes of the rep itself.
765 pub fn memory_size(&self) -> usize {
766 use std::mem::size_of;
767 match self {
768 LsnRep::Empty => 0,
769 LsnRep::Compact { bytes, .. } => bytes.capacity(),
770 LsnRep::Long(v) => v.capacity() * size_of::<Lsn>(),
771 }
772 }
773
774 fn put_3byte(bytes: &mut [u8], offset: usize, value: u32) {
775 bytes[offset] = (value & 0xFF) as u8;
776 bytes[offset + 1] = ((value >> 8) & 0xFF) as u8;
777 bytes[offset + 2] = ((value >> 16) & 0xFF) as u8;
778 }
779
780 fn get_3byte(bytes: &[u8], offset: usize) -> u32 {
781 (bytes[offset] as u32)
782 | ((bytes[offset + 1] as u32) << 8)
783 | ((bytes[offset + 2] as u32) << 16)
784 }
785}
786
787/// Helper used by `LsnRep::set` during `mutateToLongArray` to read an existing
788/// Compact slot without borrowing `self` (which is mid-mutation).
789fn self_get_compact(base_file_number: u32, bytes: &[u8], idx: usize) -> Lsn {
790 let off = idx * LsnRep::BYTES_PER_LSN_ENTRY;
791 let file_offset = LsnRep::get_3byte(bytes, off + 1);
792 if file_offset == LsnRep::THREE_BYTE_NEGATIVE_ONE {
793 NULL_LSN
794 } else {
795 Lsn::new(base_file_number + bytes[off] as u32, file_offset)
796 }
797}
798
799/// `INKeyRep.MaxKeySize.DEFAULT_MAX_KEY_LENGTH` (INKeyRep.java) and the
800/// `TREE_COMPACT_MAX_KEY_LENGTH` config default.
801#[allow(non_upper_case_globals)]
802pub const INKeyRep_DEFAULT_MAX_KEY_LENGTH: i32 = 16;
803
804/// T-2: node-level key array — `INKeyRep.{Default,MaxKeySize}` (INKeyRep.java).
805///
806/// The per-slot key that used to live in `BinEntry`/`InEntry` as a `Vec<u8>`
807/// (24-byte header + a separate heap allocation per key) is hoisted here as a
808/// node-level rep. When every (post-prefix) key in the node is `<=`
809/// `TREE_COMPACT_MAX_KEY_LENGTH` (default 16) the keys pack into ONE
810/// fixed-width byte buffer (`MaxKeySize`): `slot_width` bytes per slot, with a
811/// parallel `lengths` vector tracking the actual length of each key. A key
812/// longer than the threshold inflates the whole node to the `Default` rep
813/// (one `Vec<u8>` per slot), matching JE's `Default.compact` /
814/// `MaxKeySize.expandToDefaultRep`.
815///
816/// As in JE, this stores the UNPREFIXED suffix (key prefixing strips the
817/// common prefix first), so the compact rep is the smaller post-prefix bytes.
818#[derive(Debug, Clone)]
819pub enum KeyRep {
820 /// `INKeyRep.Default` — one owned key per slot (any length).
821 Default(Vec<Vec<u8>>),
822 /// `INKeyRep.MaxKeySize` — all keys packed into one fixed-width buffer.
823 /// `buf.len() == slot_width * lengths.len()`; slot `i` occupies
824 /// `buf[i*slot_width .. i*slot_width + lengths[i]]`.
825 Compact { buf: Vec<u8>, slot_width: usize, lengths: Vec<u16> },
826}
827
828impl KeyRep {
829 /// An empty `Default` rep.
830 #[inline]
831 pub fn new() -> Self {
832 KeyRep::Default(Vec::new())
833 }
834
835 /// Build a `Default` rep from owned keys (callers may later `compact`).
836 #[inline]
837 pub fn from_keys(keys: Vec<Vec<u8>>) -> Self {
838 KeyRep::Default(keys)
839 }
840
841 /// Number of slots.
842 #[inline]
843 pub fn len(&self) -> usize {
844 match self {
845 KeyRep::Default(v) => v.len(),
846 KeyRep::Compact { lengths, .. } => lengths.len(),
847 }
848 }
849
850 #[inline]
851 pub fn is_empty(&self) -> bool {
852 self.len() == 0
853 }
854
855 /// `INKeyRep.get(idx)` / `getKey` — borrow the (post-prefix) key at slot
856 /// `idx` without allocating.
857 #[inline]
858 pub fn get(&self, idx: usize) -> &[u8] {
859 match self {
860 KeyRep::Default(v) => v[idx].as_slice(),
861 KeyRep::Compact { buf, slot_width, lengths } => {
862 let off = idx * slot_width;
863 &buf[off..off + lengths[idx] as usize]
864 }
865 }
866 }
867
868 /// Set the key at slot `idx`. A key longer than a Compact rep's
869 /// `slot_width` inflates the rep to `Default` first
870 /// (`MaxKeySize.expandToDefaultRep`).
871 pub fn set(&mut self, idx: usize, key: Vec<u8>) {
872 match self {
873 KeyRep::Default(v) => v[idx] = key,
874 KeyRep::Compact { slot_width, .. } if key.len() > *slot_width => {
875 self.inflate_to_default();
876 self.set(idx, key);
877 }
878 KeyRep::Compact { buf, slot_width, lengths } => {
879 let off = idx * *slot_width;
880 buf[off..off + key.len()].copy_from_slice(&key);
881 lengths[idx] = key.len() as u16;
882 }
883 }
884 }
885
886 /// Insert a key at slot `idx`, shifting later slots up (mirrors
887 /// `Vec::insert` + `INArrayRep.copy`).
888 pub fn insert(&mut self, idx: usize, key: Vec<u8>) {
889 match self {
890 KeyRep::Default(v) => v.insert(idx, key),
891 KeyRep::Compact { slot_width, .. } if key.len() > *slot_width => {
892 self.inflate_to_default();
893 self.insert(idx, key);
894 }
895 KeyRep::Compact { buf, slot_width, lengths } => {
896 let sw = *slot_width;
897 let at = idx * sw;
898 buf.splice(at..at, std::iter::repeat_n(0u8, sw));
899 buf[at..at + key.len()].copy_from_slice(&key);
900 lengths.insert(idx, key.len() as u16);
901 }
902 }
903 }
904
905 /// Remove the key at slot `idx`, shifting later slots down.
906 pub fn remove(&mut self, idx: usize) -> Vec<u8> {
907 match self {
908 KeyRep::Default(v) => v.remove(idx),
909 KeyRep::Compact { buf, slot_width, lengths } => {
910 let sw = *slot_width;
911 let len = lengths[idx] as usize;
912 let at = idx * sw;
913 let out = buf[at..at + len].to_vec();
914 buf.drain(at..at + sw);
915 lengths.remove(idx);
916 out
917 }
918 }
919 }
920
921 /// `INKeyRep.MaxKeySize.expandToDefaultRep` — mutate a Compact rep to a
922 /// Default rep (one owned `Vec<u8>` per slot).
923 fn inflate_to_default(&mut self) {
924 if let KeyRep::Compact { .. } = self {
925 let keys: Vec<Vec<u8>> =
926 (0..self.len()).map(|i| self.get(i).to_vec()).collect();
927 *self = KeyRep::Default(keys);
928 }
929 }
930
931 /// `INKeyRep.Default.compact(parent)` (INKeyRep.java) — if every key in a
932 /// `Default` rep fits `compact_max_key_length`, pack them into a
933 /// `MaxKeySize` (`Compact`) rep. `compact_max_key_length <= 0` disables
934 /// compaction. No-op when already Compact.
935 pub fn compact(&mut self, compact_max_key_length: i32) {
936 if compact_max_key_length <= 0 {
937 return;
938 }
939 let KeyRep::Default(keys) = self else {
940 return; // already Compact
941 };
942 if keys.is_empty() {
943 return;
944 }
945 let max_len = keys.iter().map(|k| k.len()).max().unwrap_or(0);
946 if max_len > compact_max_key_length as usize {
947 return; // a key exceeds the threshold — stay Default
948 }
949 let slot_width = max_len.max(1);
950 let mut buf = vec![0u8; slot_width * keys.len()];
951 let mut lengths = Vec::with_capacity(keys.len());
952 for (i, k) in keys.iter().enumerate() {
953 let off = i * slot_width;
954 buf[off..off + k.len()].copy_from_slice(k);
955 lengths.push(k.len() as u16);
956 }
957 *self = KeyRep::Compact { buf, slot_width, lengths };
958 }
959
960 /// True when key-byte memory is accounted for inside this rep (Compact),
961 /// vs per-slot `Vec` allocations (Default).
962 /// `INKeyRep.accountsForKeyByteMemUsage`.
963 #[inline]
964 pub fn is_compact(&self) -> bool {
965 matches!(self, KeyRep::Compact { .. })
966 }
967
968 /// Heap bytes of the rep itself (`INKeyRep.calculateMemorySize` +
969 /// key-byte accounting). For Default this is the `Vec<Vec<u8>>` header
970 /// plus each key's heap allocation; for Compact it is the single buffer
971 /// plus the lengths vector.
972 pub fn memory_size(&self) -> usize {
973 use std::mem::size_of;
974 match self {
975 KeyRep::Default(v) => {
976 v.capacity() * size_of::<Vec<u8>>()
977 + v.iter().map(|k| k.capacity()).sum::<usize>()
978 }
979 KeyRep::Compact { buf, lengths, .. } => {
980 buf.capacity() + lengths.capacity() * size_of::<u16>()
981 }
982 }
983 }
984}
985
986impl Default for KeyRep {
987 fn default() -> Self {
988 KeyRep::new()
989 }
990}
991
992/// Lightweight upper-IN representation used by the tree traversal layer.
993///
994/// `IN`: carries the dirty flag (IN_DIRTY_BIT), the LRU
995/// generation counter, and a weak back-pointer to the parent so that
996/// dirty state can be propagated upward.
997#[derive(Debug)]
998pub struct InNodeStub {
999 /// Node ID.
1000 pub node_id: u64,
1001 /// Level in tree.
1002 pub level: i32,
1003 /// Child entries (key, lsn).
1004 pub entries: Vec<InEntry>,
1005 /// T-4: per-node resident-child-pointer representation.
1006 ///
1007 /// `IN.entryTargets` (`INTargetRep`). The cached child pointer is no
1008 /// longer a per-`InEntry` `Option<Arc>` (which cost a pointer-sized slot
1009 /// even when no child was resident); it lives here as a compact
1010 /// node-level rep that starts `None` (0 child-pointer bytes — most upper
1011 /// INs have no resident children), grows to `Sparse` for a few cached
1012 /// children, and inflates to `Default` (the full parallel array) once
1013 /// many children are resident. See `INTargetRep.{None,Sparse,Default}`.
1014 pub targets: TargetRep,
1015 /// Dirty flag — set whenever this node is modified.
1016 /// `IN.dirty` (IN_DIRTY_BIT).
1017 pub dirty: bool,
1018 /// LRU generation counter for the evictor.
1019 /// `IN.generation`.
1020 pub generation: u64,
1021 /// Weak back-pointer to parent IN.
1022 /// Enables dirty-propagation and latch-coupling validation.
1023 /// `IN.parent` reference used during splits and logging.
1024 pub parent: Option<Weak<RwLock<TreeNode>>>,
1025 /// T-3: per-node packed LSN array (`IN.entryLsnByteArray`). The per-slot
1026 /// `lsn` (8 bytes) that used to live in `InEntry` is hoisted here as a
1027 /// `base_file_number`-relative 4-byte-per-slot rep, falling back to a
1028 /// `u64`-per-slot `Long` rep only when a node's LSN range exceeds the
1029 /// compact form. Access via `get_lsn(slot)` / `set_lsn(slot, lsn)`.
1030 pub lsn_rep: LsnRep,
1031}
1032
1033/// Entry in an IN node.
1034///
1035/// T-4: the resident-child pointer that used to live here (`Option<Arc>`) was
1036/// hoisted to the node-level `InNodeStub.targets` (`INTargetRep`); access the
1037/// child for slot `i` via `InNodeStub::get_child(i)` / `set_child` / etc.
1038///
1039/// T-3: the per-slot `lsn` (8 bytes) that used to live here was hoisted to the
1040/// node-level `InNodeStub.lsn_rep` (`IN.entryLsnByteArray`); access the LSN for
1041/// slot `i` via `InNodeStub::get_lsn(i)` / `set_lsn(i, lsn)`.
1042#[derive(Debug, Clone)]
1043pub struct InEntry {
1044 /// Key for this entry.
1045 pub key: Vec<u8>,
1046}
1047
1048/// Lightweight BIN representation used by the tree traversal layer.
1049///
1050/// `BIN` (which extends `IN`): carries the dirty flag, LRU
1051/// generation counter, and a weak back-pointer to the parent IN.
1052///
1053/// # Key Prefix Compression
1054///
1055/// BINs support key prefix compression. When
1056/// `key_prefix` is non-empty the `key` field of every `BinEntry` stores only
1057/// the *suffix* — the bytes after stripping the common leading bytes. The
1058/// full key is reconstructed by prepending `key_prefix` to the stored suffix.
1059///
1060/// This is transparent to callers through the `get_full_key` / `find_entry`
1061/// helpers on `BinStub`. The prefix is recomputed after every insert and
1062/// after a split via `recompute_key_prefix`.
1063#[derive(Debug)]
1064pub struct BinStub {
1065 /// Node ID.
1066 pub node_id: u64,
1067 /// Level (always BIN_LEVEL).
1068 pub level: i32,
1069 /// Entries. When `key_prefix` is non-empty the `key` field in each entry
1070 /// is the *suffix* of the full key (leading `key_prefix` bytes stripped).
1071 /// `IN.entryKeys` (suffix-only storage when prefixing is on).
1072 pub entries: Vec<BinEntry>,
1073 /// Common prefix shared by every key in this BIN.
1074 /// Empty slice means no prefix compression is active.
1075 /// `IN.keyPrefix`.
1076 pub key_prefix: Vec<u8>,
1077 /// Dirty flag — set whenever this BIN is modified.
1078 /// `IN.dirty` (IN_DIRTY_BIT).
1079 pub dirty: bool,
1080 /// BIN-delta flag — true when this BIN contains only dirty (delta) slots
1081 /// rather than a complete set of entries.
1082 /// `IN.IN_DELTA_BIT` (the IN_DELTA_BIT flag inside `flags`).
1083 pub is_delta: bool,
1084 /// LSN at which this BIN was last logged as a full (non-delta) BIN.
1085 ///
1086 /// Used by the checkpoint path to construct `BINDeltaLogEntry.prev_full_lsn`
1087 /// and to compare against `prev_delta_lsn` when deciding whether to write
1088 /// a delta or a full BIN.
1089 ///
1090 /// `BIN.lastFullLsn`.
1091 pub last_full_lsn: Lsn,
1092 /// LSN at which this BIN was last logged as a BIN-delta.
1093 ///
1094 /// Written as `prev_delta_lsn` into the next `BINDeltaLogEntry` so the
1095 /// cleaner's utilization tracker can mark the superseded delta obsolete.
1096 /// Reset to `NULL_LSN` whenever a full BIN is written.
1097 ///
1098 /// `BIN.lastDeltaVersion` / `BIN.getLastDeltaLsn()`.
1099 pub last_delta_lsn: Lsn,
1100 /// LRU generation counter for the evictor.
1101 /// `IN.generation`.
1102 pub generation: u64,
1103 /// Weak back-pointer to parent IN.
1104 /// Enables dirty-propagation and latch-coupling validation.
1105 pub parent: Option<Weak<RwLock<TreeNode>>>,
1106 /// If true, `BinEntry.expiration_time` values in this BIN are packed hours
1107 /// since epoch; if false, they are packed seconds since epoch.
1108 ///
1109 /// Default: `true` (hours, matching TTL resolution).
1110 ///
1111 /// `BIN.expirationInHours`.
1112 pub expiration_in_hours: bool,
1113 /// Number of cursors currently positioned on this BIN.
1114 ///
1115 /// The evictor skips BINs with a non-zero cursor count to avoid evicting
1116 /// a node that a cursor is actively traversing. CursorImpl increments
1117 /// this when positioning on a BIN and decrements it on reposition/close.
1118 ///
1119 /// `IN.cursorSet.size()` used by `Evictor.selectIN()`.
1120 pub cursor_count: i32,
1121 /// When true, the NEXT log of this BIN must be a full BIN, not a delta.
1122 ///
1123 /// Set after a dirty slot is removed (a delta would silently lose that
1124 /// removal) and cleared after a full BIN is written. This is the
1125 /// delta-chain bound: it forces a periodic full BIN so a delta never
1126 /// references stale state.
1127 ///
1128 /// `IN.prohibitNextDelta` / `IN.setProhibitNextDelta` (IN.java:5013) /
1129 /// `IN.getProhibitNextDelta`.
1130 pub prohibit_next_delta: bool,
1131 /// T-3: per-node packed LSN array (`IN.entryLsnByteArray`). The per-slot
1132 /// `lsn` (8 bytes) that used to live in `BinEntry` is hoisted here as a
1133 /// `base_file_number`-relative 4-byte-per-slot rep. Access via
1134 /// `get_lsn(slot)` / `set_lsn(slot, lsn)`.
1135 pub lsn_rep: LsnRep,
1136 /// T-2: per-node key array (`INKeyRep.{Default,MaxKeySize}`). The per-slot
1137 /// `key` (`Vec<u8>`, 24-byte header + heap alloc) that used to live in
1138 /// `BinEntry` is hoisted here. Stores the post-prefix SUFFIX (key
1139 /// prefixing strips the common prefix first). Packs into one fixed-width
1140 /// buffer (`Compact`) when every suffix is `<= compact_max_key_length`,
1141 /// else one `Vec<u8>` per slot (`Default`). `keys.len()` is kept in lock
1142 /// step with `entries.len()`. Access via `get_key(slot)` /
1143 /// `get_full_key(slot)`.
1144 pub keys: KeyRep,
1145 /// T-5: the node's compact-key threshold (`IN.getCompactMaxKeyLength`),
1146 /// copied from the owning `Tree` at construction so `apply_new_prefix` can
1147 /// decide whether the suffixes now fit `MaxKeySize`. Default 16.
1148 pub compact_max_key_length: i32,
1149}
1150
1151/// Entry in a BIN node.
1152///
1153/// T-3: the per-slot `lsn` (8 bytes) that used to live here was hoisted to the
1154/// node-level `BinStub.lsn_rep` (`IN.entryLsnByteArray`); access the LSN for
1155/// slot `i` via `BinStub::get_lsn(i)` / `set_lsn(i, lsn)`.
1156#[derive(Debug, Clone)]
1157pub struct BinEntry {
1158 /// Optional embedded data (for small records) or cached LN.
1159 pub data: Option<Vec<u8>>,
1160 /// True when this slot has been marked known-deleted (analogous to the
1161 /// KNOWN_DELETED_BIT in `IN.entryStates`). The slot is eligible for
1162 /// removal by `compress_bin()`.
1163 pub known_deleted: bool,
1164 /// True when this slot has been modified since the last full BIN log write.
1165 ///
1166 /// `IN.entryStates[i] & IN_DIRTY_BIT`. Used by the checkpoint
1167 /// path to decide whether to write a BIN-delta (few dirty slots) or a
1168 /// full BIN (many dirty slots).
1169 pub dirty: bool,
1170 /// Packed expiration time (0 = no expiration).
1171 ///
1172 /// When the owning `BinStub.expiration_in_hours` is true, this value is
1173 /// hours since Unix epoch; otherwise it is seconds since Unix epoch.
1174 ///
1175 /// `IN.entryExpiration`.
1176 pub expiration_time: u32,
1177}
1178
1179impl InNodeStub {
1180 /// `IN.getTarget(idx)` — the resident child cached for slot `idx`, cloned
1181 /// (a strong `Arc`), or `None` if the child is not cached. Routes through
1182 /// the node-level `INTargetRep` (T-4).
1183 #[inline]
1184 pub fn get_child(&self, idx: usize) -> Option<ChildArc> {
1185 self.targets.get(idx).cloned()
1186 }
1187
1188 /// Borrow the resident child for slot `idx` without cloning.
1189 #[inline]
1190 pub fn child_ref(&self, idx: usize) -> Option<&ChildArc> {
1191 self.targets.get(idx)
1192 }
1193
1194 /// True if slot `idx` has no resident (cached) child.
1195 /// `IN.getTarget(idx) == null`.
1196 #[inline]
1197 pub fn child_is_none(&self, idx: usize) -> bool {
1198 self.targets.get(idx).is_none()
1199 }
1200
1201 /// `IN.setTarget(idx, node)` — set (or clear) the cached child for slot
1202 /// `idx`, mutating the `INTargetRep` upward as needed.
1203 #[inline]
1204 pub fn set_child(&mut self, idx: usize, node: Option<ChildArc>) {
1205 self.targets.set(idx, node);
1206 }
1207
1208 /// `IN.detachNode` helper — remove and return the cached child for slot
1209 /// `idx`, leaving the slot's key/LSN intact for re-fetch.
1210 #[inline]
1211 pub fn take_child(&mut self, idx: usize) -> Option<ChildArc> {
1212 self.targets.take(idx)
1213 }
1214
1215 /// `IN.getLsn(idx)` (IN.java:1752) — the LSN of slot `idx` via the
1216 /// node-level packed `LsnRep` (T-3).
1217 #[inline]
1218 pub fn get_lsn(&self, idx: usize) -> Lsn {
1219 self.lsn_rep.get(idx)
1220 }
1221
1222 /// `IN.setLsn(idx, lsn)` (IN.java:1773) — set the LSN of slot `idx` via
1223 /// the node-level packed `LsnRep` (T-3).
1224 #[inline]
1225 pub fn set_lsn(&mut self, idx: usize, lsn: Lsn) {
1226 let n = self.entries.len();
1227 self.lsn_rep.set(idx, lsn, n);
1228 }
1229
1230 /// Insert an entry at `idx`, shifting the child mapping to stay aligned
1231 /// (`INArrayRep.copy`), then set the new slot's cached child. Mirrors the
1232 /// old `entries.insert(idx, InEntry{ child: ..})` in one call.
1233 pub fn insert_entry(
1234 &mut self,
1235 idx: usize,
1236 key: Vec<u8>,
1237 lsn: Lsn,
1238 child: Option<ChildArc>,
1239 ) {
1240 self.entries.insert(idx, InEntry { key });
1241 let n = self.entries.len();
1242 self.lsn_rep.insert_shift(idx, n);
1243 self.lsn_rep.set(idx, lsn, n);
1244 self.targets.insert_shift(idx);
1245 if child.is_some() {
1246 self.targets.set(idx, child);
1247 }
1248 }
1249
1250 /// Remove the entry at `idx`, shifting the child mapping to stay aligned
1251 /// (`INArrayRep.copy`). Returns the removed `InEntry` (key).
1252 pub fn remove_entry(&mut self, idx: usize) -> InEntry {
1253 let e = self.entries.remove(idx);
1254 self.lsn_rep.remove_shift(idx);
1255 self.targets.remove_shift(idx);
1256 e
1257 }
1258
1259 /// All resident children (cloned `Arc`s), in unspecified order.
1260 /// Replaces `entries.iter().filter_map(|e| e.child.clone())`.
1261 pub fn resident_children(&self) -> Vec<ChildArc> {
1262 self.targets.iter_children().collect()
1263 }
1264
1265 /// `(slot_index, child)` of the first resident child, if any.
1266 pub fn first_resident_child(&self) -> Option<(usize, ChildArc)> {
1267 (0..self.entries.len())
1268 .find_map(|i| self.targets.get(i).map(|c| (i, c.clone())))
1269 }
1270}
1271
1272impl BinStub {
1273 /// `IN.getLsn(idx)` (IN.java:1752) — the LSN of slot `idx` via the
1274 /// node-level packed `LsnRep` (T-3).
1275 #[inline]
1276 pub fn get_lsn(&self, idx: usize) -> Lsn {
1277 self.lsn_rep.get(idx)
1278 }
1279
1280 /// `IN.setLsn(idx, lsn)` (IN.java:1773) — set the LSN of slot `idx` via
1281 /// the node-level packed `LsnRep` (T-3).
1282 #[inline]
1283 pub fn set_lsn(&mut self, idx: usize, lsn: Lsn) {
1284 let n = self.entries.len();
1285 self.lsn_rep.set(idx, lsn, n);
1286 }
1287
1288 /// TREE-F1: the single user-facing liveness predicate for a BIN slot.
1289 ///
1290 /// A slot is LIVE for reads/scans iff it is neither `known_deleted` nor
1291 /// TTL-expired. This mirrors the two ways JE makes a slot read as ABSENT:
1292 /// * `IN.findEntry` (IN.java:3197) returns -1 for a `known_deleted`
1293 /// exact match;
1294 /// * `CursorImpl.isProbablyExpired` / `lockAndGetCurrent`
1295 /// (CursorImpl.java:2062-2064) skip `isEntryKnownDeleted` (and
1296 /// expired) slots while stepping.
1297 ///
1298 /// KD slots legitimately exist in live BINs during BIN-delta
1299 /// reconstitution until the compressor reclaims them; the maintenance
1300 /// paths (compressor / recovery undo) iterate them on purpose and do NOT
1301 /// use this predicate.
1302 #[inline]
1303 pub fn slot_is_live(&self, idx: usize) -> bool {
1304 match self.entries.get(idx) {
1305 Some(e) => {
1306 !(e.known_deleted
1307 || (e.expiration_time != 0
1308 && noxu_util::ttl::is_expired(
1309 e.expiration_time,
1310 self.expiration_in_hours,
1311 )))
1312 }
1313 None => false,
1314 }
1315 }
1316
1317 // ========================================================================
1318 // Key prefix compression helpers
1319 // IN.computeKeyPrefix / IN.recalcSuffixes / IN.getKey
1320 // ========================================================================
1321
1322 /// Strips embedded LN data from non-dirty slots, freeing the heap
1323 /// allocations of the per-slot value bytes while keeping the slot keys
1324 /// and LSNs addressable. Used by the evictor's PartialEvict path: a
1325 /// hot BIN is kept in cache so its descent path stays warm, but the LN
1326 /// data is dropped to make room for hotter content. Subsequent reads
1327 /// re-fetch the data from the log via the slot LSN.
1328 ///
1329 /// Skips slots that are still dirty (their data has not been written
1330 /// to the log yet, so dropping the in-memory copy would lose the
1331 /// update). Returns the number of bytes freed (sum of the lengths
1332 /// of the dropped `Vec<u8>` data fields).
1333 ///
1334 /// Returns 0 if the BIN has any open cursors (the cursor may be
1335 /// reading the data right now).
1336 pub fn strip_lns(&mut self) -> usize {
1337 if self.cursor_count > 0 {
1338 return 0;
1339 }
1340 let mut freed = 0usize;
1341 for idx in 0..self.entries.len() {
1342 // JE BIN.evictLNs / LN.isEvictable (LN.java:263 returns true): an
1343 // LN's in-memory value can be stripped whenever it is recoverable
1344 // from the log — i.e. the slot has a valid (logged) LSN — REGARDLESS
1345 // of the dirty bit. The dirty bit governs whether the BIN's
1346 // *structure* needs re-logging at the next checkpoint (BIN-delta vs
1347 // full BIN), NOT whether the LN *value* is durable: a transactional
1348 // commit logs the LN, so the slot's LSN points at the durable copy
1349 // even while the slot is still dirty. Gating the strip on `!dirty`
1350 // (the previous behaviour) meant a freshly-written, not-yet-
1351 // checkpointed record — the common case under a write/recently-read
1352 // workload — could never be stripped, so eviction reclaimed almost
1353 // nothing under pressure (EVICTOR-RECLAIM-1). A slot with a NULL/
1354 // transient LSN (a deferred-write LN never logged) is NOT
1355 // strippable — its only copy is the in-memory value.
1356 if self.get_lsn(idx) == NULL_LSN {
1357 continue;
1358 }
1359 if let Some(data) = self.entries[idx].data.take() {
1360 freed = freed.saturating_add(data.len());
1361 }
1362 }
1363 freed
1364 }
1365
1366 /// Reconstruct the full key for slot `idx` by prepending the BIN's
1367 /// current prefix to the stored suffix.
1368 ///
1369 /// `IN.getKey(int idx)`.
1370 pub fn get_full_key(&self, idx: usize) -> Option<Vec<u8>> {
1371 if idx >= self.keys.len() {
1372 return None;
1373 }
1374 let suffix = self.keys.get(idx); // T-2
1375 if self.key_prefix.is_empty() {
1376 Some(suffix.to_vec())
1377 } else {
1378 let mut full =
1379 Vec::with_capacity(self.key_prefix.len() + suffix.len());
1380 full.extend_from_slice(&self.key_prefix);
1381 full.extend_from_slice(suffix);
1382 Some(full)
1383 }
1384 }
1385
1386 /// Borrow the stored (post-prefix) suffix at slot `idx` (`INKeyRep.get`).
1387 #[inline]
1388 pub fn get_key(&self, idx: usize) -> &[u8] {
1389 self.keys.get(idx)
1390 }
1391
1392 /// T-2: insert a new slot at `idx` keeping the parallel `entries`, `keys`,
1393 /// and `lsn_rep` arrays in lock step. `suffix` is the post-prefix key.
1394 fn insert_slot(
1395 &mut self,
1396 idx: usize,
1397 suffix: Vec<u8>,
1398 lsn: Lsn,
1399 data: Option<Vec<u8>>,
1400 ) {
1401 self.entries.insert(
1402 idx,
1403 BinEntry {
1404 data,
1405 known_deleted: false,
1406 dirty: true,
1407 expiration_time: 0,
1408 },
1409 );
1410 self.keys.insert(idx, suffix); // T-2
1411 let n = self.entries.len();
1412 self.lsn_rep.insert_shift(idx, n); // T-3
1413 self.lsn_rep.set(idx, lsn, n);
1414 }
1415
1416 /// Decompress a stored suffix back to a full key.
1417 ///
1418 /// `IN.getKey` used from outside: prepend `key_prefix` to
1419 /// `suffix`. If `key_prefix` is empty the suffix *is* the full key.
1420 pub fn decompress_key(&self, suffix: &[u8]) -> Vec<u8> {
1421 if self.key_prefix.is_empty() {
1422 suffix.to_vec()
1423 } else {
1424 let mut full =
1425 Vec::with_capacity(self.key_prefix.len() + suffix.len());
1426 full.extend_from_slice(&self.key_prefix);
1427 full.extend_from_slice(suffix);
1428 full
1429 }
1430 }
1431
1432 /// Strip the current prefix from a full key to obtain the stored suffix.
1433 ///
1434 /// `IN.computeKeySuffix(byte[] prefix, byte[] key)`.
1435 ///
1436 /// # Panics
1437 /// Panics (debug only) if `full_key` does not start with `key_prefix`.
1438 pub fn compress_key(&self, full_key: &[u8]) -> Vec<u8> {
1439 let plen = self.key_prefix.len();
1440 if plen == 0 {
1441 full_key.to_vec()
1442 } else {
1443 debug_assert!(
1444 full_key.starts_with(&self.key_prefix),
1445 "compress_key: key does not start with current prefix"
1446 );
1447 full_key[plen..].to_vec()
1448 }
1449 }
1450
1451 /// Compute the longest common prefix of all full keys currently in this
1452 /// BIN, optionally excluding the entry at `exclude_idx` (used during
1453 /// insertions to ignore the slot that is about to be replaced).
1454 ///
1455 /// Returns an empty `Vec` if the BIN has fewer than 2 entries or if the
1456 /// keys share no common leading bytes.
1457 ///
1458 /// `IN.computeKeyPrefix(int excludeIdx)`.
1459 pub fn compute_key_prefix(&self, exclude_idx: Option<usize>) -> Vec<u8> {
1460 // Need at least 2 entries to find a common prefix.
1461 let n = self.keys.len();
1462 if n < 2 {
1463 return Vec::new();
1464 }
1465
1466 // Pick the first non-excluded index as the seed.
1467 let first_idx = match exclude_idx {
1468 Some(0) => 1,
1469 _ => 0,
1470 };
1471
1472 // The current prefix_len is taken from the seed full key.
1473 let seed_full = match self.get_full_key(first_idx) {
1474 Some(k) => k,
1475 None => return Vec::new(),
1476 };
1477 let mut prefix_len = seed_full.len();
1478
1479 // Compare every other non-excluded entry against the running prefix.
1480 // Iterate all entries (byteOrdered disabled in too).
1481 for i in (first_idx + 1)..n {
1482 if let Some(ex) = exclude_idx
1483 && i == ex
1484 {
1485 continue;
1486 }
1487 let full_key = match self.get_full_key(i) {
1488 Some(k) => k,
1489 None => continue,
1490 };
1491 let new_len =
1492 get_key_prefix_length(&seed_full[..prefix_len], &full_key);
1493 if new_len < prefix_len {
1494 prefix_len = new_len;
1495 }
1496 if prefix_len == 0 {
1497 return Vec::new();
1498 }
1499 }
1500
1501 seed_full[..prefix_len].to_vec()
1502 }
1503
1504 /// Recompute the key prefix from scratch and re-encode every stored suffix.
1505 ///
1506 /// Call this after bulk inserts, splits, or merges.
1507 ///
1508 /// `IN.recalcKeyPrefix()` → `IN.recalcSuffixes(newPrefix, …)`.
1509 pub fn recompute_key_prefix(&mut self) {
1510 let new_prefix = self.compute_key_prefix(None);
1511 self.apply_new_prefix(new_prefix);
1512 }
1513
1514 /// Apply `new_prefix` as the BIN's key prefix, re-encoding all stored
1515 /// suffixes from the old prefix into the new one.
1516 ///
1517 /// This is the Rust.
1518 fn apply_new_prefix(&mut self, new_prefix: Vec<u8>) {
1519 // Reconstruct all full keys (using old prefix), then re-encode with
1520 // the new prefix.
1521 let full_keys: Vec<Vec<u8>> = (0..self.keys.len())
1522 .map(|i| self.get_full_key(i).unwrap_or_default())
1523 .collect();
1524
1525 self.key_prefix = new_prefix;
1526
1527 // T-2: re-encode every suffix into the key rep, then re-attempt
1528 // compaction (a smaller prefix may make all suffixes fit MaxKeySize).
1529 for (i, full_key) in full_keys.into_iter().enumerate() {
1530 let suffix = self.compress_key(&full_key);
1531 self.keys.set(i, suffix);
1532 }
1533 self.keys.compact(self.compact_max_key_length);
1534 }
1535
1536 /// Binary-search this BIN for `full_key` (a full, uncompressed key).
1537 ///
1538 /// The stored suffixes are compared after stripping the current prefix
1539 /// from `full_key`, so the search is done entirely in suffix-space — no
1540 /// heap allocation needed in the happy path.
1541 ///
1542 /// Returns `(idx, exact)` where:
1543 /// - `idx` is the slot index (or insertion point when `exact == false`).
1544 /// - `exact` is `true` when an exact match was found.
1545 ///
1546 /// `IN.findEntry(key, indicateIfDuplicate, exact)`.
1547 pub fn find_entry_compressed(&self, full_key: &[u8]) -> (usize, bool) {
1548 let plen = self.key_prefix.len();
1549 // Check that the key shares the current prefix; if not it cannot be
1550 // present and we return the appropriate insertion point.
1551 if plen > 0
1552 && (full_key.len() < plen
1553 || &full_key[..plen] != self.key_prefix.as_slice())
1554 {
1555 // The key does not share the current prefix.
1556 // Determine insertion point using full-key comparison.
1557 let pos = self.key_partition_point(|s| {
1558 self.decompress_key(s).as_slice() < full_key
1559 });
1560 return (pos, false);
1561 }
1562 let suffix = &full_key[plen..];
1563 // T-2: binary search over the node-level key rep (suffix space).
1564 match self.key_binary_search(suffix) {
1565 Ok(idx) => (idx, true),
1566 Err(idx) => (idx, false),
1567 }
1568 }
1569
1570 /// Binary search the key rep for `suffix` (suffix space, unsigned bytes).
1571 /// Mirrors `Vec::binary_search_by(|e| e.key.cmp(suffix))` over the
1572 /// node-level `KeyRep` (T-2).
1573 #[inline]
1574 fn key_binary_search(&self, suffix: &[u8]) -> Result<usize, usize> {
1575 let mut lo = 0usize;
1576 let mut hi = self.keys.len();
1577 while lo < hi {
1578 let mid = lo + (hi - lo) / 2;
1579 match self.keys.get(mid).cmp(suffix) {
1580 std::cmp::Ordering::Less => lo = mid + 1,
1581 std::cmp::Ordering::Greater => hi = mid,
1582 std::cmp::Ordering::Equal => return Ok(mid),
1583 }
1584 }
1585 Err(lo)
1586 }
1587
1588 /// `slice::partition_point` over the node-level key rep suffixes (T-2):
1589 /// the index of the first slot for which `pred(suffix)` is false.
1590 #[inline]
1591 fn key_partition_point(
1592 &self,
1593 mut pred: impl FnMut(&[u8]) -> bool,
1594 ) -> usize {
1595 let mut lo = 0usize;
1596 let mut hi = self.keys.len();
1597 while lo < hi {
1598 let mid = lo + (hi - lo) / 2;
1599 if pred(self.keys.get(mid)) {
1600 lo = mid + 1;
1601 } else {
1602 hi = mid;
1603 }
1604 }
1605 lo
1606 }
1607
1608 /// Insert or update a full (uncompressed) key in this BIN.
1609 ///
1610 /// After insertion the key prefix is recomputed; if the prefix changes all
1611 /// stored suffixes are re-encoded.
1612 ///
1613 /// Returns `(slot_index, is_new_insert)`.
1614 ///
1615 /// `IN.setKey` / BIN insert path.
1616 pub fn insert_with_prefix(
1617 &mut self,
1618 full_key: Vec<u8>,
1619 lsn: Lsn,
1620 data: Option<Vec<u8>>,
1621 ) -> (usize, bool) {
1622 // Is the current prefix still compatible with this key?
1623 let plen = self.key_prefix.len();
1624 let new_len = if plen > 0 {
1625 get_key_prefix_length(&self.key_prefix, &full_key)
1626 } else {
1627 0
1628 };
1629
1630 // If the new key shrinks the prefix we must re-encode everything first.
1631 if plen > 0 && new_len < plen {
1632 // Compute new prefix considering the incoming key and
1633 // all existing full keys. We pass `None` for exclude_idx because
1634 // the slot for this key does not yet exist.
1635 let mut candidate = self.compute_key_prefix(None);
1636 // Also constrain by the new key itself.
1637 if !candidate.is_empty() {
1638 let cl = get_key_prefix_length(&candidate, &full_key);
1639 candidate.truncate(cl);
1640 } else {
1641 // No existing prefix; try to build one from the new key
1642 // against the existing full keys.
1643 if !self.entries.is_empty()
1644 && let Some(first_full) = self.get_full_key(0)
1645 {
1646 candidate = create_key_prefix(&first_full, &full_key)
1647 .unwrap_or_default();
1648 for i in 1..self.entries.len() {
1649 if candidate.is_empty() {
1650 break;
1651 }
1652 if let Some(fk) = self.get_full_key(i) {
1653 let l = get_key_prefix_length(&candidate, &fk);
1654 candidate.truncate(l);
1655 }
1656 }
1657 }
1658 }
1659 self.apply_new_prefix(candidate);
1660 }
1661
1662 // Compress the new key under the (possibly updated) prefix.
1663 let suffix = self.compress_key(&full_key);
1664
1665 match self.key_binary_search(&suffix) {
1666 Ok(idx) => {
1667 // Key exists — update in place.
1668 self.set_lsn(idx, lsn); // T-3
1669 self.entries[idx].data = data;
1670 // Mark slot dirty: this slot changed since the last full BIN log.
1671 // `IN.setDirtyEntry(idx)`.
1672 self.entries[idx].dirty = true;
1673 (idx, false)
1674 }
1675 Err(idx) => {
1676 // New key — insert in sorted position.
1677 // New slots start dirty: they have never been logged in any BIN.
1678 // `IN.setDirtyEntry(idx)` called after `insertEntry`.
1679 self.insert_slot(idx, suffix, lsn, data);
1680 // After insertion, if there is no prefix yet, try to establish one.
1681 if self.key_prefix.is_empty() && self.entries.len() >= 2 {
1682 self.recompute_key_prefix();
1683 }
1684 (idx, true)
1685 }
1686 }
1687 }
1688
1689 /// Slice-based variant of [`BinStub::insert_with_prefix`] for the recovery redo path.
1690 ///
1691 /// Accepts `key` and `data` as `&[u8]` slices instead of owned `Vec<u8>`,
1692 /// eliminating the intermediate `Vec<u8>` that `redo_ln` would otherwise
1693 /// allocate before crossing the BIN boundary. The compressed suffix and
1694 /// the data bytes are each copied into the `BinEntry` exactly once.
1695 ///
1696 /// Semantics are identical to `insert_with_prefix`:
1697 /// - Updates the slot in place when the key already exists.
1698 /// - Inserts a new sorted entry when absent, recomputing the key prefix.
1699 ///
1700 /// Wave 11-K optimisation (Fix 1).
1701 pub fn insert_with_prefix_slice(
1702 &mut self,
1703 full_key: &[u8],
1704 lsn: Lsn,
1705 data: Option<&[u8]>,
1706 ) -> (usize, bool) {
1707 let plen = self.key_prefix.len();
1708 let new_len = if plen > 0 {
1709 get_key_prefix_length(&self.key_prefix, full_key)
1710 } else {
1711 0
1712 };
1713
1714 if plen > 0 && new_len < plen {
1715 let mut candidate = self.compute_key_prefix(None);
1716 if !candidate.is_empty() {
1717 let cl = get_key_prefix_length(&candidate, full_key);
1718 candidate.truncate(cl);
1719 } else {
1720 if !self.entries.is_empty()
1721 && let Some(first_full) = self.get_full_key(0)
1722 {
1723 candidate = create_key_prefix(&first_full, full_key)
1724 .unwrap_or_default();
1725 for i in 1..self.entries.len() {
1726 if candidate.is_empty() {
1727 break;
1728 }
1729 if let Some(fk) = self.get_full_key(i) {
1730 let l = get_key_prefix_length(&candidate, &fk);
1731 candidate.truncate(l);
1732 }
1733 }
1734 }
1735 }
1736 self.apply_new_prefix(candidate);
1737 }
1738
1739 let suffix = self.compress_key(full_key);
1740
1741 match self.key_binary_search(&suffix) {
1742 Ok(idx) => {
1743 self.set_lsn(idx, lsn); // T-3
1744 self.entries[idx].data = data.map(|d| d.to_vec());
1745 self.entries[idx].dirty = true;
1746 (idx, false)
1747 }
1748 Err(idx) => {
1749 self.insert_slot(idx, suffix, lsn, data.map(|d| d.to_vec()));
1750 if self.key_prefix.is_empty() && self.entries.len() >= 2 {
1751 self.recompute_key_prefix();
1752 }
1753 (idx, true)
1754 }
1755 }
1756 }
1757
1758 /// Returns the number of slots that are marked dirty.
1759 ///
1760 /// `BIN.getNumDirtyEntries()`.
1761 pub fn dirty_count(&self) -> usize {
1762 self.entries.iter().filter(|e| e.dirty).count()
1763 }
1764
1765 /// Decide whether to log this BIN as a delta (true) or a full BIN (false).
1766 ///
1767 /// Faithful port of JE `BIN.shouldLogDelta()` (BIN.java:1892). The
1768 /// decision is COUNT-based (number of would-be delta slots vs a percent of
1769 /// `nEntries`), NOT a dirty-fraction-vs-hardcoded-0.25 heuristic:
1770 ///
1771 /// ```text
1772 /// if (isBINDelta()) { return true; } // already a delta
1773 /// if (isDeltaProhibited()) return false; // prohibit / no prior full
1774 /// numDeltas = getNDeltas();
1775 /// if (numDeltas <= 0) return false; // empty delta is invalid
1776 /// deltaLimit = (getNEntries() * binDeltaPercent) / 100; // INTEGER math
1777 /// return numDeltas <= deltaLimit;
1778 /// ```
1779 ///
1780 /// `numDeltas` (JE `getNDeltas`) is the count of slots that would appear in
1781 /// the delta — i.e. the dirty slots since the last full BIN — which here is
1782 /// `dirty_count()`. `binDeltaPercent` is the CONFIGURABLE `TREE_BIN_DELTA`
1783 /// param (JE `DatabaseImpl.getBinDeltaPercent()`, default 25), threaded in
1784 /// by the checkpointer — NOT a hardcoded constant.
1785 ///
1786 /// `isDeltaProhibited()` (BIN.java:1867) is
1787 /// `getProhibitNextDelta() || isDeferredWriteMode() || lastFullLsn == NULL`.
1788 /// Deferred-write mode is not modelled in the runtime stub; the other two
1789 /// terms are.
1790 ///
1791 /// JE ref: `BIN.shouldLogDelta` (BIN.java:1892), `BIN.isDeltaProhibited`
1792 /// (BIN.java:1867).
1793 pub fn should_log_delta(&self, bin_delta_percent: i32) -> bool {
1794 // Already a delta: re-log as a delta. JE asserts !prohibitNextDelta
1795 // and lastFullLsn != NULL here.
1796 if self.is_delta {
1797 return self.last_full_lsn != NULL_LSN && !self.prohibit_next_delta;
1798 }
1799
1800 // isDeltaProhibited(): cheapest checks first.
1801 if self.prohibit_next_delta || self.last_full_lsn == NULL_LSN {
1802 return false;
1803 }
1804
1805 // numDeltas = getNDeltas(): the dirty slots that would be in the delta.
1806 let num_deltas = self.dirty_count() as i32;
1807
1808 // A delta with zero items is not valid.
1809 if num_deltas <= 0 {
1810 return false;
1811 }
1812
1813 // Configured BinDeltaPercent limit — INTEGER math, exactly as JE.
1814 let delta_limit = (self.entries.len() as i32 * bin_delta_percent) / 100;
1815 num_deltas <= delta_limit
1816 }
1817
1818 /// Comparator-aware binary search: finds `full_key` using `cmp`.
1819 ///
1820 /// Unlike `find_entry_compressed` (which uses suffix-based lexicographic
1821 /// comparison), this decompresses each entry's key to its full form and
1822 /// applies the provided comparator — required for sorted-dup databases
1823 /// where lexicographic suffix comparison would give wrong results when
1824 /// different-length primary keys are in the same BIN.
1825 ///
1826 /// Returns `(idx, exact)`. Does NOT do prefix compression.
1827 ///
1828 /// `IN.findEntry` with btreeComparator active.
1829 pub fn find_entry_cmp(
1830 &self,
1831 full_key: &[u8],
1832 cmp: &dyn Fn(&[u8], &[u8]) -> std::cmp::Ordering,
1833 ) -> (usize, bool) {
1834 // Hot path: avoid per-comparison Vec<u8> allocation.
1835 // When key_prefix is empty the stored suffix IS the full key, so we
1836 // pass the suffix slice directly. When prefix is non-empty we build a
1837 // temporary concatenation only once per comparison using a small
1838 // stack-local Vec that is dropped immediately after the call — this
1839 // still allocates but is limited to O(key_len) bytes per call and
1840 // avoids retaining any heap state between comparisons.
1841 if self.key_prefix.is_empty() {
1842 match self.key_binary_search_by(|s| cmp(s, full_key)) {
1843 Ok(idx) => (idx, true),
1844 Err(idx) => (idx, false),
1845 }
1846 } else {
1847 let prefix = self.key_prefix.as_slice();
1848 match self.key_binary_search_by(|s| {
1849 let mut fk = Vec::with_capacity(prefix.len() + s.len());
1850 fk.extend_from_slice(prefix);
1851 fk.extend_from_slice(s);
1852 cmp(&fk, full_key)
1853 }) {
1854 Ok(idx) => (idx, true),
1855 Err(idx) => (idx, false),
1856 }
1857 }
1858 }
1859
1860 /// Comparator-driven binary search over the node-level key rep (T-2).
1861 /// `cmp(stored_suffix)` returns how the stored slot compares to the
1862 /// search key.
1863 #[inline]
1864 fn key_binary_search_by(
1865 &self,
1866 mut cmp: impl FnMut(&[u8]) -> std::cmp::Ordering,
1867 ) -> Result<usize, usize> {
1868 let mut lo = 0usize;
1869 let mut hi = self.keys.len();
1870 while lo < hi {
1871 let mid = lo + (hi - lo) / 2;
1872 match cmp(self.keys.get(mid)) {
1873 std::cmp::Ordering::Less => lo = mid + 1,
1874 std::cmp::Ordering::Greater => hi = mid,
1875 std::cmp::Ordering::Equal => return Ok(mid),
1876 }
1877 }
1878 Err(lo)
1879 }
1880
1881 /// Returns the LSN of the slot matching `full_key`, if one exists.
1882 ///
1883 /// Used by the recovery LN-redo apply to enforce JE's currency check
1884 /// (`RecoveryManager.redo()` line ~2512): a logged LN is applied only
1885 /// when `logrecLsn > treeLsn`. Returns `None` when the key is absent
1886 /// (always apply). Uses the same lookup variant the matching insert
1887 /// path uses so the comparison is over the right slot.
1888 pub fn redo_slot_lsn(
1889 &self,
1890 full_key: &[u8],
1891 cmp: Option<&dyn Fn(&[u8], &[u8]) -> std::cmp::Ordering>,
1892 key_prefixing: bool,
1893 ) -> Option<Lsn> {
1894 let (idx, found) = match cmp {
1895 Some(c) => self.find_entry_cmp(full_key, c),
1896 None if key_prefixing => self.find_entry_compressed(full_key),
1897 None => {
1898 // insert_raw path: full keys stored verbatim.
1899 match self.key_binary_search(full_key) {
1900 Ok(idx) => (idx, true),
1901 Err(idx) => (idx, false),
1902 }
1903 }
1904 };
1905 if found { Some(self.get_lsn(idx)) } else { None }
1906 }
1907
1908 /// Raw insert (no prefix compression) for databases with
1909 /// `key_prefixing = false`.
1910 ///
1911 /// JE `IN.computeKeyPrefix` returns `null` when
1912 /// `databaseImpl.getKeyPrefixing()` is `false`, so no prefix is ever
1913 /// set on those BINs. Noxu was previously ignoring the flag and always
1914 /// calling `insert_with_prefix`; this method provides the faithful path.
1915 ///
1916 /// The key is stored verbatim (no suffix stripping). An existing
1917 /// `key_prefix` on the BIN is left untouched; callers must ensure it is
1918 /// empty (split_child already guarantees this for new BINs when
1919 /// `key_prefixing = false`).
1920 ///
1921 /// Returns `(slot_index, is_new_insert)`.
1922 ///
1923 /// Ref: `IN.java computeKeyPrefix` ~line 2456,
1924 /// `DatabaseConfig.setKeyPrefixing` / `DatabaseImpl.getKeyPrefixing`.
1925 pub fn insert_raw(
1926 &mut self,
1927 full_key: Vec<u8>,
1928 lsn: Lsn,
1929 data: Option<Vec<u8>>,
1930 ) -> (usize, bool) {
1931 // Binary search on the stored (full) keys.
1932 // When key_prefix is empty entries store full keys directly; for
1933 // key_prefixing=false DBs the prefix is always empty.
1934 match self.key_binary_search(full_key.as_slice()) {
1935 Ok(idx) => {
1936 self.set_lsn(idx, lsn); // T-3
1937 self.entries[idx].data = data;
1938 self.entries[idx].dirty = true;
1939 (idx, false)
1940 }
1941 Err(idx) => {
1942 self.insert_slot(idx, full_key, lsn, data);
1943 (idx, true)
1944 }
1945 }
1946 }
1947
1948 /// Comparator-aware insert: inserts `full_key` into the BIN using `cmp`.
1949 ///
1950 /// Prefix compression is DISABLED: the key is stored as-is. This is
1951 /// intentional for sorted-dup databases where the custom comparator
1952 /// requires full-key access at every comparison.
1953 ///
1954 /// Returns `(slot_index, is_new_insert)`.
1955 ///
1956 pub fn insert_cmp(
1957 &mut self,
1958 full_key: Vec<u8>,
1959 lsn: Lsn,
1960 data: Option<Vec<u8>>,
1961 cmp: &dyn Fn(&[u8], &[u8]) -> std::cmp::Ordering,
1962 ) -> (usize, bool) {
1963 if self.key_prefix.is_empty() {
1964 match self.key_binary_search_by(|s| cmp(s, &full_key)) {
1965 Ok(idx) => {
1966 self.set_lsn(idx, lsn); // T-3
1967 self.entries[idx].data = data;
1968 self.entries[idx].dirty = true;
1969 (idx, false)
1970 }
1971 Err(idx) => {
1972 self.insert_slot(idx, full_key, lsn, data);
1973 (idx, true)
1974 }
1975 }
1976 } else {
1977 let prefix = self.key_prefix.clone();
1978 match self.key_binary_search_by(|s| {
1979 let mut fk = Vec::with_capacity(prefix.len() + s.len());
1980 fk.extend_from_slice(&prefix);
1981 fk.extend_from_slice(s);
1982 cmp(&fk, &full_key)
1983 }) {
1984 Ok(idx) => {
1985 // Key exists — update in place.
1986 self.set_lsn(idx, lsn); // T-3
1987 self.entries[idx].data = data;
1988 self.entries[idx].dirty = true;
1989 (idx, false)
1990 }
1991 Err(idx) => {
1992 // New key — insert at sorted position (no prefix compression).
1993 self.insert_slot(idx, full_key, lsn, data);
1994 (idx, true)
1995 }
1996 }
1997 }
1998 }
1999
2000 /// Comparator-aware delete: removes `full_key` from the BIN using `cmp`.
2001 ///
2002 /// Returns `true` if the entry was found and removed.
2003 pub fn delete_cmp(
2004 &mut self,
2005 full_key: &[u8],
2006 cmp: &dyn Fn(&[u8], &[u8]) -> std::cmp::Ordering,
2007 ) -> bool {
2008 let result = if self.key_prefix.is_empty() {
2009 self.key_binary_search_by(|s| cmp(s, full_key))
2010 } else {
2011 let prefix = self.key_prefix.clone();
2012 self.key_binary_search_by(|s| {
2013 let mut fk = Vec::with_capacity(prefix.len() + s.len());
2014 fk.extend_from_slice(&prefix);
2015 fk.extend_from_slice(s);
2016 cmp(&fk, full_key)
2017 })
2018 };
2019 match result {
2020 Ok(idx) => {
2021 self.entries.remove(idx);
2022 self.keys.remove(idx); // T-2
2023 self.lsn_rep.remove_shift(idx); // T-3
2024 self.dirty = true;
2025 true
2026 }
2027 Err(_) => false,
2028 }
2029 }
2030
2031 /// Serialise ALL entries (full BIN write).
2032 ///
2033 /// Format (per slot): key_len(u32BE) | key | lsn(u64BE) |
2034 /// has_data(u8) | data_len(u32BE) | data | known_deleted(u8)
2035 ///
2036 /// Prepended by: node_id(u64BE) | num_entries(u32BE).
2037 ///
2038 /// `BIN.writeToLog()` (non-delta path).
2039 pub fn serialize_full(&self) -> Vec<u8> {
2040 let mut buf = Vec::new();
2041 buf.extend_from_slice(&self.node_id.to_be_bytes());
2042 buf.extend_from_slice(&(self.entries.len() as u32).to_be_bytes());
2043 for i in 0..self.entries.len() {
2044 let full_key = self.get_full_key(i).unwrap_or_default();
2045 buf.extend_from_slice(&(full_key.len() as u32).to_be_bytes());
2046 buf.extend_from_slice(&full_key);
2047 let lsn = self.get_lsn(i); // T-3
2048 let e = &self.entries[i];
2049 buf.extend_from_slice(&lsn.as_u64().to_be_bytes());
2050 if let Some(d) = &e.data {
2051 buf.push(1u8);
2052 buf.extend_from_slice(&(d.len() as u32).to_be_bytes());
2053 buf.extend_from_slice(d);
2054 } else {
2055 buf.push(0u8);
2056 }
2057 buf.push(e.known_deleted as u8);
2058 }
2059 buf
2060 }
2061
2062 /// Serialise only dirty slots (BIN-delta write).
2063 ///
2064 /// Format (per dirty slot): slot_idx(u32BE) | key_len(u32BE) | key |
2065 /// lsn(u64BE) | has_data(u8) | data_len(u32BE) | data | known_deleted(u8)
2066 ///
2067 /// Prepended by: node_id(u64BE) | num_dirty(u32BE).
2068 ///
2069 /// `BIN.writeToLog()` (delta path).
2070 pub fn serialize_delta(&self) -> Vec<u8> {
2071 let dirty: Vec<usize> = (0..self.entries.len())
2072 .filter(|&i| self.entries[i].dirty)
2073 .collect();
2074 let mut buf = Vec::new();
2075 buf.extend_from_slice(&self.node_id.to_be_bytes());
2076 buf.extend_from_slice(&(dirty.len() as u32).to_be_bytes());
2077 for idx in dirty {
2078 buf.extend_from_slice(&(idx as u32).to_be_bytes());
2079 let full_key = self.get_full_key(idx).unwrap_or_default();
2080 buf.extend_from_slice(&(full_key.len() as u32).to_be_bytes());
2081 buf.extend_from_slice(&full_key);
2082 let lsn = self.get_lsn(idx); // T-3
2083 let e = &self.entries[idx];
2084 buf.extend_from_slice(&lsn.as_u64().to_be_bytes());
2085 if let Some(d) = &e.data {
2086 buf.push(1u8);
2087 buf.extend_from_slice(&(d.len() as u32).to_be_bytes());
2088 buf.extend_from_slice(d);
2089 } else {
2090 buf.push(0u8);
2091 }
2092 buf.push(e.known_deleted as u8);
2093 }
2094 buf
2095 }
2096
2097 /// Deserialise a full BIN from the bytes produced by `serialize_full()`.
2098 ///
2099 /// Returns a `BinStub` with all entries populated and all slots marked
2100 /// clean (they are already on disk at `last_full_lsn`). Returns `None`
2101 /// if the byte slice is malformed.
2102 ///
2103 /// `INLogEntry.readEntry()` / `IN.readFromLog()` (non-delta).
2104 pub fn deserialize_full(bytes: &[u8]) -> Option<BinStub> {
2105 if bytes.len() < 12 {
2106 return None;
2107 }
2108 let node_id = u64::from_be_bytes(bytes[0..8].try_into().ok()?);
2109 let num_entries =
2110 u32::from_be_bytes(bytes[8..12].try_into().ok()?) as usize;
2111 let mut pos = 12usize;
2112 let mut entries = Vec::with_capacity(num_entries);
2113 let mut lsns: Vec<Lsn> = Vec::with_capacity(num_entries);
2114 let mut keys: Vec<Vec<u8>> = Vec::with_capacity(num_entries); // T-2
2115 for _ in 0..num_entries {
2116 // key_len(u32BE) | key | lsn(u64BE) | has_data(u8) [| data_len(u32BE) | data] | known_deleted(u8)
2117 if pos + 4 > bytes.len() {
2118 return None;
2119 }
2120 let key_len =
2121 u32::from_be_bytes(bytes[pos..pos + 4].try_into().ok()?)
2122 as usize;
2123 pos += 4;
2124 if pos + key_len > bytes.len() {
2125 return None;
2126 }
2127 let key = bytes[pos..pos + key_len].to_vec();
2128 pos += key_len;
2129 if pos + 8 > bytes.len() {
2130 return None;
2131 }
2132 let lsn = Lsn::from_u64(u64::from_be_bytes(
2133 bytes[pos..pos + 8].try_into().ok()?,
2134 ));
2135 pos += 8;
2136 if pos + 1 > bytes.len() {
2137 return None;
2138 }
2139 let has_data = bytes[pos] != 0;
2140 pos += 1;
2141 let data = if has_data {
2142 if pos + 4 > bytes.len() {
2143 return None;
2144 }
2145 let data_len =
2146 u32::from_be_bytes(bytes[pos..pos + 4].try_into().ok()?)
2147 as usize;
2148 pos += 4;
2149 if pos + data_len > bytes.len() {
2150 return None;
2151 }
2152 let d = bytes[pos..pos + data_len].to_vec();
2153 pos += data_len;
2154 Some(d)
2155 } else {
2156 None
2157 };
2158 if pos + 1 > bytes.len() {
2159 return None;
2160 }
2161 let known_deleted = bytes[pos] != 0;
2162 pos += 1;
2163 entries.push(BinEntry {
2164 data,
2165 known_deleted,
2166 dirty: false, // freshly loaded from log — clean
2167 expiration_time: 0,
2168 });
2169 keys.push(key); // T-2 (full keys; recompute_key_prefix compresses)
2170 lsns.push(lsn); // T-3
2171 }
2172 // Keys stored in the serialized format are full (uncompressed) keys.
2173 // Re-establish the key prefix after loading so that memory use and
2174 // search performance match an in-memory BIN.
2175 // `IN.readFromLog()` → key prefix is part of the wire
2176 // format in the; in Noxu we store full keys and recompute on load.
2177 let mut bin = BinStub {
2178 node_id,
2179 level: BIN_LEVEL,
2180 entries,
2181 key_prefix: Vec::new(),
2182 dirty: false,
2183 is_delta: false,
2184 last_full_lsn: NULL_LSN, // caller sets this to the logged LSN
2185 last_delta_lsn: NULL_LSN,
2186 generation: 0,
2187 parent: None,
2188 expiration_in_hours: true,
2189 cursor_count: 0,
2190 prohibit_next_delta: false,
2191 lsn_rep: LsnRep::from_lsns(&lsns), // T-3
2192 keys: KeyRep::from_keys(keys), // T-2 (full keys, no prefix yet)
2193 compact_max_key_length: INKeyRep_DEFAULT_MAX_KEY_LENGTH,
2194 };
2195 // Recompute key prefix from the full keys just loaded.
2196 // `IN.recalcKeyPrefix()` called after materializing from log.
2197 if bin.entries.len() >= 2 {
2198 bin.recompute_key_prefix();
2199 } else {
2200 // Even a single-slot BIN should attempt compaction.
2201 bin.keys.compact(bin.compact_max_key_length);
2202 }
2203 Some(bin)
2204 }
2205
2206 /// Deserialise a BIN delta from the bytes produced by `serialize_delta()`.
2207 ///
2208 /// **DO NOT USE for BIN reconstruction.** This helper writes full
2209 /// (uncompressed) keys directly into slots without recomputing the BIN
2210 /// key prefix, so on a prefix-compressed BIN it corrupts the slot keys and
2211 /// breaks the sorted-suffix invariant. It is NOT wired into any live path.
2212 /// The correct delta-reconstruction path is
2213 /// `mutate_to_full_bin` → `apply_delta_to_bin` → `insert_with_prefix`,
2214 /// which recomputes the prefix. This function is retained only for the
2215 /// raw byte-format round-trip and must not be used to reconstitute a BIN.
2216 /// Tracked for removal — see the v3.x review synthesis (storage C-2).
2217 ///
2218 /// Returns `None` if `delta_bytes` is malformed.
2219 pub fn apply_delta(base: &mut BinStub, delta_bytes: &[u8]) -> Option<()> {
2220 if delta_bytes.len() < 12 {
2221 return None;
2222 }
2223 // node_id(u64BE) — must match base
2224 let _node_id = u64::from_be_bytes(delta_bytes[0..8].try_into().ok()?);
2225 let num_dirty =
2226 u32::from_be_bytes(delta_bytes[8..12].try_into().ok()?) as usize;
2227 let mut pos = 12usize;
2228 for _ in 0..num_dirty {
2229 // slot_idx(u32BE) | key_len(u32BE) | key | lsn(u64BE) | has_data(u8) [| data_len | data] | known_deleted(u8)
2230 if pos + 4 > delta_bytes.len() {
2231 return None;
2232 }
2233 let slot_idx =
2234 u32::from_be_bytes(delta_bytes[pos..pos + 4].try_into().ok()?)
2235 as usize;
2236 pos += 4;
2237 if pos + 4 > delta_bytes.len() {
2238 return None;
2239 }
2240 let key_len =
2241 u32::from_be_bytes(delta_bytes[pos..pos + 4].try_into().ok()?)
2242 as usize;
2243 pos += 4;
2244 if pos + key_len > delta_bytes.len() {
2245 return None;
2246 }
2247 let key = delta_bytes[pos..pos + key_len].to_vec();
2248 pos += key_len;
2249 if pos + 8 > delta_bytes.len() {
2250 return None;
2251 }
2252 let lsn = Lsn::from_u64(u64::from_be_bytes(
2253 delta_bytes[pos..pos + 8].try_into().ok()?,
2254 ));
2255 pos += 8;
2256 if pos + 1 > delta_bytes.len() {
2257 return None;
2258 }
2259 let has_data = delta_bytes[pos] != 0;
2260 pos += 1;
2261 let data = if has_data {
2262 if pos + 4 > delta_bytes.len() {
2263 return None;
2264 }
2265 let data_len = u32::from_be_bytes(
2266 delta_bytes[pos..pos + 4].try_into().ok()?,
2267 ) as usize;
2268 pos += 4;
2269 if pos + data_len > delta_bytes.len() {
2270 return None;
2271 }
2272 let d = delta_bytes[pos..pos + data_len].to_vec();
2273 pos += data_len;
2274 Some(d)
2275 } else {
2276 None
2277 };
2278 if pos + 1 > delta_bytes.len() {
2279 return None;
2280 }
2281 let known_deleted = delta_bytes[pos] != 0;
2282 pos += 1;
2283
2284 // Apply to base: update existing slot or insert new one.
2285 if slot_idx < base.entries.len() {
2286 base.keys.set(slot_idx, key); // T-2
2287 base.set_lsn(slot_idx, lsn); // T-3
2288 base.entries[slot_idx].data = data;
2289 base.entries[slot_idx].known_deleted = known_deleted;
2290 base.entries[slot_idx].dirty = false;
2291 } else {
2292 // Slot index beyond current length — append.
2293 base.entries.push(BinEntry {
2294 data,
2295 known_deleted,
2296 dirty: false,
2297 expiration_time: 0,
2298 });
2299 let n = base.entries.len();
2300 base.keys.insert(n - 1, key); // T-2
2301 base.lsn_rep.set(n - 1, lsn, n); // T-3
2302 }
2303 }
2304 Some(())
2305 }
2306
2307 /// Clear per-slot dirty flags and record `logged_at` as the LSN at which
2308 /// this BIN was last fully logged.
2309 ///
2310 /// Called by the checkpoint path after a successful full-BIN log write.
2311 /// `BIN.afterLog()` / `BIN.setLastFullLsn()`.
2312 pub fn clear_dirty_after_full_log(&mut self, logged_at: Lsn) {
2313 for e in &mut self.entries {
2314 e.dirty = false;
2315 }
2316 self.last_full_lsn = logged_at;
2317 self.dirty = false;
2318 // A full BIN captures all current state, so the delta-chain bound is
2319 // cleared: the next log may once again be a delta.
2320 // JE `IN.afterLog` clears the prohibit flag after a full log
2321 // (IN.java:5557 `bin.setProhibitNextDelta(false)`).
2322 self.prohibit_next_delta = false;
2323 }
2324
2325 /// Clear per-slot dirty flags after a successful delta log write.
2326 ///
2327 /// `last_full_lsn` is NOT updated — the full LSN only changes after a
2328 /// full BIN write.
2329 /// `BIN.afterLog()` (delta path).
2330 pub fn clear_dirty_after_delta_log(&mut self) {
2331 for e in &mut self.entries {
2332 e.dirty = false;
2333 }
2334 self.dirty = false;
2335 }
2336}
2337
2338impl TreeNode {
2339 /// Returns true if this is a BIN (bottom internal node).
2340 pub fn is_bin(&self) -> bool {
2341 matches!(self, TreeNode::Bottom(_))
2342 }
2343
2344 /// Returns the level of this node.
2345 pub fn level(&self) -> i32 {
2346 match self {
2347 TreeNode::Internal(n) => n.level,
2348 TreeNode::Bottom(b) => b.level,
2349 }
2350 }
2351
2352 /// Returns the node id of this node.
2353 pub fn node_id(&self) -> u64 {
2354 match self {
2355 TreeNode::Internal(n) => n.node_id,
2356 TreeNode::Bottom(b) => b.node_id,
2357 }
2358 }
2359
2360 /// Faithful in-memory heap footprint of this node, in bytes.
2361 ///
2362 /// JE `IN.getBudgetedMemorySize()` (IN.java) returns the running
2363 /// `inMemorySize` that `MemoryBudget` tracks for the node: the fixed
2364 /// IN/BIN struct overhead plus, per slot, the fixed entry overhead and the
2365 /// variable key (and embedded-LN data for BINs) bytes. This is the single
2366 /// source of truth for both the live tree accounting and the evictor's
2367 /// detach credit (EV-13) — keeping it on `TreeNode` avoids the formula
2368 /// drifting between `noxu-tree` and `noxu-evictor`.
2369 ///
2370 /// Rust has a fixed struct layout (unlike JE's `Sizeof`-measured JVM
2371 /// constants) so `size_of` is exact for the fixed overheads; the variable
2372 /// part mirrors JE's per-slot `entryKeys`/embedded-data accounting.
2373 pub fn budgeted_memory_size(&self) -> u64 {
2374 use std::mem::size_of;
2375 match self {
2376 TreeNode::Bottom(b) => {
2377 (size_of::<BinStub>()
2378 + b.entries.len() * size_of::<BinEntry>()
2379 + b.key_prefix.len()
2380 + b.keys.memory_size() // T-2: node-level key rep bytes
2381 + b.lsn_rep.memory_size() // T-3: node-level LSN rep bytes
2382 + b.entries
2383 .iter()
2384 .map(|e| {
2385 e.data.as_ref().map(|d| d.len()).unwrap_or(0)
2386 })
2387 .sum::<usize>()) as u64
2388 }
2389 TreeNode::Internal(n) => {
2390 (size_of::<InNodeStub>()
2391 + n.entries.len() * size_of::<InEntry>()
2392 + n.targets.memory_size()
2393 + n.entries.iter().map(|e| e.key.len()).sum::<usize>())
2394 as u64
2395 }
2396 }
2397 }
2398
2399 /// Binary search for a key in this node.
2400 ///
2401 /// For BIN nodes the search is prefix-aware: if the BIN has a key prefix,
2402 /// `key` (a full, uncompressed key) is compared against stored suffixes
2403 /// after stripping the prefix.
2404 /// `IN.findEntry(key, indicateIfDuplicate, exact)`.
2405 ///
2406 /// Returns index with EXACT_MATCH flag set if exact match found.
2407 /// If exact is false, returns insertion point.
2408 pub fn find_entry(&self, key: &[u8], _indicator: bool, exact: bool) -> i32 {
2409 match self {
2410 TreeNode::Internal(n) => {
2411 let result = n
2412 .entries
2413 .binary_search_by(|entry| entry.key.as_slice().cmp(key));
2414 match result {
2415 Ok(idx) => (idx as i32) | EXACT_MATCH,
2416 Err(idx) => {
2417 if exact {
2418 -1
2419 } else {
2420 // Floor (not insertion point): the child slot to
2421 // descend into is the largest entry ≤ key. Slot 0
2422 // is the leftmost child, so a key below every
2423 // separator floors to 0. (St-H5: previously
2424 // returned the insertion point `idx`, which routes
2425 // one child too far right.)
2426 (idx as i32 - 1).max(0)
2427 }
2428 }
2429 }
2430 }
2431 TreeNode::Bottom(b) => {
2432 // Use prefix-aware search: the stored key is a suffix when
2433 // key_prefix is non-empty.
2434 let (idx, found) = b.find_entry_compressed(key);
2435 if found {
2436 (idx as i32) | EXACT_MATCH
2437 } else if exact {
2438 -1
2439 } else {
2440 idx as i32
2441 }
2442 }
2443 }
2444 }
2445
2446 /// Gets the number of entries in this node.
2447 pub fn get_n_entries(&self) -> usize {
2448 match self {
2449 TreeNode::Internal(n) => n.entries.len(),
2450 TreeNode::Bottom(b) => b.entries.len(),
2451 }
2452 }
2453
2454 // ========================================================================
2455 // Dirty flag
2456 // ========================================================================
2457
2458 /// Returns true if this node has been modified since last checkpoint.
2459 ///
2460 /// `IN.getDirty()`.
2461 pub fn is_dirty(&self) -> bool {
2462 match self {
2463 TreeNode::Internal(n) => n.dirty,
2464 TreeNode::Bottom(b) => b.dirty,
2465 }
2466 }
2467
2468 /// Sets or clears the dirty flag on this node.
2469 ///
2470 /// `IN.setDirty(boolean dirty)`.
2471 pub fn set_dirty(&mut self, dirty: bool) {
2472 match self {
2473 TreeNode::Internal(n) => n.dirty = dirty,
2474 TreeNode::Bottom(b) => b.dirty = dirty,
2475 }
2476 }
2477
2478 // ========================================================================
2479 // LRU generation
2480 // ========================================================================
2481
2482 /// Returns the LRU generation counter.
2483 ///
2484 /// `IN.getGeneration()`.
2485 pub fn get_generation(&self) -> u64 {
2486 match self {
2487 TreeNode::Internal(n) => n.generation,
2488 TreeNode::Bottom(b) => b.generation,
2489 }
2490 }
2491
2492 /// Sets the LRU generation counter.
2493 ///
2494 /// `IN.setGeneration(long gen)`.
2495 pub fn set_generation(&mut self, r#gen: u64) {
2496 match self {
2497 TreeNode::Internal(n) => n.generation = r#gen,
2498 TreeNode::Bottom(b) => b.generation = r#gen,
2499 }
2500 }
2501
2502 // ========================================================================
2503 // Parent pointer
2504 // ========================================================================
2505
2506 /// Returns a clone of the weak parent pointer, if any.
2507 pub fn get_parent(&self) -> Option<Weak<RwLock<TreeNode>>> {
2508 match self {
2509 TreeNode::Internal(n) => n.parent.clone(),
2510 TreeNode::Bottom(b) => b.parent.clone(),
2511 }
2512 }
2513
2514 /// Sets the weak parent pointer on this node.
2515 pub fn set_parent(&mut self, parent: Option<Weak<RwLock<TreeNode>>>) {
2516 match self {
2517 TreeNode::Internal(n) => n.parent = parent,
2518 TreeNode::Bottom(b) => b.parent = parent,
2519 }
2520 }
2521
2522 // ========================================================================
2523 // Log serialization
2524 // ========================================================================
2525
2526 /// Estimates the serialized byte size of this node for log/checkpoint use.
2527 ///
2528 /// `IN.getLogSize()` — Noxu-native serialization format.
2529 ///
2530 /// Format (big-endian):
2531 /// - node_id : 8 bytes
2532 /// - level : 4 bytes
2533 /// - n_entries : 4 bytes
2534 /// - dirty : 1 byte
2535 /// - For each entry:
2536 /// - key_len : 2 bytes
2537 /// - key : key_len bytes
2538 /// - lsn : 8 bytes
2539 pub fn log_size(&self) -> usize {
2540 // Fixed header: node_id(8) + level(4) + n_entries(4) + dirty(1)
2541 let mut size: usize = 8 + 4 + 4 + 1;
2542 match self {
2543 TreeNode::Internal(n) => {
2544 for entry in &n.entries {
2545 size += 2 + entry.key.len() + 8; // key_len + key + lsn
2546 }
2547 }
2548 TreeNode::Bottom(b) => {
2549 for i in 0..b.entries.len() {
2550 size += 2 + b.get_key(i).len() + 8; // key_len + key + lsn
2551 }
2552 }
2553 }
2554 size
2555 }
2556
2557 /// Serializes this node to bytes for log writing.
2558 ///
2559 /// `IN.writeToLog(ByteBuffer logBuffer)` — Noxu-native
2560 /// format matching `log_size()`.
2561 pub fn write_to_bytes(&self) -> Vec<u8> {
2562 let mut buf = Vec::with_capacity(self.log_size());
2563 match self {
2564 TreeNode::Internal(n) => {
2565 buf.extend_from_slice(&n.node_id.to_be_bytes());
2566 buf.extend_from_slice(&n.level.to_be_bytes());
2567 buf.extend_from_slice(&(n.entries.len() as u32).to_be_bytes());
2568 buf.push(n.dirty as u8);
2569 for (i, entry) in n.entries.iter().enumerate() {
2570 buf.extend_from_slice(
2571 &(entry.key.len() as u16).to_be_bytes(),
2572 );
2573 buf.extend_from_slice(&entry.key);
2574 buf.extend_from_slice(&n.get_lsn(i).as_u64().to_be_bytes());
2575 }
2576 }
2577 TreeNode::Bottom(b) => {
2578 buf.extend_from_slice(&b.node_id.to_be_bytes());
2579 buf.extend_from_slice(&b.level.to_be_bytes());
2580 buf.extend_from_slice(&(b.entries.len() as u32).to_be_bytes());
2581 buf.push(b.dirty as u8);
2582 for i in 0..b.entries.len() {
2583 let key = b.get_key(i);
2584 buf.extend_from_slice(&(key.len() as u16).to_be_bytes());
2585 buf.extend_from_slice(key);
2586 buf.extend_from_slice(&b.get_lsn(i).as_u64().to_be_bytes());
2587 }
2588 }
2589 }
2590 buf
2591 }
2592}
2593
2594/// Internal helper used during splits to carry entries of either node kind.
2595///
2596/// `BinStub` and `InNodeStub` store different entry types, so we need a
2597/// common wrapper to pass split slices around without code duplication.
2598enum SplitEntries {
2599 /// Upper-IN entries plus the parallel resident-child pointers (one per
2600 /// entry; `None` when the child is not cached) and the parallel per-slot
2601 /// LSNs (T-3: LSNs travel with their slots on a split, just like JE
2602 /// `IN.split` copies `entryLsnByteArray`/`entryLsnLongArray`).
2603 Internal(Vec<InEntry>, Vec<Option<ChildArc>>, Vec<Lsn>),
2604 /// BIN entries (metadata only) plus the parallel per-slot LSNs and the
2605 /// parallel FULL keys (T-2: keys live in the node-level `KeyRep`, not in
2606 /// `BinEntry`, so they travel as a separate `Vec<Vec<u8>>` of full keys
2607 /// through the split — the new BINs recompute their prefix from these).
2608 Bottom(Vec<BinEntry>, Vec<Lsn>, Vec<Vec<u8>>),
2609}
2610
2611impl SplitEntries {
2612 /// Returns the number of entries.
2613 fn len(&self) -> usize {
2614 match self {
2615 SplitEntries::Internal(v, _, _) => v.len(),
2616 SplitEntries::Bottom(v, _, _) => v.len(),
2617 }
2618 }
2619
2620 /// Returns the key at `index` as a slice.
2621 fn get_key(&self, index: usize) -> &[u8] {
2622 match self {
2623 SplitEntries::Internal(v, _, _) => v[index].key.as_slice(),
2624 SplitEntries::Bottom(_, _, k) => k[index].as_slice(),
2625 }
2626 }
2627
2628 /// Returns a sub-range `[lo, hi)` as a new `SplitEntries`.
2629 fn slice(&self, lo: usize, hi: usize) -> Self {
2630 match self {
2631 SplitEntries::Internal(v, c, l) => SplitEntries::Internal(
2632 v[lo..hi].to_vec(),
2633 c[lo..hi].to_vec(),
2634 l[lo..hi].to_vec(),
2635 ),
2636 SplitEntries::Bottom(v, l, k) => SplitEntries::Bottom(
2637 v[lo..hi].to_vec(),
2638 l[lo..hi].to_vec(),
2639 k[lo..hi].to_vec(),
2640 ),
2641 }
2642 }
2643}
2644
2645/// Tri-state outcome from one attempt at
2646/// `Tree::get_adjacent_bin_attempt`.
2647///
2648/// Distinguishes "the tree genuinely has no BIN in the requested
2649/// direction" (→ propagate as end-of-iteration) from "the path we
2650/// captured was invalidated by a concurrent split" (→ caller
2651/// retries from root). This split is necessary because the cursor
2652/// translates a `None` from `get_adjacent_bin` into
2653/// `OperationStatus::NotFound`, which is indistinguishable from a
2654/// real end-of-tree.
2655#[derive(Debug)]
2656enum AdjacentBinOutcome {
2657 /// A BIN was found in the requested direction. T-3: each slot carries its
2658 /// `Lsn` alongside the `BinEntry` (the LSN lives in the node's packed
2659 /// `LsnRep`, not in `BinEntry`, so the scan snapshot pairs them).
2660 Found(Vec<(BinEntry, Lsn, Vec<u8>)>),
2661 /// The tree genuinely has no BIN in the requested direction.
2662 NoAdjacent,
2663 /// A concurrent split invalidated our captured path; the
2664 /// caller should retry from root.
2665 SplitRaceRetry,
2666}
2667
2668/// Split hint for the `splitSpecial` heuristic.
2669///
2670/// JE `Tree.forceSplit` tracks `allLeftSideDescent` / `allRightSideDescent`
2671/// (true if **every** routing decision during the top-down descent followed
2672/// the leftmost / rightmost child). At split time, when one of those flags
2673/// is set, `IN.splitSpecial` forces the split index to 1 (left side) or
2674/// `nEntries - 1` (right side) instead of `nEntries / 2`.
2675///
2676/// Effect: for sequential-append workloads the left BIN stays near-full
2677/// after every split (only one entry migrates to the new sibling), cutting
2678/// the split count roughly in half and reducing write amplification.
2679///
2680/// Ref: `IN.java splitSpecial` ~line 4129, `Tree.java forceSplit` ~line 1907.
2681#[derive(Clone, Copy, Debug, PartialEq, Eq)]
2682enum SplitHint {
2683 /// Normal midpoint split (`n_entries / 2`).
2684 Normal,
2685 /// Key was at position 0 on every level of descent.
2686 /// → `split_index = 1` so left node keeps all but the first entry.
2687 AllLeft,
2688 /// Key was at the rightmost position on every level of descent.
2689 /// → `split_index = n_entries - 1` so left node keeps almost everything.
2690 AllRight,
2691}
2692
2693impl Tree {
2694 /// Creates a new empty tree.
2695 ///
2696 /// Constructor.
2697 pub fn new(database_id: u64, max_entries_per_node: usize) -> Self {
2698 Tree {
2699 database_id,
2700 max_entries_per_node,
2701 root: RwLock::new(None),
2702 root_latch: SharedLatch::new(LatchContext::new("TreeRoot"), false),
2703 root_log_lsn: RwLock::new(noxu_util::NULL_LSN),
2704 root_splits: AtomicU64::new(0),
2705 relatches_required: AtomicU64::new(0),
2706 key_comparator: None,
2707 memory_counter: None,
2708 in_list_listener: None,
2709 log_manager: None,
2710 redo_capacity_hint: 0,
2711 key_prefixing: false, // JE default: KEY_PREFIXING_DEFAULT = false
2712 compact_max_key_length: INKeyRep_DEFAULT_MAX_KEY_LENGTH, // T-5
2713 }
2714 }
2715
2716 /// Installs a shared memory counter for evictor / MemoryBudget feedback.
2717 ///
2718 /// → `env.getMemoryBudget().updateTreeMemoryUsage(delta)`
2719 ///. The counter is updated on every BIN entry insert/delete.
2720 pub fn set_memory_counter(&mut self, counter: Arc<AtomicI64>) {
2721 self.memory_counter = Some(counter);
2722 }
2723
2724 /// Installs the [`InListListener`] (the evictor) so node add/access/remove
2725 /// feed the LRU lists. JE: `INList` registration that feeds
2726 /// `Evictor.addBack`/`moveBack`/`remove`.
2727 pub fn set_in_list_listener(&mut self, listener: Arc<dyn InListListener>) {
2728 self.in_list_listener = Some(listener);
2729 }
2730
2731 /// Installs the [`noxu_log::LogManager`] so an evicted root IN can be
2732 /// re-materialized from its persisted LSN on the next access (EV-14).
2733 ///
2734 /// JE: the tree reaches the log through `database.getEnv().getLogManager()`
2735 /// for `ChildReference.fetchTarget`. Noxu installs it directly.
2736 pub fn set_log_manager(&mut self, lm: Arc<noxu_log::LogManager>) {
2737 self.log_manager = Some(lm);
2738 }
2739
2740 /// Drops this tree's `Arc<LogManager>` reference (EV-14 teardown).
2741 ///
2742 /// The env's `Drop` calls this on every tree it owns so the
2743 /// `Tree -> Arc<LogManager> -> Arc<FileManager>` chain cannot keep the
2744 /// FileManager (and its on-disk exclusive lock) alive past environment
2745 /// close. After this the tree can no longer re-fetch an evicted root
2746 /// from the log — which is correct, because the environment is shutting
2747 /// down and the tree is about to be dropped.
2748 pub fn clear_log_manager(&mut self) {
2749 self.log_manager = None;
2750 }
2751
2752 /// T-5: set the compact-key threshold (`TREE_COMPACT_MAX_KEY_LENGTH` /
2753 /// `IN.getCompactMaxKeyLength`). New BINs created by this tree inherit it;
2754 /// `<= 0` disables the compact key rep. Default 16.
2755 pub fn set_compact_max_key_length(&mut self, len: i32) {
2756 self.compact_max_key_length = len;
2757 }
2758
2759 /// Notify the listener that a node became resident (JE `Evictor.addBack`).
2760 #[inline]
2761 fn note_added(&self, node_id: u64) {
2762 if let Some(l) = &self.in_list_listener {
2763 l.note_ins_added(node_id);
2764 }
2765 }
2766
2767 /// Notify the listener that a resident node was accessed
2768 /// (JE `Evictor.moveBack` — LRU touch).
2769 #[inline]
2770 fn note_accessed(&self, node_id: u64) {
2771 if let Some(l) = &self.in_list_listener {
2772 l.note_ins_accessed(node_id);
2773 }
2774 }
2775
2776 /// Notify the listener that a node was removed (JE `Evictor.remove`).
2777 #[inline]
2778 fn note_removed(&self, node_id: u64) {
2779 if let Some(l) = &self.in_list_listener {
2780 l.note_ins_removed(node_id);
2781 }
2782 }
2783
2784 /// Creates a new empty tree with a custom key comparator.
2785 ///
2786 /// Used for sorted-duplicate databases where keys are two-part
2787 /// composite keys that require a custom ordering function.
2788 ///
2789 /// Constructor with `btreeComparator` parameter.
2790 pub fn new_with_comparator(
2791 database_id: u64,
2792 max_entries_per_node: usize,
2793 comparator: KeyComparatorFn,
2794 ) -> Self {
2795 Tree {
2796 database_id,
2797 max_entries_per_node,
2798 root: RwLock::new(None),
2799 root_latch: SharedLatch::new(LatchContext::new("TreeRoot"), false),
2800 root_log_lsn: RwLock::new(noxu_util::NULL_LSN),
2801 root_splits: AtomicU64::new(0),
2802 relatches_required: AtomicU64::new(0),
2803 key_comparator: Some(comparator),
2804 memory_counter: None,
2805 in_list_listener: None,
2806 log_manager: None,
2807 redo_capacity_hint: 0,
2808 key_prefixing: false,
2809 compact_max_key_length: INKeyRep_DEFAULT_MAX_KEY_LENGTH, // T-5
2810 }
2811 }
2812
2813 /// Sets the key-prefixing flag.
2814 ///
2815 /// When `true`, BIN key-prefix compression is enabled: shared leading
2816 /// bytes are factored out of each slot's key. When `false` (the
2817 /// default), keys are stored verbatim — matching JE
2818 /// `DatabaseConfig.setKeyPrefixing(false)` / `IN.computeKeyPrefix`
2819 /// returning `null`.
2820 ///
2821 /// Ref: `IN.java computeKeyPrefix` ~line 2456.
2822 pub fn set_key_prefixing(&mut self, enabled: bool) {
2823 self.key_prefixing = enabled;
2824 }
2825
2826 /// Sets the key comparator, replacing any existing one.
2827 pub fn set_comparator(&mut self, comparator: KeyComparatorFn) {
2828 self.key_comparator = Some(comparator);
2829 }
2830
2831 /// Store a capacity hint used by `redo_insert` when it creates the first
2832 /// BIN for this tree (the first-key path).
2833 ///
2834 /// The first BIN's `entries` Vec is pre-allocated with
2835 /// `capacity.min(max_entries_per_node)` slots, eliminating the
2836 /// Vec-resize doubling cycle (1 → 2 → 4 → … → cap) that would
2837 /// otherwise occur during the redo loop.
2838 ///
2839 /// Call once before the redo loop. Has no effect on `insert` (the
2840 /// normal, non-recovery path).
2841 ///
2842 /// Wave 11-K optimisation (Fix 3).
2843 pub fn hint_redo_capacity(&mut self, capacity: usize) {
2844 self.redo_capacity_hint = capacity;
2845 }
2846
2847 /// Returns the current redo capacity hint (0 = no hint set).
2848 pub fn get_redo_capacity_hint(&self) -> usize {
2849 self.redo_capacity_hint
2850 }
2851
2852 /// Takes the key comparator out of this tree (leaving None).
2853 pub fn take_comparator(&mut self) -> Option<KeyComparatorFn> {
2854 self.key_comparator.take()
2855 }
2856
2857 /// Returns a reference to the key comparator, if configured.
2858 ///
2859 /// Used by `CursorImpl::find_bin_for_key` (R4 fix) so the cursor's own
2860 /// IN-level descent uses the same comparator-aware floor slot as the
2861 /// tree's own search paths. Mirrors JE `DatabaseImpl.getKeyComparator()`.
2862 pub fn get_comparator(&self) -> Option<&KeyComparatorFn> {
2863 self.key_comparator.as_ref()
2864 }
2865
2866 /// Returns the key comparator if set, or performs lexicographic comparison.
2867 #[inline]
2868 fn key_cmp(&self, a: &[u8], b: &[u8]) -> std::cmp::Ordering {
2869 match &self.key_comparator {
2870 Some(cmp) => cmp(a, b),
2871 None => a.cmp(b),
2872 }
2873 }
2874
2875 /// Floor child slot index for descending an internal node: the largest
2876 /// slot whose key is ≤ `key`. Slot 0 carries a virtual −∞ key (always
2877 /// qualifies); `entries[1..]` are sorted ascending, so this binary-searches
2878 /// the partition point instead of an O(n) linear walk (St-H4). Uses
2879 /// `key_cmp` so a configured custom comparator is honoured on every descent
2880 /// path. Returns 0 for an empty/single-slot node.
2881 fn upper_in_floor_index(&self, entries: &[InEntry], key: &[u8]) -> usize {
2882 if entries.len() <= 1 {
2883 return 0;
2884 }
2885 entries[1..].partition_point(|e| {
2886 self.key_cmp(e.key.as_slice(), key) != std::cmp::Ordering::Greater
2887 })
2888 }
2889
2890 /// Returns true if the tree has no root (is empty).
2891 pub fn is_empty(&self) -> bool {
2892 self.root.read().is_none()
2893 }
2894
2895 /// Sets the root of the tree.
2896 ///
2897 /// Must hold root_latch exclusively before calling.
2898 pub fn set_root(&self, node: TreeNode) {
2899 *self.root.write() = Some(Arc::new(RwLock::new(node)));
2900 }
2901
2902 /// Returns the root Arc, if any.
2903 ///
2904 /// Returns a cloned `Arc` rather than a reference so the caller does not
2905 /// hold the inner `RwLock` guard.
2906 ///
2907 /// EV-14: when the in-memory root has been evicted (`evict_root`) but a
2908 /// persisted version exists (`root_log_lsn` set), this re-materializes it
2909 /// from the log before returning — the faithful equivalent of JE
2910 /// `Tree.getRootIN` always calling `root.fetchTarget(...)`. Returns
2911 /// `None` only for a genuinely empty tree (no resident root and no
2912 /// persisted root LSN).
2913 pub fn get_root(&self) -> Option<Arc<RwLock<TreeNode>>> {
2914 if let Some(r) = self.root.read().clone() {
2915 return Some(r);
2916 }
2917 // Root not resident: re-fetch it from `root_log_lsn` if one exists
2918 // (a no-op returning None when the tree was never populated).
2919 self.fetch_root_from_log()
2920 }
2921
2922 /// Returns the database ID.
2923 pub fn get_database_id(&self) -> u64 {
2924 self.database_id
2925 }
2926
2927 /// Count the total number of live (non-deleted) entries across all BINs.
2928 ///
2929 /// Used by `DatabaseImpl::set_recovered_tree()` to initialise the
2930 /// per-database `entry_count` AtomicU64 after recovery replays the log.
2931 pub fn count_entries(&self) -> u64 {
2932 let mut total = 0u64;
2933 if let Some(root) = self.get_root() {
2934 Self::count_entries_recursive(&root, &mut total);
2935 }
2936 total
2937 }
2938
2939 /// DBI-14: collect every live `(full_key, data, lsn)` triple in physical
2940 /// (left-to-right) order. Used by `resort_under_comparator` to rebuild a
2941 /// tree whose slots were laid out in byte order (e.g. by recovery redo,
2942 /// which has no access to the application comparator) under the real
2943 /// configured comparator.
2944 fn collect_all_entries(&self) -> Vec<(Vec<u8>, Vec<u8>, Lsn)> {
2945 let mut out = Vec::new();
2946 if let Some(root) = self.get_root() {
2947 Self::collect_all_entries_recursive(&root, &mut out);
2948 }
2949 out
2950 }
2951
2952 fn collect_all_entries_recursive(
2953 node_arc: &Arc<RwLock<TreeNode>>,
2954 out: &mut Vec<(Vec<u8>, Vec<u8>, Lsn)>,
2955 ) {
2956 let guard = node_arc.read();
2957 match &*guard {
2958 TreeNode::Bottom(b) => {
2959 for i in 0..b.entries.len() {
2960 if b.entries[i].known_deleted {
2961 continue;
2962 }
2963 if let Some(fk) = b.get_full_key(i) {
2964 let data =
2965 b.entries[i].data.clone().unwrap_or_default();
2966 out.push((fk, data, b.get_lsn(i)));
2967 }
2968 }
2969 }
2970 TreeNode::Internal(n) => {
2971 let children: Vec<Arc<RwLock<TreeNode>>> =
2972 n.resident_children();
2973 drop(guard);
2974 for child in &children {
2975 Self::collect_all_entries_recursive(child, out);
2976 }
2977 }
2978 }
2979 }
2980
2981 /// DBI-14: rebuild this tree so that its on-disk byte-ordered slot layout
2982 /// is re-sorted under the currently-configured key comparator.
2983 ///
2984 /// Recovery redo (`redo_insert`) has no access to the application's
2985 /// comparator function — only the persisted identity — so it lays keys
2986 /// out in unsigned-byte order. After `set_recovered_tree` attaches the
2987 /// real comparator, the slots must be re-sorted, or comparator-driven
2988 /// searches would binary-search a tree ordered by the wrong relation.
2989 ///
2990 /// No-op when no comparator is configured (byte order already matches the
2991 /// recovered layout) or when the tree is empty. Mirrors the effect of
2992 /// JE reconstructing the comparator at open and the tree always having
2993 /// been built under it.
2994 pub fn resort_under_comparator(&self) {
2995 if self.key_comparator.is_none() {
2996 return;
2997 }
2998 let entries = self.collect_all_entries();
2999 if entries.is_empty() {
3000 return;
3001 }
3002 // Drop the current root; re-insert every entry through the normal
3003 // comparator-aware insert path so the new layout obeys the comparator.
3004 *self.root.write() = None;
3005 *self.root_log_lsn.write() = noxu_util::NULL_LSN;
3006 for (key, data, lsn) in entries {
3007 // Best-effort: a failed re-insert would be a tree-structure bug;
3008 // surface it loudly in debug builds.
3009 let r = self.insert(key, data, lsn);
3010 debug_assert!(
3011 r.is_ok(),
3012 "resort_under_comparator: re-insert failed: {r:?}"
3013 );
3014 }
3015 }
3016
3017 fn count_entries_recursive(
3018 node_arc: &Arc<RwLock<TreeNode>>,
3019 total: &mut u64,
3020 ) {
3021 let guard = node_arc.read();
3022 match &*guard {
3023 TreeNode::Bottom(b) => {
3024 // Count only live (non-known_deleted) entries.
3025 *total += b.entries.iter().filter(|e| !e.known_deleted).count()
3026 as u64;
3027 }
3028 TreeNode::Internal(n) => {
3029 let children: Vec<Arc<RwLock<TreeNode>>> =
3030 n.resident_children();
3031 drop(guard);
3032 for child in children {
3033 Self::count_entries_recursive(&child, total);
3034 }
3035 }
3036 }
3037 }
3038
3039 /// Sum the real in-memory heap footprint of every resident node in the
3040 /// tree (DBI-23 oracle / reconciliation), in bytes.
3041 ///
3042 /// Walks all resident IN/BIN nodes and adds each node's
3043 /// `budgeted_memory_size` (JE `IN.getBudgetedMemorySize`). This is the
3044 /// authoritative "real heap" figure the incrementally-maintained
3045 /// `memory_counter` is meant to approximate; an engine can call it to
3046 /// reconcile counter drift, and the DBI-23 test uses it as the oracle the
3047 /// live counter must stay within tolerance of.
3048 pub fn total_budgeted_memory(&self) -> u64 {
3049 let mut total = 0u64;
3050 if let Some(root) = self.get_root() {
3051 Self::total_budgeted_memory_recursive(&root, &mut total);
3052 }
3053 total
3054 }
3055
3056 fn total_budgeted_memory_recursive(
3057 node_arc: &Arc<RwLock<TreeNode>>,
3058 total: &mut u64,
3059 ) {
3060 let guard = node_arc.read();
3061 *total += guard.budgeted_memory_size();
3062 if let TreeNode::Internal(n) = &*guard {
3063 let children: Vec<Arc<RwLock<TreeNode>>> = n.resident_children();
3064 drop(guard);
3065 for child in children {
3066 Self::total_budgeted_memory_recursive(&child, total);
3067 }
3068 }
3069 }
3070
3071 /// Search for a BIN that should contain the given key.
3072 ///
3073 /// This is the core tree traversal operation. It walks from root to BIN
3074 /// using latch-coupling (acquire child latch, then release parent latch).
3075 ///
3076 /// . Descends the tree until a BIN is
3077 /// reached, following the child pointer at the slot whose key is the
3078 /// largest key <= the search key (the "LTE" rule). Slot 0 in every upper
3079 /// IN carries a virtual key (-infinity) so any search key routes through
3080 /// it when all real keys are larger.
3081 ///
3082 /// Returns a SearchResult indicating where the key is or should be.
3083 /// Returns None if tree is empty.
3084 pub fn search(&self, key: &[u8]) -> Option<SearchResult> {
3085 let root = self.get_root()?;
3086
3087 // Hand-over-hand latch coupling for the descent. At each level we
3088 // hold a `parking_lot::ArcRwLockReadGuard` on the current node;
3089 // before dropping it, we acquire the child's read guard via
3090 // `Arc::read_arc`. This keeps a continuous chain of read locks
3091 // along the descent path so that no concurrent `split_child(parent,
3092 // …)` can run on a node we are about to enter — `split_child` takes
3093 // `parent.write()` to install the new sibling, and that write
3094 // blocks while we hold `parent.read()`. Without this, the prior
3095 // pattern (capture child Arc, drop parent guard, then take child
3096 // read lock) left a window in which a split could relocate the
3097 // child entries: a search for a key that should have ended up in
3098 // the new sibling would instead reach the (now left-half) child
3099 // and return a false `NotFound`.
3100 //
3101 // `read_arc()` returns `ArcRwLockReadGuard<RawRwLock, TreeNode>`
3102 // — a guard that owns its own Arc reference, so it has no
3103 // borrow lifetime and can be held across loop iterations and
3104 // assignment.
3105 let mut guard: parking_lot::ArcRwLockReadGuard<
3106 parking_lot::RawRwLock,
3107 TreeNode,
3108 > = root.read_arc();
3109
3110 loop {
3111 if guard.is_bin() {
3112 // JE: IN.fetchTarget / CursorImpl access moves the reached
3113 // BIN toward the hot end of the evictor's LRU list
3114 // (Evictor.moveBack). A freshly split BIN that has not yet
3115 // been registered is added here (moveBack is add-if-absent).
3116 if let TreeNode::Bottom(bin) = &*guard {
3117 self.note_accessed(bin.node_id);
3118 }
3119 // Reached a BIN: final key lookup within the same guard.
3120 // Use indicate_if_duplicate=true so an exact match sets
3121 // EXACT_MATCH in the return value. Guard against -1 (not
3122 // found): -1i32 has all bits set, so the naive
3123 // `index & EXACT_MATCH != 0` check would incorrectly report
3124 // an exact match for a missing key.
3125 let (found, raw_idx) = match &*guard {
3126 TreeNode::Bottom(bin) => match &self.key_comparator {
3127 Some(cmp) => {
3128 let (idx, exact) =
3129 bin.find_entry_cmp(key, cmp.as_ref());
3130 (exact, idx as i32)
3131 }
3132 None => {
3133 let index = guard.find_entry(key, true, true);
3134 let exact =
3135 index >= 0 && (index & EXACT_MATCH != 0);
3136 (exact, index & 0xFFFF)
3137 }
3138 },
3139 _ => {
3140 let index = guard.find_entry(key, true, true);
3141 let exact = index >= 0 && (index & EXACT_MATCH != 0);
3142 (exact, index & 0xFFFF)
3143 }
3144 };
3145 // CursorImpl.isProbablyExpired(): if an exact match
3146 // was found, check whether the entry's TTL has already elapsed.
3147 // If it has, treat the slot as not found so callers skip it.
3148 //
3149 // TREE-F1: also treat a known_deleted slot as ABSENT on an
3150 // exact lookup, mirroring the tail of IN.findEntry
3151 // (IN.java:3197): `if (ret >= 0 && exact &&
3152 // isEntryKnownDeleted(ret & 0xffff)) return -1;`. KD slots
3153 // legitimately exist in live BINs during BIN-delta
3154 // reconstitution until the compressor reclaims them.
3155 let found = if found {
3156 if let TreeNode::Bottom(bin) = &*guard {
3157 let idx = (raw_idx & 0x7FFF) as usize;
3158 bin.slot_is_live(idx)
3159 } else {
3160 found
3161 }
3162 } else {
3163 found
3164 };
3165 return Some(SearchResult::with_values(found, raw_idx, false));
3166 }
3167
3168 // Upper IN: find the child slot with the largest key <= search
3169 // key, and capture the child Arc WHILE HOLDING the guard.
3170 // Slot 0 has a virtual key that compares as -infinity.
3171 let parent_arc =
3172 parking_lot::ArcRwLockReadGuard::rwlock(&guard).clone();
3173 let next_arc = match &*guard {
3174 TreeNode::Internal(n) => {
3175 if n.entries.is_empty() {
3176 return None;
3177 }
3178 // Walk forward as long as entry.key <= key, starting
3179 // from slot 0 (which always qualifies because its key
3180 // is the virtual -infinity key).
3181 let idx = self.upper_in_floor_index(&n.entries, key);
3182 match n.get_child(idx) {
3183 // Resident child: keep the hand-over-hand fast path.
3184 Some(c) => {
3185 let next_guard = c.read_arc();
3186 drop(guard);
3187 guard = next_guard;
3188 continue;
3189 }
3190 // EV-14/EV-13: child evicted — re-fetch it from its
3191 // slot LSN (JE ChildReference.fetchTarget). Must
3192 // drop the parent read guard to upgrade to a write
3193 // latch inside child_at_or_fetch.
3194 None => idx,
3195 }
3196 }
3197 TreeNode::Bottom(_) => {
3198 unreachable!("is_bin() returned false above")
3199 }
3200 };
3201 drop(guard);
3202 let child = self.child_at_or_fetch(&parent_arc, next_arc)?;
3203 guard = child.read_arc();
3204 }
3205 }
3206
3207 /// Combined search-and-fetch: descend once to the BIN and return the
3208 /// slot's data together with a reference to the BIN arc.
3209 ///
3210 /// Replaces the previous three-descent sequence on the `Database::get`
3211 /// hot path:
3212 /// 1. `Tree::search` — existence check only.
3213 /// 2. `CursorImpl::get_data_from_tree` — re-descended to fetch data.
3214 /// 3. `CursorImpl::find_bin_for_key` — re-descended for BIN pinning.
3215 ///
3216 /// One descent now does all three jobs. At the BIN level it uses the
3217 /// existing binary-search helper `find_entry_compressed` instead of the
3218 /// O(n) `iter().find()` used by `get_data_from_tree`.
3219 ///
3220 /// Returns `None` only when the tree is empty. Otherwise returns
3221 /// `Some(SlotFetch)` — callers must inspect `SlotFetch::found` to
3222 /// determine whether the key was present. The BIN read-guard is released
3223 /// before this method returns so callers may safely call `lock_ln`
3224 /// (which may block) without holding any tree latch.
3225 ///
3226 /// Wave-11-I — see the 2026 review.
3227 pub fn search_with_data(&self, key: &[u8]) -> Option<SlotFetch> {
3228 let root = self.get_root()?;
3229 let mut guard: parking_lot::ArcRwLockReadGuard<
3230 parking_lot::RawRwLock,
3231 TreeNode,
3232 > = root.read_arc();
3233
3234 loop {
3235 if guard.is_bin() {
3236 // Capture the BIN Arc before inspecting entries.
3237 let bin_arc =
3238 parking_lot::ArcRwLockReadGuard::rwlock(&guard).clone();
3239
3240 let (found, data, lsn, slot_index) = match &*guard {
3241 TreeNode::Bottom(bin) => {
3242 let (idx, exact) = match &self.key_comparator {
3243 Some(cmp) => bin.find_entry_cmp(key, cmp.as_ref()),
3244 None => bin.find_entry_compressed(key),
3245 };
3246 if exact {
3247 // TREE-F1: a slot is reported as found only when
3248 // live (not known_deleted, not TTL-expired) — the
3249 // same predicate used by Tree::search and the
3250 // cursor scan. Mirrors IN.findEntry (IN.java:3197)
3251 // and CursorImpl.isProbablyExpired.
3252 if bin.slot_is_live(idx) {
3253 let lsn = bin.get_lsn(idx); // T-3
3254 let e = &bin.entries[idx];
3255 (true, e.data.clone(), lsn.as_u64(), idx)
3256 } else {
3257 (false, None, 0u64, 0)
3258 }
3259 } else {
3260 (false, None, 0u64, 0)
3261 }
3262 }
3263 _ => (false, None, 0u64, 0),
3264 };
3265 // Release the BIN read guard before returning so the caller
3266 // can call lock_ln (which may block) without holding a latch.
3267 drop(guard);
3268 return Some(SlotFetch {
3269 found,
3270 data,
3271 lsn,
3272 slot_index,
3273 bin_arc,
3274 });
3275 }
3276
3277 // Upper IN: same hand-over-hand descent as `Tree::search`.
3278 let parent_arc =
3279 parking_lot::ArcRwLockReadGuard::rwlock(&guard).clone();
3280 let next_idx = match &*guard {
3281 TreeNode::Internal(n) => {
3282 if n.entries.is_empty() {
3283 return None;
3284 }
3285 // Slot 0 = virtual −∞; walk forward while entry.key ≤ key.
3286 let idx = self.upper_in_floor_index(&n.entries, key);
3287 match n.get_child(idx) {
3288 Some(c) => {
3289 let next_guard = c.read_arc();
3290 drop(guard);
3291 guard = next_guard;
3292 continue;
3293 }
3294 // EV-14/EV-13: re-fetch an evicted child from its LSN.
3295 None => idx,
3296 }
3297 }
3298 TreeNode::Bottom(_) => {
3299 unreachable!("is_bin() returned false above")
3300 }
3301 };
3302 drop(guard);
3303 let child = self.child_at_or_fetch(&parent_arc, next_idx)?;
3304 guard = child.read_arc();
3305 }
3306 }
3307
3308 /// Sets the expiration time (in absolute hours since Unix epoch) for an
3309 /// existing key's BIN slot.
3310 ///
3311 /// Returns `true` if the key was found and updated, `false` otherwise.
3312 ///
3313 /// Used by `Database::put_with_options()` to apply per-record TTL.
3314 /// `IN.entryExpiration` / `BIN.expirationInHours` path.
3315 pub fn update_key_expiration(
3316 &self,
3317 key: &[u8],
3318 expiration_hours: u32,
3319 ) -> bool {
3320 let root = match self.get_root() {
3321 Some(r) => r,
3322 None => return false,
3323 };
3324 // Hand-over-hand latch coupling for the descent. At the BIN we
3325 // need a write lock; we drop our read lock first and take the
3326 // write lock under the protection of the *outer* parent's read
3327 // lock (held by the previous loop iteration's guard). For the
3328 // first iteration there is no outer parent, but no `split_child`
3329 // can run on the root itself in that single-level case because
3330 // root splits go through `split_root_if_needed` which holds
3331 // `self.root.write()`. So the worst case is that the root is
3332 // promoted from a single BIN to a level-2 IN between our read
3333 // detect and our write — handled by the `is_bin` re-check
3334 // inside the write lock.
3335 //
3336 // We retry the descent up to a small bound to absorb the rare
3337 // case where a concurrent split moved this key into the new
3338 // sibling between the read-chain release and the write-lock
3339 // acquisition. Without the retry, the sole caller
3340 // (`Database::put_with_options`) would silently lose the TTL
3341 // for the affected key. Three attempts is generous: each
3342 // retry only races a single split and splits are infrequent.
3343 for _ in 0..3 {
3344 let mut guard: parking_lot::ArcRwLockReadGuard<
3345 parking_lot::RawRwLock,
3346 TreeNode,
3347 > = root.read_arc();
3348 let bin_arc;
3349 loop {
3350 if guard.is_bin() {
3351 bin_arc =
3352 parking_lot::ArcRwLockReadGuard::rwlock(&guard).clone();
3353 drop(guard);
3354 break;
3355 }
3356 let next_arc = match &*guard {
3357 TreeNode::Internal(n) => {
3358 if n.entries.is_empty() {
3359 return false;
3360 }
3361 let idx = self.upper_in_floor_index(&n.entries, key);
3362 match n.get_child(idx) {
3363 Some(c) => c,
3364 None => return false,
3365 }
3366 }
3367 TreeNode::Bottom(_) => unreachable!(),
3368 };
3369 let next_guard = next_arc.read_arc();
3370 drop(guard);
3371 guard = next_guard;
3372 }
3373
3374 // Now take the write lock on the BIN we descended to.
3375 let mut wguard = bin_arc.write();
3376 if let TreeNode::Bottom(bin) = &mut *wguard {
3377 let slot = if let Some(cmp) = &self.key_comparator {
3378 let (idx, exact) = bin.find_entry_cmp(key, cmp.as_ref());
3379 if exact { Some(idx) } else { None }
3380 } else {
3381 let (idx, exact) = bin.find_entry_compressed(key);
3382 if exact { Some(idx) } else { None }
3383 };
3384 if let Some(slot_idx) = slot
3385 && let Some(entry) = bin.entries.get_mut(slot_idx)
3386 {
3387 entry.expiration_time = expiration_hours;
3388 bin.expiration_in_hours = true;
3389 bin.dirty = true;
3390 return true;
3391 }
3392 }
3393 // Key not in this BIN — either it was never present or a
3394 // concurrent split moved it. Retry the descent; at most a
3395 // few iterations are needed to follow the key into its new
3396 // BIN.
3397 }
3398 false
3399 }
3400
3401 /// Returns the key and data of the first BIN entry at or after `key`.
3402 ///
3403 /// Descends with the tree's key comparator (same path as `search()`), then
3404 /// within the BIN finds the first slot whose stored key >= `key` using the
3405 /// comparator. Returns `None` if every entry in the tree is < `key`.
3406 ///
3407 /// Used by sorted-duplicate cursor `search(Set)` to position at the first
3408 /// (key, data) pair whose two-part key >= `lower_bound(primary_key)`.
3409 ///
3410 /// → BIN scan path.
3411 pub fn first_entry_at_or_after(
3412 &self,
3413 key: &[u8],
3414 ) -> Option<(Vec<u8>, Vec<u8>, u64)> {
3415 // Hand-over-hand latch coupling — see Tree::search for the
3416 // detailed rationale on why this closes a reader-vs-splitter
3417 // race window.
3418 let mut guard: parking_lot::ArcRwLockReadGuard<
3419 parking_lot::RawRwLock,
3420 TreeNode,
3421 > = self.get_root()?.read_arc();
3422
3423 loop {
3424 if guard.is_bin() {
3425 let result = match &*guard {
3426 TreeNode::Bottom(bin) => {
3427 let (mut idx, _exact) = match &self.key_comparator {
3428 Some(cmp) => bin.find_entry_cmp(key, cmp.as_ref()),
3429 None => bin.find_entry_compressed(key),
3430 };
3431 // TREE-F1: skip non-live slots (known_deleted /
3432 // TTL-expired) at/after the floor index, mirroring the
3433 // cursor getNext skip (CursorImpl.java:2062-2064).
3434 while idx < bin.entries.len() && !bin.slot_is_live(idx)
3435 {
3436 idx += 1;
3437 }
3438 if idx < bin.entries.len() {
3439 let full_key =
3440 bin.get_full_key(idx).unwrap_or_default();
3441 let data = bin.entries[idx]
3442 .data
3443 .clone()
3444 .unwrap_or_default();
3445 let lsn = bin.get_lsn(idx).as_u64(); // T-3
3446 Some((full_key, data, lsn))
3447 } else {
3448 None
3449 }
3450 }
3451 _ => None,
3452 };
3453 return result;
3454 }
3455
3456 // Upper IN: same descent as search().
3457 let parent_arc =
3458 parking_lot::ArcRwLockReadGuard::rwlock(&guard).clone();
3459 let next_idx = match &*guard {
3460 TreeNode::Internal(n) => {
3461 if n.entries.is_empty() {
3462 return None;
3463 }
3464 let idx = self.upper_in_floor_index(&n.entries, key);
3465 match n.get_child(idx) {
3466 Some(c) => {
3467 let next_guard = c.read_arc();
3468 drop(guard);
3469 guard = next_guard;
3470 continue;
3471 }
3472 None => idx, // EV-14/EV-13: re-fetch below.
3473 }
3474 }
3475 TreeNode::Bottom(_) => unreachable!(),
3476 };
3477 drop(guard);
3478 let child = self.child_at_or_fetch(&parent_arc, next_idx)?;
3479 guard = child.read_arc();
3480 }
3481 }
3482
3483 /// Like [`Tree::first_entry_at_or_after`] but also returns the BIN node
3484 /// (so callers may pin it) and the entry's slot index inside that
3485 /// BIN.
3486 ///
3487 /// Wave 11-N (Bug 2): `CursorImpl::search_dup` previously stored
3488 /// `current_index = 0` after a sorted-dup `Search`, which broke the
3489 /// fast-path of `retrieve_next` (and the slow path's
3490 /// `next_index = current_index + 1` arithmetic) for any primary
3491 /// that was not the first slot of its BIN. This helper hands back
3492 /// the real index so the cursor can be positioned correctly.
3493 ///
3494 /// CC-2 fix: uses the same `read_arc()` hand-over-hand latch coupling
3495 /// as every other descent method (`search`, `first_entry_at_or_after`,
3496 /// `get_first_node`, `get_adjacent_bin_attempt`). The original
3497 /// implementation did `arc.read().is_bin()` (lock acquired and released)
3498 /// then a SECOND `arc.read()` on the next line — a gap in which a
3499 /// concurrent split can promote the node (BIN→upper IN) or move the
3500 /// sought key to a new sibling, yielding a false "not found" for an
3501 /// existing key. Mirrors JE `Tree.searchSubTree` / `Tree.search`
3502 /// which hold the latch across the `is_bin()` test and the subsequent
3503 /// entry lookup.
3504 pub fn first_entry_at_or_after_with_index(
3505 &self,
3506 key: &[u8],
3507 ) -> Option<(
3508 Vec<u8>,
3509 Vec<u8>,
3510 usize,
3511 u64,
3512 std::sync::Arc<crate::NodeRwLock<TreeNode>>,
3513 )> {
3514 // Hand-over-hand latch coupling — identical strategy to
3515 // first_entry_at_or_after; the guard is held continuously across
3516 // is_bin() and the subsequent entry lookup so no split can
3517 // restructure the path between the two observations.
3518 let mut guard: parking_lot::ArcRwLockReadGuard<
3519 parking_lot::RawRwLock,
3520 TreeNode,
3521 > = self.get_root()?.read_arc();
3522 loop {
3523 if guard.is_bin() {
3524 if let TreeNode::Bottom(bin) = &*guard {
3525 let (idx, _exact) = match &self.key_comparator {
3526 Some(cmp) => bin.find_entry_cmp(key, cmp.as_ref()),
3527 None => bin.find_entry_compressed(key),
3528 };
3529 // TREE-F1: skip non-live slots (known_deleted /
3530 // TTL-expired) at/after the floor index
3531 // (CursorImpl.java:2062-2064).
3532 let mut idx = idx;
3533 while idx < bin.entries.len() && !bin.slot_is_live(idx) {
3534 idx += 1;
3535 }
3536 if idx < bin.entries.len() {
3537 let full_key =
3538 bin.get_full_key(idx).unwrap_or_default();
3539 let data =
3540 bin.entries[idx].data.clone().unwrap_or_default();
3541 let lsn = bin.get_lsn(idx).as_u64(); // T-3
3542 // Obtain the Arc for the BIN node the guard came from.
3543 // `ArcRwLockReadGuard::rwlock()` returns the backing Arc.
3544 let bin_arc =
3545 parking_lot::ArcRwLockReadGuard::rwlock(&guard)
3546 .clone();
3547 return Some((full_key, data, idx, lsn, bin_arc));
3548 } else {
3549 return None;
3550 }
3551 }
3552 return None;
3553 }
3554
3555 // Upper IN: descend as in first_entry_at_or_after / search.
3556 let parent_arc =
3557 parking_lot::ArcRwLockReadGuard::rwlock(&guard).clone();
3558 let next_idx = match &*guard {
3559 TreeNode::Internal(n) => {
3560 if n.entries.is_empty() {
3561 return None;
3562 }
3563 let idx = self.upper_in_floor_index(&n.entries, key);
3564 match n.get_child(idx) {
3565 Some(c) => {
3566 let next_guard = c.read_arc();
3567 drop(guard);
3568 guard = next_guard;
3569 continue;
3570 }
3571 None => idx, // EV-14/EV-13: re-fetch below.
3572 }
3573 }
3574 TreeNode::Bottom(_) => unreachable!(),
3575 };
3576 drop(guard);
3577 let child = self.child_at_or_fetch(&parent_arc, next_idx)?;
3578 guard = child.read_arc();
3579 }
3580 }
3581
3582 /// Insert a key/data pair into the tree.
3583 ///
3584 /// . Handles the root-is-null case by
3585 /// creating a two-level tree (upper IN + BIN) per initialisation path,
3586 /// then delegates to `insert_recursive` which performs preemptive splitting
3587 /// as it descends.
3588 ///
3589 /// Returns Ok(true) if this was a new insert, Ok(false) if it was an update.
3590 pub fn insert(
3591 &self,
3592 key: Vec<u8>,
3593 data: Vec<u8>,
3594 lsn: Lsn,
3595 ) -> Result<bool, TreeError> {
3596 // Save sizes before potentially moving key/data — needed for memory tracking.
3597 let key_len = key.len();
3598 let data_len = data.len();
3599
3600 // First-key path. We MUST hold the write lock while testing
3601 // root.is_none() and replacing the root, otherwise N threads can all
3602 // observe an empty tree, each build a fresh single-entry root, and
3603 // the last writer's `*self.root.write() = Some(...)` silently
3604 // discards the others' inserts. (Reproducer:
3605 // xa_protocol_test::test_concurrent_independent_xids — 8 threads
3606 // each inserting their own key into an empty tree lost ~30% of
3607 // inserts before this lock change.)
3608 {
3609 let mut root_guard = self.root.write();
3610 if root_guard.is_none() {
3611 let bin_node_id = generate_node_id();
3612 let root_node_id = generate_node_id();
3613 let bin = Arc::new(RwLock::new(TreeNode::Bottom(BinStub {
3614 node_id: bin_node_id,
3615 level: BIN_LEVEL,
3616 entries: vec![BinEntry {
3617 data: Some(data),
3618 known_deleted: false,
3619 dirty: false,
3620 expiration_time: 0,
3621 }],
3622 key_prefix: Vec::new(), // single entry — no common prefix yet
3623 dirty: true,
3624 is_delta: false,
3625 last_full_lsn: NULL_LSN,
3626 last_delta_lsn: NULL_LSN,
3627 generation: 0,
3628 parent: None, // set below after root_in is created
3629 // St-H6: use true to match the engine-wide invariant that
3630 // every BIN which may hold TTL entries uses hours granularity
3631 // (JE BIN.java default; matches tree.rs:980 and read_from_log).
3632 expiration_in_hours: true,
3633 cursor_count: 0,
3634 prohibit_next_delta: false,
3635 lsn_rep: LsnRep::from_lsns(&[lsn]),
3636 keys: KeyRep::from_keys(vec![key]), // T-2
3637 compact_max_key_length: self.compact_max_key_length,
3638 })));
3639
3640 // Upper IN at level 2; slot 0 uses an empty key (virtual root key).
3641 let root_arc =
3642 Arc::new(RwLock::new(TreeNode::Internal(InNodeStub {
3643 node_id: root_node_id,
3644 level: MAIN_LEVEL | 2,
3645 entries: vec![InEntry {
3646 key: vec![], // virtual key for slot 0 in upper IN
3647 }],
3648 // T-4: the single resident child at slot 0.
3649 targets: TargetRep::Sparse(vec![(0, bin.clone())]),
3650 dirty: true,
3651 generation: 0,
3652 parent: None,
3653 lsn_rep: LsnRep::from_lsns(&[lsn]),
3654 })));
3655
3656 // Wire the BIN's parent pointer back to the root IN.
3657 {
3658 let mut g = bin.write();
3659 g.set_parent(Some(Arc::downgrade(&root_arc)));
3660 }
3661
3662 *root_guard = Some(root_arc);
3663
3664 // JE: IN.fetchTarget / initial tree build registers the new
3665 // resident nodes with the evictor (Evictor.addBack).
3666 self.note_added(root_node_id);
3667 self.note_added(bin_node_id);
3668
3669 // Count the first entry.
3670 if let Some(counter) = &self.memory_counter {
3671 let delta =
3672 (key_len + data_len + BIN_ENTRY_OVERHEAD) as i64;
3673 counter.fetch_add(delta, Ordering::Relaxed);
3674 }
3675 return Ok(true);
3676 }
3677 // Another thread initialized the root while we were waiting for
3678 // the write lock; fall through and insert into the existing tree.
3679 }
3680
3681 // Check whether the root itself needs to be split before descending.
3682 // Tree.searchSplitsAllowed(): if rootIN.needsSplitting()
3683 // call splitRoot first.
3684 self.split_root_if_needed(lsn)?;
3685
3686 // Recursively insert, splitting children proactively as we descend
3687 // (forceSplit / searchSplitsAllowed pattern).
3688 let root_arc = self.get_root().unwrap();
3689 let result = Self::insert_recursive(
3690 &root_arc,
3691 key,
3692 data,
3693 lsn,
3694 self.max_entries_per_node,
3695 self.key_comparator.as_ref(),
3696 self.key_prefixing,
3697 self.in_list_listener.as_ref(),
3698 )?;
3699
3700 // Update the memory counter for new inserts.
3701 // IN.updateMemorySize(delta) → MemoryBudget.updateTreeMemoryUsage(delta).
3702 // LN_OVERHEAD = 48 bytes (approximate fixed overhead per entry).
3703 if result && let Some(counter) = &self.memory_counter {
3704 let delta = (key_len + data_len + BIN_ENTRY_OVERHEAD) as i64;
3705 counter.fetch_add(delta, Ordering::Relaxed);
3706 }
3707
3708 Ok(result)
3709 }
3710
3711 /// Recovery-redo variant of [`Tree::insert`] that accepts `&[u8]` slices.
3712 ///
3713 /// Eliminates the two intermediate `Vec<u8>` allocations that the normal
3714 /// insert path requires at the `redo_ln` call site (one for the key, one
3715 /// for the data). The compressed key suffix and the data bytes are each
3716 /// materialised into their `BinEntry` slots exactly once.
3717 ///
3718 /// Semantics are identical to `insert`:
3719 /// - Updates the existing slot when the key is already present.
3720 /// - Inserts a new sorted entry when the key is absent.
3721 /// - Triggers the same root-split and proactive-split logic.
3722 ///
3723 /// `data` should be the raw value bytes, or an empty slice for a
3724 /// deletion (which should not normally arrive here during redo, but is
3725 /// handled gracefully).
3726 ///
3727 /// Wave 11-K optimisation (Fix 1).
3728 pub fn redo_insert(
3729 &self,
3730 key: &[u8],
3731 data: &[u8],
3732 lsn: Lsn,
3733 ) -> Result<bool, TreeError> {
3734 let key_len = key.len();
3735 let data_len = data.len();
3736 let data_opt: Option<&[u8]> =
3737 if data.is_empty() { None } else { Some(data) };
3738
3739 // First-key path: initialise a two-level tree from scratch.
3740 {
3741 let mut root_guard = self.root.write();
3742 if root_guard.is_none() {
3743 // Pre-allocate the BIN's entries Vec using the redo capacity
3744 // hint (Fix 3). Without the hint the first BIN starts at
3745 // capacity 1 and doubles on each insert; with the hint it
3746 // starts at min(hint, max_entries) entries, eliminating
3747 // ~log2(max_entries) Vec-resize doublings.
3748 let initial_cap = if self.redo_capacity_hint > 0 {
3749 self.redo_capacity_hint.min(self.max_entries_per_node)
3750 } else {
3751 1
3752 };
3753 let mut initial_entries = Vec::with_capacity(initial_cap);
3754 initial_entries.push(BinEntry {
3755 data: data_opt.map(|d| d.to_vec()),
3756 known_deleted: false,
3757 dirty: false,
3758 expiration_time: 0,
3759 });
3760 let bin = Arc::new(RwLock::new(TreeNode::Bottom(BinStub {
3761 node_id: generate_node_id(),
3762 level: BIN_LEVEL,
3763 entries: initial_entries,
3764 key_prefix: Vec::new(),
3765 dirty: true,
3766 is_delta: false,
3767 last_full_lsn: NULL_LSN,
3768 last_delta_lsn: NULL_LSN,
3769 generation: 0,
3770 parent: None,
3771 // St-H6: use true to match the engine-wide hours-only
3772 // invariant (JE BIN.java default; matches tree.rs:980).
3773 expiration_in_hours: true,
3774 cursor_count: 0,
3775 prohibit_next_delta: false,
3776 lsn_rep: LsnRep::from_lsns(&[lsn]),
3777 keys: KeyRep::from_keys(vec![key.to_vec()]), // T-2
3778 compact_max_key_length: self.compact_max_key_length,
3779 })));
3780
3781 let root_arc =
3782 Arc::new(RwLock::new(TreeNode::Internal(InNodeStub {
3783 node_id: generate_node_id(),
3784 level: MAIN_LEVEL | 2,
3785 entries: vec![InEntry { key: vec![] }],
3786 // T-4: the single resident child at slot 0.
3787 targets: TargetRep::Sparse(vec![(0, bin.clone())]),
3788 dirty: true,
3789 generation: 0,
3790 parent: None,
3791 lsn_rep: LsnRep::from_lsns(&[lsn]),
3792 })));
3793
3794 {
3795 let mut g = bin.write();
3796 g.set_parent(Some(Arc::downgrade(&root_arc)));
3797 }
3798
3799 *root_guard = Some(root_arc);
3800
3801 if let Some(counter) = &self.memory_counter {
3802 let delta =
3803 (key_len + data_len + BIN_ENTRY_OVERHEAD) as i64;
3804 counter.fetch_add(delta, Ordering::Relaxed);
3805 }
3806 return Ok(true);
3807 }
3808 }
3809
3810 self.split_root_if_needed(lsn)?;
3811
3812 let root_arc = self.get_root().unwrap();
3813 let result = Self::redo_insert_recursive(
3814 &root_arc,
3815 key,
3816 data_opt,
3817 lsn,
3818 self.max_entries_per_node,
3819 self.key_comparator.as_ref(),
3820 self.key_prefixing,
3821 )?;
3822
3823 if result && let Some(counter) = &self.memory_counter {
3824 let delta = (key_len + data_len + BIN_ENTRY_OVERHEAD) as i64;
3825 counter.fetch_add(delta, Ordering::Relaxed);
3826 }
3827
3828 Ok(result)
3829 }
3830
3831 /// Splits the root node if it is full (needsSplitting).
3832 ///
3833 ///
3834 /// ```text
3835 /// 1. Save oldRoot (the current root IN or BIN).
3836 /// 2. Create newRoot at oldRoot.level + 1.
3837 /// 3. Insert oldRoot into newRoot at slot 0 with a virtual (empty) key.
3838 /// 4. Call split_node on oldRoot, passing newRoot as parent.
3839 /// 5. Replace tree root with newRoot.
3840 /// ```
3841 fn split_root_if_needed(&self, lsn: Lsn) -> Result<(), TreeError> {
3842 // Hold `self.root.write()` across the needs_split check and the
3843 // root promotion, mirroring the first-key path fix and matching
3844 // the broader insert/split serialisation discipline.
3845 //
3846 // With the previous read-then-write pattern, two concurrent
3847 // splitters could each observe needs_split == true, then take()
3848 // and install in turn, with the second wrapping the first's
3849 // already-promoted root in its own new IN. Each level wraps the
3850 // previous, producing a chain of one-child internal nodes. No
3851 // data is lost (every entry is still reachable) but the tree
3852 // becomes unnecessarily deep, and the imbalance can compound
3853 // under heavy concurrent insertion.
3854 let mut root_guard = self.root.write();
3855 let needs_split = match root_guard.as_ref() {
3856 Some(arc) => {
3857 let g = arc.read();
3858 g.get_n_entries() >= self.max_entries_per_node
3859 }
3860 None => false,
3861 };
3862 if !needs_split {
3863 return Ok(());
3864 }
3865
3866 // Create a fresh new root one level above the current root.
3867 let old_root_arc = root_guard.take().expect("checked Some above");
3868 let old_root_level = {
3869 let g = old_root_arc.read();
3870 g.level()
3871 };
3872
3873 // newRoot = new IN(level = oldRoot.level + 1) with slot 0 = oldRoot.
3874 // The key at slot 0 is the virtual key (empty slice) following the
3875 // convention that entry-zero in an upper IN compares as -infinity.
3876 let new_root_arc =
3877 Arc::new(RwLock::new(TreeNode::Internal(InNodeStub {
3878 node_id: generate_node_id(),
3879 level: old_root_level + 1,
3880 entries: vec![InEntry { key: vec![] }],
3881 // T-4: slot 0's resident child is the old root.
3882 targets: TargetRep::Sparse(vec![(0, old_root_arc.clone())]),
3883 dirty: true,
3884 generation: 0,
3885 parent: None,
3886 lsn_rep: LsnRep::from_lsns(&[lsn]),
3887 })));
3888
3889 // Update the old root's parent pointer to the new root.
3890 {
3891 let mut g = old_root_arc.write();
3892 g.set_parent(Some(Arc::downgrade(&new_root_arc)));
3893 }
3894
3895 // Install the new root before calling split_child so split_child
3896 // (which itself takes parent.write()) can run unencumbered.
3897 *root_guard = Some(new_root_arc.clone());
3898 drop(root_guard);
3899
3900 // Now split the old root (which is now child at slot 0 in new_root).
3901 Self::split_child(
3902 &new_root_arc,
3903 0, // child is at slot 0
3904 self.max_entries_per_node,
3905 lsn,
3906 SplitHint::Normal,
3907 &[], // no insertion key at root-init time
3908 self.key_comparator.as_ref(),
3909 self.key_prefixing,
3910 self.in_list_listener.as_ref(),
3911 )?;
3912
3913 // EVICTOR-RECLAIM-1: register the freshly-promoted root IN with the
3914 // evictor's LRU (JE Tree.splitRoot adds the new root to the INList).
3915 // split_child above already registers the new sibling.
3916 let new_root_id = match &*new_root_arc.read() {
3917 TreeNode::Internal(n) => n.node_id,
3918 TreeNode::Bottom(b) => b.node_id,
3919 };
3920 self.note_added(new_root_id);
3921
3922 self.root_splits.fetch_add(1, Ordering::Relaxed);
3923 Ok(())
3924 }
3925
3926 /// Splits the child at `child_index` in `parent`.
3927 ///
3928 /// . This implementation always keeps the **left** half in the
3929 /// existing child node (`child_arc`) and puts the right half in the new
3930 /// sibling, regardless of where the `identifierKey` falls. JE's
3931 /// `IN.splitInternal` (`idKeyIndex` logic ~line 4172) can place either
3932 /// half in the existing node; Noxu's preemptive-split discipline ensures
3933 /// the parent always has a free slot at split time (the split is done on
3934 /// the way *down*, before the parent fills up), so the safe simplification
3935 /// of always using the left half is correct here — no routing information
3936 /// is lost. This comment replaces the previous incorrect claim that
3937 /// `idKeyIndex` drove the choice.
3938 ///
3939 /// Note: does not emit a split log entry; split nodes are marked dirty
3940 /// and flushed at the next checkpoint (flush_dirty_bins/upper_ins).
3941 ///
3942 /// ```text
3943 /// 1. splitIndex = child.nEntries / 2 (or 1 / n-1 for splitSpecial)
3944 /// 2. Create newSibling at the same level.
3945 /// 3. Move entries [splitIndex..nEntries) to newSibling.
3946 /// 4. Update parent slot childIndex -> child (left half),
3947 /// insert newSibling with newIdKey after childIndex.
3948 /// ```
3949 fn split_child(
3950 parent: &Arc<RwLock<TreeNode>>,
3951 child_index: usize,
3952 max_entries: usize,
3953 lsn: Lsn,
3954 hint: SplitHint,
3955 insert_key: &[u8],
3956 key_comparator: Option<&KeyComparatorFn>,
3957 key_prefixing: bool,
3958 listener: Option<&Arc<dyn InListListener>>,
3959 ) -> Result<(), TreeError> {
3960 // The split is performed under `parent.write()` for the entire
3961 // duration. This is a deliberate choice for correctness:
3962 //
3963 // - Without it, between dropping `child.write()` (after installing
3964 // the left half) and acquiring `parent.write()` (to install the
3965 // sibling), a concurrent descender can pick `child_arc` from the
3966 // parent (still pointing at it), descend, take `child.write()`
3967 // and insert a key. Whether the descender's key belongs in the
3968 // left half (now in `child`) or the right half (which will be
3969 // in the new sibling) is determined by the parent's split key —
3970 // but the parent doesn't know about the split key yet, so the
3971 // descender's routing decision is based on stale data. If the
3972 // descender's key falls in the right half, it lands in `child`
3973 // (left half) where a future search will not find it: the
3974 // future search descends from the root, the parent now has the
3975 // sibling installed, the search routes the key to the sibling,
3976 // the sibling does not contain the key — silently lost.
3977 //
3978 // - Holding `parent.write()` throughout serialises split_child
3979 // against every descender that wants `parent.read()`. A
3980 // descender already holding `parent.read()` (latch coupling
3981 // from above) keeps split_child waiting at this lock until it
3982 // has finished its own work. Combined, the split + sibling
3983 // install is atomic with respect to descents.
3984 //
3985 // - Splits are infrequent compared to inserts (~ once per
3986 // max_entries new keys) so the extra serialisation here does
3987 // not dominate.
3988 //
3989 // Reproducer that exercises this race:
3990 // crates/noxu-db/tests/concurrent_commits_stress.rs.
3991 let mut parent_write_guard = parent.write();
3992
3993 // Extract the child Arc from the parent slot.
3994 let child_arc = match &*parent_write_guard {
3995 TreeNode::Internal(p) => {
3996 p.get_child(child_index).ok_or(TreeError::SplitRequired)?
3997 }
3998 TreeNode::Bottom(_) => return Err(TreeError::SplitRequired),
3999 };
4000
4001 // Gather all entries from the child plus split metadata, AND
4002 // perform the in-place left-half install, all under a single
4003 // write lock on the child. See the earlier comment on the race
4004 // this avoids inside split_child.
4005 let mut child_guard = child_arc.write();
4006 let child_level = child_guard.level();
4007 // St-H6: capture the splitting BIN's expiration_in_hours flag BEFORE
4008 // drop(child_guard) so the right-half sibling inherits it.
4009 // JE: BIN.java::setExpiration calls setExpirationInHours(hours) to
4010 // propagate the flag on split/clone; the Rust split was hardcoding
4011 // false instead of inheriting — this caused hours-granularity TTL
4012 // entries in the right sibling to be read with in_hours=false, making
4013 // the hours-since-epoch value compare as seconds-since-epoch (far in
4014 // the past) and every right-sibling TTL record appear expired.
4015 let bin_expiration_in_hours: bool = match &*child_guard {
4016 TreeNode::Bottom(b) => b.expiration_in_hours,
4017 // Internal nodes do not carry per-entry TTL; default to true
4018 // (the engine-wide invariant for any BIN that may hold TTL data).
4019 TreeNode::Internal(_) => true,
4020 };
4021 // T-2/T-5: the compact-key threshold the new sibling BIN inherits.
4022 // (Only consumed when the child is a BIN; an upper-IN split produces
4023 // upper-IN siblings, which have no compact key rep.)
4024 let bin_compact_max_key_length: i32 = match &*child_guard {
4025 TreeNode::Bottom(b) => b.compact_max_key_length,
4026 TreeNode::Internal(_) => INKeyRep_DEFAULT_MAX_KEY_LENGTH,
4027 };
4028 let (all_entries, bin_old_prefix) = match &*child_guard {
4029 TreeNode::Internal(n) => {
4030 // T-4: capture the parallel resident-child array alongside the
4031 // entries so children travel with their slots through the
4032 // split (JE `IN.split` copies `entryTargets`).
4033 let children: Vec<Option<ChildArc>> =
4034 (0..n.entries.len()).map(|i| n.get_child(i)).collect();
4035 // T-3: capture the parallel per-slot LSNs so they travel with
4036 // their slots (JE `IN.split` copies `entryLsnByteArray`).
4037 let lsns: Vec<Lsn> =
4038 (0..n.entries.len()).map(|i| n.get_lsn(i)).collect();
4039 (
4040 SplitEntries::Internal(n.entries.clone(), children, lsns),
4041 Vec::new(),
4042 )
4043 }
4044 TreeNode::Bottom(b) => {
4045 // Decompress to full keys.
4046 let full: Vec<BinEntry> = (0..b.entries.len())
4047 .map(|i| BinEntry {
4048 data: b.entries[i].data.clone(),
4049 known_deleted: b.entries[i].known_deleted,
4050 dirty: b.entries[i].dirty,
4051 expiration_time: b.entries[i].expiration_time,
4052 })
4053 .collect();
4054 let lsns: Vec<Lsn> =
4055 (0..b.entries.len()).map(|i| b.get_lsn(i)).collect();
4056 // T-2: carry FULL keys through the split; the new BINs
4057 // recompute their own prefix from them.
4058 let full_keys: Vec<Vec<u8>> = (0..b.entries.len())
4059 .map(|i| b.get_full_key(i).unwrap_or_default())
4060 .collect();
4061 (
4062 SplitEntries::Bottom(full, lsns, full_keys),
4063 b.key_prefix.clone(),
4064 )
4065 }
4066 };
4067
4068 // Determine split point — JE `IN.splitSpecial` / `IN.splitInternal`.
4069 //
4070 // Normal midpoint: `n_entries / 2`.
4071 // AllLeft: insertion key is at position 0 on every descend level.
4072 // → split_index = 1 (left half keeps n-1 entries; new right sibling
4073 // gets only the former-first slot, then the insertion fills it).
4074 // This matches JE: `if (leftSide && index == 0) splitInternal(…, 1)`.
4075 // AllRight: insertion key is at the last position on every level.
4076 // → split_index = n_entries - 1 (left half keeps all but one entry).
4077 // JE: `else if (!leftSide && index == nEntries-1) splitInternal(…, nEntries-1)`.
4078 //
4079 // Ref: `IN.java` splitSpecial ~line 4129, splitInternal ~line 4159.
4080 let n_entries = all_entries.len();
4081 let split_index = if n_entries >= 2 {
4082 // Find where insert_key falls in the child.
4083 let insert_idx = {
4084 let mut idx = 0usize;
4085 for i in 1..n_entries {
4086 let ord = match key_comparator {
4087 Some(cmp) => cmp(all_entries.get_key(i), insert_key),
4088 None => all_entries.get_key(i).cmp(insert_key),
4089 };
4090 if ord != std::cmp::Ordering::Greater {
4091 idx = i;
4092 } else {
4093 break;
4094 }
4095 }
4096 idx
4097 };
4098 match hint {
4099 SplitHint::AllLeft if insert_idx == 0 => 1,
4100 SplitHint::AllRight if insert_idx == n_entries - 1 => {
4101 n_entries - 1
4102 }
4103 _ => n_entries / 2,
4104 }
4105 } else {
4106 n_entries / 2
4107 };
4108
4109 // newIdKey — the full key of the first entry of the right half.
4110 // For BIN: entries are already full keys after decompression above.
4111 // For IN: entries carry full keys directly.
4112 let new_id_key = all_entries.get_key(split_index).to_vec();
4113 // Suppress unused-variable warning when no BIN is involved.
4114 let _ = &bin_old_prefix;
4115
4116 // Divide into left and right halves.
4117 let left_entries = all_entries.slice(0, split_index);
4118 let right_entries = all_entries.slice(split_index, n_entries);
4119
4120 // Install the left half into `child_arc` (still under the same
4121 // write lock) and mark the node dirty.
4122 match (&mut *child_guard, &left_entries) {
4123 (TreeNode::Internal(n), SplitEntries::Internal(le, lc, ll)) => {
4124 n.entries = le.clone();
4125 // T-4: reinstall the (now-shorter) left child array.
4126 n.targets = TargetRep::None;
4127 for (i, c) in lc.iter().enumerate() {
4128 if let Some(child) = c {
4129 n.set_child(i, Some(child.clone()));
4130 }
4131 }
4132 // T-3: reinstall the (now-shorter) left LSN array.
4133 n.lsn_rep = LsnRep::from_lsns(ll);
4134 }
4135 (TreeNode::Bottom(b), SplitEntries::Bottom(le, ll, lk)) => {
4136 // Reset prefix; keys arrive as FULL keys (no prefix yet).
4137 b.key_prefix = Vec::new();
4138 // Pre-allocate at max_entries capacity so the left half
4139 // does not need to reallocate on the next insert (Fix 3).
4140 let mut left = Vec::with_capacity(max_entries);
4141 left.extend_from_slice(le);
4142 b.entries = left;
4143 // T-3: reinstall the left LSN array.
4144 b.lsn_rep = LsnRep::from_lsns(ll);
4145 // T-2: reinstall the left key rep from the full keys (Default;
4146 // recompute_key_prefix below compresses + compacts).
4147 b.keys = KeyRep::from_keys(lk.clone());
4148 // Recompute prefix on each half after split (only when
4149 // key_prefixing is enabled for this database).
4150 // JE: IN.computeKeyPrefix returns null when
4151 // databaseImpl.getKeyPrefixing() is false.
4152 // Ref: IN.java computeKeyPrefix ~line 2456.
4153 if key_prefixing && b.entries.len() >= 2 {
4154 b.recompute_key_prefix();
4155 } else {
4156 b.keys.compact(b.compact_max_key_length); // T-2
4157 }
4158 }
4159 _ => return Err(TreeError::SplitRequired),
4160 }
4161 child_guard.set_dirty(true);
4162 drop(child_guard);
4163
4164 // Create the new right-half sibling.
4165 // Parent pointer will be wired in when it is inserted into the parent.
4166 let new_sibling = match right_entries {
4167 SplitEntries::Internal(re, rc, rl) => {
4168 let mut rin = InNodeStub {
4169 node_id: generate_node_id(),
4170 level: child_level,
4171 entries: re,
4172 targets: TargetRep::None,
4173 dirty: true,
4174 generation: 0,
4175 parent: None, // set below
4176 // T-3: the right half's per-slot LSNs.
4177 lsn_rep: LsnRep::from_lsns(&rl),
4178 };
4179 // T-4: install the right half's resident children.
4180 for (i, c) in rc.into_iter().enumerate() {
4181 if c.is_some() {
4182 rin.set_child(i, c);
4183 }
4184 }
4185 Arc::new(RwLock::new(TreeNode::Internal(rin)))
4186 }
4187 SplitEntries::Bottom(re, rl, rk) => {
4188 // Entries arrive as FULL keys; build BinStub with no prefix
4189 // then recompute key prefix for the new sibling.
4190 // Pre-allocate at max_entries capacity so the right half
4191 // does not need to reallocate on the next insert (Fix 3).
4192 let mut right = Vec::with_capacity(max_entries);
4193 right.extend(re);
4194 let mut sibling_bin = BinStub {
4195 node_id: generate_node_id(),
4196 level: child_level,
4197 entries: right,
4198 key_prefix: Vec::new(),
4199 dirty: true,
4200 is_delta: false,
4201 last_full_lsn: NULL_LSN,
4202 last_delta_lsn: NULL_LSN,
4203 generation: 0,
4204 parent: None, // set below
4205 // St-H6 fix: inherit the splitting BIN's flag so that
4206 // is_expired() uses the correct granularity for entries
4207 // that were already in the BIN before the split.
4208 // JE reference: BIN.java::split() propagates
4209 // expirationInHours via setExpirationInHours(hours).
4210 expiration_in_hours: bin_expiration_in_hours,
4211 cursor_count: 0,
4212 prohibit_next_delta: false,
4213 // T-3: the right half's per-slot LSNs.
4214 lsn_rep: LsnRep::from_lsns(&rl),
4215 // T-2: full keys (Default); recompute/compact below.
4216 keys: KeyRep::from_keys(rk),
4217 compact_max_key_length: bin_compact_max_key_length,
4218 };
4219 // St-H6 debug guard: the sibling must carry the same flag as
4220 // the splitting BIN so that in_hours-resolution entries are
4221 // never silently expired by a mismatched false flag.
4222 debug_assert_eq!(
4223 sibling_bin.expiration_in_hours, bin_expiration_in_hours,
4224 "St-H6 invariant: sibling BIN expiration_in_hours must \
4225 match the splitting BIN (got {}, expected {})",
4226 sibling_bin.expiration_in_hours, bin_expiration_in_hours
4227 );
4228
4229 if key_prefixing && sibling_bin.entries.len() >= 2 {
4230 sibling_bin.recompute_key_prefix();
4231 } else {
4232 sibling_bin.keys.compact(bin_compact_max_key_length); // T-2
4233 }
4234 Arc::new(RwLock::new(TreeNode::Bottom(sibling_bin)))
4235 }
4236 };
4237
4238 // Note: the child (left half) was marked dirty earlier under the
4239 // same write lock that installed left_entries; no need to re-take
4240 // the write lock here.
4241
4242 // Insert the new sibling into the parent after child_index.
4243 // We already hold `parent.write()` (taken at the top of the
4244 // function); operate on it directly rather than re-acquiring.
4245 match &mut *parent_write_guard {
4246 TreeNode::Internal(p) => {
4247 let insert_pos = child_index + 1;
4248 // T-4: insert the parent slot and set its cached child via the
4249 // node-level INTargetRep (shifting existing children).
4250 p.insert_entry(
4251 insert_pos,
4252 new_id_key,
4253 lsn,
4254 Some(new_sibling.clone()),
4255 );
4256 // Parent is dirty because it gained a new entry.
4257 p.dirty = true;
4258 }
4259 TreeNode::Bottom(_) => return Err(TreeError::SplitRequired),
4260 }
4261
4262 // Wire the new sibling's parent pointer to the parent node
4263 // before releasing parent_write_guard, so a future descent that
4264 // takes parent.read() and finds the sibling immediately sees a
4265 // fully-wired parent pointer.
4266 {
4267 let mut g = new_sibling.write();
4268 g.set_parent(Some(Arc::downgrade(parent)));
4269 }
4270 // T-4: when an upper IN split, the children that moved into the new
4271 // sibling must have their parent back-pointers re-wired to the
4272 // sibling (JE re-parents moved targets in IN.split).
4273 {
4274 let sg = new_sibling.read();
4275 if let TreeNode::Internal(sn) = &*sg {
4276 let moved = sn.resident_children();
4277 drop(sg);
4278 for child in moved {
4279 let mut cg = child.write();
4280 cg.set_parent(Some(Arc::downgrade(&new_sibling)));
4281 }
4282 }
4283 }
4284 drop(parent_write_guard);
4285
4286 // EVICTOR-RECLAIM-1: register the freshly-split sibling with the
4287 // evictor's LRU (JE IN.splitInternal calls inList.add(newSibling)).
4288 // Without this, split-created BINs/INs are invisible to the evictor:
4289 // the policy lists never receive them, every evict_batch phase quota
4290 // is 0, and eviction reclaims nothing under pressure even though the
4291 // nodes are fully resident. Only the very first root+BIN (the
4292 // first-key path) and re-fetched nodes were ever registered.
4293 if let Some(l) = listener {
4294 let sibling_id = match &*new_sibling.read() {
4295 TreeNode::Internal(n) => n.node_id,
4296 TreeNode::Bottom(b) => b.node_id,
4297 };
4298 l.note_ins_added(sibling_id);
4299 }
4300
4301 Ok(())
4302 }
4303
4304 /// Recursive insert with preemptive splitting.
4305 ///
4306 /// Top-down traversal in `Tree.forceSplit` +
4307 /// `Tree.searchSplitsAllowed`:
4308 ///
4309 /// 1. At an upper IN: find which child slot covers `key`, split the child
4310 /// proactively if it is full (so we always have room to insert the split
4311 /// key into the parent), then recurse into the appropriate child.
4312 /// 2. At a BIN: insert the key/data directly.
4313 ///
4314 /// This implements the "preemptive splitting" strategy from the: we split
4315 /// children on the way down so we never need to walk back up.
4316 fn insert_recursive(
4317 node_arc: &Arc<RwLock<TreeNode>>,
4318 key: Vec<u8>,
4319 data: Vec<u8>,
4320 lsn: Lsn,
4321 max_entries: usize,
4322 key_comparator: Option<&KeyComparatorFn>,
4323 key_prefixing: bool,
4324 listener: Option<&Arc<dyn InListListener>>,
4325 ) -> Result<bool, TreeError> {
4326 Self::insert_recursive_inner(
4327 node_arc,
4328 key,
4329 data,
4330 lsn,
4331 max_entries,
4332 key_comparator,
4333 key_prefixing,
4334 true, // all_left_so_far
4335 true, // all_right_so_far
4336 listener,
4337 )
4338 }
4339
4340 /// Inner recursive helper that threads `allLeftSideDescent` /
4341 /// `allRightSideDescent` from `Tree.forceSplit` (JE ~line 1912).
4342 ///
4343 /// Both flags start `true` at the root and are cleared as soon as the
4344 /// descent takes a non-leftmost / non-rightmost child slot. At split
4345 /// time they are forwarded to `split_child` which uses them to pick the
4346 /// `splitSpecial` split index (JE `IN.splitSpecial` ~line 4129).
4347 #[allow(clippy::too_many_arguments)]
4348 fn insert_recursive_inner(
4349 node_arc: &Arc<RwLock<TreeNode>>,
4350 key: Vec<u8>,
4351 data: Vec<u8>,
4352 lsn: Lsn,
4353 max_entries: usize,
4354 key_comparator: Option<&KeyComparatorFn>,
4355 key_prefixing: bool,
4356 all_left_so_far: bool,
4357 all_right_so_far: bool,
4358 listener: Option<&Arc<dyn InListListener>>,
4359 ) -> Result<bool, TreeError> {
4360 // Determine if this is a BIN (leaf level).
4361 //
4362 // We hold a read lock on `node_arc` (the parent of any descent we
4363 // do below) for the duration of this call, releasing it just
4364 // before returning. That achieves *latch coupling*: a concurrent
4365 // `split_child(parent, …)` that wants to reorganise our subtree
4366 // ultimately needs `parent.write()` to install the new sibling,
4367 // and that write blocks until our read lock is dropped. Without
4368 // this, the descender-vs-splitter race goes:
4369 //
4370 // T_X: at root, picks child_arc (BIN), drops root read lock.
4371 // T_Y: at root, runs split_child(root, …): takes child_arc.write(),
4372 // installs left half [E1..E5], creates sibling [E6..E10],
4373 // takes root.write() and inserts the sibling.
4374 // T_X: now takes child_arc.write() and inserts a key whose
4375 // sort order falls in the right half. The key lands in
4376 // child_arc (left half) but a future search descending
4377 // from the root routes that key to the new sibling and
4378 // does not find it — silently lost.
4379 //
4380 // Reproducer: noxu-db/tests/concurrent_commits_stress.rs
4381 // (32 threads × 100 keys, ~1–6 lost writes per run before this fix;
4382 // occasionally hundreds when an entire BIN is orphaned).
4383 let parent_guard = node_arc.read();
4384 let is_bin = parent_guard.is_bin();
4385
4386 if is_bin {
4387 // BIN: drop the read lock and take the write lock; this is
4388 // safe because the *outer* call frame still holds a read
4389 // lock on this BIN's parent (or this is the root, in which
4390 // case the first-key path has already initialised it). A
4391 // concurrent split_child(parent, …) cannot run while the
4392 // outer parent.read() is held, so the BIN cannot be
4393 // restructured between dropping our read lock and acquiring
4394 // our write lock.
4395 drop(parent_guard);
4396 let mut guard = node_arc.write();
4397 match &mut *guard {
4398 TreeNode::Bottom(bin) => {
4399 let is_new = if let Some(cmp) = key_comparator {
4400 // Comparator-based insert: no prefix compression.
4401 let (_idx, new) =
4402 bin.insert_cmp(key, lsn, Some(data), cmp.as_ref());
4403 new
4404 } else if key_prefixing {
4405 // insert_with_prefix handles prefix recomputation when
4406 // the new key shrinks the existing prefix, and also
4407 // initialises the prefix when 2 entries are present for
4408 // the first time.
4409 let (_idx, new) =
4410 bin.insert_with_prefix(key, lsn, Some(data));
4411 new
4412 } else {
4413 // key_prefixing disabled: store full key, no prefix.
4414 // JE: IN.computeKeyPrefix returns null when
4415 // databaseImpl.getKeyPrefixing() is false.
4416 // Ref: IN.java computeKeyPrefix ~line 2456.
4417 let (_idx, new) = bin.insert_raw(key, lsn, Some(data));
4418 new
4419 };
4420 // Mark dirty after any modification.
4421 bin.dirty = true;
4422 Ok(is_new)
4423 }
4424 TreeNode::Internal(_) => Err(TreeError::SplitRequired),
4425 }
4426 } else {
4427 // Upper IN: find the child slot that covers key.
4428 // Index = parent.findEntry(key, false, false)
4429 // Entry zero in an upper IN has a virtual key (-infinity), so
4430 // any real key is routed to at least slot 0.
4431 let (child_index, n_entries_at_level, child_arc) =
4432 match &*parent_guard {
4433 TreeNode::Internal(n) => {
4434 // Binary search for the largest key <= search key.
4435 // Slot 0 always matches (virtual key = -infinity).
4436 let mut idx = 0usize;
4437 for (i, entry) in n.entries.iter().enumerate() {
4438 if i == 0 {
4439 idx = 0;
4440 } else {
4441 let ord = match key_comparator {
4442 Some(cmp) => cmp(
4443 entry.key.as_slice(),
4444 key.as_slice(),
4445 ),
4446 None => {
4447 entry.key.as_slice().cmp(key.as_slice())
4448 }
4449 };
4450 if ord != std::cmp::Ordering::Greater {
4451 idx = i;
4452 } else {
4453 break;
4454 }
4455 }
4456 }
4457 let child =
4458 n.get_child(idx).ok_or(TreeError::SplitRequired)?;
4459 (idx, n.entries.len(), child)
4460 }
4461 TreeNode::Bottom(_) => {
4462 return Err(TreeError::SplitRequired);
4463 }
4464 };
4465
4466 // Update the descent-side flags (JE `Tree.forceSplit` ~1959).
4467 // `allLeftSideDescent` ← still true only if we chose slot 0.
4468 // `allRightSideDescent` ← still true only if we chose the last slot.
4469 let all_left = all_left_so_far && child_index == 0;
4470 let all_right = all_right_so_far
4471 && child_index == n_entries_at_level.saturating_sub(1);
4472
4473 // Proactively split the child if it is full.
4474 // If (child.needsSplitting()) child.split(parent, ...)
4475 let child_full = {
4476 let g = child_arc.read();
4477 g.get_n_entries() >= max_entries
4478 };
4479
4480 if child_full {
4481 // Build the splitSpecial hint from the accumulated flags.
4482 // JE `Tree.forceSplit` ~line 2010:
4483 // if (allLeftSideDescent || allRightSideDescent)
4484 // child.splitSpecial(parent, index, grandParent,
4485 // maxTreeEntriesPerNode, key, allLeftSideDescent)
4486 let hint = match (all_left, all_right) {
4487 (true, _) => SplitHint::AllLeft,
4488 (_, true) => SplitHint::AllRight,
4489 _ => SplitHint::Normal,
4490 };
4491 // split_child(parent, …) needs parent.write(); we must
4492 // drop our parent read lock before calling it.
4493 drop(parent_guard);
4494 Self::split_child(
4495 node_arc,
4496 child_index,
4497 max_entries,
4498 lsn,
4499 hint,
4500 &key,
4501 key_comparator,
4502 key_prefixing,
4503 listener,
4504 )?;
4505
4506 // After the split, re-find which child now covers key.
4507 // Re-enter at the top of the inner function; carry the
4508 // flags (the new topology doesn't invalidate them — we
4509 // still know the overall descent direction).
4510 return Self::insert_recursive_inner(
4511 node_arc,
4512 key,
4513 data,
4514 lsn,
4515 max_entries,
4516 key_comparator,
4517 key_prefixing,
4518 all_left_so_far,
4519 all_right_so_far,
4520 listener,
4521 );
4522 }
4523
4524 // Descend into the child while still holding parent_guard.
4525 // The recursive call will hold child.read() before this
4526 // returns, then drop it; combined with our parent_guard,
4527 // the latch coupling chain is preserved on the way down and
4528 // unwound on the way back up.
4529 let r = Self::insert_recursive_inner(
4530 &child_arc,
4531 key,
4532 data,
4533 lsn,
4534 max_entries,
4535 key_comparator,
4536 key_prefixing,
4537 all_left,
4538 all_right,
4539 listener,
4540 );
4541 drop(parent_guard);
4542 r
4543 }
4544 }
4545
4546 /// Slice-based variant of [`Tree::insert_recursive`] for the recovery redo path.
4547 ///
4548 /// Accepts `key: &[u8]` and `data: Option<&[u8]>` instead of owned
4549 /// `Vec<u8>` values. At the BIN leaf, calls
4550 /// [`BinStub::insert_with_prefix_slice`] which copies bytes into the
4551 /// `BinEntry` exactly once.
4552 ///
4553 /// For the comparator path (custom key comparator), falls back to
4554 /// `insert_cmp` with a one-time `to_vec()` conversion — that path is
4555 /// rare in practice (sorted-dup databases only) and is not on the
4556 /// W11 hot path.
4557 ///
4558 /// Wave 11-K optimisation (Fix 1).
4559 fn redo_insert_recursive(
4560 node_arc: &Arc<RwLock<TreeNode>>,
4561 key: &[u8],
4562 data: Option<&[u8]>,
4563 lsn: Lsn,
4564 max_entries: usize,
4565 key_comparator: Option<&KeyComparatorFn>,
4566 key_prefixing: bool,
4567 ) -> Result<bool, TreeError> {
4568 Self::redo_insert_recursive_inner(
4569 node_arc,
4570 key,
4571 data,
4572 lsn,
4573 max_entries,
4574 key_comparator,
4575 key_prefixing,
4576 true,
4577 true,
4578 )
4579 }
4580
4581 #[allow(clippy::too_many_arguments)]
4582 fn redo_insert_recursive_inner(
4583 node_arc: &Arc<RwLock<TreeNode>>,
4584 key: &[u8],
4585 data: Option<&[u8]>,
4586 lsn: Lsn,
4587 max_entries: usize,
4588 key_comparator: Option<&KeyComparatorFn>,
4589 key_prefixing: bool,
4590 all_left_so_far: bool,
4591 all_right_so_far: bool,
4592 ) -> Result<bool, TreeError> {
4593 let parent_guard = node_arc.read();
4594 let is_bin = parent_guard.is_bin();
4595
4596 if is_bin {
4597 drop(parent_guard);
4598 let mut guard = node_arc.write();
4599 match &mut *guard {
4600 TreeNode::Bottom(bin) => {
4601 // REC-F2: JE redo currency check
4602 // (RecoveryManager.redo() line ~2512/2544). A logged LN
4603 // is applied only when logrecLsn > treeLsn. If the slot
4604 // already holds an equal-or-newer LSN, skip the overwrite
4605 // so an out-of-order (older-LSN) redo cannot revert
4606 // committed data or reset the slot LSN backward. This
4607 // makes redo genuinely idempotent regardless of
4608 // redo/undo phase order. Deletes never reach this path
4609 // (redo_ln routes Delete through tree.delete), so the JE
4610 // "lsnCmp == 0 && isDeletion -> set KD" sub-case does not
4611 // apply here.
4612 let cmp_ref = key_comparator.map(|c| {
4613 c.as_ref()
4614 as &dyn Fn(&[u8], &[u8]) -> std::cmp::Ordering
4615 });
4616 if let Some(slot_lsn) =
4617 bin.redo_slot_lsn(key, cmp_ref, key_prefixing)
4618 && lsn <= slot_lsn
4619 {
4620 // Tree already holds an equal-or-newer version.
4621 return Ok(false);
4622 }
4623 let is_new = if let Some(cmp) = key_comparator {
4624 // Comparator path: fall back to owned-Vec variant.
4625 let (_idx, new) = bin.insert_cmp(
4626 key.to_vec(),
4627 lsn,
4628 data.map(|d| d.to_vec()),
4629 cmp.as_ref(),
4630 );
4631 new
4632 } else if key_prefixing {
4633 let (_idx, new) =
4634 bin.insert_with_prefix_slice(key, lsn, data);
4635 new
4636 } else {
4637 // key_prefixing disabled: store full key verbatim.
4638 // Ref: IN.java computeKeyPrefix ~line 2456.
4639 let (_idx, new) = bin.insert_raw(
4640 key.to_vec(),
4641 lsn,
4642 data.map(|d| d.to_vec()),
4643 );
4644 new
4645 };
4646 bin.dirty = true;
4647 Ok(is_new)
4648 }
4649 TreeNode::Internal(_) => Err(TreeError::SplitRequired),
4650 }
4651 } else {
4652 let (child_index, n_entries_at_level, child_arc) =
4653 match &*parent_guard {
4654 TreeNode::Internal(n) => {
4655 let mut idx = 0usize;
4656 for (i, entry) in n.entries.iter().enumerate() {
4657 if i == 0 {
4658 idx = 0;
4659 } else {
4660 let ord = match key_comparator {
4661 Some(cmp) => cmp(entry.key.as_slice(), key),
4662 None => entry.key.as_slice().cmp(key),
4663 };
4664 if ord != std::cmp::Ordering::Greater {
4665 idx = i;
4666 } else {
4667 break;
4668 }
4669 }
4670 }
4671 let child =
4672 n.get_child(idx).ok_or(TreeError::SplitRequired)?;
4673 (idx, n.entries.len(), child)
4674 }
4675 TreeNode::Bottom(_) => {
4676 return Err(TreeError::SplitRequired);
4677 }
4678 };
4679
4680 let all_left = all_left_so_far && child_index == 0;
4681 let all_right = all_right_so_far
4682 && child_index == n_entries_at_level.saturating_sub(1);
4683
4684 let child_full = {
4685 let g = child_arc.read();
4686 g.get_n_entries() >= max_entries
4687 };
4688
4689 if child_full {
4690 let hint = match (all_left, all_right) {
4691 (true, _) => SplitHint::AllLeft,
4692 (_, true) => SplitHint::AllRight,
4693 _ => SplitHint::Normal,
4694 };
4695 drop(parent_guard);
4696 Self::split_child(
4697 node_arc,
4698 child_index,
4699 max_entries,
4700 lsn,
4701 hint,
4702 key,
4703 key_comparator,
4704 key_prefixing,
4705 // Recovery redo path: the listener is not active during
4706 // log replay (the evictor is wired AFTER recovery, and
4707 // the INList is rebuilt separately). EVICTOR-RECLAIM-1
4708 // registration happens on the live insert path.
4709 None,
4710 )?;
4711 return Self::redo_insert_recursive_inner(
4712 node_arc,
4713 key,
4714 data,
4715 lsn,
4716 max_entries,
4717 key_comparator,
4718 key_prefixing,
4719 all_left_so_far,
4720 all_right_so_far,
4721 );
4722 }
4723
4724 let r = Self::redo_insert_recursive_inner(
4725 &child_arc,
4726 key,
4727 data,
4728 lsn,
4729 max_entries,
4730 key_comparator,
4731 key_prefixing,
4732 all_left,
4733 all_right,
4734 );
4735 drop(parent_guard);
4736 r
4737 }
4738 }
4739
4740 /// Pre-warm the tree's internal `Vec<BinEntry>` capacity before a redo
4741 /// pass that will insert approximately `n` records.
4742 ///
4743 /// If the tree is empty, this is a no-op (there is no BIN yet to reserve
4744 /// capacity on). If the tree already has a root BIN (from a previous
4745 /// checkpoint), reserves `n.min(max_entries_per_node)` additional slots
4746 /// in that BIN's entries vector, eliminating the resize-double cycle
4747 /// during the redo loop.
4748 ///
4749 /// Wave 11-K optimisation (Fix 3).
4750 pub fn reserve_redo_capacity(&self, n: usize) {
4751 if n == 0 {
4752 return;
4753 }
4754 let root = match self.get_root() {
4755 Some(r) => r,
4756 None => return,
4757 };
4758 // Descend to the leftmost BIN and reserve there.
4759 let mut arc = root;
4760 loop {
4761 let guard = arc.read();
4762 match &*guard {
4763 TreeNode::Bottom(bin_guard) => {
4764 let additional = n
4765 .min(self.max_entries_per_node)
4766 .saturating_sub(bin_guard.entries.len());
4767 drop(guard);
4768 let mut wguard = arc.write();
4769 if let TreeNode::Bottom(bin) = &mut *wguard {
4770 bin.entries.reserve(additional);
4771 }
4772 return;
4773 }
4774 TreeNode::Internal(inner) => {
4775 let child = inner.get_child(0);
4776 drop(guard);
4777 match child {
4778 Some(c) => arc = c,
4779 None => return,
4780 }
4781 }
4782 }
4783 }
4784 }
4785
4786 /// Get the first (leftmost) BIN in the tree.
4787 ///
4788 /// Descends to the leftmost BIN by
4789 /// always following the first child slot at each upper IN level.
4790 pub fn get_first_node(&self) -> Option<SearchResult> {
4791 let mut guard: parking_lot::ArcRwLockReadGuard<
4792 parking_lot::RawRwLock,
4793 TreeNode,
4794 > = self.get_root()?.read_arc();
4795
4796 loop {
4797 if guard.is_bin() {
4798 let n = guard.get_n_entries();
4799 if n == 0 {
4800 return None;
4801 }
4802 // TREE-F1: return the first LIVE slot, skipping known_deleted
4803 // slots (CursorImpl.java:2062-2064). If the leftmost BIN is
4804 // entirely KD during the reconstitution window the cursor's
4805 // get_first falls through to its cross-BIN advance.
4806 if let TreeNode::Bottom(b) = &*guard {
4807 match (0..b.entries.len()).find(|&i| b.slot_is_live(i)) {
4808 Some(i) => {
4809 return Some(SearchResult::with_values(
4810 true, i as i32, false,
4811 ));
4812 }
4813 None => return None,
4814 }
4815 }
4816 return Some(SearchResult::with_values(true, 0, false));
4817 }
4818
4819 // Capture the leftmost child Arc while holding `guard`, then
4820 // hand-over-hand: take the child read lock before releasing
4821 // the parent's. Same race fix as `Tree::search`.
4822 let next_arc = match &*guard {
4823 TreeNode::Internal(n_node) => n_node.get_child(0)?,
4824 _ => return None,
4825 };
4826 let next_guard = next_arc.read_arc();
4827 drop(guard);
4828 guard = next_guard;
4829 }
4830 }
4831
4832 /// Get the last (rightmost) BIN in the tree.
4833 ///
4834 /// Descends to the rightmost BIN by
4835 /// always following the last child slot at each upper IN level.
4836 pub fn get_last_node(&self) -> Option<SearchResult> {
4837 let mut guard: parking_lot::ArcRwLockReadGuard<
4838 parking_lot::RawRwLock,
4839 TreeNode,
4840 > = self.get_root()?.read_arc();
4841
4842 loop {
4843 if guard.is_bin() {
4844 let n = guard.get_n_entries();
4845 if n == 0 {
4846 return None;
4847 }
4848 // TREE-F1: return the last LIVE slot, skipping known_deleted
4849 // slots (CursorImpl.java:2062-2064).
4850 if let TreeNode::Bottom(b) = &*guard {
4851 match (0..b.entries.len())
4852 .rev()
4853 .find(|&i| b.slot_is_live(i))
4854 {
4855 Some(i) => {
4856 return Some(SearchResult::with_values(
4857 true, i as i32, false,
4858 ));
4859 }
4860 None => return None,
4861 }
4862 }
4863 return Some(SearchResult::with_values(
4864 true,
4865 (n - 1) as i32,
4866 false,
4867 ));
4868 }
4869
4870 // Capture the rightmost child Arc while holding `guard`, then
4871 // hand-over-hand: take the child read lock before releasing
4872 // the parent's. Same race fix as `Tree::search`.
4873 let next_arc = match &*guard {
4874 TreeNode::Internal(n_node) => {
4875 n_node.get_child(n_node.entries.len().saturating_sub(1))?
4876 }
4877 _ => return None,
4878 };
4879 let next_guard = next_arc.read_arc();
4880 drop(guard);
4881 guard = next_guard;
4882 }
4883 }
4884
4885 /// Returns the number of root splits that have occurred.
4886 pub fn get_root_splits(&self) -> u64 {
4887 self.root_splits.load(Ordering::Relaxed)
4888 }
4889
4890 /// Returns the number of relatches required.
4891 pub fn get_relatches_required(&self) -> u64 {
4892 self.relatches_required.load(Ordering::Relaxed)
4893 }
4894
4895 /// Delete a key from the tree.
4896 ///
4897 /// Traverses the tree to find the BIN that should contain the key, then
4898 /// removes the entry. Returns true if the key was found and removed.
4899 ///
4900 /// Delete path in `Tree` from the.
4901 ///
4902 /// In-memory removal only — WAL logging for deletes is handled by the
4903 /// cursor layer (`cursor_impl.rs::log_ln_write`) before this is called,
4904 /// matching separation between LN logging and tree mutation.
4905 pub fn delete(&self, key: &[u8]) -> bool {
4906 let root = match self.get_root() {
4907 Some(r) => r,
4908 None => return false,
4909 };
4910
4911 // F8 consistency: insert accounts key + data + BIN_ENTRY_OVERHEAD; delete must
4912 // subtract the SAME (data_len was previously omitted, leaking
4913 // data_len from the cache counter on every delete and biasing the
4914 // evictor's over-budget view). Peek the data length before deleting.
4915 let data_len = if self.memory_counter.is_some() {
4916 self.search_with_data(key)
4917 .filter(|sf| sf.found)
4918 .and_then(|sf| sf.data.as_ref().map(|d| d.len()))
4919 .unwrap_or(0)
4920 } else {
4921 0
4922 };
4923
4924 let deleted =
4925 Self::delete_recursive(&root, key, self.key_comparator.as_ref());
4926
4927 // Update the memory counter when an entry is removed.
4928 // IN.updateMemorySize(-delta) → MemoryBudget.updateTreeMemoryUsage(-delta).
4929 if deleted && let Some(counter) = &self.memory_counter {
4930 let delta = (key.len() + data_len + BIN_ENTRY_OVERHEAD) as i64;
4931 counter.fetch_sub(delta, Ordering::Relaxed);
4932 }
4933
4934 deleted
4935 }
4936
4937 /// Recursive helper for `delete`: descend to the BIN that holds `key`
4938 /// and remove it.
4939 fn delete_recursive(
4940 node_arc: &Arc<RwLock<TreeNode>>,
4941 key: &[u8],
4942 key_comparator: Option<&KeyComparatorFn>,
4943 ) -> bool {
4944 // Latch coupling, mirroring `insert_recursive`. Without this,
4945 // delete has the same "BIN split out from under us" race: thread
4946 // A finds child_arc as the target BIN under parent.read(), drops
4947 // the lock, and another thread runs split_child(parent, …) that
4948 // moves the target key into the new sibling. A then takes
4949 // child_arc.write(), looks for the key in the (now left-half)
4950 // BIN, doesn't find it, and returns `false`. The caller treats
4951 // the `false` as "key was not present", but the key is actually
4952 // still in the tree (in the sibling). Subsequent operations
4953 // observe a stale record that should have been deleted —
4954 // semantically a lost delete.
4955 let parent_guard = node_arc.read();
4956 let is_bin = parent_guard.is_bin();
4957 let child_arc = if !is_bin {
4958 match &*parent_guard {
4959 TreeNode::Internal(n) => {
4960 // Find child slot with largest key <= search key
4961 let mut idx = 0usize;
4962 for (i, entry) in n.entries.iter().enumerate() {
4963 if i == 0 {
4964 idx = 0;
4965 } else {
4966 let ord = match key_comparator {
4967 Some(cmp) => cmp(entry.key.as_slice(), key),
4968 None => entry.key.as_slice().cmp(key),
4969 };
4970 if ord != std::cmp::Ordering::Greater {
4971 idx = i;
4972 } else {
4973 break;
4974 }
4975 }
4976 }
4977 n.get_child(idx)
4978 }
4979 _ => None,
4980 }
4981 } else {
4982 None
4983 };
4984
4985 if is_bin {
4986 // Drop the read lock before taking the write lock; the outer
4987 // call frame still holds the parent read lock so a concurrent
4988 // split_child cannot run on this BIN's parent until we unwind.
4989 drop(parent_guard);
4990 let mut g = node_arc.write();
4991 match &mut *g {
4992 TreeNode::Bottom(bin) => {
4993 if let Some(cmp) = key_comparator {
4994 bin.delete_cmp(key, cmp.as_ref())
4995 } else {
4996 // Entries store compressed (suffix) keys when key_prefix
4997 // is non-empty. Compress the search key before comparing.
4998 //
4999 // The caller is not required to ensure that `key`
5000 // shares this BIN's learned `key_prefix` — a stray
5001 // delete of a key that was never present (or that
5002 // sits under a different prefix) is legal and must
5003 // simply return `false`. Calling `compress_key`
5004 // unconditionally would `debug_assert!`-panic on
5005 // such inputs, so guard it the same way the cursor
5006 // path does.
5007 if !bin.key_prefix.is_empty()
5008 && !key.starts_with(bin.key_prefix.as_slice())
5009 {
5010 return false;
5011 }
5012 let suffix = bin.compress_key(key);
5013 match bin.key_binary_search(suffix.as_slice()) {
5014 Ok(idx) => {
5015 bin.entries.remove(idx);
5016 bin.keys.remove(idx); // T-2
5017 bin.lsn_rep.remove_shift(idx); // T-3
5018 // Mark dirty after any modification.
5019 bin.dirty = true;
5020 true
5021 }
5022 Err(_) => false,
5023 }
5024 }
5025 }
5026 _ => false,
5027 }
5028 } else {
5029 // Descend with parent_guard still held; the recursion will
5030 // hold its own read lock and drop ours after it returns.
5031 let r = match child_arc {
5032 Some(child) => {
5033 Self::delete_recursive(&child, key, key_comparator)
5034 }
5035 None => false,
5036 };
5037 drop(parent_guard);
5038 r
5039 }
5040 }
5041
5042 // ========================================================================
5043 // B-tree Merge / Compress
5044 // ========================================================================
5045
5046 /// Merge under-full sibling BIN pairs and remove empty subtrees.
5047 ///
5048 /// `INCompressor` / `Tree.compressInternal()` logic.
5049 ///
5050 /// merges two adjacent siblings when their combined entry count is
5051 /// ≤ `max_entries_per_node` (the merge threshold equal to the node
5052 /// capacity). The left sibling's entries are prepended into the right
5053 /// sibling; the parent key slot pointing at the left sibling is then
5054 /// removed from the parent IN with `deleteEntry`. If the parent IN
5055 /// becomes empty after the removal the process repeats recursively up
5056 /// the tree.
5057 ///
5058 /// This implementation performs a single post-order walk so that each
5059 /// level is compressed after all its children have been compressed.
5060 pub fn compress(&self) {
5061 let root = match self.get_root() {
5062 Some(r) => r,
5063 None => return,
5064 };
5065 Self::compress_node(&root, self.max_entries_per_node);
5066 }
5067
5068 /// Recursive post-order compress helper.
5069 ///
5070 /// Visits children first (post-order), then scans adjacent child
5071 /// pairs in the current IN and merges them when the merge condition
5072 /// holds: `left.n_entries + right.n_entries <= max_entries`.
5073 ///
5074 /// After merging, the parent entry for the left sibling is deleted.
5075 /// The loop restarts after each merge so that newly under-full pairs
5076 /// created by previous merges are also considered.
5077 fn compress_node(node_arc: &Arc<RwLock<TreeNode>>, max_entries: usize) {
5078 // Collect child arcs to recurse without holding the node lock.
5079 let children: Vec<Arc<RwLock<TreeNode>>> = {
5080 let g = node_arc.read();
5081 match &*g {
5082 TreeNode::Internal(n) => n.resident_children(),
5083 // BINs are leaves; nothing to compress at this level.
5084 TreeNode::Bottom(_) => return,
5085 }
5086 };
5087
5088 // Post-order: recurse into every child before working on this level.
5089 for child in &children {
5090 Self::compress_node(child, max_entries);
5091 }
5092
5093 // Compress the current IN level: merge adjacent under-full children.
5094 // Repeat until a full pass produces no merges.
5095 loop {
5096 let n_entries = {
5097 let g = node_arc.read();
5098 g.get_n_entries()
5099 };
5100
5101 let mut merged_any = false;
5102
5103 // `i` is the index of the *left* candidate; right is at `i+1`.
5104 let mut i = 0usize;
5105 while i + 1 < n_entries {
5106 // Fetch left and right child arcs.
5107 let (left_arc, right_arc) = {
5108 let g = node_arc.read();
5109 match &*g {
5110 TreeNode::Internal(p) => {
5111 let l = p.get_child(i);
5112 let r = p.get_child(i + 1);
5113 match (l, r) {
5114 (Some(l), Some(r)) => (l, r),
5115 _ => {
5116 i += 1;
5117 continue;
5118 }
5119 }
5120 }
5121 TreeNode::Bottom(_) => return,
5122 }
5123 };
5124
5125 let left_n = { left_arc.read().get_n_entries() };
5126 let right_n = { right_arc.read().get_n_entries() };
5127
5128 // merge condition: combined count fits within one node.
5129 if left_n + right_n > max_entries {
5130 i += 1;
5131 continue;
5132 }
5133
5134 // Determine node kind from left child.
5135 let left_is_bin = { left_arc.read().is_bin() };
5136
5137 if left_is_bin {
5138 // BIN merge: decompress left entries to full keys, then
5139 // prepend into right BIN (also decompressed), and finally
5140 // recompute the merged BIN's prefix.
5141 // merge left into right, then
5142 // recalcKeyPrefix on the merged node.
5143 let left_full_entries: Vec<BinEntry> = {
5144 {
5145 let g = left_arc.read();
5146 match &*g {
5147 TreeNode::Bottom(b) => (0..b.entries.len())
5148 .map(|j| BinEntry {
5149 data: b.entries[j].data.clone(),
5150 known_deleted: b.entries[j]
5151 .known_deleted,
5152 dirty: b.entries[j].dirty,
5153 expiration_time: b.entries[j]
5154 .expiration_time,
5155 })
5156 .collect(),
5157 _ => {
5158 i += 1;
5159 continue;
5160 }
5161 }
5162 }
5163 };
5164 // T-3 / T-2: capture left's per-slot LSNs and FULL keys.
5165 let (left_full_lsns, left_full_keys): (
5166 Vec<Lsn>,
5167 Vec<Vec<u8>>,
5168 ) = {
5169 let g = left_arc.read();
5170 match &*g {
5171 TreeNode::Bottom(b) => (
5172 (0..b.entries.len())
5173 .map(|j| b.get_lsn(j))
5174 .collect(),
5175 (0..b.entries.len())
5176 .map(|j| {
5177 b.get_full_key(j).unwrap_or_default()
5178 })
5179 .collect(),
5180 ),
5181 _ => (Vec::new(), Vec::new()),
5182 }
5183 };
5184 {
5185 {
5186 let mut g = right_arc.write();
5187 match &mut *g {
5188 TreeNode::Bottom(rb) => {
5189 // Decompress right entries to full keys.
5190 let right_full: Vec<BinEntry> = (0..rb
5191 .entries
5192 .len())
5193 .map(|j| BinEntry {
5194 data: rb.entries[j].data.clone(),
5195 known_deleted: rb.entries[j]
5196 .known_deleted,
5197 dirty: rb.entries[j].dirty,
5198 expiration_time: rb.entries[j]
5199 .expiration_time,
5200 })
5201 .collect();
5202 // T-3 / T-2: right's per-slot LSNs + keys.
5203 let right_full_lsns: Vec<Lsn> =
5204 (0..rb.entries.len())
5205 .map(|j| rb.get_lsn(j))
5206 .collect();
5207 let right_full_keys: Vec<Vec<u8>> =
5208 (0..rb.entries.len())
5209 .map(|j| {
5210 rb.get_full_key(j)
5211 .unwrap_or_default()
5212 })
5213 .collect();
5214 // Left entries are all smaller; prepend.
5215 let mut combined = left_full_entries;
5216 combined.extend(right_full);
5217 let mut combined_lsns = left_full_lsns;
5218 combined_lsns.extend(right_full_lsns);
5219 let mut combined_keys = left_full_keys;
5220 combined_keys.extend(right_full_keys);
5221 // Reset prefix and assign full keys.
5222 rb.key_prefix = Vec::new();
5223 rb.entries = combined;
5224 // T-3: rebuild the merged LSN array.
5225 rb.lsn_rep =
5226 LsnRep::from_lsns(&combined_lsns);
5227 // T-2: rebuild the merged key rep (Default;
5228 // recompute below compresses + compacts).
5229 rb.keys = KeyRep::from_keys(combined_keys);
5230 // Recompute prefix on merged BIN.
5231 if rb.entries.len() >= 2 {
5232 rb.recompute_key_prefix();
5233 } else {
5234 rb.keys
5235 .compact(rb.compact_max_key_length);
5236 }
5237 rb.dirty = true;
5238 }
5239 _ => {
5240 i += 1;
5241 continue;
5242 }
5243 }
5244 }
5245 }
5246 // Clear the now-merged left BIN.
5247 {
5248 let mut g = left_arc.write();
5249 if let TreeNode::Bottom(lb) = &mut *g {
5250 lb.entries.clear();
5251 lb.lsn_rep = LsnRep::Empty; // T-3
5252 lb.keys = KeyRep::new(); // T-2
5253 lb.key_prefix = Vec::new();
5254 lb.dirty = true;
5255 }
5256 }
5257 } else {
5258 // Upper-IN merge: prepend left's InEntries into right.
5259 // T-4: capture left's resident children alongside its
5260 // entries so they travel into the merged right IN.
5261 let (left_in_entries, left_children): (
5262 Vec<InEntry>,
5263 Vec<Option<ChildArc>>,
5264 ) = {
5265 let g = left_arc.read();
5266 match &*g {
5267 TreeNode::Internal(n) => {
5268 let children = (0..n.entries.len())
5269 .map(|j| n.get_child(j))
5270 .collect();
5271 (n.entries.clone(), children)
5272 }
5273 _ => {
5274 i += 1;
5275 continue;
5276 }
5277 }
5278 };
5279 // T-3: capture left's per-slot LSNs.
5280 let left_in_lsns: Vec<Lsn> = {
5281 let g = left_arc.read();
5282 match &*g {
5283 TreeNode::Internal(n) => (0..n.entries.len())
5284 .map(|j| n.get_lsn(j))
5285 .collect(),
5286 _ => Vec::new(),
5287 }
5288 };
5289 let n_left = left_in_entries.len();
5290 {
5291 {
5292 let mut g = right_arc.write();
5293 match &mut *g {
5294 TreeNode::Internal(rn) => {
5295 // Snapshot right's existing children, then
5296 // rebuild the merged entry + target arrays
5297 // (left half first, then right half).
5298 let right_children: Vec<Option<ChildArc>> =
5299 (0..rn.entries.len())
5300 .map(|j| rn.get_child(j))
5301 .collect();
5302 // T-3: snapshot right's LSNs too.
5303 let right_in_lsns: Vec<Lsn> =
5304 (0..rn.entries.len())
5305 .map(|j| rn.get_lsn(j))
5306 .collect();
5307 let mut combined = left_in_entries.clone();
5308 combined.append(&mut rn.entries);
5309 rn.entries = combined;
5310 // T-3: rebuild the merged LSN array.
5311 let mut combined_lsns =
5312 left_in_lsns.clone();
5313 combined_lsns.extend(right_in_lsns);
5314 rn.lsn_rep =
5315 LsnRep::from_lsns(&combined_lsns);
5316 rn.targets = TargetRep::None;
5317 for (j, c) in
5318 left_children.iter().enumerate()
5319 {
5320 if let Some(child) = c {
5321 rn.set_child(
5322 j,
5323 Some(child.clone()),
5324 );
5325 }
5326 }
5327 for (j, c) in
5328 right_children.into_iter().enumerate()
5329 {
5330 if c.is_some() {
5331 rn.set_child(n_left + j, c);
5332 }
5333 }
5334 rn.dirty = true;
5335 }
5336 _ => {
5337 i += 1;
5338 continue;
5339 }
5340 }
5341 }
5342 }
5343 // Update parent pointers for moved children.
5344 for child in left_children.into_iter().flatten() {
5345 let mut cg = child.write();
5346 cg.set_parent(Some(Arc::downgrade(&right_arc)));
5347 }
5348 // Clear the now-merged left IN.
5349 {
5350 let mut g = left_arc.write();
5351 if let TreeNode::Internal(ln) = &mut *g {
5352 ln.entries.clear();
5353 ln.lsn_rep = LsnRep::Empty; // T-3
5354 ln.targets = TargetRep::None;
5355 ln.dirty = true;
5356 }
5357 }
5358 }
5359
5360 // Remove the right sibling's parent slot and update
5361 // the left slot to point at the merged right child.
5362 //
5363 // We keep the LEFT slot's key (which is the correct minimum for
5364 // the merged BIN's range) and remove the RIGHT slot (i+1).
5365 // This avoids having to update the parent key when i == 0.
5366 {
5367 {
5368 let mut g = node_arc.write();
5369 match &mut *g {
5370 TreeNode::Internal(p) => {
5371 // Update left slot (i) to point at right_arc
5372 // (which now contains the merged entries).
5373 if i < p.entries.len() {
5374 p.set_child(i, Some(right_arc.clone()));
5375 }
5376 // Remove right slot (i+1) — it is now redundant.
5377 // T-4: remove_entry shifts the child array too.
5378 if i + 1 < p.entries.len() {
5379 p.remove_entry(i + 1);
5380 }
5381 p.dirty = true;
5382 }
5383 TreeNode::Bottom(_) => return,
5384 }
5385 }
5386 }
5387
5388 merged_any = true;
5389 // Advance i to check the merged BIN against its new right
5390 // sibling (the old slot i+2 is now at i+1).
5391 i += 1;
5392 let updated_n = { node_arc.read().get_n_entries() };
5393 if i + 1 >= updated_n {
5394 break;
5395 }
5396 }
5397
5398 if !merged_any {
5399 break;
5400 }
5401 }
5402 }
5403
5404 // ========================================================================
5405 // BIN slot compression
5406 // ========================================================================
5407
5408 /// Compress deleted slots from a BIN node, then prune it from its parent
5409 /// IN when it becomes empty.
5410 ///
5411 /// (the in-place slot-removal
5412 /// path, NOT the sibling-merge path handled by `compress()`).
5413 ///
5414 /// # Algorithm
5415 ///
5416 /// 1. If the BIN is a delta, skip — deltas cannot be compressed.
5417 /// 2. Remove all slots where `entry.known_deleted` is true. This mirrors
5418 /// `bin.compress(!bin.shouldLogDelta(), localTracker)`.
5419 /// 3. If the BIN is now empty, remove it from its parent IN. This mirrors
5420 /// `pruneBIN(db, binRef, idKey)` → `tree.delete(idKey)`.
5421 ///
5422 /// # Arguments
5423 ///
5424 /// * `bin_arc` — the BIN to compress (must be a `TreeNode::Bottom`).
5425 ///
5426 /// # Returns
5427 ///
5428 /// `true` if compression made progress (slots were removed or the BIN was
5429 /// pruned), `false` if the BIN was skipped (delta, no cursors issue, etc.).
5430 pub fn compress_bin(&self, bin_arc: &Arc<RwLock<TreeNode>>) -> bool {
5431 // ---- Step 1: collect metadata without holding the write lock ----
5432 let (is_delta, n_entries, id_key) = {
5433 {
5434 let g = bin_arc.read();
5435 match &*g {
5436 TreeNode::Bottom(b) => {
5437 // Identifier key = first full key in the BIN
5438 // (the: bin.getIdentifierKey()).
5439 let id_key = b.get_full_key(0);
5440 (b.is_delta, b.entries.len(), id_key)
5441 }
5442 _ => return false, // not a BIN
5443 }
5444 }
5445 };
5446
5447 // If (bin.isBINDelta()) return; — deltas cannot be compressed.
5448 if is_delta {
5449 return false;
5450 }
5451
5452 // ---- Step 2: remove known-deleted slots) ----
5453 // We compress dirty slots too (compress_dirty_slots = true) because
5454 // we are not writing a BIN-delta here.
5455 let removed_any = {
5456 {
5457 let mut g = bin_arc.write();
5458 match &mut *g {
5459 TreeNode::Bottom(b) => {
5460 let before = b.entries.len();
5461 // BIN.compress(): walk backwards to remove
5462 // deleted slots without index confusion.
5463 //
5464 // ponytail: IC-3 — we remove `known_deleted` slots
5465 // without consulting the lock manager's per-record
5466 // write-lock state (JE BIN.compress inspects the
5467 // cursor/lock state). The lock manager lives in a
5468 // DIFFERENT crate (noxu-txn); the tree layer has no
5469 // access to it, so a cross-crate write-lock check is
5470 // out of scope here. This is SAFE in the current
5471 // design because the only slots that reach here with
5472 // `known_deleted == true` are committed deletes:
5473 // * the dbi write path (cursor_impl.rs delete())
5474 // PHYSICALLY removes the slot via tree.delete()
5475 // while holding the txn write lock — it never
5476 // leaves a write-locked `known_deleted` tombstone
5477 // in a BinStub; and
5478 // * the only writer of BinStub.known_deleted == true
5479 // is BIN-delta / recovery replay, which only
5480 // replays already-committed deletes.
5481 // The compressor daemon
5482 // (environment_impl.rs: collect_bins_with_known_deleted
5483 // → compress_bin) therefore only ever sees committed
5484 // (unlocked) defunct slots. See
5485 // docs/src/operations/known-limitations.md (IC-3) for
5486 // the upgrade path if a future write path ever leaves
5487 // an uncommitted write-locked tombstone in a BinStub.
5488 let mut j = b.entries.len();
5489 while j > 0 {
5490 j -= 1;
5491 if b.entries[j].known_deleted {
5492 // JE `IN.deleteEntry` (IN.java:3466): removing a
5493 // DIRTY slot must prohibit the next delta — a
5494 // delta only carries dirty slots, so the removal
5495 // would otherwise be silently lost. Force a
5496 // full BIN on the next log.
5497 if b.entries[j].dirty {
5498 b.prohibit_next_delta = true;
5499 }
5500 b.entries.remove(j);
5501 b.keys.remove(j); // T-2
5502 b.lsn_rep.remove_shift(j); // T-3
5503 b.dirty = true;
5504 }
5505 }
5506 // Recompute prefix after slot removal, since the
5507 // remaining keys may share a longer common prefix.
5508 // After compress(), call recalcKeyPrefix().
5509 if b.entries.len() >= 2 {
5510 b.recompute_key_prefix();
5511 } else if b.entries.len() < 2 {
5512 b.key_prefix = Vec::new();
5513 }
5514 b.entries.len() < before
5515 }
5516 _ => false,
5517 }
5518 }
5519 };
5520
5521 // ---- Step 3: prune empty BIN from parent ----
5522 // If (empty) pruneBIN(db, binRef, idKey) → tree.delete(idKey).
5523 // We only prune when the BIN is actually empty after compression.
5524 let now_empty = { bin_arc.read().get_n_entries() == 0 };
5525
5526 if now_empty {
5527 // pruneBIN re-descends to the SPECIFIC empty BIN and removes its
5528 // parent-IN slot ONLY IF the BIN is still empty (and has no
5529 // cursors and is not a delta) UNDER THE PARENT LATCH.
5530 //
5531 // We must NOT use `self.delete(&id_key)` here (IC-1): that
5532 // re-descends by key and removes whatever live entry now matches
5533 // `id_key`. Between reading `now_empty` (a fresh read lock taken
5534 // after the compression write lock was dropped) and acting on it,
5535 // a concurrent insert can repopulate this BIN; `self.delete` would
5536 // then drop a LIVE entry — tree corruption / lost write.
5537 //
5538 // JE `INCompressor.pruneBIN` (INCompressor.java ~line 502-510)
5539 // calls `tree.delete(idKey)`, and JE `Tree.delete` /
5540 // `searchDeletableSubTree` (Tree.java ~line 755-800) re-validates
5541 // `bin.getNEntries() != 0` → NODE_NOT_EMPTY (abort) and
5542 // `bin.nCursors() > 0` → CURSORS_EXIST (abort) while holding the
5543 // parent (branch) latch. `prune_empty_bin` reproduces exactly
5544 // that re-validation. See `prune_empty_bin` below.
5545 //
5546 // Note: we only attempt the prune if n_entries was > 0 before
5547 // compression (an already-empty BIN we never populated is left
5548 // alone, matching the pre-existing guard).
5549 if let Some(key) = id_key
5550 && n_entries > 0
5551 {
5552 self.prune_empty_bin(&key);
5553 }
5554 return true;
5555 }
5556
5557 removed_any
5558 }
5559
5560 /// Re-descend to the leaf BIN that should contain `id_key` and remove its
5561 /// parent-IN child slot ONLY IF the BIN is still safe to prune.
5562 ///
5563 /// This is the faithful port of JE `Tree.delete(idKey)` /
5564 /// `Tree.searchDeletableSubTree` (Tree.java ~line 755-800) as invoked by
5565 /// `INCompressor.pruneBIN` (INCompressor.java ~line 502-510). JE takes the
5566 /// branch-parent latch, re-descends to the specific empty BIN, and aborts
5567 /// the prune (removing NOTHING) if any of the following changed since the
5568 /// compressor observed the BIN as empty:
5569 ///
5570 /// * `bin.getNEntries() != 0` → `NodeNotEmptyException` (a concurrent
5571 /// insert repopulated the BIN — IC-1: we must NOT delete a live entry).
5572 /// * `bin.isBINDelta()` → `unexpectedState` (deltas are never empty).
5573 /// * `bin.nCursors() > 0` → `CursorsExistException` (a cursor is parked
5574 /// on the empty BIN; requeue rather than orphan the cursor).
5575 ///
5576 /// The re-check and the slot removal both happen while holding the
5577 /// **parent IN write latch**. Holding the parent write latch blocks every
5578 /// descender (insert / delete take `parent.read()` hand-over-hand), so a
5579 /// concurrent insert cannot reach the BIN between our re-check and the
5580 /// slot removal — the TOCTOU window IC-1 describes is closed.
5581 ///
5582 /// Returns `true` iff a parent-IN slot was removed, `false` otherwise
5583 /// (BIN repopulated, has a cursor, is a delta, vanished, or is the root —
5584 /// in every `false` case NOTHING is removed).
5585 pub fn prune_empty_bin(&self, id_key: &[u8]) -> bool {
5586 let root = match self.get_root() {
5587 Some(r) => r,
5588 None => return false,
5589 };
5590
5591 // If the root itself is the BIN (single-BIN tree) there is no parent
5592 // IN to remove a slot from. JE's searchDeletableSubTree returns null
5593 // ("the entire tree is empty") and keeps the root BIN; we do the same.
5594 if root.read().is_bin() {
5595 return false;
5596 }
5597
5598 // Descend by id_key tracking the IN that is the *parent of the leaf
5599 // BIN* and the child index within it. Hand-over-hand read coupling
5600 // keeps the descent consistent with concurrent splits, exactly like
5601 // `get_parent_bin_for_child_ln`.
5602 let (parent_arc, child_index) = {
5603 let mut parent_arc: Arc<RwLock<TreeNode>> = root.clone();
5604 let mut guard: parking_lot::ArcRwLockReadGuard<
5605 parking_lot::RawRwLock,
5606 TreeNode,
5607 > = root.read_arc();
5608 loop {
5609 let (next_arc, idx) = match &*guard {
5610 TreeNode::Internal(n) => {
5611 if n.entries.is_empty() {
5612 return false;
5613 }
5614 let idx = self.upper_in_floor_index(&n.entries, id_key);
5615 match n.get_child(idx) {
5616 Some(c) => (c, idx),
5617 None => return false,
5618 }
5619 }
5620 TreeNode::Bottom(_) => {
5621 unreachable!("is_bin checked before / below")
5622 }
5623 };
5624 // Is the next node the leaf BIN? If so, `guard`'s node is the
5625 // parent IN we want and `idx` is the child slot.
5626 if next_arc.read().is_bin() {
5627 drop(guard);
5628 break (parent_arc, idx);
5629 }
5630 let next_guard = next_arc.read_arc();
5631 drop(guard);
5632 parent_arc = next_arc;
5633 guard = next_guard;
5634 }
5635 };
5636
5637 // ---- Re-validate and remove the slot UNDER THE PARENT WRITE LATCH ----
5638 // Holding parent.write() excludes all descenders (they need
5639 // parent.read()), so the BIN cannot be repopulated between the
5640 // re-check and the slot removal.
5641 let mut parent_guard = parent_arc.write();
5642 let pruned_bin_id;
5643 let removed_key_len = match &mut *parent_guard {
5644 TreeNode::Internal(p) => {
5645 let child = match p.get_child(child_index) {
5646 Some(c) => c,
5647 None => return false, // slot already vacated / invalid
5648 };
5649 // Re-validate the child BIN under the parent latch.
5650 {
5651 let cg = child.read();
5652 match &*cg {
5653 TreeNode::Bottom(b) => {
5654 // JE: bin.getNEntries() != 0 → NODE_NOT_EMPTY (abort).
5655 if !b.entries.is_empty() {
5656 return false;
5657 }
5658 // JE: bin.isBINDelta() → unexpectedState (abort).
5659 if b.is_delta {
5660 return false;
5661 }
5662 // JE: bin.nCursors() > 0 → CURSORS_EXIST (abort).
5663 if b.cursor_count > 0 {
5664 return false;
5665 }
5666 pruned_bin_id = b.node_id;
5667 }
5668 // A concurrent split could in principle have replaced
5669 // the child with an IN; never prune in that case.
5670 TreeNode::Internal(_) => return false,
5671 }
5672 }
5673 // Safe to prune: remove the BIN's slot from the parent IN.
5674 // Mirrors the parent-slot removal `Tree.delete` performs for
5675 // an empty BIN (Tree.java deleteEntry under the branch latch).
5676 // T-4: remove_entry shifts the node-level child array too.
5677 let removed = p.remove_entry(child_index);
5678 p.dirty = true;
5679 removed.key.len()
5680 }
5681 TreeNode::Bottom(_) => return false,
5682 };
5683 drop(parent_guard);
5684
5685 // JE: removing the BIN slot detaches the BIN from the tree; the
5686 // evictor must drop it from its LRU lists (Evictor.remove).
5687 self.note_removed(pruned_bin_id);
5688
5689 // Preserve the memory-counter bookkeeping that `self.delete` performed
5690 // (IN.updateMemorySize(-delta) → MemoryBudget.updateTreeMemoryUsage).
5691 // The pruned slot's key plus the fixed per-entry overhead matches the
5692 // `delete` accounting (key.len() + BIN_ENTRY_OVERHEAD).
5693 if let Some(counter) = &self.memory_counter {
5694 let delta = (removed_key_len + BIN_ENTRY_OVERHEAD) as i64;
5695 counter.fetch_sub(delta, Ordering::Relaxed);
5696 }
5697
5698 true
5699 }
5700
5701 /// Detach the resident child node `node_id` from its parent IN, dropping
5702 /// the strong `Arc` so the node is actually freed from memory, and return
5703 /// the heap bytes reclaimed (0 if not found / not detachable).
5704 ///
5705 /// This is the faithful port of JE `IN.detachNode(idx, updateLsn, newLsn)`
5706 /// (IN.java ~4019) as called from `Evictor.evict` (Evictor.java ~3035):
5707 /// `evict` measures `target.getBudgetedMemorySize()` and then
5708 /// `parent.detachNode(index, ...)` does `setTarget(idx, null)` to drop the
5709 /// child reference and `getInMemoryINs().remove(child)` to drop it from
5710 /// the INList.
5711 ///
5712 /// EV-13: before this method existed, the evictor credited
5713 /// `node_size_fn(node_id)` bytes back to the budget and removed the node
5714 /// from the LRU lists, but the parent's `InEntry.child` still held a
5715 /// strong `Arc` — so the node was never dropped from the heap. The budget
5716 /// over-credited (claimed bytes freed that were not), `cache_usage`
5717 /// drifted below reality, and the evictor under-fired. Detaching here
5718 /// drops the `Arc` for real and credits exactly the measured size.
5719 ///
5720 /// The detach happens **under the parent IN write latch** (JE detaches
5721 /// under the parent's latch), so no concurrent descender can re-cache the
5722 /// child between measurement and detach. The slot (key + LSN) is kept —
5723 /// only the in-memory `child` target is cleared — matching JE's
5724 /// `setTarget(idx, null)` which leaves the `ChildReference` LSN intact so
5725 /// the node can be re-fetched from the log later.
5726 ///
5727 /// Returns `0` if the node is not a resident child of any IN (e.g. it is
5728 /// the root, already detached, or was pinned and could not be latched).
5729 pub fn detach_node_by_id(&self, node_id: u64) -> u64 {
5730 let root = match self.get_root() {
5731 Some(r) => r,
5732 None => return 0,
5733 };
5734
5735 // The root has no parent IN to detach from (JE evicts the root via a
5736 // separate evictRoot path; we keep the root resident here).
5737 let root_id = {
5738 let g = root.read();
5739 match &*g {
5740 TreeNode::Internal(n) => n.node_id,
5741 TreeNode::Bottom(b) => b.node_id,
5742 }
5743 };
5744 if root_id == node_id {
5745 return 0;
5746 }
5747
5748 // Locate the parent IN and the child slot index.
5749 let (parent_arc, child_index) =
5750 match Self::find_parent_of_node_id(&root, node_id) {
5751 Some(p) => p,
5752 None => return 0,
5753 };
5754
5755 // ---- Measure + detach UNDER THE PARENT WRITE LATCH ----
5756 // Holding parent.write() excludes all descenders (they take
5757 // parent.read() hand-over-hand), so the child cannot be re-cached or
5758 // re-pinned between the measurement and the detach. Mirrors JE
5759 // detachNode running under the parent latch held by Evictor.evict.
5760 let mut parent_guard = parent_arc.write();
5761 let TreeNode::Internal(p) = &mut *parent_guard else {
5762 return 0; // parent is not an IN (concurrent restructure)
5763 };
5764 if child_index >= p.entries.len() {
5765 return 0;
5766 }
5767 // T-4: detach the cached child via the node-level INTargetRep, leaving
5768 // the slot's key/LSN intact for re-fetch (JE IN.setTarget(idx, null)).
5769 let child = match p.take_child(child_index) {
5770 Some(c) => c, // child Arc removed from the slot
5771 None => return 0, // already detached
5772 };
5773
5774 // Measure the child's real heap footprint while we still hold it.
5775 // JE: long evictedBytes = target.getBudgetedMemorySize().
5776 let freed = child.read().budgeted_memory_size();
5777
5778 // EV-14 re-fetch correctness: the parent slot LSN must point at the
5779 // child's CURRENT on-disk version so `child_at_or_fetch` re-reads the
5780 // right bytes (JE `IN.updateEntry(idx, newLsn)` is called whenever a
5781 // child is logged; the parent slot LSN tracks the child's LSN). The
5782 // evictor only fully evicts/detaches a CLEAN BIN (it logs+clears dirty
5783 // BINs via flush_dirty_node_to_log first, which sets `last_full_lsn`),
5784 // so the child's authoritative LSN is its `last_full_lsn`. Stamp it
5785 // into the parent slot before dropping the child; if it is null (the
5786 // child was never logged) leave the existing slot LSN intact rather
5787 // than writing a null — a never-logged clean child cannot occur on
5788 // the evict path, but be conservative.
5789 let child_full_lsn = match &*child.read() {
5790 TreeNode::Bottom(b) => b.last_full_lsn,
5791 TreeNode::Internal(_) => NULL_LSN,
5792 };
5793 if child_full_lsn != NULL_LSN {
5794 p.set_lsn(child_index, child_full_lsn);
5795 }
5796
5797 // Mark the parent dirty: the slot's in-memory target changed (JE
5798 // detachNode sets dirty when updateLsn; we conservatively mark dirty
5799 // so the parent is re-logged with the now-non-resident slot).
5800 p.dirty = true;
5801
5802 // Drop the strong Arc explicitly so the node is freed now (the slot's
5803 // `child` is already None). If any other resident path still held a
5804 // strong reference this would not free — but the tree is the sole
5805 // strong owner of a cached child, so this drops the last strong ref.
5806 drop(parent_guard);
5807 drop(child);
5808
5809 // JE: getInMemoryINs().remove(child) — drop it from the evictor LRU.
5810 self.note_removed(node_id);
5811
5812 // NOTE: the live tree-memory counter (`memory_counter`) is the SAME
5813 // `Arc<AtomicI64>` the evictor's Arbiter uses as `cache_usage`. The
5814 // evictor decrements it once via `Arbiter::release_memory(bytes)` for
5815 // the full eviction batch, so detach must NOT decrement here too —
5816 // that would double-credit and drive `cache_usage` below reality
5817 // (the very drift EV-13 fixes, in the other direction). We only
5818 // measure-and-free; the caller does the single counter update.
5819 freed
5820 }
5821
5822 /// Evict the root IN of this tree (EV-14).
5823 ///
5824 /// Faithful port of JE `Evictor.evictRoot` (Evictor.java:3050-3110) plus
5825 /// the `RootEvictor.doWork` + `Tree.withRootLatchedExclusive` framing
5826 /// (Evictor.java:2529-2576, Tree.java:508-517). Unlike a normal IN, the
5827 /// root has no parent slot to detach from; instead the *tree's* root
5828 /// reference is the equivalent of the `RootChildReference`, so eviction:
5829 ///
5830 /// 1. Latches the root reference exclusively (`rootLatch.acquireExclusive`
5831 /// via `withRootLatchedExclusive`).
5832 /// 2. Re-checks that the root is still resident and still evictable
5833 /// (no resident children, no pinned BIN — JE `RootEvictor.doWork`
5834 /// re-latches and re-checks `rootIN == target && rootIN.isRoot()`).
5835 /// 3. If the root is dirty, LOGS it first so the on-disk version is
5836 /// current and updates `root_log_lsn` to the new LSN (JE
5837 /// `evictRoot`: `long newLsn = target.log(...); rootRef.setLsn(newLsn)`).
5838 /// 4. Clears the in-memory root (`rootRef.clearTarget()` — JE leaves the
5839 /// `ChildReference` LSN intact; here `root_log_lsn` is that LSN) and
5840 /// `note_removed`s it from the evictor LRU (JE `inList.remove(target)`).
5841 ///
5842 /// On the next access `fetch_root_from_log` re-materializes the root from
5843 /// `root_log_lsn` (JE `Tree.getRootINRootAlreadyLatched` →
5844 /// `root.fetchTarget`).
5845 ///
5846 /// # Conditions (eviction is REFUSED, returning `None`, when)
5847 ///
5848 /// * there is no log manager wired (the root could never be re-fetched),
5849 /// * the tree has no resident root (already evicted),
5850 /// * the root has any resident child (JE only evicts a childless root —
5851 /// the `hasCachedChildren` skip in `processTarget`; a root with cached
5852 /// children would orphan them, the EV-6 invariant),
5853 /// * the root is a BIN pinned by a cursor (`cursor_count > 0`),
5854 /// * the root is dirty but we have no clean persisted version AND logging
5855 /// it fails, or
5856 /// * the root is clean but `root_log_lsn` is null (never logged — cannot
5857 /// be re-fetched; happens only for a brand-new unlogged tree).
5858 ///
5859 /// Returns `Some((freed_bytes, was_dirty))` on success, where `freed_bytes`
5860 /// is the root's measured heap footprint (JE
5861 /// `target.getBudgetedMemorySize()`) and `was_dirty` reports whether the
5862 /// root had to be logged (JE `rootEvictor.flushed`, which drives
5863 /// `nDirtyNodesEvicted` and `modifyDbRoot`).
5864 pub fn evict_root(&self, db_id: u64) -> Option<(u64, bool)> {
5865 // A root with no re-fetch path must never be made non-resident.
5866 self.log_manager.as_ref()?;
5867
5868 // JE `Tree.withRootLatchedExclusive(rootEvictor)`: hold the root latch
5869 // exclusively across the whole evict so no descender or splitter can
5870 // observe/install a half-evicted root. Acquiring `self.root.write()`
5871 // is the Noxu equivalent (it is the lock guarding the root pointer).
5872 let mut root_slot = self.root.write();
5873 let root_arc = root_slot.as_ref()?.clone();
5874
5875 // JE `RootEvictor.doWork`: re-latch the target and re-check the
5876 // conditions. We hold the node guard for the duration.
5877 let node_guard = root_arc.write();
5878
5879 // EV-6 / JE `processTarget` hasCachedChildren skip: a root with any
5880 // resident child must NOT be evicted (it would orphan the child).
5881 // EV-14 only evicts an *idle* root whose children are already
5882 // non-resident (or which is itself a leaf BIN).
5883 let (node_id, was_dirty, freed) = match &*node_guard {
5884 TreeNode::Internal(n) => {
5885 if !n.resident_children().is_empty() {
5886 return None; // has cached children — keep resident
5887 }
5888 (n.node_id, n.dirty, node_guard.budgeted_memory_size())
5889 }
5890 TreeNode::Bottom(b) => {
5891 if b.cursor_count > 0 {
5892 return None; // pinned by a cursor — keep resident
5893 }
5894 (
5895 b.node_id,
5896 b.dirty || b.dirty_count() > 0,
5897 node_guard.budgeted_memory_size(),
5898 )
5899 }
5900 };
5901
5902 // If dirty, log the root first so the on-disk version is current,
5903 // then record the new LSN as the root's re-fetch point (JE
5904 // `evictRoot`: target.log(...) + rootRef.setLsn(newLsn)).
5905 if was_dirty {
5906 let lm = self.log_manager.as_ref()?; // checked above; re-borrow
5907 let node_bytes = node_guard.write_to_bytes();
5908 let is_bin = node_guard.is_bin();
5909 let entry = noxu_log::entry::in_log_entry::InLogEntry::new(
5910 db_id, NULL_LSN, // prev_full_lsn
5911 NULL_LSN, // prev_delta_lsn
5912 node_bytes,
5913 );
5914 let mut buf = bytes::BytesMut::with_capacity(entry.log_size());
5915 entry.write_to_log(&mut buf);
5916 let entry_type = if is_bin {
5917 noxu_log::LogEntryType::BIN
5918 } else {
5919 noxu_log::LogEntryType::IN
5920 };
5921 // flush_required = true so the root's bytes are durable before we
5922 // drop the in-memory copy (JE logs synchronously in evictRoot).
5923 let new_lsn = match lm.log(
5924 entry_type,
5925 &buf,
5926 noxu_log::Provisional::No,
5927 true, // flush_required
5928 false, // fsync at next checkpoint
5929 ) {
5930 Ok(l) => l,
5931 Err(_) => return None, // could not log — keep the root resident
5932 };
5933 *self.root_log_lsn.write() = new_lsn;
5934 } else {
5935 // Clean root: it must already be re-fetchable. If it was never
5936 // logged (root_log_lsn null) we cannot evict it safely.
5937 if *self.root_log_lsn.read() == NULL_LSN {
5938 return None;
5939 }
5940 }
5941
5942 // JE `rootRef.clearTarget()` + `inList.remove(target)`: drop the
5943 // in-memory root and remove it from the evictor LRU. The root_log_lsn
5944 // is the surviving `ChildReference` LSN used to re-fetch it.
5945 drop(node_guard);
5946 *root_slot = None;
5947 drop(root_slot);
5948 self.note_removed(node_id);
5949
5950 Some((freed, was_dirty))
5951 }
5952
5953 /// Re-materialize an evicted root IN from its persisted `root_log_lsn`
5954 /// (EV-14, piece B).
5955 /// Faithful to JE `Tree.getRootINRootAlreadyLatched` (Tree.java:477-516)
5956 /// which calls `root.fetchTarget(database, null)` when the in-memory
5957 /// target is null. Idempotent and cheap when the root is already
5958 /// resident: returns the resident root without touching the log.
5959 ///
5960 /// Returns `None` only when the tree is genuinely empty (no resident root
5961 /// AND `root_log_lsn` is null) or when the re-fetch fails (no log manager,
5962 /// log read error, deserialize failure) — callers then see an empty tree,
5963 /// never wrong data.
5964 pub fn fetch_root_from_log(&self) -> Option<Arc<RwLock<TreeNode>>> {
5965 // Fast path: root already resident.
5966 if let Some(r) = self.root.read().clone() {
5967 return Some(r);
5968 }
5969 // Take the write lock and re-check (another thread may have re-fetched
5970 // it while we waited — JE upgrades the root latch the same way).
5971 let mut root_slot = self.root.write();
5972 if let Some(r) = root_slot.as_ref() {
5973 return Some(r.clone());
5974 }
5975 let log_lsn = *self.root_log_lsn.read();
5976 let node = self.fetch_node_from_log(log_lsn)?;
5977 let node_id = node.node_id();
5978 let arc = Arc::new(RwLock::new(node));
5979 *root_slot = Some(arc.clone());
5980 drop(root_slot);
5981 // JE: a fetched IN is added back to the INList (Evictor LRU).
5982 self.note_added(node_id);
5983 Some(arc)
5984 }
5985
5986 /// Return the resident child Arc for slot `idx` of `parent_arc`, fetching
5987 /// it from its slot LSN and installing it if it is not resident (EV-14 /
5988 /// EV-13 re-fetch on descent).
5989 ///
5990 /// Faithful to JE `ChildReference.fetchTarget` (and `IN.fetchTarget`):
5991 /// when a slot's in-memory target is null but its LSN is valid, the node
5992 /// is read back from the log and cached in the slot. Installing the
5993 /// fetched child requires the parent EX-latch, so this takes the parent
5994 /// write lock; the fast path (child already resident) takes only a read
5995 /// lock.
5996 ///
5997 /// Returns `None` only when the slot index is out of range, the slot has
5998 /// no valid LSN, or the log read/deserialize fails — callers then treat
5999 /// the descent as terminating in an empty subtree, never wrong data.
6000 fn child_at_or_fetch(
6001 &self,
6002 parent_arc: &Arc<RwLock<TreeNode>>,
6003 idx: usize,
6004 ) -> Option<ChildArc> {
6005 // Fast path: child already cached (read lock only).
6006 {
6007 let g = parent_arc.read();
6008 if let TreeNode::Internal(n) = &*g {
6009 if let Some(c) = n.get_child(idx) {
6010 return Some(c);
6011 }
6012 } else {
6013 return None; // BINs have no IN children
6014 }
6015 }
6016 // Slow path: fetch the child from its slot LSN under the parent
6017 // EX-latch (JE installs the fetched target under the IN latch).
6018 let mut g = parent_arc.write();
6019 let TreeNode::Internal(n) = &mut *g else {
6020 return None;
6021 };
6022 // Re-check: another thread may have fetched it while we upgraded.
6023 if let Some(c) = n.get_child(idx) {
6024 return Some(c);
6025 }
6026 if idx >= n.entries.len() {
6027 return None;
6028 }
6029 let child_lsn = n.get_lsn(idx);
6030 let node = self.fetch_node_from_log(child_lsn)?;
6031 let node_id = node.node_id();
6032 let arc: ChildArc = Arc::new(RwLock::new(node));
6033 n.set_child(idx, Some(arc.clone()));
6034 drop(g);
6035 // JE: a fetched IN is added back to the INList (Evictor LRU).
6036 self.note_added(node_id);
6037 Some(arc)
6038 }
6039
6040 /// Check whether a BIN node is a candidate for slot compression and,
6041 /// if so, trigger `compress_bin`.
6042 ///
6043 /// from (the opportunistic / lazy compression path).
6044 ///
6045 /// # Algorithm
6046 ///
6047 /// 1. Skip the BIN if it is a delta or has no defunct (known-deleted) slots.
6048 /// 2. If compression succeeds and the BIN becomes empty, it is pruned.
6049 ///
6050 /// # Returns
6051 ///
6052 /// `true` if compression was triggered (regardless of whether any slots
6053 /// were actually removed), `false` if the BIN does not need compression.
6054 pub fn maybe_compress_bin_and_parent(
6055 &self,
6056 bin_arc: &Arc<RwLock<TreeNode>>,
6057 ) -> bool {
6058 // Check whether the BIN has any deleted slots worth compressing.
6059 // lazyCompress: skip deltas and BINs with no defunct slots.
6060 let should_compress = {
6061 {
6062 let g = bin_arc.read();
6063 match &*g {
6064 TreeNode::Bottom(b) => {
6065 // Skip deltas (the: !in.isBIN() || in.isBINDelta()).
6066 if b.is_delta {
6067 false
6068 } else {
6069 // Check for any known-deleted slot
6070 // (the: for (int i=0; i < bin.getNEntries(); i++) {
6071 // if (bin.isDefunct(i)) { ... break; }
6072 // }).
6073 b.entries.iter().any(|e| e.known_deleted)
6074 }
6075 }
6076 _ => false,
6077 }
6078 }
6079 };
6080
6081 if !should_compress {
6082 return false;
6083 }
6084
6085 self.compress_bin(bin_arc)
6086 }
6087
6088 // ========================================================================
6089 // Latch-coupling validation
6090 // ========================================================================
6091
6092 /// Validate that `parent.entries[child_index].child` still points at
6093 /// `child_arc` after acquiring the child's latch.
6094 ///
6095 /// Re-latch validation step inside the
6096 /// `Tree.searchSplitsAllowed`: after a concurrent split the parent
6097 /// slot that previously held the child may have changed. Callers that
6098 /// plan to mutate the child must verify the parent-child link is still
6099 /// intact before proceeding.
6100 ///
6101 /// Returns `true` if the parent-child link is intact.
6102 pub fn validate_parent_child(
6103 parent: &Arc<RwLock<TreeNode>>,
6104 child_index: usize,
6105 child_arc: &Arc<RwLock<TreeNode>>,
6106 ) -> bool {
6107 let g = parent.read();
6108 match &*g {
6109 TreeNode::Internal(p) => match p.child_ref(child_index) {
6110 Some(stored) => Arc::ptr_eq(stored, child_arc),
6111 None => false,
6112 },
6113 TreeNode::Bottom(_) => false,
6114 }
6115 }
6116
6117 /// Search for the BIN that should contain `key`, with latch-coupling
6118 /// validation at every level of descent.
6119 ///
6120 /// .
6121 ///
6122 /// The difference from `search()` is that after obtaining the child
6123 /// arc we call `validate_parent_child` to confirm the parent still
6124 /// holds the expected Arc. If the link has been broken (e.g. by a
6125 /// concurrent split that relocated the child) the traversal restarts
6126 /// from the root.
6127 ///
6128 /// Returns a `SearchResult` if the key is (or should be) in the tree,
6129 /// `None` if the tree is empty.
6130 ///
6131 /// Same as [`Tree::search`] but exposes the hand-over-hand latch
6132 /// coupling explicitly. Kept as a public, equivalent API for
6133 /// callers (today only tests) that want to verify the
6134 /// latch-coupling behaviour against `search()` itself.
6135 ///
6136 /// Both `search()` and this method use the same `read_arc()`
6137 /// hand-over-hand: take the child read guard *before* dropping
6138 /// the parent guard, so a concurrent `split_child(parent, ..)`
6139 /// (which takes `parent.write()`) cannot run between when we
6140 /// captured the child Arc and when we entered the child. There
6141 /// is no validate-and-restart loop because the coupling makes
6142 /// the race unreachable.
6143 pub fn search_with_coupling(&self, key: &[u8]) -> Option<SearchResult> {
6144 let root = self.get_root()?;
6145 let mut guard: parking_lot::ArcRwLockReadGuard<
6146 parking_lot::RawRwLock,
6147 TreeNode,
6148 > = root.read_arc();
6149
6150 loop {
6151 if guard.is_bin() {
6152 let index = guard.find_entry(key, true, true);
6153 let found = index >= 0 && (index & EXACT_MATCH != 0);
6154 return Some(SearchResult::with_values(
6155 found,
6156 index & 0xFFFF,
6157 false,
6158 ));
6159 }
6160
6161 let parent_arc =
6162 parking_lot::ArcRwLockReadGuard::rwlock(&guard).clone();
6163 let next_idx = match &*guard {
6164 TreeNode::Internal(n) => {
6165 if n.entries.is_empty() {
6166 return None;
6167 }
6168 let idx = self.upper_in_floor_index(&n.entries, key);
6169 match n.get_child(idx) {
6170 Some(c) => {
6171 let next_guard = c.read_arc();
6172 drop(guard);
6173 guard = next_guard;
6174 continue;
6175 }
6176 None => idx, // EV-14/EV-13: re-fetch below.
6177 }
6178 }
6179 TreeNode::Bottom(_) => {
6180 unreachable!("is_bin() returned false above")
6181 }
6182 };
6183 // Hand-over-hand: take the child read guard before
6184 // releasing the parent guard. Closes the
6185 // descender-vs-splitter window: a concurrent
6186 // split_child(parent, ..) takes parent.write(), which
6187 // blocks while we still hold parent.read().
6188 drop(guard);
6189 let child = self.child_at_or_fetch(&parent_arc, next_idx)?;
6190 guard = child.read_arc();
6191 }
6192 }
6193
6194 // ========================================================================
6195 // BIN-Delta reconstitution
6196 // ========================================================================
6197
6198 /// Increments the cursor-pin count on a BIN node.
6199 ///
6200 /// Called by `CursorImpl` when it positions on (or enters) a BIN.
6201 /// The evictor will not select a BIN with `cursor_count > 0` for eviction
6202 /// (`RealNodeInfo.pin_count`), matching `BIN.incrementCursorCount()`.
6203 pub fn pin_bin(bin_arc: &Arc<RwLock<TreeNode>>) {
6204 let mut guard = bin_arc.write();
6205 if let TreeNode::Bottom(ref mut stub) = *guard {
6206 stub.cursor_count += 1;
6207 }
6208 }
6209
6210 /// Decrements the cursor-pin count on a BIN node.
6211 ///
6212 /// Called by `CursorImpl` when it moves away from or closes on a BIN.
6213 /// Uses `saturating_sub` to guard against an accidental double-unpin.
6214 /// Matching `BIN.decrementCursorCount()`.
6215 pub fn unpin_bin(bin_arc: &Arc<RwLock<TreeNode>>) {
6216 let mut guard = bin_arc.write();
6217 if let TreeNode::Bottom(ref mut stub) = *guard {
6218 stub.cursor_count = stub.cursor_count.saturating_sub(1);
6219 }
6220 }
6221
6222 /// Returns `true` if the given `BinStub` is a BIN-delta (not a full BIN).
6223 ///
6224 /// `IN.isBINDelta()`.
6225 pub fn bin_is_delta(bin: &BinStub) -> bool {
6226 bin.is_delta
6227 }
6228
6229 /// Merge delta entries into a full BIN's entry list.
6230 ///
6231 /// - For each delta entry: if a matching key already exists in `bin`,
6232 /// replace it (delta is authoritative).
6233 /// - Otherwise insert the delta entry in sorted position.
6234 ///
6235 /// Delta entries carry **full** keys (prefix already prepended by the
6236 /// caller). After applying all delta entries the BIN's prefix is
6237 /// recomputed so the final state is consistent.
6238 ///
6239 /// All delta entries are considered to be the most-recently-dirtied
6240 /// state, exactly as in where delta slots supersede full-BIN slots.
6241 pub fn apply_delta_to_bin(
6242 bin: &mut BinStub,
6243 delta_entries: Vec<(Vec<u8>, Lsn, Option<Vec<u8>>)>,
6244 ) {
6245 for (full_key, lsn, data) in delta_entries {
6246 // `full_key` is a full (uncompressed) key here.
6247 bin.insert_with_prefix(full_key, lsn, data);
6248 }
6249 bin.dirty = true;
6250 }
6251
6252 /// Reconstitute a BIN-delta into a full BIN.
6253 ///
6254 /// from the:
6255 ///
6256 /// 1. Extract the delta entries from `self` (this BIN-delta), decompressing
6257 /// them to full keys.
6258 /// 2. Apply them onto `base` (the previously logged full BIN) via
6259 /// `apply_delta_to_bin`.
6260 /// 3. Copy `base`'s merged entries and prefix back into `self`.
6261 /// 4. Clear the `is_delta` flag so subsequent code treats `self` as
6262 /// a full BIN.
6263 ///
6264 /// After this call `self` is a full BIN; `base` should be discarded.
6265 pub fn mutate_to_full_bin(delta: &mut BinStub, mut base: BinStub) {
6266 // Decompress delta entries to full keys before applying.
6267 let delta_full_entries: Vec<(Vec<u8>, Lsn, Option<Vec<u8>>)> = (0
6268 ..delta.entries.len())
6269 .map(|i| {
6270 (
6271 delta.get_full_key(i).unwrap_or_default(),
6272 delta.get_lsn(i),
6273 delta.entries[i].data.clone(),
6274 )
6275 })
6276 .collect();
6277 // reconstituteBIN + resetContent + setBINDelta(false).
6278 Self::apply_delta_to_bin(&mut base, delta_full_entries);
6279 delta.entries = base.entries;
6280 delta.lsn_rep = base.lsn_rep; // T-3
6281 delta.keys = base.keys; // T-2
6282 delta.key_prefix = base.key_prefix;
6283 delta.is_delta = false;
6284 delta.dirty = true;
6285 }
6286
6287 /// Read an IN/BIN log entry at `log_lsn` and deserialise it into a
6288 /// `TreeNode`, ready to be installed as a (re-fetched) resident node.
6289 ///
6290 /// JE `LogManager.getLogEntry(lsn)` + `IN.readFromLog` as used by
6291 /// `ChildReference.fetchTarget` (the path that re-materializes a
6292 /// non-resident node from its persisted LSN on descent) and by
6293 /// `Tree.getRootINRootAlreadyLatched` for the root. The freshly-fetched
6294 /// node has no resident children (`TargetRep::None`); its own children, if
6295 /// any, are re-fetched on demand the same way when the descent reaches
6296 /// them.
6297 ///
6298 /// Returns `None` if the LSN is null, the log read fails, the entry is not
6299 /// an IN/BIN, or deserialisation fails (the caller treats this as "node
6300 /// unavailable" rather than panicking, matching the graceful-degradation
6301 /// policy of `mutate_to_full_bin_from_log`).
6302 fn fetch_node_from_log(&self, log_lsn: Lsn) -> Option<TreeNode> {
6303 if log_lsn == NULL_LSN {
6304 return None;
6305 }
6306 let lm = self.log_manager.as_ref()?;
6307 let (entry_type, payload) = lm.read_entry(log_lsn).ok()?;
6308 // The on-disk payload is an `InLogEntry` body (db_id | prev_full_lsn
6309 // | prev_delta_lsn | len | node_data). The recovery scanner strips
6310 // this header before calling `recover_in_redo`; re-fetch must do the
6311 // same so `deserialize_*` sees the bare node bytes. JE
6312 // `INLogEntry.readEntry` parses the same wrapper.
6313 let in_entry =
6314 noxu_log::entry::in_log_entry::InLogEntry::read_from_log(&payload)
6315 .ok()?;
6316 let node_data = &in_entry.node_data;
6317 use noxu_log::LogEntryType;
6318 match entry_type {
6319 LogEntryType::BIN => {
6320 Self::deserialize_bin(node_data).map(TreeNode::Bottom)
6321 }
6322 LogEntryType::IN => {
6323 Self::deserialize_upper_in(node_data).map(TreeNode::Internal)
6324 }
6325 // BIN-deltas are never logged as the *root* version and are
6326 // reconstituted by the BIN-delta path, not here.
6327 _ => {
6328 log::warn!(
6329 "fetch_node_from_log: expected IN/BIN entry at LSN {:?}, \
6330 got {:?}",
6331 log_lsn,
6332 entry_type
6333 );
6334 None
6335 }
6336 }
6337 }
6338
6339 /// Reconstitute a BIN-delta into a full BIN by reading the base from log.
6340 ///
6341 /// — the
6342 /// single-argument overload that calls `fetchFullBIN(databaseImpl)` to
6343 /// read the last full BIN from the log manager automatically.
6344 ///
6345 /// Algorithm:
6346 /// 1. If `delta.last_full_lsn == NULL_LSN`, the BIN was never written as a
6347 /// full entry; there is no base to merge so the delta IS the full BIN.
6348 /// Clear `is_delta` and return.
6349 /// 2. Read the full-BIN log entry at `delta.last_full_lsn` using
6350 /// `log_manager.read_entry(lsn)`.
6351 /// 3. Deserialize the payload with `BinStub::deserialize_full()`.
6352 /// 4. Delegate to `Self::mutate_to_full_bin(delta, base)` to merge and
6353 /// replace `delta`'s contents.
6354 ///
6355 /// On any read / parse failure the function falls back to clearing the
6356 /// `is_delta` flag without merging, so the caller always gets a non-delta
6357 /// BIN (possibly missing some old slots). This mirrors the
6358 /// `EnvironmentFailureException` path but gracefully degrades instead of
6359 /// panicking.
6360 ///
6361 /// `BIN.fetchFullBIN(dbImpl)` + `BIN.mutateToFullBIN(boolean)`.
6362 pub fn mutate_to_full_bin_from_log(
6363 delta: &mut BinStub,
6364 log_manager: &noxu_log::LogManager,
6365 ) {
6366 if !delta.is_delta {
6367 // Already a full BIN; nothing to do.
6368 return;
6369 }
6370
6371 if delta.last_full_lsn == NULL_LSN {
6372 // BIN has never been logged as a full entry — the in-memory delta
6373 // is effectively the full state. During recovery this path is
6374 // harmless.
6375 delta.is_delta = false;
6376 return;
6377 }
6378
6379 // Read the full-BIN log entry at last_full_lsn.
6380 // `envImpl.getLogManager().getEntryHandleFileNotFound(lsn)`.
6381 match log_manager.read_entry(delta.last_full_lsn) {
6382 Ok((entry_type, payload)) => {
6383 use noxu_log::LogEntryType;
6384 if entry_type == LogEntryType::BIN {
6385 if let Some(mut base) = BinStub::deserialize_full(&payload)
6386 {
6387 // Set the base's last_full_lsn so it is preserved
6388 // into the merged result.
6389 base.last_full_lsn = delta.last_full_lsn;
6390 Self::mutate_to_full_bin(delta, base);
6391 return;
6392 }
6393 // Deserialization failed — fall through to graceful degradation.
6394 log::warn!(
6395 "mutate_to_full_bin_from_log: failed to deserialize \
6396 full BIN at LSN {:?}; keeping delta as-is",
6397 delta.last_full_lsn
6398 );
6399 } else {
6400 log::warn!(
6401 "mutate_to_full_bin_from_log: expected BIN entry at \
6402 LSN {:?}, got {:?}",
6403 delta.last_full_lsn,
6404 entry_type
6405 );
6406 }
6407 }
6408 Err(e) => {
6409 log::warn!(
6410 "mutate_to_full_bin_from_log: failed to read log at \
6411 LSN {:?}: {}",
6412 delta.last_full_lsn,
6413 e
6414 );
6415 }
6416 }
6417
6418 // Graceful degradation: promote the delta to a "full" BIN without
6419 // the base slots. The BIN will be re-logged as a full BIN at the
6420 // next checkpoint.
6421 delta.is_delta = false;
6422 delta.dirty = true;
6423 }
6424
6425 // ========================================================================
6426 // getNextBin / getPrevBin
6427 // ========================================================================
6428
6429 /// Return the entries of the BIN immediately to the right of the BIN
6430 /// that contains (or would contain) `current_key`.
6431 ///
6432 /// → `Tree.getNextIN(forward=true)`.
6433 ///
6434 /// # Algorithm
6435 /// 1. Build a root-to-BIN path for `current_key`.
6436 /// 2. Walk the path back up looking for a parent that has a slot to the
6437 /// right of the slot we descended through.
6438 /// 3. When found, descend to the leftmost BIN of that sibling subtree.
6439 /// 4. If no such parent exists, return `None` (no next BIN).
6440 pub fn get_next_bin(
6441 &self,
6442 current_key: &[u8],
6443 ) -> Option<Vec<(BinEntry, Lsn, Vec<u8>)>> {
6444 let root = self.get_root()?;
6445 self.get_adjacent_bin(&root, current_key, true)
6446 }
6447
6448 /// Return the entries of the BIN immediately to the left of the BIN
6449 /// that contains (or would contain) `current_key`.
6450 ///
6451 /// → `Tree.getNextIN(forward=false)`.
6452 pub fn get_prev_bin(
6453 &self,
6454 current_key: &[u8],
6455 ) -> Option<Vec<(BinEntry, Lsn, Vec<u8>)>> {
6456 let root = self.get_root()?;
6457 self.get_adjacent_bin(&root, current_key, false)
6458 }
6459
6460 /// Core implementation shared by `get_next_bin` and `get_prev_bin`.
6461 ///
6462 /// Builds the path from `root` down to the BIN for `current_key`
6463 /// (each element records the parent arc, the slot index taken,
6464 /// and the child Arc reached) using `read_arc()` hand-over-hand
6465 /// latch coupling.
6466 ///
6467 /// The ascent re-acquires the parent's read lock one level at a
6468 /// time. To handle a concurrent split that completes between
6469 /// path capture and ascent, we validate that the slot still
6470 /// holds the child Arc we descended through. If the slot
6471 /// mismatches we retry the whole operation from root with a
6472 /// short pause between attempts. The retry budget is generous
6473 /// (`MAX_ASCENT_ATTEMPTS`) so that the typical case of a few
6474 /// cascading splits between two BIN-level cursor steps is
6475 /// absorbed without surfacing as a false end-of-iteration.
6476 /// After exhausting the budget we conservatively return `None`,
6477 /// signalling "no adjacent BIN found"; the cursor will then
6478 /// either restart its scan or report end-of-iteration. The
6479 /// budget is finite so a pathological workload (a thread
6480 /// permanently splitting under us) cannot livelock the lookup.
6481 /// JE `Tree.getNextIN` / `Tree.getPrevIN`.
6482 ///
6483 /// R3 fix (2026-06-16): converted from `static fn` to `&self` so that the
6484 /// IN-level descent uses `self.upper_in_floor_index` (comparator-aware)
6485 /// instead of a raw byte `<=`. Without this, databases with a custom
6486 /// comparator (secondary indexes, sorted-dup) could descend to the wrong
6487 /// child → wrong adjacent BIN → incorrect cursor iteration across BIN
6488 /// boundaries. Mirrors `Tree.getNextIN`/`Tree.getPrevIN` using the
6489 /// comparator-aware `IN.findEntry`.
6490 fn get_adjacent_bin(
6491 &self,
6492 root: &Arc<RwLock<TreeNode>>,
6493 current_key: &[u8],
6494 forward: bool,
6495 ) -> Option<Vec<(BinEntry, Lsn, Vec<u8>)>> {
6496 const MAX_ASCENT_ATTEMPTS: u32 = 8;
6497 for attempt in 0..MAX_ASCENT_ATTEMPTS {
6498 match self.get_adjacent_bin_attempt(root, current_key, forward) {
6499 AdjacentBinOutcome::Found(v) => return Some(v),
6500 AdjacentBinOutcome::NoAdjacent => return None,
6501 AdjacentBinOutcome::SplitRaceRetry => {
6502 // Brief pause to let the splitter finish.
6503 if attempt + 1 < MAX_ASCENT_ATTEMPTS {
6504 std::thread::yield_now();
6505 }
6506 }
6507 }
6508 }
6509 // Exhausted retry budget. Signal "no adjacent" so the
6510 // cursor can fall back to its end-of-iteration path.
6511 None
6512 }
6513
6514 /// One attempt at `get_adjacent_bin`. The tri-state return
6515 /// value distinguishes "no adjacent BIN exists" (which the
6516 /// caller should propagate as end-of-iteration) from "a
6517 /// concurrent split invalidated our path" (which the caller
6518 /// should retry from root).
6519 fn get_adjacent_bin_attempt(
6520 &self,
6521 root: &Arc<RwLock<TreeNode>>,
6522 current_key: &[u8],
6523 forward: bool,
6524 ) -> AdjacentBinOutcome {
6525 // Path entry: (parent_arc, slot_idx_taken, child_arc_reached).
6526 // The child Arc lets the ascent validate that the slot still
6527 // points to the same node we descended through.
6528 let mut path: Vec<(
6529 Arc<RwLock<TreeNode>>,
6530 usize,
6531 Arc<RwLock<TreeNode>>,
6532 )> = Vec::new();
6533
6534 let mut guard: parking_lot::ArcRwLockReadGuard<
6535 parking_lot::RawRwLock,
6536 TreeNode,
6537 > = root.read_arc();
6538 loop {
6539 if guard.is_bin() {
6540 break;
6541 }
6542
6543 let (next_arc, slot_idx) = match &*guard {
6544 TreeNode::Internal(n) => {
6545 if n.entries.is_empty() {
6546 return AdjacentBinOutcome::NoAdjacent;
6547 }
6548 // R3 fix: use comparator-aware upper_in_floor_index so
6549 // that custom-comparator / sorted-dup databases descend
6550 // to the correct child. Mirrors JE Tree.getNextIN which
6551 // uses IN.findEntry (comparator-aware) not raw byte order.
6552 let idx =
6553 self.upper_in_floor_index(&n.entries, current_key);
6554 let child = match n.get_child(idx) {
6555 Some(c) => c,
6556 None => return AdjacentBinOutcome::NoAdjacent,
6557 };
6558 (child, idx)
6559 }
6560 TreeNode::Bottom(_) => unreachable!(),
6561 };
6562
6563 // Record the parent and the child we are about to enter
6564 // — the child Arc lets the ascent validate the slot.
6565 let parent_arc =
6566 parking_lot::ArcRwLockReadGuard::rwlock(&guard).clone();
6567 path.push((parent_arc, slot_idx, Arc::clone(&next_arc)));
6568
6569 // Hand-over-hand: take child read lock BEFORE releasing parent.
6570 let next_guard = next_arc.read_arc();
6571 drop(guard);
6572 guard = next_guard;
6573 }
6574 drop(guard);
6575
6576 // Ascend the path. At each level, validate that
6577 // `parent.entries[taken_idx].child == descended_child` before
6578 // trusting `taken_idx` as a coordinate. If not, return
6579 // `SplitRaceRetry` so the caller restarts from root.
6580 while let Some((parent_arc, taken_idx, descended_child)) = path.pop() {
6581 let parent_guard = parent_arc.read();
6582 let (n_entries, slot_still_valid) = match &*parent_guard {
6583 TreeNode::Internal(p) => {
6584 let n = p.entries.len();
6585 let valid = p
6586 .child_ref(taken_idx)
6587 .is_some_and(|c| Arc::ptr_eq(c, &descended_child));
6588 (n, valid)
6589 }
6590 _ => return AdjacentBinOutcome::NoAdjacent,
6591 };
6592 drop(parent_guard);
6593
6594 if !slot_still_valid {
6595 return AdjacentBinOutcome::SplitRaceRetry;
6596 }
6597
6598 let sibling_idx = if forward {
6599 taken_idx + 1
6600 } else if taken_idx == 0 {
6601 // No left sibling at this level — ascend further.
6602 continue;
6603 } else {
6604 taken_idx - 1
6605 };
6606
6607 if forward && sibling_idx >= n_entries {
6608 // No right sibling at this level — ascend further.
6609 continue;
6610 }
6611
6612 // Found a sibling slot — fetch the sibling child arc.
6613 let sibling_arc = {
6614 let g = parent_arc.read();
6615 match &*g {
6616 TreeNode::Internal(p) => match p.get_child(sibling_idx) {
6617 Some(c) => c,
6618 None => return AdjacentBinOutcome::NoAdjacent,
6619 },
6620 _ => return AdjacentBinOutcome::NoAdjacent,
6621 }
6622 };
6623
6624 // Descend to the leftmost (forward) or rightmost (!forward) BIN.
6625 return match Self::descend_to_edge_bin(&sibling_arc, forward) {
6626 Some(v) => AdjacentBinOutcome::Found(v),
6627 None => AdjacentBinOutcome::NoAdjacent,
6628 };
6629 }
6630
6631 // Exhausted path without finding a sibling → no adjacent BIN.
6632 AdjacentBinOutcome::NoAdjacent
6633 }
6634
6635 /// Descend to the leftmost BIN (`forward = true`) or rightmost BIN
6636 /// (`forward = false`) in the sub-tree rooted at `node_arc`.
6637 ///
6638 /// `Tree.searchSubTree(SearchType.LEFT / RIGHT, targetLevel)`.
6639 fn descend_to_edge_bin(
6640 node_arc: &Arc<RwLock<TreeNode>>,
6641 forward: bool,
6642 ) -> Option<Vec<(BinEntry, Lsn, Vec<u8>)>> {
6643 // Hand-over-hand latch coupling — see Tree::search.
6644 let mut guard: parking_lot::ArcRwLockReadGuard<
6645 parking_lot::RawRwLock,
6646 TreeNode,
6647 > = node_arc.read_arc();
6648
6649 loop {
6650 if guard.is_bin() {
6651 return match &*guard {
6652 TreeNode::Bottom(b) => {
6653 // Return entries with full (decompressed) keys so that
6654 // callers always work with complete keys.
6655 //
6656 // TREE-F1: KD slots are NOT filtered here — the BIN's
6657 // slot indices are returned verbatim so the cursor can
6658 // skip KD slots itself (CursorImpl getNext loop;
6659 // CursorImpl.java:2062-2064) and continue to the next
6660 // BIN when an edge BIN is entirely KD during the
6661 // BIN-delta reconstitution window.
6662 let full_entries: Vec<(BinEntry, Lsn, Vec<u8>)> = (0
6663 ..b.entries.len())
6664 .map(|i| {
6665 (
6666 BinEntry {
6667 data: b.entries[i].data.clone(),
6668 known_deleted: b.entries[i]
6669 .known_deleted,
6670 dirty: b.entries[i].dirty,
6671 expiration_time: b.entries[i]
6672 .expiration_time,
6673 },
6674 b.get_lsn(i),
6675 b.get_full_key(i).unwrap_or_default(),
6676 )
6677 })
6678 .collect();
6679 Some(full_entries)
6680 }
6681 _ => None,
6682 };
6683 }
6684
6685 let next = match &*guard {
6686 TreeNode::Internal(n) => {
6687 if forward {
6688 n.get_child(0)?
6689 } else {
6690 n.get_child(n.entries.len().saturating_sub(1))?
6691 }
6692 }
6693 _ => return None,
6694 };
6695 // Take child read lock BEFORE releasing parent's.
6696 let next_guard = next.read_arc();
6697 drop(guard);
6698 guard = next_guard;
6699 }
6700 }
6701}
6702
6703// ============================================================================
6704// Tree statistics
6705// ============================================================================
6706
6707/// Statistics collected by a full tree walk.
6708///
6709/// `TreeWalkerStatsAccumulator`.
6710#[derive(Debug, Default, Clone, PartialEq, Eq)]
6711pub struct TreeStats {
6712 /// Number of BINs (bottom internal nodes).
6713 pub n_bins: u64,
6714 /// Number of upper INs.
6715 pub n_ins: u64,
6716 /// Total number of entries across all nodes.
6717 pub n_entries: u64,
6718 /// Height of the tree (1 = root is a BIN, 2 = one level above BINs, …).
6719 pub height: u32,
6720}
6721
6722impl Tree {
6723 /// Walks the entire tree and collects structural statistics.
6724 ///
6725 /// `TreeWalkerStatsAccumulator` pattern — performs a simple
6726 /// recursive DFS and counts INs, BINs, entries, and tree height.
6727 pub fn collect_stats(&self) -> TreeStats {
6728 let mut stats = TreeStats::default();
6729 if let Some(root) = self.get_root() {
6730 Self::collect_stats_recursive(&root, &mut stats, 0);
6731 }
6732 stats
6733 }
6734
6735 fn collect_stats_recursive(
6736 node_arc: &Arc<RwLock<TreeNode>>,
6737 stats: &mut TreeStats,
6738 depth: u32,
6739 ) {
6740 let guard = node_arc.read();
6741
6742 let current_height = depth + 1;
6743 if current_height > stats.height {
6744 stats.height = current_height;
6745 }
6746
6747 match &*guard {
6748 TreeNode::Bottom(b) => {
6749 stats.n_bins += 1;
6750 stats.n_entries += b.entries.len() as u64;
6751 }
6752 TreeNode::Internal(n) => {
6753 stats.n_ins += 1;
6754 stats.n_entries += n.entries.len() as u64;
6755 // Collect child arcs before releasing the guard.
6756 let children: Vec<Arc<RwLock<TreeNode>>> =
6757 n.resident_children();
6758 // Release guard before recursing to avoid lock ordering issues.
6759 drop(guard);
6760 for child in children {
6761 Self::collect_stats_recursive(&child, stats, depth + 1);
6762 }
6763 }
6764 }
6765 }
6766
6767 /// Collects all dirty BINs as (Arc to node, db_id) pairs.
6768 ///
6769 /// The checkpoint path calls this to enumerate BINs that need to be
6770 /// logged. For each dirty BIN the checkpoint decides — based on the
6771 /// BIN-delta threshold — whether to write a full `BIN` entry or a
6772 /// `BINDelta` entry.
6773 ///
6774 /// `Checkpointer.processINList()` which iterates the dirty
6775 /// IN list accumulated during normal operation.
6776 pub fn collect_dirty_bins(
6777 &self,
6778 db_id: u64,
6779 ) -> Vec<(u64, Arc<RwLock<TreeNode>>)> {
6780 let mut result = Vec::new();
6781 if let Some(root) = self.get_root() {
6782 Self::collect_dirty_bins_recursive(&root, db_id, &mut result);
6783 }
6784 result
6785 }
6786
6787 fn collect_dirty_bins_recursive(
6788 node_arc: &Arc<RwLock<TreeNode>>,
6789 db_id: u64,
6790 out: &mut Vec<(u64, Arc<RwLock<TreeNode>>)>,
6791 ) {
6792 let guard = node_arc.read();
6793 match &*guard {
6794 TreeNode::Bottom(b) => {
6795 // Include this BIN if it is dirty or has any dirty slots.
6796 if b.dirty || b.dirty_count() > 0 {
6797 out.push((db_id, Arc::clone(node_arc)));
6798 }
6799 }
6800 TreeNode::Internal(n) => {
6801 let children: Vec<Arc<RwLock<TreeNode>>> =
6802 n.resident_children();
6803 drop(guard);
6804 for child in children {
6805 Self::collect_dirty_bins_recursive(&child, db_id, out);
6806 } // guard already dropped
6807 }
6808 }
6809 }
6810
6811 /// Collect all BINs that have at least one `known_deleted` slot.
6812 ///
6813 /// INCompressor queue-drain scan in the: the daemon iterates
6814 /// the in-memory IN list and identifies BINs that still hold zombie deleted
6815 /// slots. Each returned `Arc` can be passed directly to `compress_bin()`.
6816 pub fn collect_bins_with_known_deleted(
6817 &self,
6818 ) -> Vec<Arc<RwLock<TreeNode>>> {
6819 let mut result = Vec::new();
6820 if let Some(root) = self.get_root() {
6821 Self::collect_bins_with_known_deleted_recursive(&root, &mut result);
6822 }
6823 result
6824 }
6825
6826 fn collect_bins_with_known_deleted_recursive(
6827 node_arc: &Arc<RwLock<TreeNode>>,
6828 out: &mut Vec<Arc<RwLock<TreeNode>>>,
6829 ) {
6830 let guard = node_arc.read();
6831 match &*guard {
6832 TreeNode::Bottom(b) => {
6833 if b.entries.iter().any(|e| e.known_deleted) {
6834 out.push(Arc::clone(node_arc));
6835 }
6836 }
6837 TreeNode::Internal(n) => {
6838 let children: Vec<Arc<RwLock<TreeNode>>> =
6839 n.resident_children();
6840 drop(guard);
6841 for child in children {
6842 Self::collect_bins_with_known_deleted_recursive(
6843 &child, out,
6844 );
6845 }
6846 }
6847 }
6848 }
6849
6850 /// Collect all dirty upper (non-BIN) internal nodes, sorted ascending by
6851 /// level (bottom-up order, BIN level excluded).
6852 ///
6853 /// Serialise an upper-IN node (level > 1) by node_id for off-heap storage.
6854 ///
6855 /// Traverses the tree to find the internal node whose matches,
6856 /// then calls to produce a compact byte
6857 /// representation. Returns if the node is not found or is a BIN
6858 /// (BINs are not upper INs).
6859 ///
6860 /// Mirrors `OffHeapAllocator` serialises the same bytes that would be written
6861 /// to the log, allowing the evictor to store upper-INs off-heap and avoid
6862 /// log-file reads on the next traversal.
6863 pub fn serialize_upper_in(&self, node_id: u64) -> Option<Vec<u8>> {
6864 let root = self.get_root()?;
6865 Self::find_and_serialize_upper_in(&root, node_id)
6866 }
6867
6868 fn find_and_serialize_upper_in(
6869 node_arc: &Arc<RwLock<TreeNode>>,
6870 target_id: u64,
6871 ) -> Option<Vec<u8>> {
6872 let guard = node_arc.read();
6873 match &*guard {
6874 TreeNode::Bottom(_) => None, // BINs are not upper INs
6875 TreeNode::Internal(n) => {
6876 if n.node_id == target_id {
6877 // Serialise InNodeStub for off-heap storage.
6878 // Format: node_id(u64BE) | level(i32BE) | n_entries(u32BE)
6879 // then per-entry: key_len(u32BE) | key | lsn(u64BE)
6880 let mut buf = Vec::new();
6881 buf.extend_from_slice(&n.node_id.to_be_bytes());
6882 buf.extend_from_slice(&n.level.to_be_bytes());
6883 buf.extend_from_slice(
6884 &(n.entries.len() as u32).to_be_bytes(),
6885 );
6886 for (i, e) in n.entries.iter().enumerate() {
6887 buf.extend_from_slice(
6888 &(e.key.len() as u32).to_be_bytes(),
6889 );
6890 buf.extend_from_slice(&e.key);
6891 buf.extend_from_slice(
6892 &n.get_lsn(i).as_u64().to_be_bytes(),
6893 );
6894 }
6895 return Some(buf);
6896 }
6897 // Recurse into children before releasing the guard so we
6898 // hold the minimum read-lock duration.
6899 let children: Vec<Arc<RwLock<TreeNode>>> =
6900 n.resident_children();
6901 drop(guard);
6902 for child in &children {
6903 if let Some(bytes) =
6904 Self::find_and_serialize_upper_in(child, target_id)
6905 {
6906 return Some(bytes);
6907 }
6908 }
6909 None
6910 }
6911 }
6912 }
6913
6914 /// Upper-IN traversal in `Checkpointer.processINList()` from
6915 /// — visits all `TreeNode::Internal` nodes whose `dirty` flag is set
6916 /// and returns them together with their level, sorted lowest-level-first
6917 /// so the checkpointer can log them bottom-up. The root is always the
6918 /// last entry (highest level), which must be logged `Provisional::No`.
6919 pub fn collect_dirty_upper_ins(
6920 &self,
6921 _db_id: u64,
6922 ) -> Vec<(i32, Arc<RwLock<TreeNode>>)> {
6923 let mut result: Vec<(i32, Arc<RwLock<TreeNode>>)> = Vec::new();
6924 if let Some(root) = self.get_root() {
6925 Self::collect_dirty_upper_ins_recursive(&root, &mut result);
6926 }
6927 result.sort_by_key(|(level, _)| *level);
6928 result
6929 }
6930
6931 fn collect_dirty_upper_ins_recursive(
6932 node_arc: &Arc<RwLock<TreeNode>>,
6933 out: &mut Vec<(i32, Arc<RwLock<TreeNode>>)>,
6934 ) {
6935 let guard = node_arc.read();
6936 match &*guard {
6937 TreeNode::Bottom(_) => {
6938 // BINs are handled by flush_dirty_bins_internal; skip here.
6939 }
6940 TreeNode::Internal(n) => {
6941 let is_dirty = n.dirty;
6942 // REC-AA: return the node's ACTUAL tree level (n.level, in
6943 // MAIN_LEVEL|n units), not a root-relative depth. The level
6944 // must be on the same scale as a BIN's `level` (BIN_LEVEL =
6945 // MAIN_LEVEL|1) so that the checkpointer's flush-level
6946 // computation and the evictor's `node_level < flush_level`
6947 // comparison are meaningful. With a root-relative depth the
6948 // root had the SMALLEST value (0) and the IN above the BINs
6949 // the LARGEST, inverting the provisional/non-provisional
6950 // boundary; with n.level the root has the largest level, as JE
6951 // expects.
6952 let level = n.level;
6953 let children: Vec<Arc<RwLock<TreeNode>>> =
6954 n.resident_children();
6955 drop(guard);
6956 // Recurse into children first (bottom-up ordering).
6957 for child in &children {
6958 Self::collect_dirty_upper_ins_recursive(child, out);
6959 }
6960 // Add this node after children (so parent comes after all descendants).
6961 if is_dirty {
6962 out.push((level, Arc::clone(node_arc)));
6963 }
6964 }
6965 }
6966 }
6967
6968 // ========================================================================
6969 // Tree.java ports: 8 additional tree methods (Task #82)
6970 // ========================================================================
6971
6972 /// Returns `true` if the root node is currently loaded in memory.
6973 ///
6974 /// .
6975 pub fn is_root_resident(&self) -> bool {
6976 self.root.read().is_some()
6977 }
6978
6979 /// Returns the root node `Arc` if present, or `None`.
6980 ///
6981 /// .
6982 pub fn get_resident_root_in(&self) -> Option<Arc<RwLock<TreeNode>>> {
6983 self.root.read().clone()
6984 }
6985
6986 /// Returns the BIN that should contain a slot for `key` (the "parent" of
6987 /// LN slots).
6988 ///
6989 /// . Descends the tree
6990 /// exactly like `search()` and returns the leaf-level BIN arc, or `None`
6991 /// if the tree is empty.
6992 ///
6993 /// Uses `read_arc()` hand-over-hand on the descent — the child
6994 /// guard is taken before the parent guard is dropped, matching
6995 /// `search()`. Returns the BIN Arc with no read lock held; the
6996 /// caller must take whatever lock it needs to operate on the
6997 /// returned BIN.
6998 pub fn get_parent_bin_for_child_ln(
6999 &self,
7000 key: &[u8],
7001 ) -> Option<Arc<RwLock<TreeNode>>> {
7002 let root = self.get_root()?;
7003 let mut current_arc: Arc<RwLock<TreeNode>> = root.clone();
7004 let mut guard: parking_lot::ArcRwLockReadGuard<
7005 parking_lot::RawRwLock,
7006 TreeNode,
7007 > = root.read_arc();
7008
7009 loop {
7010 if guard.is_bin() {
7011 drop(guard);
7012 return Some(current_arc);
7013 }
7014
7015 let parent_arc = current_arc.clone();
7016 let next_idx = match &*guard {
7017 TreeNode::Internal(n) => {
7018 if n.entries.is_empty() {
7019 return None;
7020 }
7021 let idx = self.upper_in_floor_index(&n.entries, key);
7022 match n.get_child(idx) {
7023 Some(c) => {
7024 let next_guard = c.read_arc();
7025 drop(guard);
7026 current_arc = c;
7027 guard = next_guard;
7028 continue;
7029 }
7030 None => idx, // EV-14/EV-13: re-fetch below.
7031 }
7032 }
7033 TreeNode::Bottom(_) => {
7034 unreachable!("is_bin() returned false above")
7035 }
7036 };
7037 // Hand-over-hand: take child guard before dropping parent.
7038 drop(guard);
7039 let child = self.child_at_or_fetch(&parent_arc, next_idx)?;
7040 let next_guard = child.read_arc();
7041 current_arc = child;
7042 guard = next_guard;
7043 }
7044 }
7045
7046 /// Returns the BIN where `key` should be inserted.
7047 ///
7048 /// . Semantically identical to
7049 /// `get_parent_bin_for_child_ln` — expressed as a separate method to match
7050 /// API surface.
7051 ///
7052 /// Implemented as a delegation to `get_parent_bin_for_child_ln`,
7053 /// which uses `read_arc()` hand-over-hand on the descent.
7054 pub fn find_bin_for_insert(
7055 &self,
7056 key: &[u8],
7057 ) -> Option<Arc<RwLock<TreeNode>>> {
7058 self.get_parent_bin_for_child_ln(key)
7059 }
7060
7061 /// Search for a BIN, allowing splits during descent (preemptive splitting).
7062 ///
7063 /// . This thin wrapper
7064 /// delegates to `search()` and returns the result wrapped in `Some`.
7065 /// The full split-allowed descent is performed by `insert()` internally;
7066 /// this method exposes the same result type for callers that only need to
7067 /// locate the BIN.
7068 ///
7069 /// Returns `None` if the tree is empty.
7070 pub fn search_splits_allowed(&self, key: &[u8]) -> Option<SearchResult> {
7071 self.search(key)
7072 }
7073
7074 /// Traverses the entire tree and returns every IN and BIN node as a flat
7075 /// list.
7076 ///
7077 /// . Used by recovery to rebuild
7078 /// the in-memory IN list after log replay. The walk is a BFS from the
7079 /// root; every `Arc<RwLock<TreeNode>>` encountered (both Internal and
7080 /// Bottom variants) is included in the result.
7081 pub fn rebuild_in_list(&self) -> Vec<Arc<RwLock<TreeNode>>> {
7082 let mut result = Vec::new();
7083 if let Some(root) = self.get_root() {
7084 Self::rebuild_in_list_recursive(&root, &mut result);
7085 }
7086 result
7087 }
7088
7089 fn rebuild_in_list_recursive(
7090 node_arc: &Arc<RwLock<TreeNode>>,
7091 out: &mut Vec<Arc<RwLock<TreeNode>>>,
7092 ) {
7093 // Push this node unconditionally — both INs and BINs belong in the list.
7094 out.push(Arc::clone(node_arc));
7095
7096 let guard = node_arc.read();
7097
7098 if let TreeNode::Internal(n) = &*guard {
7099 // Collect child arcs while holding the guard, then drop it before
7100 // recursing to avoid holding multiple locks simultaneously.
7101 let children: Vec<Arc<RwLock<TreeNode>>> = n.resident_children();
7102 drop(guard);
7103 for child in children {
7104 Self::rebuild_in_list_recursive(&child, out);
7105 }
7106 }
7107 // BIN nodes are leaves — no children to recurse into.
7108 }
7109
7110 /// Validates internal tree consistency.
7111 ///
7112 /// . Primarily a debug/test tool.
7113 ///
7114 /// Rules checked:
7115 /// - An empty tree (no root) is trivially valid → returns `true`.
7116 /// - A non-empty tree must have a non-null root.
7117 /// - Every Internal node must have at least one entry.
7118 /// - Every child pointer that is `Some` must be readable (lock must be
7119 /// acquirable — i.e., no poisoned locks).
7120 ///
7121 /// Returns `true` if no inconsistencies are detected, `false` otherwise.
7122 pub fn validate_in_list(&self) -> bool {
7123 match self.get_root() {
7124 None => true, // empty tree is always valid
7125 Some(root) => Self::validate_node(&root),
7126 }
7127 }
7128
7129 fn validate_node(node_arc: &Arc<RwLock<TreeNode>>) -> bool {
7130 let guard = node_arc.read();
7131
7132 match &*guard {
7133 TreeNode::Bottom(_bin) => {
7134 // BIN nodes are always structurally valid at this level.
7135 true
7136 }
7137 TreeNode::Internal(n) => {
7138 // An Internal node must have at least one entry.
7139 if n.entries.is_empty() {
7140 return false;
7141 }
7142 // Collect child arcs before dropping the guard.
7143 let children: Vec<Arc<RwLock<TreeNode>>> =
7144 n.resident_children();
7145 drop(guard);
7146 // Recursively validate every resident child.
7147 for child in children {
7148 if !Self::validate_node(&child) {
7149 return false;
7150 }
7151 }
7152 true
7153 }
7154 }
7155 }
7156
7157 /// Traverses the tree to find the parent IN that contains `child_node_id`
7158 /// as one of its child slots.
7159 ///
7160 /// . Used by the cleaner
7161 /// migration path to re-insert migrated INs after eviction/fetch.
7162 ///
7163 /// Returns `(parent_arc, slot_index)` where `slot_index` is the position
7164 /// in the parent's `entries` vector whose child matches `child_node_id`,
7165 /// or `None` if no such parent is found.
7166 pub fn get_parent_in_for_child_in(
7167 &self,
7168 child_node_id: u64,
7169 ) -> Option<(Arc<RwLock<TreeNode>>, usize)> {
7170 let root = self.get_root()?;
7171 Self::find_parent_of_node_id(&root, child_node_id)
7172 }
7173
7174 /// Recursive DFS helper for `get_parent_in_for_child_in`.
7175 ///
7176 /// Scans every entry in each Internal node. When a child's node_id
7177 /// matches `target_id` the parent arc and slot index are returned.
7178 fn find_parent_of_node_id(
7179 node_arc: &Arc<RwLock<TreeNode>>,
7180 target_id: u64,
7181 ) -> Option<(Arc<RwLock<TreeNode>>, usize)> {
7182 let guard = node_arc.read();
7183
7184 let TreeNode::Internal(n) = &*guard else {
7185 // BIN nodes have no IN children — cannot be a parent of another IN.
7186 return None;
7187 };
7188
7189 // Check whether any child of this IN has the target node_id.
7190 let mut children: Vec<(usize, Arc<RwLock<TreeNode>>)> = Vec::new();
7191 for slot in 0..n.entries.len() {
7192 if let Some(child_arc) = n.child_ref(slot) {
7193 // Read the child's node_id under a separate lock (acquire child
7194 // while parent guard is still held — this is intentional for
7195 // the ID comparison only; we release both immediately after).
7196 let child_id = {
7197 let cg = child_arc.read();
7198 match &*cg {
7199 TreeNode::Internal(cn) => cn.node_id,
7200 TreeNode::Bottom(cb) => cb.node_id,
7201 }
7202 };
7203
7204 if child_id == target_id {
7205 // Found — return a clone of this node as parent.
7206 let parent_clone = Arc::clone(node_arc);
7207 return Some((parent_clone, slot));
7208 }
7209
7210 // Not found at this slot; schedule this child for recursion.
7211 children.push((slot, Arc::clone(child_arc)));
7212 }
7213 }
7214 // Release parent guard before recursing.
7215 drop(guard);
7216
7217 // Recurse into each Internal child.
7218 for (_slot, child_arc) in children {
7219 if let Some(result) =
7220 Self::find_parent_of_node_id(&child_arc, target_id)
7221 {
7222 return Some(result);
7223 }
7224 }
7225
7226 None
7227 }
7228
7229 /// Propagates the dirty flag upward from `node_arc` to the root.
7230 ///
7231 /// Implicit dirty propagation: after modifying any node,
7232 /// all ancestors on the path to the root must also be marked dirty so
7233 /// the checkpointer logs them.
7234 ///
7235 /// In this happens through `IN.setDirty(true)` calls at each level
7236 /// during split/insert callbacks. Here we walk the weak parent chain.
7237 /// Reconstitute a BIN-delta by merging it onto a base full BIN.
7238 ///
7239 /// Implements JE `BINDelta.reconstituteBIN(databaseImpl)` for the recovery
7240 /// path where the log manager is not available as a `LogManager` but as
7241 /// raw serialized bytes.
7242 ///
7243 /// Algorithm:
7244 /// 1. Deserialise `base_bytes` as a full `BinStub`.
7245 /// 2. Apply `delta_bytes` slots onto the base using `BinStub::apply_delta`
7246 /// (raw slot overlay).
7247 /// 3. Recompute key prefix so prefix-compressed entries are consistent.
7248 ///
7249 /// Returns `None` if either byte slice is malformed.
7250 ///
7251 /// JE `BINDelta.reconstituteBIN` / `BINDelta.applyDelta`
7252 /// (DRIFT-10 / Stage 3).
7253 pub fn reconstitute_bin_delta(
7254 base_bytes: &[u8],
7255 delta_bytes: &[u8],
7256 ) -> Option<BinStub> {
7257 let mut base = BinStub::deserialize_full(base_bytes)?;
7258 // Apply the delta slots onto the base.
7259 // Note: BinStub::apply_delta uses slot-index addressing into base.entries,
7260 // extending with new entries when the slot_idx >= base.entries.len().
7261 // After apply_delta we recompute the key prefix to fix prefix compression.
7262 BinStub::apply_delta(&mut base, delta_bytes)?;
7263 // Recompute prefix so prefix-compressed BINs are consistent after merge.
7264 base.recompute_key_prefix();
7265 base.is_delta = false;
7266 base.dirty = false;
7267 Some(base)
7268 }
7269
7270 pub fn propagate_dirty_to_root(node_arc: &Arc<RwLock<TreeNode>>) {
7271 let parent_weak = { node_arc.read().get_parent() };
7272
7273 if let Some(parent_arc) = parent_weak.and_then(|w| w.upgrade()) {
7274 {
7275 let mut g = parent_arc.write();
7276 g.set_dirty(true);
7277 }
7278 // Recurse further up.
7279 Self::propagate_dirty_to_root(&parent_arc);
7280 }
7281 }
7282
7283 // ========================================================================
7284 // IN-redo: JE RecoveryManager.recoverIN / recoverRootIN / recoverChildIN
7285 // ========================================================================
7286
7287 /// Deserialise an upper-IN node from bytes produced by
7288 /// `TreeNode::write_to_bytes()` / `flush_one_tree_upper_ins`.
7289 ///
7290 /// Format: node_id(u64BE) | level(i32BE) | n_entries(u32BE) | dirty(u8)
7291 /// | per-entry: key_len(u16BE) | key | lsn(u64BE)
7292 ///
7293 /// JE `INFileReader.getIN(db)` / `IN.readFromLog`.
7294 pub fn deserialize_upper_in(bytes: &[u8]) -> Option<InNodeStub> {
7295 if bytes.len() < 13 {
7296 return None;
7297 }
7298 let node_id = u64::from_be_bytes(bytes[0..8].try_into().ok()?);
7299 let level = i32::from_be_bytes(bytes[8..12].try_into().ok()?);
7300 let n_entries =
7301 u32::from_be_bytes(bytes[12..16].try_into().ok()?) as usize;
7302 // dirty byte (1 byte after n_entries)
7303 if bytes.len() < 17 {
7304 return None;
7305 }
7306 let mut pos = 17usize; // skip node_id(8) + level(4) + n_entries(4) + dirty(1)
7307 let mut entries = Vec::with_capacity(n_entries);
7308 let mut lsns: Vec<Lsn> = Vec::with_capacity(n_entries);
7309 for _ in 0..n_entries {
7310 if pos + 2 > bytes.len() {
7311 return None;
7312 }
7313 let key_len =
7314 u16::from_be_bytes(bytes[pos..pos + 2].try_into().ok()?)
7315 as usize;
7316 pos += 2;
7317 if pos + key_len > bytes.len() {
7318 return None;
7319 }
7320 let key = bytes[pos..pos + key_len].to_vec();
7321 pos += key_len;
7322 if pos + 8 > bytes.len() {
7323 return None;
7324 }
7325 let lsn = noxu_util::Lsn::from_u64(u64::from_be_bytes(
7326 bytes[pos..pos + 8].try_into().ok()?,
7327 ));
7328 pos += 8;
7329 entries.push(InEntry { key });
7330 lsns.push(lsn); // T-3
7331 }
7332 Some(InNodeStub {
7333 node_id,
7334 level,
7335 entries,
7336 // T-4: a freshly deserialized IN has no resident children.
7337 targets: TargetRep::None,
7338 dirty: false,
7339 generation: 0,
7340 parent: None,
7341 lsn_rep: LsnRep::from_lsns(&lsns), // T-3
7342 })
7343 }
7344
7345 /// Deserialise a BIN from bytes produced by `BinStub::serialize_full()`.
7346 ///
7347 /// Thin wrapper so the recovery path does not need to import `BinStub`
7348 /// directly from callers that only have the raw bytes.
7349 ///
7350 /// JE `INFileReader.getIN(db)` for a BIN entry.
7351 pub fn deserialize_bin(bytes: &[u8]) -> Option<BinStub> {
7352 let mut bin = BinStub::deserialize_full(bytes)?;
7353 bin.dirty = false; // freshly loaded from log — clean for now
7354 Some(bin)
7355 }
7356
7357 /// Apply a logged IN/BIN to the in-memory tree during the recovery redo pass.
7358 ///
7359 /// Implements JE `RecoveryManager.recoverIN`:
7360 /// - `is_root` nodes are handled by `recover_root_in`.
7361 /// - non-root nodes are handled by `recover_child_in`.
7362 ///
7363 /// `log_lsn` is the LSN at which this IN/BIN was logged. The currency
7364 /// check in `recover_child_in` uses this to decide whether to replace the
7365 /// in-memory slot (tree slot LSN < log_lsn → replace; equal → noop;
7366 /// greater → skip).
7367 ///
7368 /// JE `RecoveryManager.recoverIN` / `replayOneIN`
7369 /// (RecoveryManager.java ~lines 1200–1280).
7370 pub fn recover_in_redo(
7371 &self,
7372 log_lsn: noxu_util::Lsn,
7373 is_root: bool,
7374 is_bin: bool,
7375 node_data: &[u8],
7376 ) -> InRedoResult {
7377 if is_bin {
7378 let Some(bin) = Self::deserialize_bin(node_data) else {
7379 return InRedoResult::DeserializeFailed;
7380 };
7381 if is_root {
7382 self.recover_root_bin(log_lsn, bin)
7383 } else {
7384 self.recover_child_bin(log_lsn, bin)
7385 }
7386 } else {
7387 let Some(upper) = Self::deserialize_upper_in(node_data) else {
7388 return InRedoResult::DeserializeFailed;
7389 };
7390 if is_root {
7391 self.recover_root_upper_in(log_lsn, upper)
7392 } else {
7393 self.recover_child_upper_in(log_lsn, upper)
7394 }
7395 }
7396 }
7397
7398 /// Recover a root BIN.
7399 ///
7400 /// If no root exists or the existing root is older (lower LSN), install
7401 /// this BIN as the new root.
7402 ///
7403 /// JE `RecoveryManager.recoverRootIN` / `RootUpdater.doWork`
7404 /// (RecoveryManager.java ~lines 1293–1410).
7405 fn recover_root_bin(
7406 &self,
7407 log_lsn: noxu_util::Lsn,
7408 bin: BinStub,
7409 ) -> InRedoResult {
7410 let mut root_guard = self.root.write();
7411 let existing_lsn = *self.root_log_lsn.read();
7412 match &*root_guard {
7413 None => {
7414 // No root — install this BIN as the root.
7415 // JE: `root == null` case in `RootUpdater.doWork`.
7416 let node = TreeNode::Bottom(bin);
7417 *root_guard = Some(Arc::new(RwLock::new(node)));
7418 *self.root_log_lsn.write() = log_lsn;
7419 InRedoResult::Inserted
7420 }
7421 Some(_) => {
7422 // JE: `originalLsn = root.getLsn()`; replace if logLsn > originalLsn.
7423 if log_lsn > existing_lsn {
7424 let node = TreeNode::Bottom(bin);
7425 *root_guard = Some(Arc::new(RwLock::new(node)));
7426 *self.root_log_lsn.write() = log_lsn;
7427 InRedoResult::Replaced
7428 } else {
7429 InRedoResult::Skipped
7430 }
7431 }
7432 }
7433 }
7434
7435 /// Recover a root upper IN.
7436 ///
7437 /// JE `RecoveryManager.recoverRootIN` for a non-BIN root.
7438 fn recover_root_upper_in(
7439 &self,
7440 log_lsn: noxu_util::Lsn,
7441 upper: InNodeStub,
7442 ) -> InRedoResult {
7443 let mut root_guard = self.root.write();
7444 let existing_lsn = *self.root_log_lsn.read();
7445 match &*root_guard {
7446 None => {
7447 let node = TreeNode::Internal(upper);
7448 *root_guard = Some(Arc::new(RwLock::new(node)));
7449 *self.root_log_lsn.write() = log_lsn;
7450 InRedoResult::Inserted
7451 }
7452 Some(_) => {
7453 if log_lsn > existing_lsn {
7454 let node = TreeNode::Internal(upper);
7455 *root_guard = Some(Arc::new(RwLock::new(node)));
7456 *self.root_log_lsn.write() = log_lsn;
7457 InRedoResult::Replaced
7458 } else {
7459 InRedoResult::Skipped
7460 }
7461 }
7462 }
7463 }
7464
7465 /// Recover a non-root BIN.
7466 ///
7467 /// Implements the three-case currency check from JE
7468 /// `RecoveryManager.recoverChildIN`
7469 /// (RecoveryManager.java lines 1412–1500):
7470 ///
7471 /// 1. Node not in tree: skip (parent logged a later structure that already
7472 /// omits this node, or node was deleted).
7473 /// 2. Physical match (slot LSN == log_lsn): noop — already current.
7474 /// 3. Logical match: another version of the node is in the slot.
7475 /// Replace if tree slot LSN < log_lsn (tree is older), skip otherwise.
7476 fn recover_child_bin(
7477 &self,
7478 log_lsn: noxu_util::Lsn,
7479 bin: BinStub,
7480 ) -> InRedoResult {
7481 let node_id = bin.node_id;
7482 let Some((parent_arc, slot)) = self.get_parent_in_for_child_in(node_id)
7483 else {
7484 // Case 1: not in tree.
7485 return InRedoResult::NotInTree;
7486 };
7487 let mut parent = parent_arc.write();
7488 let TreeNode::Internal(ref mut p) = *parent else {
7489 return InRedoResult::NotInTree;
7490 };
7491 let tree_lsn = p.get_lsn(slot); // T-3
7492 if tree_lsn == log_lsn {
7493 // Case 2: physical match — noop.
7494 InRedoResult::Skipped
7495 } else if tree_lsn < log_lsn {
7496 // Case 3: logical match, tree is older — replace.
7497 // JE `parent.recoverIN(idx, inFromLog, logLsn, lastLoggedSize)`.
7498 let new_arc = Arc::new(RwLock::new(TreeNode::Bottom(bin)));
7499 // Set parent back-pointer on the new node.
7500 {
7501 let mut ng = new_arc.write();
7502 if let TreeNode::Bottom(ref mut b) = *ng {
7503 b.parent = Some(Arc::downgrade(&parent_arc));
7504 }
7505 }
7506 p.set_child(slot, Some(new_arc));
7507 p.set_lsn(slot, log_lsn); // T-3
7508 InRedoResult::Replaced
7509 } else {
7510 // tree_lsn > log_lsn: tree already holds a newer version.
7511 InRedoResult::Skipped
7512 }
7513 }
7514
7515 /// Recover a non-root upper IN.
7516 ///
7517 /// JE `RecoveryManager.recoverChildIN` for a non-BIN node.
7518 fn recover_child_upper_in(
7519 &self,
7520 log_lsn: noxu_util::Lsn,
7521 upper: InNodeStub,
7522 ) -> InRedoResult {
7523 let node_id = upper.node_id;
7524 let Some((parent_arc, slot)) = self.get_parent_in_for_child_in(node_id)
7525 else {
7526 return InRedoResult::NotInTree;
7527 };
7528 let mut parent = parent_arc.write();
7529 let TreeNode::Internal(ref mut p) = *parent else {
7530 return InRedoResult::NotInTree;
7531 };
7532 let tree_lsn = p.get_lsn(slot); // T-3
7533 if tree_lsn == log_lsn {
7534 InRedoResult::Skipped
7535 } else if tree_lsn < log_lsn {
7536 let new_arc = Arc::new(RwLock::new(TreeNode::Internal(upper)));
7537 {
7538 let mut ng = new_arc.write();
7539 if let TreeNode::Internal(ref mut n) = *ng {
7540 n.parent = Some(Arc::downgrade(&parent_arc));
7541 }
7542 }
7543 p.set_child(slot, Some(new_arc));
7544 p.set_lsn(slot, log_lsn); // T-3
7545 InRedoResult::Replaced
7546 } else {
7547 InRedoResult::Skipped
7548 }
7549 }
7550}
7551
7552/// Result of a single `recover_in_redo` call.
7553///
7554/// JE traces the same outcomes in `RecoveryManager` debug logging.
7555#[derive(Debug, Clone, Copy, PartialEq, Eq)]
7556pub enum InRedoResult {
7557 /// Node was inserted as the new root.
7558 Inserted,
7559 /// Node replaced an older version in the tree.
7560 Replaced,
7561 /// Node not applied: tree already holds an equal or newer version.
7562 Skipped,
7563 /// Node not found in tree (parent logged later structure that excludes it).
7564 NotInTree,
7565 /// Deserialisation of `node_data` bytes failed.
7566 DeserializeFailed,
7567}
7568
7569/// Global node ID counter for generating unique node IDs.
7570///
7571/// This is the SINGLE source of node-ids for the whole tree subsystem. The
7572/// BIN constructor (`bin.rs`) and `node.rs` route through `generate_node_id`
7573/// so that, after crash recovery, a freshly allocated node-id is always
7574/// strictly greater than every node-id present in the recovered log.
7575///
7576/// JE ref: `NodeSequence.getNextLocalNodeId` (a single per-env counter) and
7577/// `IN.nodeId` allocation; `NodeSequence.initRealNodeId` seeds the counter
7578/// from the recovered `CheckpointEnd.lastLocalNodeId`. The env seeds this
7579/// counter post-recovery via `seed_node_id_counter`.
7580static NODE_ID_COUNTER: std::sync::atomic::AtomicU64 =
7581 std::sync::atomic::AtomicU64::new(1);
7582
7583/// Generates a unique node ID.
7584pub fn generate_node_id() -> u64 {
7585 NODE_ID_COUNTER.fetch_add(1, std::sync::atomic::Ordering::SeqCst)
7586}
7587
7588/// Returns the node-id that would be generated next (without allocating it).
7589///
7590/// Used by recovery seeding and by tests to assert no node-id reuse after a
7591/// restart.
7592pub fn peek_next_node_id_counter() -> u64 {
7593 NODE_ID_COUNTER.load(std::sync::atomic::Ordering::SeqCst)
7594}
7595
7596/// Seeds the node-id counter so the next generated id is `> last_node_id`.
7597///
7598/// Called by `EnvironmentImpl` after recovery with the recovered
7599/// `use_max_node_id`, mirroring `NodeSequence.initRealNodeId` /
7600/// `setLastNodeId`: post-restart allocation must never reuse a node-id that
7601/// is already in the log. Monotonic: never lowers the counter.
7602pub fn seed_node_id_counter(last_node_id: u64) {
7603 let want_next = last_node_id.saturating_add(1);
7604 // Bump only if our current next is below the recovered floor.
7605 let mut cur = NODE_ID_COUNTER.load(std::sync::atomic::Ordering::SeqCst);
7606 while cur < want_next {
7607 match NODE_ID_COUNTER.compare_exchange_weak(
7608 cur,
7609 want_next,
7610 std::sync::atomic::Ordering::SeqCst,
7611 std::sync::atomic::Ordering::SeqCst,
7612 ) {
7613 Ok(_) => break,
7614 Err(observed) => cur = observed,
7615 }
7616 }
7617}
7618
7619#[cfg(test)]
7620mod tests {
7621 use super::*;
7622
7623 // ====================================================================
7624 // T-3: LsnRep packed-LSN encoding (IN.entryLsnByteArray / getLsn /
7625 // setLsnInternal, IN.java:1752-1935).
7626 // ====================================================================
7627
7628 /// All-NULL node uses the 0-byte Empty rep; reads return NULL_LSN.
7629 #[test]
7630 fn lsnrep_empty_is_zero_bytes() {
7631 let rep = LsnRep::new(64);
7632 assert!(matches!(rep, LsnRep::Empty));
7633 assert_eq!(rep.memory_size(), 0);
7634 assert_eq!(rep.get(0), NULL_LSN);
7635 assert_eq!(rep.get(63), NULL_LSN);
7636 }
7637
7638 /// LSNs sharing a file number pack to the Compact rep (4 bytes/slot,
7639 /// base_file_number-relative) and round-trip exactly.
7640 #[test]
7641 fn lsnrep_compact_roundtrip_same_file() {
7642 let mut rep = LsnRep::new(8);
7643 for i in 0..8u32 {
7644 rep.set(i as usize, Lsn::new(7, 1000 + i), 8);
7645 }
7646 assert!(matches!(rep, LsnRep::Compact { .. }));
7647 for i in 0..8u32 {
7648 assert_eq!(rep.get(i as usize), Lsn::new(7, 1000 + i));
7649 }
7650 // 8 slots * 4 bytes = 32 bytes, far below 8 * 8 = 64 for raw u64.
7651 assert_eq!(rep.memory_size(), 8 * 4);
7652 }
7653
7654 /// NULL_LSN is stored via the 0xffffff file-offset sentinel, NOT u64::MAX,
7655 /// so a node with NULL slots still packs Compact (the blocker JE solves).
7656 #[test]
7657 fn lsnrep_null_does_not_force_long() {
7658 let mut rep = LsnRep::new(4);
7659 rep.set(0, Lsn::new(3, 50), 4);
7660 rep.set(1, NULL_LSN, 4);
7661 rep.set(2, Lsn::new(3, 60), 4);
7662 rep.set(3, NULL_LSN, 4);
7663 assert!(
7664 matches!(rep, LsnRep::Compact { .. }),
7665 "NULL slots must NOT force the Long rep"
7666 );
7667 assert_eq!(rep.get(0), Lsn::new(3, 50));
7668 assert_eq!(rep.get(1), NULL_LSN);
7669 assert_eq!(rep.get(2), Lsn::new(3, 60));
7670 assert_eq!(rep.get(3), NULL_LSN);
7671 }
7672
7673 /// base_file_number tracks the minimum; setting a lower file number
7674 /// re-bases the whole array (adjustFileNumbers) while staying Compact.
7675 #[test]
7676 fn lsnrep_rebase_on_lower_file_number() {
7677 let mut rep = LsnRep::new(3);
7678 rep.set(0, Lsn::new(10, 5), 3);
7679 rep.set(1, Lsn::new(12, 6), 3);
7680 // A lower file number re-bases base_file_number to 8.
7681 rep.set(2, Lsn::new(8, 7), 3);
7682 assert!(matches!(rep, LsnRep::Compact { .. }));
7683 assert_eq!(rep.get(0), Lsn::new(10, 5));
7684 assert_eq!(rep.get(1), Lsn::new(12, 6));
7685 assert_eq!(rep.get(2), Lsn::new(8, 7));
7686 }
7687
7688 /// A file-number spread > 127 forces the Long fallback (mutateToLongArray),
7689 /// still round-tripping every slot.
7690 #[test]
7691 fn lsnrep_mutates_to_long_on_wide_file_range() {
7692 let mut rep = LsnRep::new(2);
7693 rep.set(0, Lsn::new(1, 5), 2);
7694 rep.set(1, Lsn::new(1000, 6), 2); // diff 999 > 127 -> Long
7695 assert!(matches!(rep, LsnRep::Long(_)));
7696 assert_eq!(rep.get(0), Lsn::new(1, 5));
7697 assert_eq!(rep.get(1), Lsn::new(1000, 6));
7698 }
7699
7700 /// A file offset > MAX_FILE_OFFSET (0xfffffe) forces the Long fallback.
7701 #[test]
7702 fn lsnrep_mutates_to_long_on_large_offset() {
7703 let mut rep = LsnRep::new(2);
7704 rep.set(0, Lsn::new(1, 10), 2);
7705 rep.set(1, Lsn::new(1, 0x00ff_ffff), 2); // > MAX_FILE_OFFSET -> Long
7706 assert!(matches!(rep, LsnRep::Long(_)));
7707 assert_eq!(rep.get(1), Lsn::new(1, 0x00ff_ffff));
7708 }
7709
7710 /// insert_shift / remove_shift keep slots aligned (INArrayRep.copy).
7711 #[test]
7712 fn lsnrep_insert_and_remove_shift() {
7713 let mut rep = LsnRep::from_lsns(&[
7714 Lsn::new(2, 1),
7715 Lsn::new(2, 2),
7716 Lsn::new(2, 3),
7717 ]);
7718 // Insert a new slot at index 1.
7719 rep.insert_shift(1, 4);
7720 rep.set(1, Lsn::new(2, 99), 4);
7721 assert_eq!(rep.get(0), Lsn::new(2, 1));
7722 assert_eq!(rep.get(1), Lsn::new(2, 99));
7723 assert_eq!(rep.get(2), Lsn::new(2, 2));
7724 assert_eq!(rep.get(3), Lsn::new(2, 3));
7725 // Remove slot 1.
7726 rep.remove_shift(1);
7727 assert_eq!(rep.get(0), Lsn::new(2, 1));
7728 assert_eq!(rep.get(1), Lsn::new(2, 2));
7729 assert_eq!(rep.get(2), Lsn::new(2, 3));
7730 }
7731
7732 #[test]
7733 fn test_empty_tree() {
7734 let tree = Tree::new(1, 128);
7735 assert!(tree.is_empty());
7736 assert_eq!(tree.get_database_id(), 1);
7737 assert_eq!(tree.get_root_splits(), 0);
7738 }
7739
7740 #[test]
7741 fn test_redo_insert_older_lsn_does_not_overwrite_newer_slot() {
7742 // REC-F2 reproduce-first: redo() must be idempotent w.r.t. slot
7743 // currency. JE RecoveryManager.redo() (line ~2512/2544) only
7744 // replaces a slot when logrecLsn > treeLsn. A later redo of an
7745 // OLDER committed LN for the same key must NOT revert the slot to
7746 // the older value or reset the slot LSN backward.
7747 let tree = Tree::new(1, 128);
7748 let key = b"k".to_vec();
7749
7750 // Install the newer version at LSN X (e.g. the BIN-logged value).
7751 let newer = Lsn::new(5, 500);
7752 tree.redo_insert(&key, b"new", newer).unwrap();
7753
7754 // Replay an OLDER committed LN at Y < X for the same key.
7755 let older = Lsn::new(2, 200);
7756 tree.redo_insert(&key, b"old", older).unwrap();
7757
7758 // The newer value and LSN must survive.
7759 let got = tree.search_with_data(&key).expect("key present");
7760 assert!(got.found);
7761 assert_eq!(
7762 got.data.as_deref(),
7763 Some(&b"new"[..]),
7764 "older-LSN redo reverted committed data"
7765 );
7766 assert_eq!(
7767 got.lsn,
7768 newer.as_u64(),
7769 "older-LSN redo reset slot LSN backward"
7770 );
7771
7772 // A redo at a strictly NEWER LSN must still replace (replace-only
7773 // when log_lsn > slot_lsn, matching JE lsnCmp > 0).
7774 let newest = Lsn::new(9, 900);
7775 tree.redo_insert(&key, b"newest", newest).unwrap();
7776 let got = tree.search_with_data(&key).expect("key present");
7777 assert_eq!(got.data.as_deref(), Some(&b"newest"[..]));
7778 assert_eq!(got.lsn, newest.as_u64());
7779 }
7780
7781 #[test]
7782 fn test_insert_single() {
7783 let tree = Tree::new(1, 128);
7784 let key = b"testkey".to_vec();
7785 let data = b"testdata".to_vec();
7786 let lsn = Lsn::new(1, 100);
7787
7788 let result = tree.insert(key.clone(), data, lsn);
7789 assert!(result.is_ok());
7790 assert!(result.unwrap()); // Should be a new insert
7791
7792 assert!(!tree.is_empty());
7793
7794 // Verify we can search for it
7795 let search_result = tree.search(&key);
7796 assert!(search_result.is_some());
7797 let sr = search_result.unwrap();
7798 assert!(sr.exact_parent_found || !sr.child_not_resident);
7799 }
7800
7801 #[test]
7802 fn test_insert_multiple() {
7803 let tree = Tree::new(1, 128);
7804
7805 let keys = vec![
7806 b"apple".to_vec(),
7807 b"banana".to_vec(),
7808 b"cherry".to_vec(),
7809 b"date".to_vec(),
7810 ];
7811
7812 for (i, key) in keys.iter().enumerate() {
7813 let data = format!("data{}", i).into_bytes();
7814 let lsn = Lsn::new(1, 100 + (i as u32) * 10);
7815 let result = tree.insert(key.clone(), data, lsn);
7816 assert!(result.is_ok());
7817 assert!(result.unwrap()); // All should be new inserts
7818 }
7819
7820 // Verify we can search for each
7821 for key in &keys {
7822 let search_result = tree.search(key);
7823 assert!(search_result.is_some());
7824 }
7825 }
7826
7827 #[test]
7828 fn test_insert_duplicate_key() {
7829 let tree = Tree::new(1, 128);
7830 let key = b"duplicate".to_vec();
7831 let data1 = b"first".to_vec();
7832 let data2 = b"second".to_vec();
7833 let lsn1 = Lsn::new(1, 100);
7834 let lsn2 = Lsn::new(1, 200);
7835
7836 // First insert
7837 let result1 = tree.insert(key.clone(), data1, lsn1);
7838 assert!(result1.is_ok());
7839 assert!(result1.unwrap()); // New insert
7840
7841 // Second insert with same key - should be update
7842 let result2 = tree.insert(key, data2, lsn2);
7843 assert!(result2.is_ok());
7844 assert!(!result2.unwrap()); // Update, not new insert
7845 }
7846
7847 #[test]
7848 fn test_search_empty_tree() {
7849 let tree = Tree::new(1, 128);
7850 let key = b"noexist".to_vec();
7851
7852 let result = tree.search(&key);
7853 assert!(result.is_none());
7854 }
7855
7856 #[test]
7857 fn test_first_and_last_node() {
7858 let tree = Tree::new(1, 128);
7859
7860 // Empty tree
7861 assert!(tree.get_first_node().is_none());
7862 assert!(tree.get_last_node().is_none());
7863
7864 // Insert some keys
7865 let keys = [b"a".to_vec(), b"b".to_vec(), b"c".to_vec()];
7866 for (i, key) in keys.iter().enumerate() {
7867 let data = format!("data{}", i).into_bytes();
7868 let lsn = Lsn::new(1, 100 + (i as u32) * 10);
7869 tree.insert(key.clone(), data, lsn).unwrap();
7870 }
7871
7872 // Now should have first and last
7873 let first = tree.get_first_node();
7874 assert!(first.is_some());
7875 assert_eq!(first.unwrap().index, 0);
7876
7877 let last = tree.get_last_node();
7878 assert!(last.is_some());
7879 assert_eq!(last.unwrap().index, 2);
7880 }
7881
7882 #[test]
7883 fn test_node_id_generation() {
7884 let id1 = generate_node_id();
7885 let id2 = generate_node_id();
7886 let id3 = generate_node_id();
7887
7888 assert!(id2 > id1);
7889 assert!(id3 > id2);
7890 }
7891
7892 #[test]
7893 fn test_tree_node_is_bin() {
7894 let bin = TreeNode::Bottom(BinStub {
7895 node_id: 1,
7896 level: BIN_LEVEL,
7897 entries: vec![],
7898 key_prefix: Vec::new(),
7899 dirty: false,
7900 is_delta: false,
7901 last_full_lsn: NULL_LSN,
7902 last_delta_lsn: NULL_LSN,
7903 generation: 0,
7904 parent: None,
7905 expiration_in_hours: true,
7906 cursor_count: 0,
7907 prohibit_next_delta: false,
7908 lsn_rep: LsnRep::Empty,
7909 keys: KeyRep::new(),
7910 compact_max_key_length: INKeyRep_DEFAULT_MAX_KEY_LENGTH,
7911 });
7912 assert!(bin.is_bin());
7913 assert_eq!(bin.level(), BIN_LEVEL);
7914
7915 let internal = TreeNode::Internal(InNodeStub {
7916 node_id: 2,
7917 level: MAIN_LEVEL + 2,
7918 entries: vec![],
7919 targets: TargetRep::None,
7920 dirty: false,
7921 generation: 0,
7922 parent: None,
7923 lsn_rep: LsnRep::Empty,
7924 });
7925 assert!(!internal.is_bin());
7926 assert_eq!(internal.level(), MAIN_LEVEL + 2);
7927 }
7928
7929 #[test]
7930 fn test_find_entry() {
7931 let mut entries = vec![];
7932 let mut keys = vec![];
7933 for i in 0..5 {
7934 entries.push(BinEntry {
7935 data: Some(vec![]),
7936 known_deleted: false,
7937 dirty: false,
7938 expiration_time: 0,
7939 });
7940 keys.push(format!("key{}", i).into_bytes());
7941 }
7942
7943 let bin = TreeNode::Bottom(BinStub {
7944 node_id: 1,
7945 level: BIN_LEVEL,
7946 entries,
7947 key_prefix: Vec::new(),
7948 dirty: false,
7949 is_delta: false,
7950 last_full_lsn: NULL_LSN,
7951 last_delta_lsn: NULL_LSN,
7952 generation: 0,
7953 parent: None,
7954 expiration_in_hours: true,
7955 cursor_count: 0,
7956 prohibit_next_delta: false,
7957 lsn_rep: LsnRep::Empty,
7958 keys: KeyRep::from_keys(keys),
7959 compact_max_key_length: INKeyRep_DEFAULT_MAX_KEY_LENGTH,
7960 });
7961
7962 // Search for existing key
7963 let result = bin.find_entry(b"key2", false, true);
7964 assert_eq!(result & 0xFFFF, 2);
7965 assert_ne!(result & EXACT_MATCH, 0);
7966
7967 // Search for non-existing key with exact=false
7968 let result = bin.find_entry(b"key15", false, false);
7969 assert_eq!(result & 0xFFFF, 2); // Would go between key1 and key2
7970 assert_eq!(result & EXACT_MATCH, 0);
7971 }
7972
7973 #[test]
7974 fn test_insert_until_full() {
7975 // With splits implemented, inserting beyond max_entries_per_node must
7976 // succeed (the tree splits proactively rather than returning an error).
7977 let tree = Tree::new(1, 3); // Small max to exercise splits
7978
7979 // Insert up to max
7980 for i in 0..3 {
7981 let key = format!("key{}", i).into_bytes();
7982 let data = format!("data{}", i).into_bytes();
7983 let lsn = Lsn::new(1, 100 + i);
7984 let result = tree.insert(key, data, lsn);
7985 assert!(result.is_ok(), "insert {} should succeed", i);
7986 }
7987
7988 // The 4th insert triggers a split and must also succeed.
7989 let key = b"key3".to_vec();
7990 let data = b"data3".to_vec();
7991 let lsn = Lsn::new(1, 103);
7992 let result = tree.insert(key.clone(), data, lsn);
7993 assert!(
7994 result.is_ok(),
7995 "insert after full should trigger split and succeed"
7996 );
7997 assert!(result.unwrap(), "should be a new insert");
7998
7999 // The inserted key must be findable after the split.
8000 let sr = tree.search(&key);
8001 assert!(sr.is_some(), "key3 must be searchable after split");
8002 assert!(sr.unwrap().exact_parent_found, "key3 must be found exactly");
8003 }
8004
8005 #[test]
8006 fn test_memory_counter_balanced_on_insert_delete_f8() {
8007 use std::sync::Arc;
8008 use std::sync::atomic::{AtomicI64, Ordering};
8009 // F8 regression: insert accounts key+data+48; delete must subtract the
8010 // SAME, so an insert+delete of the same record returns the counter to
8011 // its starting value (previously delete omitted data_len -> the counter
8012 // leaked data_len per delete, biasing the evictor over-budget view).
8013 let mut tree = Tree::new(1, 16);
8014 let counter = Arc::new(AtomicI64::new(0));
8015 tree.set_memory_counter(Arc::clone(&counter));
8016
8017 let key = b"a-key".to_vec();
8018 let data = vec![0u8; 200]; // non-trivial data length
8019 tree.insert(key.clone(), data.clone(), Lsn::new(0, 10)).unwrap();
8020 let after_insert = counter.load(Ordering::Relaxed);
8021 assert!(after_insert > 0, "insert must increase the counter");
8022 assert_eq!(
8023 after_insert,
8024 (key.len() + data.len() + BIN_ENTRY_OVERHEAD) as i64,
8025 "insert accounts key + data + per-slot BinEntry overhead"
8026 );
8027
8028 let deleted = tree.delete(&key);
8029 assert!(deleted);
8030 assert_eq!(
8031 counter.load(Ordering::Relaxed),
8032 0,
8033 "F8: delete must subtract key + data + BIN_ENTRY_OVERHEAD, returning the counter to its pre-insert value (no data_len leak)"
8034 );
8035 }
8036
8037 /// EV-13 (pass-post): a full-node detach must ACTUALLY drop the child
8038 /// `Arc` from the parent IN, not merely credit bytes. Before the fix the
8039 /// evictor credited `node_size_fn(node_id)` and removed the node from the
8040 /// LRU list, but the parent's `InEntry.child` still held a strong `Arc`,
8041 /// so the node was never freed (phantom free) and the budget over-credited.
8042 ///
8043 /// This test proves: after `detach_node_by_id` the held child `Arc` is the
8044 /// LAST strong reference (strong_count == 1), the parent slot's `child` is
8045 /// `None`, and the returned bytes equal the node's measured heap size.
8046 ///
8047 /// JE ref: `IN.detachNode` (`setTarget(idx, null)`) / `Evictor.evict`.
8048 #[test]
8049 fn test_ev13_detach_actually_frees_child() {
8050 // Tiny fanout forces a root split so we get a real IN parent with BIN
8051 // children that the evictor would target.
8052 let tree = Tree::new(7, 4);
8053 for i in 0u8..12 {
8054 tree.insert(
8055 vec![b'a' + i],
8056 vec![i; 8],
8057 Lsn::new(1, u32::from(i) + 1),
8058 )
8059 .unwrap();
8060 }
8061
8062 // Find a BIN child of the root IN (the eviction target) + its parent.
8063 let root = tree.get_root().expect("tree must have a root");
8064 let (parent_arc, child_idx, bin_id, expected_bytes) = {
8065 let rg = root.read();
8066 let TreeNode::Internal(n) = &*rg else {
8067 panic!("root must be an IN after split");
8068 };
8069 // Pick the first slot whose child is a resident BIN.
8070 let (idx, child) = n
8071 .first_resident_child()
8072 .expect("root must have a resident child");
8073 let (id, bytes) = {
8074 let cg = child.read();
8075 (
8076 match &*cg {
8077 TreeNode::Bottom(b) => b.node_id,
8078 TreeNode::Internal(n2) => n2.node_id,
8079 },
8080 cg.budgeted_memory_size(),
8081 )
8082 };
8083 (Arc::clone(&root), idx, id, bytes)
8084 };
8085
8086 // Hold an external strong reference to the child so we can observe its
8087 // strong_count drop when detach releases the parent's reference.
8088 let child_arc = {
8089 let pg = parent_arc.read();
8090 let TreeNode::Internal(n) = &*pg else { unreachable!() };
8091 Arc::clone(n.child_ref(child_idx).unwrap())
8092 };
8093 // Two strong refs now: the parent slot + our test handle.
8094 assert_eq!(
8095 Arc::strong_count(&child_arc),
8096 2,
8097 "precondition: parent slot + test handle hold the child"
8098 );
8099
8100 let freed = tree.detach_node_by_id(bin_id);
8101
8102 // 1. Bytes credited equal the measured heap size (no phantom credit).
8103 assert_eq!(
8104 freed, expected_bytes,
8105 "detach must credit the node's real measured heap size"
8106 );
8107 // 2. The parent slot's child is now None (JE setTarget(idx, null)).
8108 {
8109 let pg = parent_arc.read();
8110 let TreeNode::Internal(n) = &*pg else { unreachable!() };
8111 assert!(
8112 n.child_is_none(child_idx),
8113 "EV-13: parent slot must be detached (child == None)"
8114 );
8115 // The slot itself (key + LSN) is retained for re-fetch.
8116 assert!(
8117 !n.get_lsn(child_idx).is_null(),
8118 "detach keeps the slot LSN so the node can be re-fetched"
8119 );
8120 }
8121 // 3. Our handle is now the ONLY strong reference -> the parent really
8122 // dropped its Arc; the node is freed when we drop `child_arc`.
8123 // Before EV-13 this would be 2 (parent still held it) = phantom free.
8124 assert_eq!(
8125 Arc::strong_count(&child_arc),
8126 1,
8127 "EV-13: detach must drop the parent's strong Arc (no phantom free)"
8128 );
8129 }
8130
8131 /// EV-13: detach must NOT decrement the memory counter itself (the evictor
8132 /// owns that bookkeeping via `Arbiter::release_memory`). A double credit
8133 /// would drive `cache_usage` below reality.
8134 #[test]
8135 fn test_ev13_detach_does_not_touch_counter() {
8136 use std::sync::atomic::{AtomicI64, Ordering};
8137 let mut tree = Tree::new(8, 4);
8138 let counter = Arc::new(AtomicI64::new(0));
8139 tree.set_memory_counter(Arc::clone(&counter));
8140 for i in 0u8..12 {
8141 tree.insert(
8142 vec![b'a' + i],
8143 vec![i; 8],
8144 Lsn::new(1, u32::from(i) + 1),
8145 )
8146 .unwrap();
8147 }
8148 let before = counter.load(Ordering::Relaxed);
8149
8150 // Grab a BIN child id.
8151 let root = tree.get_root().unwrap();
8152 let bin_id = {
8153 let rg = root.read();
8154 let TreeNode::Internal(n) = &*rg else { unreachable!() };
8155 let child = n
8156 .resident_children()
8157 .into_iter()
8158 .next()
8159 .expect("resident child");
8160 match &*child.read() {
8161 TreeNode::Bottom(b) => b.node_id,
8162 TreeNode::Internal(n2) => n2.node_id,
8163 }
8164 };
8165
8166 let freed = tree.detach_node_by_id(bin_id);
8167 assert!(freed > 0, "detach must free a resident child");
8168 assert_eq!(
8169 counter.load(Ordering::Relaxed),
8170 before,
8171 "EV-13: detach must not change the counter (evictor credits once)"
8172 );
8173 }
8174
8175 /// EV-13: detaching the root or an unknown id is a no-op returning 0.
8176 #[test]
8177 fn test_ev13_detach_root_or_missing_is_noop() {
8178 let tree = Tree::new(9, 4);
8179 for i in 0u8..12 {
8180 tree.insert(
8181 vec![b'a' + i],
8182 vec![i; 8],
8183 Lsn::new(1, u32::from(i) + 1),
8184 )
8185 .unwrap();
8186 }
8187 let root_id = {
8188 let rg = tree.get_root().unwrap();
8189 let g = rg.read();
8190 match &*g {
8191 TreeNode::Internal(n) => n.node_id,
8192 TreeNode::Bottom(b) => b.node_id,
8193 }
8194 };
8195 assert_eq!(
8196 tree.detach_node_by_id(root_id),
8197 0,
8198 "root has no parent IN -> detach is a no-op"
8199 );
8200 assert_eq!(
8201 tree.detach_node_by_id(u64::MAX),
8202 0,
8203 "unknown node id -> detach is a no-op"
8204 );
8205 }
8206
8207 /// DBI-23 (pass-post): the live `memory_counter` must APPROXIMATE the real
8208 /// in-memory heap of the tree, not the old `key + data + 48` lower bound.
8209 ///
8210 /// JE keeps `inMemorySize` (`IN.getBudgetedMemorySize`) in lock-step with
8211 /// the per-node `computeMemorySize`; the over-budget arbiter sees the real
8212 /// figure so eviction fires at the right time. The previous Noxu live
8213 /// path undercounted each BIN slot (48 vs the 64-byte `BinEntry` struct)
8214 /// and never accounted the node-struct fixed overhead, so the counter ran
8215 /// below real heap and the evictor under-fired.
8216 ///
8217 /// We assert the live counter is within tolerance of
8218 /// `total_budgeted_memory` (the authoritative walk-and-sum oracle). The
8219 /// only gap is the per-node fixed struct overhead (BinStub/InNodeStub),
8220 /// which is a small fraction for non-trivial entries — the fix closes the
8221 /// dominant per-slot gap.
8222 #[test]
8223 fn test_dbi23_live_counter_approximates_real_heap() {
8224 use std::sync::atomic::{AtomicI64, Ordering};
8225 let mut tree = Tree::new(42, 32);
8226 let counter = Arc::new(AtomicI64::new(0));
8227 tree.set_memory_counter(Arc::clone(&counter));
8228
8229 // Insert N entries with realistic key+data sizes.
8230 let n = 400u32;
8231 for i in 0..n {
8232 let key = format!("key-{i:08}").into_bytes(); // 12 bytes
8233 let data = vec![0u8; 64]; // 64 bytes
8234 tree.insert(key, data, Lsn::new(1, i + 1)).unwrap();
8235 }
8236
8237 let live = counter.load(Ordering::Relaxed) as u64;
8238 let real = tree.total_budgeted_memory();
8239
8240 // The live counter must reflect the per-slot cost AFTER the T-2/T-3
8241 // compactions hoisted the per-slot key/LSN out of `BinEntry` into the
8242 // node-level reps. The per-slot live charge is now
8243 // `key + data + size_of::<BinEntry>() + 4` (the packed LSN slot); the
8244 // dominant data+key bytes are still charged in full. Assert the live
8245 // counter is at least the data-and-fixed portion (a stable floor that
8246 // does NOT assume the pre-compaction 64-byte slot).
8247 let new_lower_bound: u64 = (0..n)
8248 .map(|i| {
8249 let key_len = format!("key-{i:08}").len();
8250 (key_len + 64 + BIN_ENTRY_OVERHEAD) as u64
8251 })
8252 .sum();
8253
8254 assert!(
8255 live >= new_lower_bound,
8256 "DBI-23: live counter ({live}) must be >= the per-slot-correct \
8257 lower bound ({new_lower_bound})"
8258 );
8259
8260 // Within tolerance of real heap (the residual gap is the per-node
8261 // fixed struct overhead, intentionally not tracked incrementally).
8262 let lower = real * 80 / 100;
8263 assert!(
8264 live >= lower && live <= real,
8265 "DBI-23: live counter ({live}) must approximate real heap ({real}) \
8266 within tolerance [{lower}, {real}]"
8267 );
8268 }
8269
8270 #[test]
8271 fn test_delete_existing_key() {
8272 let tree = Tree::new(1, 128);
8273 let key = b"remove_me".to_vec();
8274 tree.insert(key.clone(), b"val".to_vec(), Lsn::new(1, 10)).unwrap();
8275 assert!(tree.delete(&key));
8276
8277 // After deletion the BIN is empty, so delete returns true the first
8278 // time and false the second time.
8279 assert!(!tree.delete(&key));
8280 }
8281
8282 #[test]
8283 fn test_delete_nonexistent_key() {
8284 let tree = Tree::new(1, 128);
8285 tree.insert(b"a".to_vec(), b"v".to_vec(), Lsn::new(1, 1)).unwrap();
8286
8287 assert!(!tree.delete(b"zzz"));
8288 }
8289
8290 #[test]
8291 fn test_delete_empty_tree() {
8292 let tree = Tree::new(1, 128);
8293 assert!(!tree.delete(b"nothing"));
8294 }
8295
8296 #[test]
8297 fn test_delete_all_entries_makes_bin_empty() {
8298 let tree = Tree::new(1, 128);
8299 tree.insert(b"x".to_vec(), b"1".to_vec(), Lsn::new(1, 1)).unwrap();
8300 tree.insert(b"y".to_vec(), b"2".to_vec(), Lsn::new(1, 2)).unwrap();
8301
8302 assert!(tree.delete(b"x"));
8303 assert!(tree.delete(b"y"));
8304
8305 // Tree still has a root (empty BIN), so is_empty() returns false.
8306 assert!(!tree.is_empty());
8307 // get_first_node should return None for an empty BIN.
8308 assert!(tree.get_first_node().is_none());
8309 }
8310
8311 #[test]
8312 fn test_set_root_and_get_root() {
8313 let tree = Tree::new(1, 128);
8314 assert!(tree.get_root().is_none());
8315
8316 let bin = TreeNode::Bottom(BinStub {
8317 node_id: generate_node_id(),
8318 level: BIN_LEVEL,
8319 entries: vec![],
8320 key_prefix: Vec::new(),
8321 dirty: false,
8322 is_delta: false,
8323 last_full_lsn: NULL_LSN,
8324 last_delta_lsn: NULL_LSN,
8325 generation: 0,
8326 parent: None,
8327 expiration_in_hours: true,
8328 cursor_count: 0,
8329 prohibit_next_delta: false,
8330 lsn_rep: LsnRep::Empty,
8331 keys: KeyRep::new(),
8332 compact_max_key_length: INKeyRep_DEFAULT_MAX_KEY_LENGTH,
8333 });
8334 tree.set_root(bin);
8335 assert!(tree.get_root().is_some());
8336 }
8337
8338 // ========================================================================
8339 // Split / multi-level insert tests (new)
8340 // ========================================================================
8341
8342 /// inserting enough keys to fill the root IN causes
8343 /// the root IN itself to split, resulting in a tree with 3 or more levels.
8344 ///
8345 /// With max_entries_per_node = 4:
8346 /// - Each BIN holds 4 entries before it is split.
8347 /// - The root IN at level 2 holds up to 4 BIN children.
8348 /// - Filling those 4 BINs (16 entries) and adding a 17th forces the
8349 /// root IN to split, creating a level-3 root.
8350 #[test]
8351 fn test_insert_forces_root_split() {
8352 let tree = Tree::new(1, 4);
8353
8354 // 17 inserts with fanout 4 forces the root IN to split.
8355 for i in 0u32..20 {
8356 let key = format!("key{:04}", i).into_bytes();
8357 let data = format!("data{}", i).into_bytes();
8358 let lsn = Lsn::new(1, 100 + i);
8359 let r = tree.insert(key, data, lsn);
8360 assert!(r.is_ok(), "insert {} must succeed", i);
8361 }
8362
8363 // At least one root split must have occurred.
8364 assert!(
8365 tree.get_root_splits() > 0,
8366 "expected at least one root split after 20 inserts with fanout 4"
8367 );
8368
8369 // The root level must be > level-2 (i.e., the tree has grown to 3+ levels).
8370 let root_arc = tree.get_root().as_ref().unwrap().clone();
8371 let root_level = root_arc.read().level();
8372 let level_2 = MAIN_LEVEL | 2;
8373 assert!(
8374 root_level > level_2,
8375 "root level {} must be > level-2 after root split",
8376 root_level
8377 );
8378 }
8379
8380 /// Inserting 1000 keys in sorted order and verifying all are searchable.
8381 #[test]
8382 fn test_insert_many_keys() {
8383 let tree = Tree::new(1, 8);
8384 let n = 1000u32;
8385
8386 for i in 0..n {
8387 let key = format!("key{:08}", i).into_bytes();
8388 let data = format!("data{}", i).into_bytes();
8389 let lsn = Lsn::new(1, i);
8390 let r = tree.insert(key, data, lsn);
8391 assert!(r.is_ok(), "insert {} must succeed", i);
8392 }
8393
8394 // All keys must be findable.
8395 for i in 0..n {
8396 let key = format!("key{:08}", i).into_bytes();
8397 let sr = tree.search(&key);
8398 assert!(
8399 sr.is_some() && sr.unwrap().exact_parent_found,
8400 "key{:08} must be found after bulk insert",
8401 i
8402 );
8403 }
8404 }
8405
8406 /// Inserting 500 keys in pseudo-random (reverse) order and verifying all
8407 /// are searchable.
8408 #[test]
8409 fn test_insert_random_keys() {
8410 let tree = Tree::new(1, 8);
8411 let n = 500u32;
8412
8413 // Insert in reverse order as a simple non-sorted sequence.
8414 for i in (0..n).rev() {
8415 let key = format!("rkey{:08}", i).into_bytes();
8416 let data = format!("data{}", i).into_bytes();
8417 let lsn = Lsn::new(1, i);
8418 let r = tree.insert(key, data, lsn);
8419 assert!(r.is_ok(), "insert {} must succeed", i);
8420 }
8421
8422 for i in 0..n {
8423 let key = format!("rkey{:08}", i).into_bytes();
8424 let sr = tree.search(&key);
8425 assert!(
8426 sr.is_some() && sr.unwrap().exact_parent_found,
8427 "rkey{:08} must be found",
8428 i
8429 );
8430 }
8431 }
8432
8433 /// After any number of splits, every key inserted must still be findable.
8434 ///
8435 #[test]
8436 fn test_split_preserves_all_keys() {
8437 // Tiny fanout to maximise split frequency.
8438 let tree = Tree::new(1, 3);
8439 let n = 60u32;
8440
8441 let mut keys: Vec<Vec<u8>> = Vec::new();
8442 for i in 0..n {
8443 let key = format!("sk{:04}", i).into_bytes();
8444 keys.push(key.clone());
8445 let data = format!("d{}", i).into_bytes();
8446 let lsn = Lsn::new(1, i);
8447 let r = tree.insert(key, data, lsn);
8448 assert!(r.is_ok(), "insert {} must not fail", i);
8449 }
8450
8451 // After all inserts (and all the splits they induced), every key must
8452 // still be findable in the tree.
8453 for key in &keys {
8454 let sr = tree.search(key);
8455 assert!(
8456 sr.is_some() && sr.unwrap().exact_parent_found,
8457 "key {:?} must survive all splits",
8458 std::str::from_utf8(key).unwrap_or("?")
8459 );
8460 }
8461 }
8462
8463 /// The tree level (depth) must grow as keys are inserted and splits occur.
8464 #[test]
8465 fn test_tree_height_grows() {
8466 let tree = Tree::new(1, 4);
8467
8468 // With fanout 4, one level-2 root IN can hold 4 children. After enough
8469 // inserts the root itself will split and a level-3 node will appear.
8470 // Insert enough keys to force the root to split at least once.
8471 let n = 40u32;
8472 for i in 0..n {
8473 let key = format!("hk{:08}", i).into_bytes();
8474 let data = format!("d{}", i).into_bytes();
8475 let lsn = Lsn::new(1, i);
8476 tree.insert(key, data, lsn).unwrap();
8477 }
8478
8479 // At least one root split must have occurred.
8480 assert!(
8481 tree.get_root_splits() > 0,
8482 "expected root to have split at least once for {} keys with fanout 4",
8483 n
8484 );
8485
8486 // The root level must be > level-2 (i.e., the tree has grown past two levels).
8487 let root_arc = tree.get_root().as_ref().unwrap().clone();
8488 let root_level = root_arc.read().level();
8489 let level_2 = MAIN_LEVEL | 2;
8490 assert!(
8491 root_level > level_2,
8492 "root level {} must be > {} after enough inserts",
8493 root_level,
8494 level_2
8495 );
8496 }
8497
8498 #[test]
8499 fn test_find_entry_on_internal_node() {
8500 let mut entries = vec![];
8501 for i in 0..4 {
8502 entries.push(InEntry { key: format!("k{}", i).into_bytes() });
8503 }
8504 let internal = TreeNode::Internal(InNodeStub {
8505 node_id: 1,
8506 level: MAIN_LEVEL + 2,
8507 entries,
8508 targets: TargetRep::None,
8509 dirty: false,
8510 generation: 0,
8511 parent: None,
8512 lsn_rep: LsnRep::Empty,
8513 });
8514
8515 // Exact match
8516 let r = internal.find_entry(b"k2", false, true);
8517 assert_ne!(r & EXACT_MATCH, 0);
8518 assert_eq!(r & 0xFFFF, 2);
8519
8520 // No exact match with exact=true
8521 let r = internal.find_entry(b"kx", false, true);
8522 assert_eq!(r, -1);
8523 }
8524
8525 // St-H5: non-exact `find_entry` on an Internal node must return the FLOOR
8526 // child slot (largest entry ≤ key), not the insertion point. Entries are
8527 // k0,k1,k2,k3; slot 0 is the leftmost child.
8528 #[test]
8529 fn test_find_entry_internal_nonexact_returns_floor() {
8530 let mut entries = vec![];
8531 for i in 0..4 {
8532 entries.push(InEntry { key: format!("k{}", i).into_bytes() });
8533 }
8534 let internal = TreeNode::Internal(InNodeStub {
8535 node_id: 1,
8536 level: MAIN_LEVEL + 2,
8537 entries,
8538 targets: TargetRep::None,
8539 dirty: false,
8540 generation: 0,
8541 parent: None,
8542 lsn_rep: LsnRep::Empty,
8543 });
8544
8545 // Key below every separator floors to slot 0 (leftmost child).
8546 assert_eq!(internal.find_entry(b"a", false, false) & 0xFFFF, 0);
8547 // Between k1 and k2 floors to k1 (slot 1).
8548 assert_eq!(internal.find_entry(b"k1x", false, false) & 0xFFFF, 1);
8549 // Above every separator floors to the last slot (k3 = slot 3).
8550 assert_eq!(internal.find_entry(b"zzz", false, false) & 0xFFFF, 3);
8551 // Exact match still reported as the exact slot.
8552 let r = internal.find_entry(b"k2", false, false);
8553 assert_ne!(r & EXACT_MATCH, 0);
8554 assert_eq!(r & 0xFFFF, 2);
8555 }
8556
8557 // ========================================================================
8558 // New tests: dirty tracking, generation, parent pointers, log size, stats
8559 // ========================================================================
8560
8561 /// After inserting into a tree, the BIN (and root IN) must be dirty.
8562 ///
8563 /// The: Tree.insertLN() calls bin.setDirty(true) after each insert.
8564 #[test]
8565 fn test_insert_marks_bin_dirty() {
8566 let tree = Tree::new(1, 128);
8567 tree.insert(b"key1".to_vec(), b"val1".to_vec(), Lsn::new(1, 1))
8568 .unwrap();
8569
8570 let root_arc = tree.get_root().as_ref().unwrap().clone();
8571 // root is an upper IN — its slot 0 child is the BIN.
8572 let bin_arc = {
8573 let g = root_arc.read();
8574 match &*g {
8575 TreeNode::Internal(n) => n.get_child(0).unwrap(),
8576 _ => panic!("expected Internal root"),
8577 }
8578 };
8579
8580 let bin_dirty = bin_arc.read().is_dirty();
8581 assert!(bin_dirty, "BIN must be dirty after insert");
8582 }
8583
8584 /// Updating an existing key keeps the BIN dirty.
8585 #[test]
8586 fn test_update_keeps_bin_dirty() {
8587 let tree = Tree::new(1, 128);
8588 tree.insert(b"k".to_vec(), b"v1".to_vec(), Lsn::new(1, 1)).unwrap();
8589 // second insert is an update
8590 tree.insert(b"k".to_vec(), b"v2".to_vec(), Lsn::new(1, 2)).unwrap();
8591
8592 let root_arc = tree.get_root().as_ref().unwrap().clone();
8593 let bin_arc = {
8594 let g = root_arc.read();
8595 match &*g {
8596 TreeNode::Internal(n) => n.get_child(0).unwrap(),
8597 _ => panic!("expected Internal root"),
8598 }
8599 };
8600
8601 assert!(bin_arc.read().is_dirty(), "BIN must be dirty after update");
8602 }
8603
8604 /// After deleting a key the BIN must be dirty.
8605 #[test]
8606 fn test_delete_marks_bin_dirty() {
8607 let tree = Tree::new(1, 128);
8608 tree.insert(b"del".to_vec(), b"val".to_vec(), Lsn::new(1, 1)).unwrap();
8609
8610 // Manually clear dirty flag to verify delete re-sets it.
8611 {
8612 let root_arc = tree.get_root().as_ref().unwrap().clone();
8613 let bin_arc = {
8614 let g = root_arc.read();
8615 match &*g {
8616 TreeNode::Internal(n) => n.get_child(0).unwrap(),
8617 _ => panic!("expected Internal root"),
8618 }
8619 };
8620 bin_arc.write().set_dirty(false);
8621 assert!(!bin_arc.read().is_dirty());
8622 }
8623
8624 tree.delete(b"del");
8625
8626 let root_arc = tree.get_root().as_ref().unwrap().clone();
8627 let bin_arc = {
8628 let g = root_arc.read();
8629 match &*g {
8630 TreeNode::Internal(n) => n.get_child(0).unwrap(),
8631 _ => panic!("expected Internal root"),
8632 }
8633 };
8634 assert!(bin_arc.read().is_dirty(), "BIN must be dirty after delete");
8635 }
8636
8637 /// BIN's parent pointer must point to the root IN.
8638 #[test]
8639 fn test_bin_parent_pointer_set_on_initial_insert() {
8640 let tree = Tree::new(1, 128);
8641 tree.insert(b"k".to_vec(), b"v".to_vec(), Lsn::new(1, 1)).unwrap();
8642
8643 let root_arc = tree.get_root().as_ref().unwrap().clone();
8644 let bin_arc = {
8645 let g = root_arc.read();
8646 match &*g {
8647 TreeNode::Internal(n) => n.get_child(0).unwrap(),
8648 _ => panic!("expected Internal root"),
8649 }
8650 };
8651
8652 let parent_weak = bin_arc.read().get_parent();
8653 assert!(parent_weak.is_some(), "BIN must have a parent pointer");
8654
8655 // Upgrading the weak pointer must give us the root arc.
8656 let parent_arc = parent_weak.unwrap().upgrade().unwrap();
8657 assert!(
8658 Arc::ptr_eq(&parent_arc, &root_arc),
8659 "BIN parent must be the root IN"
8660 );
8661 }
8662
8663 /// set_dirty / is_dirty round-trip on both variants.
8664 #[test]
8665 fn test_dirty_flag_roundtrip() {
8666 let mut bin_node = TreeNode::Bottom(BinStub {
8667 node_id: 1,
8668 level: BIN_LEVEL,
8669 entries: vec![],
8670 key_prefix: Vec::new(),
8671 dirty: false,
8672 is_delta: false,
8673 last_full_lsn: NULL_LSN,
8674 last_delta_lsn: NULL_LSN,
8675 generation: 0,
8676 parent: None,
8677 expiration_in_hours: true,
8678 cursor_count: 0,
8679 prohibit_next_delta: false,
8680 lsn_rep: LsnRep::Empty,
8681 keys: KeyRep::new(),
8682 compact_max_key_length: INKeyRep_DEFAULT_MAX_KEY_LENGTH,
8683 });
8684 assert!(!bin_node.is_dirty());
8685 bin_node.set_dirty(true);
8686 assert!(bin_node.is_dirty());
8687 bin_node.set_dirty(false);
8688 assert!(!bin_node.is_dirty());
8689
8690 let mut in_node = TreeNode::Internal(InNodeStub {
8691 node_id: 2,
8692 level: MAIN_LEVEL | 2,
8693 entries: vec![],
8694 targets: TargetRep::None,
8695 dirty: false,
8696 generation: 0,
8697 parent: None,
8698 lsn_rep: LsnRep::Empty,
8699 });
8700 assert!(!in_node.is_dirty());
8701 in_node.set_dirty(true);
8702 assert!(in_node.is_dirty());
8703 }
8704
8705 /// set_generation / get_generation round-trip on both variants.
8706 #[test]
8707 fn test_generation_roundtrip() {
8708 let mut bin_node = TreeNode::Bottom(BinStub {
8709 node_id: 1,
8710 level: BIN_LEVEL,
8711 entries: vec![],
8712 key_prefix: Vec::new(),
8713 dirty: false,
8714 is_delta: false,
8715 last_full_lsn: NULL_LSN,
8716 last_delta_lsn: NULL_LSN,
8717 generation: 0,
8718 parent: None,
8719 expiration_in_hours: true,
8720 cursor_count: 0,
8721 prohibit_next_delta: false,
8722 lsn_rep: LsnRep::Empty,
8723 keys: KeyRep::new(),
8724 compact_max_key_length: INKeyRep_DEFAULT_MAX_KEY_LENGTH,
8725 });
8726 assert_eq!(bin_node.get_generation(), 0);
8727 bin_node.set_generation(42);
8728 assert_eq!(bin_node.get_generation(), 42);
8729
8730 let mut in_node = TreeNode::Internal(InNodeStub {
8731 node_id: 2,
8732 level: MAIN_LEVEL | 2,
8733 entries: vec![],
8734 targets: TargetRep::None,
8735 dirty: false,
8736 generation: 0,
8737 parent: None,
8738 lsn_rep: LsnRep::Empty,
8739 });
8740 in_node.set_generation(99);
8741 assert_eq!(in_node.get_generation(), 99);
8742 }
8743
8744 /// log_size() must be consistent with write_to_bytes() length.
8745 #[test]
8746 fn test_log_size_matches_bytes_len() {
8747 // BIN stub with some entries.
8748 let bin_node = TreeNode::Bottom(BinStub {
8749 node_id: 7,
8750 level: BIN_LEVEL,
8751 entries: vec![
8752 BinEntry {
8753 data: Some(b"d1".to_vec()),
8754 known_deleted: false,
8755 dirty: false,
8756 expiration_time: 0,
8757 },
8758 BinEntry {
8759 data: None,
8760 known_deleted: false,
8761 dirty: false,
8762 expiration_time: 0,
8763 },
8764 ],
8765 key_prefix: Vec::new(),
8766 dirty: true,
8767 is_delta: false,
8768 last_full_lsn: NULL_LSN,
8769 last_delta_lsn: NULL_LSN,
8770 generation: 5,
8771 parent: None,
8772 expiration_in_hours: true,
8773 cursor_count: 0,
8774 prohibit_next_delta: false,
8775 lsn_rep: LsnRep::Empty,
8776 keys: KeyRep::from_keys(vec![b"alpha".to_vec(), b"beta".to_vec()]),
8777 compact_max_key_length: INKeyRep_DEFAULT_MAX_KEY_LENGTH,
8778 });
8779 assert_eq!(bin_node.log_size(), bin_node.write_to_bytes().len());
8780
8781 // IN stub with some entries.
8782 let in_node = TreeNode::Internal(InNodeStub {
8783 node_id: 8,
8784 level: MAIN_LEVEL | 2,
8785 entries: vec![
8786 InEntry { key: vec![] },
8787 InEntry { key: b"mid".to_vec() },
8788 ],
8789 targets: TargetRep::None,
8790 dirty: false,
8791 generation: 0,
8792 parent: None,
8793 lsn_rep: LsnRep::Empty,
8794 });
8795 assert_eq!(in_node.log_size(), in_node.write_to_bytes().len());
8796 }
8797
8798 /// write_to_bytes() output contains the node_id and dirty flag.
8799 #[test]
8800 fn test_write_to_bytes_encodes_node_id_and_dirty() {
8801 let node = TreeNode::Bottom(BinStub {
8802 node_id: 0xDEAD_BEEF_0000_0001,
8803 level: BIN_LEVEL,
8804 entries: vec![],
8805 key_prefix: Vec::new(),
8806 dirty: true,
8807 is_delta: false,
8808 last_full_lsn: NULL_LSN,
8809 last_delta_lsn: NULL_LSN,
8810 generation: 0,
8811 parent: None,
8812 expiration_in_hours: true,
8813 cursor_count: 0,
8814 prohibit_next_delta: false,
8815 lsn_rep: LsnRep::Empty,
8816 keys: KeyRep::new(),
8817 compact_max_key_length: INKeyRep_DEFAULT_MAX_KEY_LENGTH,
8818 });
8819 let bytes = node.write_to_bytes();
8820 // First 8 bytes = node_id big-endian.
8821 let id_bytes = &bytes[0..8];
8822 assert_eq!(id_bytes, 0xDEAD_BEEF_0000_0001u64.to_be_bytes());
8823 // Byte at offset 16 (after node_id[8] + level[4] + n_entries[4]) = dirty flag.
8824 assert_eq!(bytes[16], 1u8, "dirty flag must be 1");
8825 }
8826
8827 /// log_size() grows as entries are added.
8828 #[test]
8829 fn test_log_size_grows_with_entries() {
8830 let empty = TreeNode::Bottom(BinStub {
8831 node_id: 1,
8832 level: BIN_LEVEL,
8833 entries: vec![],
8834 key_prefix: Vec::new(),
8835 dirty: false,
8836 is_delta: false,
8837 last_full_lsn: NULL_LSN,
8838 last_delta_lsn: NULL_LSN,
8839 generation: 0,
8840 parent: None,
8841 expiration_in_hours: true,
8842 cursor_count: 0,
8843 prohibit_next_delta: false,
8844 lsn_rep: LsnRep::Empty,
8845 keys: KeyRep::new(),
8846 compact_max_key_length: INKeyRep_DEFAULT_MAX_KEY_LENGTH,
8847 });
8848 let with_entry = TreeNode::Bottom(BinStub {
8849 node_id: 2,
8850 level: BIN_LEVEL,
8851 entries: vec![BinEntry {
8852 data: None,
8853 known_deleted: false,
8854 dirty: false,
8855 expiration_time: 0,
8856 }],
8857 key_prefix: Vec::new(),
8858 dirty: false,
8859 is_delta: false,
8860 last_full_lsn: NULL_LSN,
8861 last_delta_lsn: NULL_LSN,
8862 generation: 0,
8863 parent: None,
8864 expiration_in_hours: true,
8865 cursor_count: 0,
8866 prohibit_next_delta: false,
8867 lsn_rep: LsnRep::Empty,
8868 keys: KeyRep::from_keys(vec![b"longkey_here".to_vec()]),
8869 compact_max_key_length: INKeyRep_DEFAULT_MAX_KEY_LENGTH,
8870 });
8871 assert!(
8872 with_entry.log_size() > empty.log_size(),
8873 "log_size must grow when entries are added"
8874 );
8875 }
8876
8877 /// propagate_dirty_to_root() marks all ancestors dirty.
8878 #[test]
8879 fn test_propagate_dirty_to_root() {
8880 // Build a 2-level tree manually: root IN -> BIN.
8881 let bin_arc = Arc::new(RwLock::new(TreeNode::Bottom(BinStub {
8882 node_id: generate_node_id(),
8883 level: BIN_LEVEL,
8884 entries: vec![],
8885 key_prefix: Vec::new(),
8886 dirty: false,
8887 is_delta: false,
8888 last_full_lsn: NULL_LSN,
8889 last_delta_lsn: NULL_LSN,
8890 generation: 0,
8891 parent: None, // set below
8892 expiration_in_hours: true,
8893 cursor_count: 0,
8894 prohibit_next_delta: false,
8895 lsn_rep: LsnRep::Empty,
8896 keys: KeyRep::new(),
8897 compact_max_key_length: INKeyRep_DEFAULT_MAX_KEY_LENGTH,
8898 })));
8899
8900 let root_arc = Arc::new(RwLock::new(TreeNode::Internal(InNodeStub {
8901 node_id: generate_node_id(),
8902 level: MAIN_LEVEL | 2,
8903 entries: vec![InEntry { key: vec![] }],
8904 targets: TargetRep::Sparse(vec![(0, bin_arc.clone())]),
8905 dirty: false,
8906 generation: 0,
8907 parent: None,
8908 lsn_rep: LsnRep::Empty,
8909 })));
8910
8911 // Wire BIN's parent to root.
8912 bin_arc.write().set_parent(Some(Arc::downgrade(&root_arc)));
8913
8914 // Root is not dirty before propagation.
8915 assert!(!root_arc.read().is_dirty());
8916
8917 // Propagate from the BIN up.
8918 Tree::propagate_dirty_to_root(&bin_arc);
8919
8920 // Root must now be dirty.
8921 assert!(
8922 root_arc.read().is_dirty(),
8923 "root must be dirty after propagate_dirty_to_root"
8924 );
8925 }
8926
8927 /// collect_stats() on an empty tree returns all-zero stats.
8928 #[test]
8929 fn test_collect_stats_empty_tree() {
8930 let tree = Tree::new(1, 128);
8931 let stats = tree.collect_stats();
8932 assert_eq!(stats, TreeStats::default());
8933 }
8934
8935 /// collect_stats() on a single-entry tree: 1 IN + 1 BIN, height 2.
8936 #[test]
8937 fn test_collect_stats_single_insert() {
8938 let tree = Tree::new(1, 128);
8939 tree.insert(b"k".to_vec(), b"v".to_vec(), Lsn::new(1, 1)).unwrap();
8940 let stats = tree.collect_stats();
8941 assert_eq!(stats.n_bins, 1, "must have 1 BIN");
8942 assert_eq!(stats.n_ins, 1, "must have 1 upper IN");
8943 assert_eq!(stats.height, 2, "single-entry tree has height 2");
8944 assert!(stats.n_entries >= 1, "must have at least 1 entry total");
8945 }
8946
8947 /// collect_stats() with many inserts: entry count matches insert count.
8948 #[test]
8949 fn test_collect_stats_many_inserts() {
8950 let tree = Tree::new(1, 8);
8951 let n = 50u32;
8952 for i in 0..n {
8953 let key = format!("sk{:04}", i).into_bytes();
8954 tree.insert(key, b"v".to_vec(), Lsn::new(1, i)).unwrap();
8955 }
8956 let stats = tree.collect_stats();
8957 // All n entries should be accounted for across all BINs.
8958 // n_entries counts entries in both INs and BINs; BIN entries = n.
8959 // We verify BIN entry total equals n by summing manually.
8960 let bin_entries: u64 = stats.n_entries - stats.n_ins; // rough check
8961 // A more precise assertion: the sum of all BIN entries == n.
8962 // Since we can't easily separate, just assert the tree is non-trivial.
8963 assert!(stats.n_bins > 0, "must have at least one BIN");
8964 assert!(stats.height >= 2, "multi-entry tree has height >= 2");
8965 // Total entries in the tree must be >= n (BIN entries alone).
8966 assert!(
8967 bin_entries >= n as u64 || stats.n_entries >= n as u64,
8968 "entry count must account for all inserts"
8969 );
8970 }
8971
8972 // ========================================================================
8973 // Tests: B-tree merge / compress
8974 // ========================================================================
8975
8976 /// After deleting most keys from a tree, compress() must reduce the BIN
8977 /// count by merging under-full siblings.
8978 ///
8979 /// Strategy: build a large tree (many BINs), delete almost all keys,
8980 /// then verify compress() reduces n_bins and all surviving keys remain
8981 /// findable. We do not hard-code the exact BIN counts because the
8982 /// preemptive splitting strategy determines the exact split points.
8983 #[test]
8984 fn test_compress_merges_underfull_bins() {
8985 let tree = Tree::new(1, 8);
8986
8987 // Insert 64 sorted keys to build a multi-BIN tree.
8988 let n = 64u32;
8989 let keys: Vec<Vec<u8>> =
8990 (0..n).map(|i| format!("cm{:04}", i).into_bytes()).collect();
8991 for (i, key) in keys.iter().enumerate() {
8992 tree.insert(key.clone(), vec![i as u8], Lsn::new(1, i as u32))
8993 .unwrap();
8994 }
8995
8996 let stats_full = tree.collect_stats();
8997 assert!(
8998 stats_full.n_bins >= 2,
8999 "must have multiple BINs after 64 inserts"
9000 );
9001
9002 // Delete all but 4 widely-spaced keys (one roughly per BIN pair).
9003 // We keep every 16th key: k0000, k0016, k0032, k0048.
9004 let keep: std::collections::HashSet<u32> =
9005 [0, 16, 32, 48].iter().cloned().collect();
9006 for i in 0..n {
9007 if !keep.contains(&i) {
9008 let key = format!("cm{:04}", i).into_bytes();
9009 tree.delete(&key);
9010 }
9011 }
9012
9013 let stats_sparse = tree.collect_stats();
9014 assert!(
9015 stats_sparse.n_bins >= 2,
9016 "should still have multiple BINs before compress"
9017 );
9018
9019 // compress() must reduce BIN count since most BINs now hold 0–1 entries.
9020 tree.compress();
9021
9022 let stats_after = tree.collect_stats();
9023 assert!(
9024 stats_after.n_bins < stats_sparse.n_bins,
9025 "compress must reduce BIN count (was {}, now {})",
9026 stats_sparse.n_bins,
9027 stats_after.n_bins
9028 );
9029
9030 // Surviving keys must still be findable.
9031 for i in keep {
9032 let key = format!("cm{:04}", i).into_bytes();
9033 let sr = tree.search(&key);
9034 assert!(
9035 sr.is_some() && sr.unwrap().exact_parent_found,
9036 "key cm{:04} must survive compress",
9037 i
9038 );
9039 }
9040 }
9041
9042 /// compress() preserves all entries: a full-BIN tree has fewer merges
9043 /// but all keys remain accessible.
9044 #[test]
9045 fn test_compress_no_op_when_full() {
9046 // Insert exactly max_entries worth of keys into a single BIN — no split
9047 // will have occurred yet, and the BINs will all be reasonably full.
9048 // We can't prevent splits entirely (preemptive), but we can verify that
9049 // compress() never loses entries.
9050 let tree = Tree::new(1, 8);
9051 let n = 32u32;
9052 for i in 0..n {
9053 let key = format!("fn{:04}", i).into_bytes();
9054 tree.insert(key, vec![i as u8], Lsn::new(1, i)).unwrap();
9055 }
9056
9057 let stats_before = tree.collect_stats();
9058 tree.compress();
9059 let stats_after = tree.collect_stats();
9060
9061 // All keys still findable.
9062 for i in 0..n {
9063 let key = format!("fn{:04}", i).into_bytes();
9064 let sr = tree.search(&key);
9065 assert!(
9066 sr.is_some() && sr.unwrap().exact_parent_found,
9067 "key fn{:04} must be findable after compress",
9068 i
9069 );
9070 }
9071
9072 // BIN count must not increase.
9073 assert!(
9074 stats_after.n_bins <= stats_before.n_bins,
9075 "compress must not increase BIN count"
9076 );
9077 }
9078
9079 /// compress() on an empty tree must not panic.
9080 #[test]
9081 fn test_compress_empty_tree() {
9082 let tree = Tree::new(1, 4);
9083 tree.compress(); // must not panic
9084 }
9085
9086 /// After deleting all entries, compress() reduces BINs to 1.
9087 #[test]
9088 fn test_compress_removes_empty_bin_from_parent() {
9089 let tree = Tree::new(1, 4);
9090 // Insert enough keys to generate multiple BINs.
9091 let n = 16u32;
9092 for i in 0..n {
9093 let key = format!("ep{:04}", i).into_bytes();
9094 tree.insert(key, vec![i as u8], Lsn::new(1, i)).unwrap();
9095 }
9096
9097 let stats_before = tree.collect_stats();
9098 assert!(stats_before.n_bins >= 2, "need multiple BINs for this test");
9099
9100 // Delete everything except the very last key.
9101 for i in 0..n - 1 {
9102 let key = format!("ep{:04}", i).into_bytes();
9103 tree.delete(&key);
9104 }
9105
9106 tree.compress();
9107
9108 let stats_after = tree.collect_stats();
9109 assert!(
9110 stats_after.n_bins < stats_before.n_bins,
9111 "compress must reduce BIN count after mass deletion"
9112 );
9113
9114 // The surviving key must still be findable.
9115 let last_key = format!("ep{:04}", n - 1).into_bytes();
9116 let sr = tree.search(&last_key);
9117 assert!(
9118 sr.is_some() && sr.unwrap().exact_parent_found,
9119 "last key must survive after compress"
9120 );
9121 }
9122
9123 // ========================================================================
9124 // IC-1: prune_empty_bin must NOT remove a live entry when the BIN was
9125 // repopulated between the compressor observing it empty and the prune.
9126 // (Tree corruption / lost-write regression test.)
9127 // ========================================================================
9128
9129 /// Find a BIN arc that is currently empty (0 entries) and is NOT the
9130 /// root, returning it together with the `id_key` the compressor would
9131 /// have captured (here we just use any key that routes to that BIN).
9132 fn first_empty_non_root_bin(tree: &Tree) -> Option<Arc<RwLock<TreeNode>>> {
9133 let root = tree.get_root()?;
9134 for node in tree.rebuild_in_list() {
9135 if Arc::ptr_eq(&node, &root) {
9136 continue; // skip root (single-BIN tree is never pruned)
9137 }
9138 let is_empty_bin = {
9139 let g = node.read();
9140 matches!(&*g, TreeNode::Bottom(b) if b.entries.is_empty())
9141 };
9142 if is_empty_bin {
9143 return Some(node);
9144 }
9145 }
9146 None
9147 }
9148
9149 /// IC-1 (fail-pre / pass-post): the old `compress_bin` prune step called
9150 /// `self.delete(&id_key)`, which re-descends by key. If a concurrent
9151 /// insert repopulated the empty BIN with a LIVE entry under that same
9152 /// `id_key`, `self.delete` would silently remove the live entry — a lost
9153 /// write. `prune_empty_bin` re-validates `n_entries == 0` under the
9154 /// parent latch and must REMOVE NOTHING when the BIN is non-empty.
9155 ///
9156 /// JE `Tree.delete` / `searchDeletableSubTree` (Tree.java ~line 755-800):
9157 /// `bin.getNEntries() != 0` → NODE_NOT_EMPTY (abort prune).
9158 #[test]
9159 fn test_ic1_prune_empty_bin_aborts_when_repopulated() {
9160 let tree = Tree::new(1, 4);
9161 let n = 16u32;
9162 for i in 0..n {
9163 let key = format!("ic{:04}", i).into_bytes();
9164 tree.insert(key, vec![i as u8], Lsn::new(1, i)).unwrap();
9165 }
9166 assert!(
9167 tree.collect_stats().n_bins >= 2,
9168 "need multiple BINs for this test"
9169 );
9170
9171 // Empty out one whole BIN by deleting every key it holds. We delete
9172 // the lowest 4 keys (ic0000..ic0003) which share the first BIN, then
9173 // physically compress it so it has 0 entries.
9174 for i in 0..4 {
9175 let key = format!("ic{:04}", i).into_bytes();
9176 tree.delete(&key);
9177 }
9178
9179 // Locate the now-empty BIN and the id_key the compressor would use.
9180 let empty_bin = match first_empty_non_root_bin(&tree) {
9181 Some(b) => b,
9182 // If the layout didn't leave an isolated empty BIN, the scenario
9183 // isn't reproducible on this build; treat as vacuously passing.
9184 None => return,
9185 };
9186
9187 // SIMULATE THE RACE: a concurrent insert repopulates the empty BIN
9188 // with a LIVE entry *before* the prune runs. We insert directly into
9189 // the BIN arc to model the insert that lands after `now_empty` was
9190 // read. Pick a key that routes to this BIN.
9191 let live_key = format!("ic{:04}", 1).into_bytes(); // was deleted above
9192 {
9193 let mut g = empty_bin.write();
9194 if let TreeNode::Bottom(b) = &mut *g {
9195 // T-2/T-3: route through the insert helper so entries/keys/
9196 // lsn_rep stay in lock step.
9197 b.insert_with_prefix(
9198 live_key.clone(),
9199 Lsn::new(1, 1),
9200 Some(vec![0xAB]),
9201 );
9202 }
9203 }
9204 let id_key = {
9205 let g = empty_bin.read();
9206 match &*g {
9207 TreeNode::Bottom(b) => b.get_full_key(0).unwrap(),
9208 _ => unreachable!(),
9209 }
9210 };
9211
9212 // Prune must ABORT (return false) because the BIN is no longer empty,
9213 // and must NOT remove the live entry.
9214 let pruned = tree.prune_empty_bin(&id_key);
9215 assert!(!pruned, "IC-1: prune must abort when the BIN was repopulated");
9216
9217 // The live entry must still be present in the BIN.
9218 let still_there = {
9219 let g = empty_bin.read();
9220 match &*g {
9221 TreeNode::Bottom(b) => {
9222 b.entries.iter().enumerate().any(|(i, _)| {
9223 b.key_prefix.is_empty() && b.get_key(i) == live_key
9224 })
9225 }
9226 _ => false,
9227 }
9228 };
9229 assert!(
9230 still_there,
9231 "IC-1: prune must not remove the repopulated live entry"
9232 );
9233 }
9234
9235 /// IC-1 companion: prune_empty_bin must abort when a cursor is parked on
9236 /// the (still-empty) BIN. JE: `bin.nCursors() > 0` → CURSORS_EXIST.
9237 #[test]
9238 fn test_ic1_prune_empty_bin_aborts_with_cursor() {
9239 let tree = Tree::new(1, 4);
9240 for i in 0..16u32 {
9241 let key = format!("cu{:04}", i).into_bytes();
9242 tree.insert(key, vec![i as u8], Lsn::new(1, i)).unwrap();
9243 }
9244 for i in 0..4 {
9245 let key = format!("cu{:04}", i).into_bytes();
9246 tree.delete(&key);
9247 }
9248 let empty_bin = match first_empty_non_root_bin(&tree) {
9249 Some(b) => b,
9250 None => return,
9251 };
9252 // Park a cursor on the empty BIN.
9253 Tree::pin_bin(&empty_bin);
9254 // id_key: any key routing to this BIN. Use the first deleted key.
9255 let id_key = format!("cu{:04}", 0).into_bytes();
9256 let pruned = tree.prune_empty_bin(&id_key);
9257 assert!(
9258 !pruned,
9259 "IC-1: prune must abort when a cursor is parked on the BIN"
9260 );
9261 Tree::unpin_bin(&empty_bin);
9262 }
9263
9264 /// IC-1 happy path: prune_empty_bin removes the parent slot when the BIN
9265 /// really is empty, no cursors, not a delta.
9266 #[test]
9267 fn test_ic1_prune_empty_bin_succeeds_when_truly_empty() {
9268 let tree = Tree::new(1, 4);
9269 for i in 0..16u32 {
9270 let key = format!("ok{:04}", i).into_bytes();
9271 tree.insert(key, vec![i as u8], Lsn::new(1, i)).unwrap();
9272 }
9273 for i in 0..4 {
9274 let key = format!("ok{:04}", i).into_bytes();
9275 tree.delete(&key);
9276 }
9277 let bins_before = tree.collect_stats().n_bins;
9278 let empty_bin = match first_empty_non_root_bin(&tree) {
9279 Some(b) => b,
9280 None => return,
9281 };
9282 // id_key: a key that routes to this empty BIN (one of the deleted).
9283 let id_key = {
9284 // route by the lowest deleted key; it falls into the leftmost BIN.
9285 let _ = &empty_bin;
9286 format!("ok{:04}", 0).into_bytes()
9287 };
9288 let pruned = tree.prune_empty_bin(&id_key);
9289 assert!(pruned, "IC-1: prune must succeed on a truly empty BIN");
9290 let bins_after = tree.collect_stats().n_bins;
9291 assert!(
9292 bins_after < bins_before,
9293 "IC-1: pruned BIN slot must be removed from the parent (was {}, now {})",
9294 bins_before,
9295 bins_after
9296 );
9297 // Every surviving key must still be findable.
9298 for i in 4..16u32 {
9299 let key = format!("ok{:04}", i).into_bytes();
9300 assert!(
9301 tree.search(&key).is_some_and(|s| s.exact_parent_found),
9302 "surviving key ok{:04} must remain after prune",
9303 i
9304 );
9305 }
9306 }
9307
9308 // ========================================================================
9309 // Tests: latch-coupling validation (validate_parent_child /
9310 // search_with_coupling)
9311 // ========================================================================
9312
9313 /// validate_parent_child returns true when the parent slot points at the
9314 /// expected child.
9315 #[test]
9316 fn test_validate_parent_child_correct_link() {
9317 let bin_arc = Arc::new(RwLock::new(TreeNode::Bottom(BinStub {
9318 node_id: generate_node_id(),
9319 level: BIN_LEVEL,
9320 entries: vec![],
9321 key_prefix: Vec::new(),
9322 dirty: false,
9323 is_delta: false,
9324 last_full_lsn: NULL_LSN,
9325 last_delta_lsn: NULL_LSN,
9326 generation: 0,
9327 parent: None,
9328 expiration_in_hours: true,
9329 cursor_count: 0,
9330 prohibit_next_delta: false,
9331 lsn_rep: LsnRep::Empty,
9332 keys: KeyRep::new(),
9333 compact_max_key_length: INKeyRep_DEFAULT_MAX_KEY_LENGTH,
9334 })));
9335
9336 let root_arc = Arc::new(RwLock::new(TreeNode::Internal(InNodeStub {
9337 node_id: generate_node_id(),
9338 level: MAIN_LEVEL | 2,
9339 entries: vec![InEntry { key: vec![] }],
9340 targets: TargetRep::Sparse(vec![(0, bin_arc.clone())]),
9341 dirty: false,
9342 generation: 0,
9343 parent: None,
9344 lsn_rep: LsnRep::Empty,
9345 })));
9346
9347 assert!(
9348 Tree::validate_parent_child(&root_arc, 0, &bin_arc),
9349 "link must be valid when parent slot 0 points at bin_arc"
9350 );
9351 }
9352
9353 /// validate_parent_child returns false when the slot index is out of range.
9354 #[test]
9355 fn test_validate_parent_child_out_of_range() {
9356 let root_arc = Arc::new(RwLock::new(TreeNode::Internal(InNodeStub {
9357 node_id: generate_node_id(),
9358 level: MAIN_LEVEL | 2,
9359 entries: vec![],
9360 targets: TargetRep::None,
9361 dirty: false,
9362 generation: 0,
9363 parent: None,
9364 lsn_rep: LsnRep::Empty,
9365 })));
9366 let other_arc = Arc::new(RwLock::new(TreeNode::Bottom(BinStub {
9367 node_id: generate_node_id(),
9368 level: BIN_LEVEL,
9369 entries: vec![],
9370 key_prefix: Vec::new(),
9371 dirty: false,
9372 is_delta: false,
9373 last_full_lsn: NULL_LSN,
9374 last_delta_lsn: NULL_LSN,
9375 generation: 0,
9376 parent: None,
9377 expiration_in_hours: true,
9378 cursor_count: 0,
9379 prohibit_next_delta: false,
9380 lsn_rep: LsnRep::Empty,
9381 keys: KeyRep::new(),
9382 compact_max_key_length: INKeyRep_DEFAULT_MAX_KEY_LENGTH,
9383 })));
9384
9385 assert!(
9386 !Tree::validate_parent_child(&root_arc, 0, &other_arc),
9387 "link must be invalid when parent has no entries"
9388 );
9389 }
9390
9391 /// validate_parent_child returns false when the slot points at a different Arc.
9392 #[test]
9393 fn test_validate_parent_child_wrong_child() {
9394 let bin_a = Arc::new(RwLock::new(TreeNode::Bottom(BinStub {
9395 node_id: generate_node_id(),
9396 level: BIN_LEVEL,
9397 entries: vec![],
9398 key_prefix: Vec::new(),
9399 dirty: false,
9400 is_delta: false,
9401 last_full_lsn: NULL_LSN,
9402 last_delta_lsn: NULL_LSN,
9403 generation: 0,
9404 parent: None,
9405 expiration_in_hours: true,
9406 cursor_count: 0,
9407 prohibit_next_delta: false,
9408 lsn_rep: LsnRep::Empty,
9409 keys: KeyRep::new(),
9410 compact_max_key_length: INKeyRep_DEFAULT_MAX_KEY_LENGTH,
9411 })));
9412 let bin_b = Arc::new(RwLock::new(TreeNode::Bottom(BinStub {
9413 node_id: generate_node_id(),
9414 level: BIN_LEVEL,
9415 entries: vec![],
9416 key_prefix: Vec::new(),
9417 dirty: false,
9418 is_delta: false,
9419 last_full_lsn: NULL_LSN,
9420 last_delta_lsn: NULL_LSN,
9421 generation: 0,
9422 parent: None,
9423 expiration_in_hours: true,
9424 cursor_count: 0,
9425 prohibit_next_delta: false,
9426 lsn_rep: LsnRep::Empty,
9427 keys: KeyRep::new(),
9428 compact_max_key_length: INKeyRep_DEFAULT_MAX_KEY_LENGTH,
9429 })));
9430
9431 let root_arc = Arc::new(RwLock::new(TreeNode::Internal(InNodeStub {
9432 node_id: generate_node_id(),
9433 level: MAIN_LEVEL | 2,
9434 entries: vec![InEntry { key: vec![] }],
9435 targets: TargetRep::Sparse(vec![(0, bin_a)]),
9436 dirty: false,
9437 generation: 0,
9438 parent: None,
9439 lsn_rep: LsnRep::Empty,
9440 })));
9441
9442 assert!(
9443 !Tree::validate_parent_child(&root_arc, 0, &bin_b),
9444 "link must be invalid when parent slot points at a different Arc"
9445 );
9446 }
9447
9448 /// search_with_coupling finds the same key as search().
9449 #[test]
9450 fn test_search_with_coupling_finds_existing_key() {
9451 let tree = Tree::new(1, 8);
9452 for i in 0u32..20 {
9453 let key = format!("c{:04}", i).into_bytes();
9454 tree.insert(key, vec![i as u8], Lsn::new(1, i)).unwrap();
9455 }
9456
9457 for i in 0u32..20 {
9458 let key = format!("c{:04}", i).into_bytes();
9459 let sr = tree.search_with_coupling(&key);
9460 assert!(
9461 sr.is_some() && sr.unwrap().exact_parent_found,
9462 "search_with_coupling must find c{:04}",
9463 i
9464 );
9465 }
9466 }
9467
9468 /// search_with_coupling returns false for a key not in the tree.
9469 #[test]
9470 fn test_search_with_coupling_missing_key() {
9471 let tree = Tree::new(1, 8);
9472 tree.insert(b"hello".to_vec(), b"v".to_vec(), Lsn::new(1, 1)).unwrap();
9473
9474 let sr = tree.search_with_coupling(b"zzz");
9475 // The search result must either be None or have exact_parent_found=false.
9476 assert!(
9477 sr.is_none_or(|r| !r.exact_parent_found),
9478 "search_with_coupling must not find a key that was never inserted"
9479 );
9480 }
9481
9482 /// search_with_coupling on an empty tree returns None.
9483 #[test]
9484 fn test_search_with_coupling_empty_tree() {
9485 let tree = Tree::new(1, 8);
9486 assert!(tree.search_with_coupling(b"k").is_none());
9487 }
9488
9489 // ========================================================================
9490 // Tests: BIN-delta reconstitution (apply_delta_to_bin / mutate_to_full_bin)
9491 // ========================================================================
9492
9493 /// apply_delta_to_bin replaces existing entries and inserts new ones.
9494 ///
9495 /// BIN.applyDelta(): delta entries are authoritative and
9496 /// supersede full-BIN entries at the same key.
9497 #[test]
9498 fn test_apply_delta_to_bin_updates_and_inserts() {
9499 let mut base = BinStub {
9500 node_id: 1,
9501 level: BIN_LEVEL,
9502 entries: vec![
9503 BinEntry {
9504 data: Some(b"old_a".to_vec()),
9505 known_deleted: false,
9506 dirty: false,
9507 expiration_time: 0,
9508 },
9509 BinEntry {
9510 data: Some(b"old_c".to_vec()),
9511 known_deleted: false,
9512 dirty: false,
9513 expiration_time: 0,
9514 },
9515 ],
9516 key_prefix: Vec::new(),
9517 dirty: false,
9518 is_delta: false,
9519 last_full_lsn: NULL_LSN,
9520 last_delta_lsn: NULL_LSN,
9521 generation: 0,
9522 parent: None,
9523 expiration_in_hours: true,
9524 cursor_count: 0,
9525 prohibit_next_delta: false,
9526 lsn_rep: LsnRep::Empty,
9527 keys: KeyRep::from_keys(vec![b"a".to_vec(), b"c".to_vec()]),
9528 compact_max_key_length: INKeyRep_DEFAULT_MAX_KEY_LENGTH,
9529 };
9530
9531 let delta_entries = vec![
9532 // Update existing key "a" with new data.
9533 (b"a".to_vec(), Lsn::new(1, 10), Some(b"new_a".to_vec())),
9534 // Insert new key "b".
9535 (b"b".to_vec(), Lsn::new(1, 20), Some(b"new_b".to_vec())),
9536 ];
9537
9538 Tree::apply_delta_to_bin(&mut base, delta_entries);
9539
9540 assert!(base.dirty, "base must be dirty after applying delta");
9541
9542 // Collect the full keys for assertions (T-2: keys live in the rep).
9543 let full_keys: Vec<Vec<u8>> = (0..base.entries.len())
9544 .map(|i| base.get_full_key(i).unwrap_or_default())
9545 .collect();
9546
9547 // "a" must be updated.
9548 let a_idx = full_keys.iter().position(|k| k == b"a").unwrap();
9549 assert_eq!(
9550 base.entries[a_idx].data.as_deref(),
9551 Some(b"new_a" as &[u8])
9552 );
9553
9554 // "b" must be newly inserted.
9555 assert!(full_keys.iter().any(|k| k == b"b"));
9556
9557 // "c" must still be present (untouched).
9558 assert!(full_keys.iter().any(|k| k == b"c"));
9559
9560 // Entries must be in sorted order.
9561 let mut sorted = full_keys.clone();
9562 sorted.sort();
9563 assert_eq!(
9564 full_keys, sorted,
9565 "entries must remain sorted after delta apply"
9566 );
9567 }
9568
9569 /// apply_delta_to_bin with an empty delta is a no-op (except dirty flag).
9570 #[test]
9571 fn test_apply_delta_to_bin_empty_delta() {
9572 let mut base = BinStub {
9573 node_id: 1,
9574 level: BIN_LEVEL,
9575 entries: vec![BinEntry {
9576 data: None,
9577 known_deleted: false,
9578 dirty: false,
9579 expiration_time: 0,
9580 }],
9581 key_prefix: Vec::new(),
9582 dirty: false,
9583 is_delta: false,
9584 last_full_lsn: NULL_LSN,
9585 last_delta_lsn: NULL_LSN,
9586 generation: 0,
9587 parent: None,
9588 expiration_in_hours: true,
9589 cursor_count: 0,
9590 prohibit_next_delta: false,
9591 lsn_rep: LsnRep::Empty,
9592 keys: KeyRep::from_keys(vec![b"x".to_vec()]),
9593 compact_max_key_length: INKeyRep_DEFAULT_MAX_KEY_LENGTH,
9594 };
9595 let n_before = base.entries.len();
9596 Tree::apply_delta_to_bin(&mut base, vec![]);
9597 assert_eq!(
9598 base.entries.len(),
9599 n_before,
9600 "empty delta must not change entry count"
9601 );
9602 assert!(base.dirty, "dirty must be set even for empty delta apply");
9603 }
9604
9605 /// mutate_to_full_bin reconstitutes a full BIN from a delta + base.
9606 ///
9607 /// BIN.mutateToFullBIN(BIN fullBIN): after mutation the
9608 /// `is_delta` flag must be cleared and the entries must contain both
9609 /// base and delta data.
9610 #[test]
9611 fn test_mutate_to_full_bin_merges_delta_and_base() {
9612 let base = BinStub {
9613 node_id: 2,
9614 level: BIN_LEVEL,
9615 entries: vec![
9616 BinEntry {
9617 data: Some(b"base_aa".to_vec()),
9618 known_deleted: false,
9619 dirty: false,
9620 expiration_time: 0,
9621 },
9622 BinEntry {
9623 data: Some(b"base_cc".to_vec()),
9624 known_deleted: false,
9625 dirty: false,
9626 expiration_time: 0,
9627 },
9628 ],
9629 key_prefix: Vec::new(),
9630 dirty: false,
9631 is_delta: false,
9632 last_full_lsn: NULL_LSN,
9633 last_delta_lsn: NULL_LSN,
9634 generation: 0,
9635 parent: None,
9636 expiration_in_hours: true,
9637 cursor_count: 0,
9638 prohibit_next_delta: false,
9639 lsn_rep: LsnRep::Empty,
9640 keys: KeyRep::from_keys(vec![b"aa".to_vec(), b"cc".to_vec()]),
9641 compact_max_key_length: INKeyRep_DEFAULT_MAX_KEY_LENGTH,
9642 };
9643
9644 // The delta has a new entry "bb" and overwrites "aa".
9645 let mut delta = BinStub {
9646 node_id: 2,
9647 level: BIN_LEVEL,
9648 entries: vec![
9649 BinEntry {
9650 data: Some(b"delta_aa".to_vec()),
9651 known_deleted: false,
9652 dirty: false,
9653 expiration_time: 0,
9654 },
9655 BinEntry {
9656 data: Some(b"delta_bb".to_vec()),
9657 known_deleted: false,
9658 dirty: false,
9659 expiration_time: 0,
9660 },
9661 ],
9662 key_prefix: Vec::new(),
9663 dirty: true,
9664 is_delta: true,
9665 last_full_lsn: NULL_LSN,
9666 last_delta_lsn: NULL_LSN,
9667 generation: 0,
9668 parent: None,
9669 expiration_in_hours: true,
9670 cursor_count: 0,
9671 prohibit_next_delta: false,
9672 lsn_rep: LsnRep::Empty,
9673 keys: KeyRep::from_keys(vec![b"aa".to_vec(), b"bb".to_vec()]),
9674 compact_max_key_length: INKeyRep_DEFAULT_MAX_KEY_LENGTH,
9675 };
9676
9677 Tree::mutate_to_full_bin(&mut delta, base);
9678
9679 // After mutation the node must be a full BIN.
9680 assert!(
9681 !delta.is_delta,
9682 "is_delta must be false after mutate_to_full_bin"
9683 );
9684 assert!(delta.dirty, "must be dirty after mutation");
9685
9686 // Collect full keys for assertions (T-2: keys live in the rep).
9687 let dk: Vec<Vec<u8>> = (0..delta.entries.len())
9688 .map(|i| delta.get_full_key(i).unwrap_or_default())
9689 .collect();
9690
9691 // "aa" must be the delta version.
9692 let aa_idx = dk.iter().position(|k| k == b"aa").unwrap();
9693 assert_eq!(
9694 delta.entries[aa_idx].data.as_deref(),
9695 Some(b"delta_aa" as &[u8])
9696 );
9697
9698 // "bb" must be present (from delta).
9699 assert!(dk.iter().any(|k| k == b"bb"));
9700
9701 // "cc" must be present (from base).
9702 assert!(dk.iter().any(|k| k == b"cc"));
9703
9704 // Three entries total, in sorted order.
9705 assert_eq!(delta.entries.len(), 3);
9706 let mut sorted = dk.clone();
9707 sorted.sort();
9708 assert_eq!(dk, sorted, "entries must be sorted after mutation");
9709 }
9710
9711 /// is_delta flag is correctly reported by bin_is_delta().
9712 #[test]
9713 fn test_bin_is_delta_flag() {
9714 let mut bin = BinStub {
9715 node_id: 1,
9716 level: BIN_LEVEL,
9717 entries: vec![],
9718 key_prefix: Vec::new(),
9719 dirty: false,
9720 is_delta: false,
9721 last_full_lsn: NULL_LSN,
9722 last_delta_lsn: NULL_LSN,
9723 generation: 0,
9724 parent: None,
9725 expiration_in_hours: true,
9726 cursor_count: 0,
9727 prohibit_next_delta: false,
9728 lsn_rep: LsnRep::Empty,
9729 keys: KeyRep::new(),
9730 compact_max_key_length: INKeyRep_DEFAULT_MAX_KEY_LENGTH,
9731 };
9732 assert!(!Tree::bin_is_delta(&bin));
9733 bin.is_delta = true;
9734 assert!(Tree::bin_is_delta(&bin));
9735 }
9736
9737 // ========================================================================
9738 // Tests: mutate_to_full_bin_from_log
9739 // ========================================================================
9740
9741 /// mutate_to_full_bin_from_log is a no-op when the BIN is already full.
9742 #[test]
9743 fn test_mutate_to_full_bin_from_log_already_full() {
9744 let dir = tempfile::tempdir().unwrap();
9745 let fm = std::sync::Arc::new(
9746 noxu_log::FileManager::new(dir.path(), false, 10_000_000, 100)
9747 .unwrap(),
9748 );
9749 let lm = noxu_log::LogManager::new(fm, 3, 1024 * 1024, 4096);
9750
9751 let mut bin = BinStub {
9752 node_id: 1,
9753 level: BIN_LEVEL,
9754 entries: vec![BinEntry {
9755 data: Some(b"v1".to_vec()),
9756 known_deleted: false,
9757 dirty: false,
9758 expiration_time: 0,
9759 }],
9760 key_prefix: Vec::new(),
9761 dirty: false,
9762 is_delta: false, // already a full BIN
9763 last_full_lsn: NULL_LSN,
9764 last_delta_lsn: NULL_LSN,
9765 generation: 0,
9766 parent: None,
9767 expiration_in_hours: true,
9768 cursor_count: 0,
9769 prohibit_next_delta: false,
9770 lsn_rep: LsnRep::Empty,
9771 keys: KeyRep::from_keys(vec![b"key1".to_vec()]),
9772 compact_max_key_length: INKeyRep_DEFAULT_MAX_KEY_LENGTH,
9773 };
9774
9775 Tree::mutate_to_full_bin_from_log(&mut bin, &lm);
9776
9777 // No-op: is_delta was already false, entries unchanged.
9778 assert!(!bin.is_delta);
9779 assert_eq!(bin.entries.len(), 1);
9780 }
9781
9782 /// mutate_to_full_bin_from_log with NULL_LSN promotes delta without base.
9783 ///
9784 /// When last_full_lsn is NULL_LSN the BIN has never been written as a full
9785 /// entry. The function must clear is_delta and leave the delta entries
9786 /// as-is (they are the authoritative full state).
9787 #[test]
9788 fn test_mutate_to_full_bin_from_log_null_lsn() {
9789 let dir = tempfile::tempdir().unwrap();
9790 let fm = std::sync::Arc::new(
9791 noxu_log::FileManager::new(dir.path(), false, 10_000_000, 100)
9792 .unwrap(),
9793 );
9794 let lm = noxu_log::LogManager::new(fm, 3, 1024 * 1024, 4096);
9795
9796 let mut delta = BinStub {
9797 node_id: 2,
9798 level: BIN_LEVEL,
9799 entries: vec![BinEntry {
9800 data: Some(b"delta_a".to_vec()),
9801 known_deleted: false,
9802 dirty: true,
9803 expiration_time: 0,
9804 }],
9805 key_prefix: Vec::new(),
9806 dirty: true,
9807 is_delta: true,
9808 last_full_lsn: NULL_LSN, // no full BIN ever written
9809 last_delta_lsn: NULL_LSN,
9810 generation: 0,
9811 parent: None,
9812 expiration_in_hours: true,
9813 cursor_count: 0,
9814 prohibit_next_delta: false,
9815 lsn_rep: LsnRep::Empty,
9816 keys: KeyRep::from_keys(vec![b"a".to_vec()]),
9817 compact_max_key_length: INKeyRep_DEFAULT_MAX_KEY_LENGTH,
9818 };
9819
9820 Tree::mutate_to_full_bin_from_log(&mut delta, &lm);
9821
9822 // is_delta must be cleared; the single delta entry is kept as-is.
9823 assert!(
9824 !delta.is_delta,
9825 "is_delta must be false after null-lsn promotion"
9826 );
9827 assert_eq!(delta.entries.len(), 1);
9828 assert_eq!(delta.entries[0].data.as_deref(), Some(b"delta_a" as &[u8]));
9829 }
9830
9831 /// mutate_to_full_bin_from_log reads full BIN from log and merges delta.
9832 ///
9833 /// Round-trip: serialize a full BIN, write it to a LogManager, record the
9834 /// LSN, then call mutate_to_full_bin_from_log on a delta referencing that
9835 /// LSN. The result must contain base-only and delta-only entries with the
9836 /// delta winning on conflicts.
9837 #[test]
9838 fn test_mutate_to_full_bin_from_log_reads_and_merges() {
9839 let dir = tempfile::tempdir().unwrap();
9840 let fm = std::sync::Arc::new(
9841 noxu_log::FileManager::new(dir.path(), false, 10_000_000, 100)
9842 .unwrap(),
9843 );
9844 let lm = noxu_log::LogManager::new(fm, 3, 1024 * 1024, 4096);
9845
9846 // Build and serialize the full BIN that will be written to the log.
9847 let full_bin = BinStub {
9848 node_id: 42,
9849 level: BIN_LEVEL,
9850 entries: vec![
9851 BinEntry {
9852 data: Some(b"base_val".to_vec()),
9853 known_deleted: false,
9854 dirty: false,
9855 expiration_time: 0,
9856 },
9857 BinEntry {
9858 data: Some(b"base_shared".to_vec()),
9859 known_deleted: false,
9860 dirty: false,
9861 expiration_time: 0,
9862 },
9863 ],
9864 key_prefix: Vec::new(),
9865 dirty: false,
9866 is_delta: false,
9867 last_full_lsn: NULL_LSN,
9868 last_delta_lsn: NULL_LSN,
9869 generation: 0,
9870 parent: None,
9871 expiration_in_hours: true,
9872 cursor_count: 0,
9873 prohibit_next_delta: false,
9874 lsn_rep: LsnRep::Empty,
9875 keys: KeyRep::from_keys(vec![
9876 b"base_only".to_vec(),
9877 b"shared_key".to_vec(),
9878 ]),
9879 compact_max_key_length: INKeyRep_DEFAULT_MAX_KEY_LENGTH,
9880 };
9881
9882 let payload = full_bin.serialize_full();
9883 let full_lsn = lm
9884 .log(
9885 noxu_log::LogEntryType::BIN,
9886 &payload,
9887 noxu_log::Provisional::No,
9888 true,
9889 false,
9890 )
9891 .expect("write full BIN to log");
9892 lm.flush_no_sync().expect("flush log");
9893
9894 // Build a delta BIN referencing the full BIN via last_full_lsn.
9895 let mut delta = BinStub {
9896 node_id: 42,
9897 level: BIN_LEVEL,
9898 entries: vec![
9899 // Overwrites "shared_key" from the base.
9900 BinEntry {
9901 data: Some(b"delta_shared".to_vec()),
9902 known_deleted: false,
9903 dirty: true,
9904 expiration_time: 0,
9905 },
9906 // New key only in the delta.
9907 BinEntry {
9908 data: Some(b"delta_val".to_vec()),
9909 known_deleted: false,
9910 dirty: true,
9911 expiration_time: 0,
9912 },
9913 ],
9914 key_prefix: Vec::new(),
9915 dirty: true,
9916 is_delta: true,
9917 last_full_lsn: full_lsn,
9918 last_delta_lsn: NULL_LSN,
9919 generation: 0,
9920 parent: None,
9921 expiration_in_hours: true,
9922 cursor_count: 0,
9923 prohibit_next_delta: false,
9924 lsn_rep: LsnRep::Empty,
9925 keys: KeyRep::from_keys(vec![
9926 b"shared_key".to_vec(),
9927 b"delta_only".to_vec(),
9928 ]),
9929 compact_max_key_length: INKeyRep_DEFAULT_MAX_KEY_LENGTH,
9930 };
9931
9932 Tree::mutate_to_full_bin_from_log(&mut delta, &lm);
9933
9934 assert!(
9935 !delta.is_delta,
9936 "is_delta must be false after log-based mutation"
9937 );
9938 assert!(delta.dirty, "must be dirty after mutation");
9939
9940 // All three distinct keys must be present.
9941 let find = |k: &[u8]| -> Option<Vec<u8>> {
9942 (0..delta.entries.len())
9943 .find(|&i| delta.get_full_key(i).as_deref() == Some(k))
9944 .and_then(|i| delta.entries[i].data.clone())
9945 };
9946
9947 assert_eq!(
9948 find(b"base_only"),
9949 Some(b"base_val".to_vec()),
9950 "base-only key must be present"
9951 );
9952 assert_eq!(
9953 find(b"shared_key"),
9954 Some(b"delta_shared".to_vec()),
9955 "delta must win on shared_key"
9956 );
9957 assert_eq!(
9958 find(b"delta_only"),
9959 Some(b"delta_val".to_vec()),
9960 "delta-only key must be present"
9961 );
9962 assert_eq!(delta.entries.len(), 3, "must have exactly 3 entries");
9963
9964 // Entries must be in sorted order (by full key).
9965 let full_keys: Vec<Vec<u8>> = (0..delta.entries.len())
9966 .map(|i| delta.get_full_key(i).unwrap())
9967 .collect();
9968 let mut sorted_keys = full_keys.clone();
9969 sorted_keys.sort();
9970 assert_eq!(full_keys, sorted_keys, "entries must be in sorted order");
9971 }
9972
9973 // ========================================================================
9974 // Tests: deserialize_full key prefix recomputation
9975 // ========================================================================
9976
9977 /// deserialize_full recomputes key prefix from loaded full keys.
9978 ///
9979 /// IN.recalcKeyPrefix() called after materializing from log:
9980 /// a BIN loaded from the log should have prefix compression applied so
9981 /// that search performance matches an in-memory BIN.
9982 #[test]
9983 fn test_deserialize_full_recomputes_key_prefix() {
9984 // Build a BIN with a known common prefix and serialize it.
9985 let mut source = BinStub {
9986 node_id: 99,
9987 level: BIN_LEVEL,
9988 entries: vec![
9989 BinEntry {
9990 data: None,
9991 known_deleted: false,
9992 dirty: false,
9993 expiration_time: 0,
9994 },
9995 BinEntry {
9996 data: None,
9997 known_deleted: false,
9998 dirty: false,
9999 expiration_time: 0,
10000 },
10001 BinEntry {
10002 data: None,
10003 known_deleted: false,
10004 dirty: false,
10005 expiration_time: 0,
10006 },
10007 ],
10008 key_prefix: Vec::new(),
10009 dirty: false,
10010 is_delta: false,
10011 last_full_lsn: NULL_LSN,
10012 last_delta_lsn: NULL_LSN,
10013 generation: 0,
10014 parent: None,
10015 expiration_in_hours: true,
10016 cursor_count: 0,
10017 prohibit_next_delta: false,
10018 lsn_rep: LsnRep::Empty,
10019 keys: KeyRep::from_keys(vec![
10020 b"pfx:alpha".to_vec(),
10021 b"pfx:beta".to_vec(),
10022 b"pfx:gamma".to_vec(),
10023 ]),
10024 compact_max_key_length: INKeyRep_DEFAULT_MAX_KEY_LENGTH,
10025 };
10026 source.recompute_key_prefix();
10027 // Verify the source has the expected prefix before serializing.
10028 assert_eq!(source.key_prefix, b"pfx:");
10029
10030 let payload = source.serialize_full();
10031
10032 // Deserialize and verify prefix is re-established.
10033 let loaded = BinStub::deserialize_full(&payload)
10034 .expect("deserialization must succeed");
10035
10036 assert_eq!(
10037 loaded.key_prefix, b"pfx:",
10038 "key prefix must be recomputed after deserialize_full"
10039 );
10040
10041 // All full keys must be reconstructable.
10042 for i in 0..loaded.entries.len() {
10043 let fk = loaded.get_full_key(i).unwrap();
10044 assert!(
10045 fk.starts_with(b"pfx:"),
10046 "full key {i} must start with prefix"
10047 );
10048 }
10049 }
10050
10051 /// deserialize_full with a single entry leaves key_prefix empty.
10052 ///
10053 /// A BIN with fewer than 2 entries cannot have a meaningful common prefix.
10054 #[test]
10055 fn test_deserialize_full_single_entry_no_prefix() {
10056 let source = BinStub {
10057 node_id: 7,
10058 level: BIN_LEVEL,
10059 entries: vec![BinEntry {
10060 data: None,
10061 known_deleted: false,
10062 dirty: false,
10063 expiration_time: 0,
10064 }],
10065 key_prefix: Vec::new(),
10066 dirty: false,
10067 is_delta: false,
10068 last_full_lsn: NULL_LSN,
10069 last_delta_lsn: NULL_LSN,
10070 generation: 0,
10071 parent: None,
10072 expiration_in_hours: true,
10073 cursor_count: 0,
10074 prohibit_next_delta: false,
10075 lsn_rep: LsnRep::Empty,
10076 keys: KeyRep::from_keys(vec![b"solo".to_vec()]),
10077 compact_max_key_length: INKeyRep_DEFAULT_MAX_KEY_LENGTH,
10078 };
10079
10080 let payload = source.serialize_full();
10081 let loaded = BinStub::deserialize_full(&payload)
10082 .expect("deserialization must succeed");
10083
10084 assert!(
10085 loaded.key_prefix.is_empty(),
10086 "single-entry BIN must have empty prefix"
10087 );
10088 assert_eq!(loaded.get_full_key(0).unwrap(), b"solo");
10089 }
10090
10091 // ========================================================================
10092 // Tests: get_next_bin / get_prev_bin
10093 // ========================================================================
10094
10095 /// get_next_bin returns the entries of the next BIN to the right.
10096 ///
10097 /// Tree.getNextBin() / getNextIN(forward=true).
10098 #[test]
10099 fn test_get_next_bin_basic() {
10100 let tree = Tree::new(1, 4);
10101
10102 // Insert 8 sorted keys — creates multiple BINs.
10103 for i in 0u32..8 {
10104 let key = format!("n{:04}", i).into_bytes();
10105 tree.insert(key, vec![i as u8], Lsn::new(1, i)).unwrap();
10106 }
10107
10108 let stats = tree.collect_stats();
10109 if stats.n_bins < 2 {
10110 // If the tree only has one BIN, skip the sibling test.
10111 return;
10112 }
10113
10114 // A key from the first BIN (e.g. "n0000") should have a next BIN.
10115 let next = tree.get_next_bin(b"n0000");
10116 assert!(
10117 next.is_some(),
10118 "must return a next BIN for a key in the leftmost BIN"
10119 );
10120
10121 let entries = next.unwrap();
10122 assert!(!entries.is_empty(), "next BIN must not be empty");
10123 // All returned keys must be strictly greater than "n0000" because they
10124 // are in a different (rightward) BIN.
10125 for (_, _, k) in &entries {
10126 assert!(
10127 k.as_slice() > b"n0000" as &[u8],
10128 "next BIN entries must all be > the search key"
10129 );
10130 }
10131 }
10132
10133 /// get_next_bin returns None for a key in the rightmost BIN.
10134 #[test]
10135 fn test_get_next_bin_at_rightmost_returns_none() {
10136 let tree = Tree::new(1, 4);
10137 for i in 0u32..8 {
10138 let key = format!("r{:04}", i).into_bytes();
10139 tree.insert(key, vec![i as u8], Lsn::new(1, i)).unwrap();
10140 }
10141 // A key from the rightmost BIN (e.g. "r0007") has no next BIN.
10142 let next = tree.get_next_bin(b"r0007");
10143 assert!(
10144 next.is_none(),
10145 "must return None for a key in the rightmost BIN"
10146 );
10147 }
10148
10149 /// get_prev_bin returns the entries of the next BIN to the left.
10150 ///
10151 /// Tree.getPrevBin() / getNextIN(forward=false).
10152 #[test]
10153 fn test_get_prev_bin_basic() {
10154 let tree = Tree::new(1, 4);
10155 for i in 0u32..8 {
10156 let key = format!("p{:04}", i).into_bytes();
10157 tree.insert(key, vec![i as u8], Lsn::new(1, i)).unwrap();
10158 }
10159
10160 // A key from the second BIN ("p0004") should have a previous BIN.
10161 let prev = tree.get_prev_bin(b"p0004");
10162 assert!(
10163 prev.is_some(),
10164 "must return a prev BIN for a key in the second BIN"
10165 );
10166
10167 let entries = prev.unwrap();
10168 assert!(!entries.is_empty(), "prev BIN must not be empty");
10169 // All returned keys must be < b"p0004".
10170 for (_, _, k) in &entries {
10171 assert!(
10172 k.as_slice() < b"p0004" as &[u8],
10173 "prev BIN entries must all be < the current BIN"
10174 );
10175 }
10176 }
10177
10178 /// get_prev_bin returns None for a key in the leftmost BIN.
10179 #[test]
10180 fn test_get_prev_bin_at_leftmost_returns_none() {
10181 let tree = Tree::new(1, 4);
10182 for i in 0u32..8 {
10183 let key = format!("q{:04}", i).into_bytes();
10184 tree.insert(key, vec![i as u8], Lsn::new(1, i)).unwrap();
10185 }
10186 // A key from the leftmost BIN ("q0000") has no prev BIN.
10187 let prev = tree.get_prev_bin(b"q0000");
10188 assert!(
10189 prev.is_none(),
10190 "must return None for a key in the leftmost BIN"
10191 );
10192 }
10193
10194 /// get_next_bin and get_prev_bin are inverse operations across the
10195 /// BIN boundary.
10196 #[test]
10197 fn test_next_prev_bin_are_symmetric() {
10198 let tree = Tree::new(1, 4);
10199 for i in 0u32..8 {
10200 let key = format!("s{:04}", i).into_bytes();
10201 tree.insert(key, vec![i as u8], Lsn::new(1, i)).unwrap();
10202 }
10203
10204 // From first BIN (s0000): next → second BIN entries.
10205 let next_from_first = tree.get_next_bin(b"s0000").unwrap();
10206 // The smallest key of the next BIN.
10207 let next_first_key =
10208 next_from_first.iter().map(|(_, _, k)| k.clone()).min().unwrap();
10209
10210 // From that key in the second BIN: prev → should overlap with first BIN.
10211 let prev_from_second = tree.get_prev_bin(&next_first_key).unwrap();
10212 let prev_first_key =
10213 prev_from_second.iter().map(|(_, _, k)| k.clone()).max().unwrap();
10214
10215 // The max key of the "prev" result must be in the first BIN (< next boundary).
10216 assert!(
10217 prev_first_key < next_first_key,
10218 "prev BIN entries must be smaller than the boundary key"
10219 );
10220 }
10221
10222 /// get_next_bin on an empty tree returns None.
10223 #[test]
10224 fn test_get_next_bin_empty_tree() {
10225 let tree = Tree::new(1, 8);
10226 assert!(tree.get_next_bin(b"any").is_none());
10227 }
10228
10229 /// get_prev_bin on an empty tree returns None.
10230 #[test]
10231 fn test_get_prev_bin_empty_tree() {
10232 let tree = Tree::new(1, 8);
10233 assert!(tree.get_prev_bin(b"any").is_none());
10234 }
10235
10236 // =========================================================================
10237 // R3 fix: get_next_bin / get_prev_bin honour the custom comparator
10238 // =========================================================================
10239
10240 /// R3 regression test: with a custom comparator that reverses byte order
10241 /// (descending), `get_next_bin` and `get_prev_bin` must use comparator
10242 /// order when routing through internal nodes.
10243 ///
10244 /// Pre-fix: the static `get_adjacent_bin_attempt` used raw `<=` byte order
10245 /// for IN routing, causing it to descend to the wrong child when comparator
10246 /// order ≠ byte order.
10247 ///
10248 /// The tree is forced to split (max_entries = 4) so there IS an internal
10249 /// node (IN) to route through. Under a reverse comparator the insertion
10250 /// order and stored key order are reversed relative to byte order, so any
10251 /// descent that uses raw byte comparison will pick the wrong slot.
10252 ///
10253 /// Pass-post invariant: iterating forward via repeated `get_next_bin` from
10254 /// the leftmost BIN yields keys in COMPARATOR order (descending byte order
10255 /// here), not in raw ascending byte order.
10256 #[test]
10257 fn test_get_next_prev_bin_custom_comparator_order() {
10258 // Reverse-order comparator: larger bytes sort first.
10259 let reverse_cmp: KeyComparatorFn =
10260 Arc::new(|a: &[u8], b: &[u8]| b.cmp(a));
10261 // Small max_entries so the tree splits and has internal nodes.
10262 let mut tree = Tree::new(1, 4);
10263 tree.set_comparator(reverse_cmp);
10264
10265 // Insert keys that are ascending in byte order ("a" < "b" < … < "i")
10266 // but descending in comparator order (i > h > … > a).
10267 let keys: &[&[u8]] =
10268 &[b"a", b"b", b"c", b"d", b"e", b"f", b"g", b"h", b"i"];
10269 for (i, k) in keys.iter().enumerate() {
10270 tree.insert(
10271 k.to_vec(),
10272 vec![i as u8],
10273 Lsn::from_u64((i + 1) as u64),
10274 )
10275 .unwrap();
10276 }
10277
10278 // Collect all BINs by walking from the comparator-smallest key ("i"
10279 // in reverse order) using get_next_bin. The anchor must be a key that
10280 // is smaller than everything in comparator order, i.e. the largest
10281 // byte-value key. We use the tree's search to find the actual leftmost
10282 // key under the comparator by starting from "i" (comparator-min).
10283 //
10284 // Strategy: start at byte key b"\xff" (larger than any inserted key in
10285 // byte order, so it lands in the last BIN in byte order, which under
10286 // a reverse comparator is the leftmost BIN in comparator order). Then
10287 // walk via get_next_bin.
10288 let start_anchor = b"\xff".as_ref();
10289 let mut bin_first_keys: Vec<Vec<u8>> = Vec::new();
10290
10291 // The first BIN in comparator order contains "i" (largest byte key).
10292 // get_next_bin from a virtual start in that BIN gives the next one.
10293 // Collect by walking from the comparator-last key leftward instead:
10294 // use get_next_bin with anchor = b"\xff" to hop to the next BIN
10295 // (comparator order: next = smaller byte value).
10296 let mut anchor = start_anchor.to_vec();
10297 loop {
10298 match tree.get_next_bin(&anchor) {
10299 None => break,
10300 Some(entries) => {
10301 if let Some((_, _, fk0)) = entries.first() {
10302 let fk = fk0.clone();
10303 bin_first_keys.push(fk.clone());
10304 anchor = fk;
10305 } else {
10306 break;
10307 }
10308 }
10309 }
10310 }
10311
10312 // We must have visited at least 2 BINs (tree was forced to split).
10313 assert!(
10314 bin_first_keys.len() >= 2,
10315 "R3: expected multiple BINs after split, got {}",
10316 bin_first_keys.len()
10317 );
10318
10319 // With a reverse comparator, bin_first_keys must be in descending byte
10320 // order (each successive BIN starts at a smaller byte key).
10321 for window in bin_first_keys.windows(2) {
10322 assert!(
10323 window[0] > window[1],
10324 "R3: BIN boundary keys must be descending (comparator order); \
10325 got {:?} then {:?}",
10326 window[0],
10327 window[1]
10328 );
10329 }
10330 }
10331 // ========================================================================
10332
10333 /// Inserting keys with a common prefix causes the BIN to establish that
10334 /// prefix. Stored suffixes are shorter than the full keys.
10335 #[test]
10336 fn test_binstub_prefix_established_on_insert() {
10337 let mut bin = BinStub {
10338 node_id: 1,
10339 level: BIN_LEVEL,
10340 entries: Vec::new(),
10341 key_prefix: Vec::new(),
10342 dirty: false,
10343 is_delta: false,
10344 last_full_lsn: NULL_LSN,
10345 last_delta_lsn: NULL_LSN,
10346 generation: 0,
10347 parent: None,
10348 expiration_in_hours: true,
10349 cursor_count: 0,
10350 prohibit_next_delta: false,
10351 lsn_rep: LsnRep::Empty,
10352 keys: KeyRep::new(),
10353 compact_max_key_length: INKeyRep_DEFAULT_MAX_KEY_LENGTH,
10354 };
10355
10356 bin.insert_with_prefix(b"record:aaa".to_vec(), Lsn::new(1, 1), None);
10357 assert!(bin.key_prefix.is_empty(), "single entry: no prefix yet");
10358
10359 bin.insert_with_prefix(b"record:bbb".to_vec(), Lsn::new(1, 2), None);
10360 assert_eq!(
10361 &bin.key_prefix, b"record:",
10362 "common prefix 'record:' must be extracted"
10363 );
10364 }
10365
10366 /// `get_full_key` on a BinStub returns the full key regardless of whether
10367 /// the stored key is a raw full key or a suffix.
10368 #[test]
10369 fn test_binstub_get_full_key_roundtrip() {
10370 let mut bin = BinStub {
10371 node_id: 1,
10372 level: BIN_LEVEL,
10373 entries: Vec::new(),
10374 key_prefix: Vec::new(),
10375 dirty: false,
10376 is_delta: false,
10377 last_full_lsn: NULL_LSN,
10378 last_delta_lsn: NULL_LSN,
10379 generation: 0,
10380 parent: None,
10381 expiration_in_hours: true,
10382 cursor_count: 0,
10383 prohibit_next_delta: false,
10384 lsn_rep: LsnRep::Empty,
10385 keys: KeyRep::new(),
10386 compact_max_key_length: INKeyRep_DEFAULT_MAX_KEY_LENGTH,
10387 };
10388
10389 let keys = [
10390 b"pfx:first".as_ref(),
10391 b"pfx:second".as_ref(),
10392 b"pfx:third".as_ref(),
10393 ];
10394 for k in keys {
10395 bin.insert_with_prefix(k.to_vec(), Lsn::new(1, 1), None);
10396 }
10397
10398 assert!(!bin.key_prefix.is_empty(), "prefix must be set");
10399
10400 for (i, expected) in keys.iter().enumerate() {
10401 let full = bin.get_full_key(i).expect("must return full key");
10402 assert_eq!(
10403 full.as_slice(),
10404 *expected,
10405 "get_full_key({}) must return full key",
10406 i
10407 );
10408 }
10409 }
10410
10411 /// `find_entry_compressed` on a BinStub with active prefix returns the
10412 /// correct slot index.
10413 #[test]
10414 fn test_binstub_find_entry_compressed() {
10415 let mut bin = BinStub {
10416 node_id: 1,
10417 level: BIN_LEVEL,
10418 entries: Vec::new(),
10419 key_prefix: Vec::new(),
10420 dirty: false,
10421 is_delta: false,
10422 last_full_lsn: NULL_LSN,
10423 last_delta_lsn: NULL_LSN,
10424 generation: 0,
10425 parent: None,
10426 expiration_in_hours: true,
10427 cursor_count: 0,
10428 prohibit_next_delta: false,
10429 lsn_rep: LsnRep::Empty,
10430 keys: KeyRep::new(),
10431 compact_max_key_length: INKeyRep_DEFAULT_MAX_KEY_LENGTH,
10432 };
10433
10434 for k in
10435 [b"db:alpha".as_ref(), b"db:beta".as_ref(), b"db:gamma".as_ref()]
10436 {
10437 bin.insert_with_prefix(k.to_vec(), Lsn::new(1, 1), None);
10438 }
10439
10440 let (idx, found) = bin.find_entry_compressed(b"db:beta");
10441 assert!(found, "db:beta must be found");
10442 assert_eq!(idx, 1, "db:beta must be at index 1");
10443
10444 let (_, not_found) = bin.find_entry_compressed(b"db:zzz");
10445 assert!(!not_found, "db:zzz must not be found");
10446 }
10447
10448 /// Tree insert/search works correctly when BINs accumulate a key prefix.
10449 #[test]
10450 fn test_tree_insert_search_with_prefix_compression() {
10451 let tree = Tree::new(1, 8);
10452 let n = 200u32;
10453
10454 // All keys share a long common prefix — good for prefix compression.
10455 for i in 0..n {
10456 let key = format!("namespace:entity:{:06}", i).into_bytes();
10457 let data = vec![i as u8];
10458 tree.insert(key, data, Lsn::new(1, i)).unwrap();
10459 }
10460
10461 // All keys must be findable.
10462 for i in 0..n {
10463 let key = format!("namespace:entity:{:06}", i).into_bytes();
10464 let sr = tree.search(&key);
10465 assert!(
10466 sr.is_some() && sr.unwrap().exact_parent_found,
10467 "key namespace:entity:{:06} must be found",
10468 i
10469 );
10470 }
10471 }
10472
10473 /// Prefix survives a BIN split: keys in both halves must still be findable.
10474 #[test]
10475 fn test_prefix_preserved_across_bin_split() {
10476 // Small fanout to force splits quickly.
10477 let tree = Tree::new(1, 4);
10478
10479 for i in 0u32..20 {
10480 let key = format!("pfx:key:{:04}", i).into_bytes();
10481 tree.insert(key, vec![i as u8], Lsn::new(1, i)).unwrap();
10482 }
10483
10484 // All keys must be findable after splits.
10485 for i in 0u32..20 {
10486 let key = format!("pfx:key:{:04}", i).into_bytes();
10487 let sr = tree.search(&key);
10488 assert!(
10489 sr.is_some() && sr.unwrap().exact_parent_found,
10490 "pfx:key:{:04} must be found after splits",
10491 i
10492 );
10493 }
10494 }
10495
10496 /// `decompress_key` round-trips: compress then decompress gives the original.
10497 #[test]
10498 fn test_binstub_compress_decompress_roundtrip() {
10499 let mut bin = BinStub {
10500 node_id: 1,
10501 level: BIN_LEVEL,
10502 entries: Vec::new(),
10503 key_prefix: Vec::new(),
10504 dirty: false,
10505 is_delta: false,
10506 last_full_lsn: NULL_LSN,
10507 last_delta_lsn: NULL_LSN,
10508 generation: 0,
10509 parent: None,
10510 expiration_in_hours: true,
10511 cursor_count: 0,
10512 prohibit_next_delta: false,
10513 lsn_rep: LsnRep::Empty,
10514 keys: KeyRep::new(),
10515 compact_max_key_length: INKeyRep_DEFAULT_MAX_KEY_LENGTH,
10516 };
10517
10518 for k in [b"myapp:user:1".as_ref(), b"myapp:user:2".as_ref()] {
10519 bin.insert_with_prefix(k.to_vec(), Lsn::new(1, 1), None);
10520 }
10521
10522 assert!(!bin.key_prefix.is_empty());
10523
10524 // Manually compress a full key and then decompress it.
10525 let full_key = b"myapp:user:3";
10526 let suffix = bin.compress_key(full_key);
10527 let recovered = bin.decompress_key(&suffix);
10528 assert_eq!(
10529 recovered.as_slice(),
10530 full_key,
10531 "compress→decompress must be identity"
10532 );
10533 }
10534
10535 /// get_next_bin correctly navigates a 3-level tree.
10536 #[test]
10537 fn test_get_next_bin_three_level_tree() {
10538 // With fanout 4, inserting 20 keys forces a root split → 3 levels.
10539 let tree = Tree::new(1, 4);
10540 for i in 0u32..20 {
10541 let key = format!("t{:04}", i).into_bytes();
10542 tree.insert(key, vec![i as u8], Lsn::new(1, i)).unwrap();
10543 }
10544 assert!(tree.get_root_splits() > 0, "tree must have grown to 3 levels");
10545
10546 // Starting from t0000, iterating via get_next_bin must visit every BIN.
10547 let mut visited: Vec<Vec<u8>> = Vec::new();
10548 // Collect the first BIN's keys by searching for t0000.
10549 if let Some(first_entries) = {
10550 // Get the leftmost BIN by using get_first_node result.
10551 // get_first_node returns SearchResult at index 0 in the leftmost BIN.
10552 // We approximate by reading the root's leftmost BIN directly.
10553 tree.get_next_bin(b"t0000")
10554 } {
10555 for (_, _, k) in first_entries {
10556 visited.push(k);
10557 }
10558 }
10559
10560 // visited should contain at least one key from the second BIN.
10561 assert!(
10562 !visited.is_empty(),
10563 "should have visited at least one key via get_next_bin in 3-level tree"
10564 );
10565 }
10566
10567 // ========================================================================
10568 // ========================================================================
10569
10570 /// insert a small set of keys
10571 /// with varying lengths and verify each is findable immediately after insert.
10572 #[test]
10573 fn test_je_simple_tree_creation() {
10574 let tree = Tree::new(1, 128);
10575
10576 let keys: &[&[u8]] = &[b"aaaaa", b"aaaab", b"aaaa", b"aaa"];
10577 for (i, &k) in keys.iter().enumerate() {
10578 tree.insert(k.to_vec(), vec![i as u8], Lsn::new(1, i as u32))
10579 .unwrap();
10580
10581 // Every key inserted so far must be findable.
10582 for &prev in &keys[..=i] {
10583 let sr = tree.search(prev);
10584 assert!(
10585 sr.is_some() && sr.unwrap().exact_parent_found,
10586 "key {:?} must be findable after {} inserts",
10587 std::str::from_utf8(prev).unwrap_or("?"),
10588 i + 1
10589 );
10590 }
10591 }
10592 }
10593
10594 /// insert N keys, verify
10595 /// all are found; delete the even-indexed keys, verify even are gone and
10596 /// odd remain.
10597 #[test]
10598 fn test_je_insert_then_delete_then_search() {
10599 let tree = Tree::new(1, 8);
10600 let n = 20usize;
10601
10602 let keys: Vec<Vec<u8>> =
10603 (0..n).map(|i| format!("key{:04}", i).into_bytes()).collect();
10604
10605 // Insert all.
10606 for (i, k) in keys.iter().enumerate() {
10607 tree.insert(k.clone(), vec![i as u8], Lsn::new(1, i as u32))
10608 .unwrap();
10609 }
10610
10611 // All must be findable.
10612 for k in &keys {
10613 let sr = tree.search(k);
10614 assert!(
10615 sr.is_some() && sr.unwrap().exact_parent_found,
10616 "key {:?} must be found after insert",
10617 std::str::from_utf8(k).unwrap_or("?")
10618 );
10619 }
10620
10621 // Delete even-indexed keys.
10622 for i in (0..n).step_by(2) {
10623 tree.delete(&keys[i]);
10624 }
10625
10626 // Even keys must no longer be found; odd keys must still be found.
10627 for (i, key) in keys.iter().enumerate() {
10628 let sr = tree.search(key);
10629 let found = sr.is_some() && sr.unwrap().exact_parent_found;
10630 if i % 2 == 0 {
10631 assert!(!found, "deleted key {:?} must not be found", i);
10632 } else {
10633 assert!(found, "kept key {:?} must still be found", i);
10634 }
10635 }
10636 }
10637
10638 /// insert N keys in reverse
10639 /// order, then verify every key is directly findable and the keys are in
10640 /// sorted ascending order (B-tree ordering invariant).
10641 #[test]
10642 fn test_je_range_scan_sorted_ascending() {
10643 let n = 40usize;
10644 let tree = Tree::new(1, 4);
10645
10646 // Insert in reverse order to stress the B-tree.
10647 for i in (0..n).rev() {
10648 let key = format!("scan{:04}", i).into_bytes();
10649 tree.insert(key, vec![i as u8], Lsn::new(1, i as u32)).unwrap();
10650 }
10651
10652 // Collect all expected keys in sorted order.
10653 let mut expected: Vec<Vec<u8>> =
10654 (0..n).map(|i| format!("scan{:04}", i).into_bytes()).collect();
10655 expected.sort();
10656
10657 // Every key must be individually findable.
10658 for key in &expected {
10659 let sr = tree.search(key);
10660 assert!(
10661 sr.is_some() && sr.unwrap().exact_parent_found,
10662 "key {:?} must be findable",
10663 std::str::from_utf8(key).unwrap_or("?")
10664 );
10665 }
10666
10667 // Verify sorted ordering invariant: expected keys are already sorted
10668 // (lexicographic order = insertion order for "scan{:04}" keys).
10669 for w in expected.windows(2) {
10670 assert!(
10671 w[0] < w[1],
10672 "keys must be in strict ascending order: {:?} < {:?}",
10673 std::str::from_utf8(&w[0]).unwrap_or("?"),
10674 std::str::from_utf8(&w[1]).unwrap_or("?")
10675 );
10676 }
10677
10678 // Use get_next_bin to scan at least a portion of the tree and verify
10679 // ordering of returned BIN entries.
10680 let first_key = format!("scan{:04}", 0).into_bytes();
10681 if let Some(entries) = tree.get_next_bin(&first_key) {
10682 let entry_keys: Vec<&[u8]> =
10683 entries.iter().map(|(_, _, k)| k.as_slice()).collect();
10684 for w in entry_keys.windows(2) {
10685 assert!(
10686 w[0] <= w[1],
10687 "BIN entries from get_next_bin must be in ascending order"
10688 );
10689 }
10690 }
10691 }
10692
10693 /// insert N keys in
10694 /// ascending order and verify the tree height stays bounded (≤ 10 levels)
10695 /// and all keys are findable.
10696 #[test]
10697 fn test_je_ascending_insert_balance() {
10698 let n = 128usize;
10699 let tree = Tree::new(1, 8);
10700
10701 for i in 0..n {
10702 let key = format!("asc{:06}", i).into_bytes();
10703 tree.insert(key, vec![(i & 0xFF) as u8], Lsn::new(1, i as u32))
10704 .unwrap();
10705 }
10706
10707 let stats = tree.collect_stats();
10708 assert!(
10709 stats.height <= 10,
10710 "tree height after {} ascending inserts with fanout 8 must be <= 10, got {}",
10711 n,
10712 stats.height
10713 );
10714
10715 for i in 0..n {
10716 let key = format!("asc{:06}", i).into_bytes();
10717 let sr = tree.search(&key);
10718 assert!(
10719 sr.is_some() && sr.unwrap().exact_parent_found,
10720 "key asc{:06} must be findable after ascending inserts",
10721 i
10722 );
10723 }
10724 }
10725
10726 /// insert N keys in
10727 /// descending order and verify the tree height stays bounded (≤ 10 levels)
10728 /// and all keys are findable.
10729 #[test]
10730 fn test_je_descending_insert_balance() {
10731 let n = 128usize;
10732 let tree = Tree::new(1, 8);
10733
10734 for i in (0..n).rev() {
10735 let key = format!("dsc{:06}", i).into_bytes();
10736 tree.insert(key, vec![(i & 0xFF) as u8], Lsn::new(1, i as u32))
10737 .unwrap();
10738 }
10739
10740 let stats = tree.collect_stats();
10741 assert!(
10742 stats.height <= 10,
10743 "tree height after {} descending inserts with fanout 8 must be <= 10, got {}",
10744 n,
10745 stats.height
10746 );
10747
10748 for i in 0..n {
10749 let key = format!("dsc{:06}", i).into_bytes();
10750 let sr = tree.search(&key);
10751 assert!(
10752 sr.is_some() && sr.unwrap().exact_parent_found,
10753 "key dsc{:06} must be findable after descending inserts",
10754 i
10755 );
10756 }
10757 }
10758
10759 /// SplitTest invariant: after many splits induced by a small
10760 /// fanout no key is lost.
10761 #[test]
10762 fn test_je_split_no_key_lost() {
10763 let tree = Tree::new(1, 4);
10764 let n = 20usize;
10765
10766 for i in 0..n {
10767 let key = format!("sp{:04}", i).into_bytes();
10768 tree.insert(key, vec![i as u8], Lsn::new(1, i as u32)).unwrap();
10769 }
10770
10771 for i in 0..n {
10772 let key = format!("sp{:04}", i).into_bytes();
10773 let sr = tree.search(&key);
10774 assert!(
10775 sr.is_some() && sr.unwrap().exact_parent_found,
10776 "key sp{:04} must survive all splits",
10777 i
10778 );
10779 }
10780 }
10781
10782 /// SplitTest invariant: after a BIN split both halves exist and
10783 /// all original keys are findable.
10784 #[test]
10785 fn test_je_split_produces_two_halves() {
10786 // fanout=4: fill one BIN then overflow it to force a split.
10787 let tree = Tree::new(1, 4);
10788 let n = 5usize; // one more than fanout → forces at least one split
10789
10790 for i in 0..n {
10791 let key = format!("half{:04}", i).into_bytes();
10792 tree.insert(key, vec![i as u8], Lsn::new(1, i as u32)).unwrap();
10793 }
10794
10795 let stats = tree.collect_stats();
10796 assert!(
10797 stats.n_bins >= 2,
10798 "after splitting a full BIN there must be >= 2 BINs, got {}",
10799 stats.n_bins
10800 );
10801
10802 for i in 0..n {
10803 let key = format!("half{:04}", i).into_bytes();
10804 let sr = tree.search(&key);
10805 assert!(
10806 sr.is_some() && sr.unwrap().exact_parent_found,
10807 "key half{:04} must be findable in one of the two halves",
10808 i
10809 );
10810 }
10811 }
10812
10813 /// SplitTest invariant: root splits are tracked and the tree
10814 /// grows in height as keys accumulate.
10815 #[test]
10816 fn test_je_root_split_creates_new_root() {
10817 // fanout=4, 20 keys: forces multiple root splits.
10818 let tree = Tree::new(1, 4);
10819
10820 for i in 0u32..20 {
10821 let key = format!("rs{:04}", i).into_bytes();
10822 tree.insert(key, vec![i as u8], Lsn::new(1, i)).unwrap();
10823 }
10824
10825 assert!(
10826 tree.get_root_splits() > 0,
10827 "expected at least one root split after 20 inserts with fanout 4"
10828 );
10829
10830 let stats = tree.collect_stats();
10831 assert!(
10832 stats.height >= 3,
10833 "tree must be at least 3 levels tall after root splits, got {}",
10834 stats.height
10835 );
10836
10837 // Every inserted key must still be findable.
10838 for i in 0u32..20 {
10839 let key = format!("rs{:04}", i).into_bytes();
10840 let sr = tree.search(&key);
10841 assert!(
10842 sr.is_some() && sr.unwrap().exact_parent_found,
10843 "key rs{:04} must be findable after root splits",
10844 i
10845 );
10846 }
10847 }
10848
10849 // ========================================================================
10850 // Tests: compress_bin / maybe_compress_bin_and_parent
10851 // INCompressor.compressBin / lazyCompress tests
10852 // ========================================================================
10853
10854 /// compress_bin removes known-deleted slots from a BIN.
10855 ///
10856 /// INCompressor.compressBin(): after compression, slots with
10857 /// `known_deleted = true` must be gone and the BIN must be dirty.
10858 #[test]
10859 fn test_compress_bin_removes_deleted_slots() {
10860 let _lsn = Lsn::new(1, 1);
10861 let bin_arc = Arc::new(RwLock::new(TreeNode::Bottom(BinStub {
10862 node_id: generate_node_id(),
10863 level: BIN_LEVEL,
10864 entries: vec![
10865 BinEntry {
10866 data: Some(b"live".to_vec()),
10867 known_deleted: false,
10868 dirty: false,
10869 expiration_time: 0,
10870 },
10871 BinEntry {
10872 data: None,
10873 known_deleted: true,
10874 dirty: false,
10875 expiration_time: 0,
10876 },
10877 BinEntry {
10878 data: Some(b"live2".to_vec()),
10879 known_deleted: false,
10880 dirty: false,
10881 expiration_time: 0,
10882 },
10883 BinEntry {
10884 data: None,
10885 known_deleted: true,
10886 dirty: false,
10887 expiration_time: 0,
10888 },
10889 ],
10890 key_prefix: Vec::new(),
10891 dirty: false,
10892 is_delta: false,
10893 last_full_lsn: NULL_LSN,
10894 last_delta_lsn: NULL_LSN,
10895 generation: 0,
10896 parent: None,
10897 expiration_in_hours: true,
10898 cursor_count: 0,
10899 prohibit_next_delta: false,
10900 lsn_rep: LsnRep::Empty,
10901 keys: KeyRep::from_keys(vec![
10902 b"a".to_vec(),
10903 b"b".to_vec(),
10904 b"c".to_vec(),
10905 b"d".to_vec(),
10906 ]),
10907 compact_max_key_length: INKeyRep_DEFAULT_MAX_KEY_LENGTH,
10908 })));
10909
10910 // Wire a minimal parent IN so compress_bin can prune if needed.
10911 let root_arc = Arc::new(RwLock::new(TreeNode::Internal(InNodeStub {
10912 node_id: generate_node_id(),
10913 level: MAIN_LEVEL | 2,
10914 entries: vec![InEntry { key: vec![] }],
10915 targets: TargetRep::Sparse(vec![(0, bin_arc.clone())]),
10916 dirty: false,
10917 generation: 0,
10918 parent: None,
10919 lsn_rep: LsnRep::Empty,
10920 })));
10921 {
10922 let mut g = bin_arc.write();
10923 g.set_parent(Some(Arc::downgrade(&root_arc)));
10924 }
10925
10926 let tree = Tree::new(1, 128);
10927 *tree.root.write() = Some(root_arc);
10928
10929 let result = tree.compress_bin(&bin_arc);
10930 assert!(
10931 result,
10932 "compress_bin must return true when slots were removed"
10933 );
10934
10935 let g = bin_arc.read();
10936 match &*g {
10937 TreeNode::Bottom(b) => {
10938 assert_eq!(
10939 b.entries.len(),
10940 2,
10941 "2 live entries must remain after compress"
10942 );
10943 assert!(
10944 b.entries.iter().all(|e| !e.known_deleted),
10945 "no deleted slots must remain"
10946 );
10947 assert!(b.dirty, "BIN must be dirty after compression");
10948 }
10949 _ => panic!("expected BIN"),
10950 }
10951 }
10952
10953 /// compress_bin on a BIN with no deleted slots returns false.
10954 ///
10955 /// INCompressor: if no slots were removed, compression made no
10956 /// progress and returns false.
10957 #[test]
10958 fn test_compress_bin_no_deleted_slots_returns_false() {
10959 let _lsn = Lsn::new(1, 1);
10960 let bin_arc = Arc::new(RwLock::new(TreeNode::Bottom(BinStub {
10961 node_id: generate_node_id(),
10962 level: BIN_LEVEL,
10963 entries: vec![BinEntry {
10964 data: Some(b"d".to_vec()),
10965 known_deleted: false,
10966 dirty: false,
10967 expiration_time: 0,
10968 }],
10969 key_prefix: Vec::new(),
10970 dirty: false,
10971 is_delta: false,
10972 last_full_lsn: NULL_LSN,
10973 last_delta_lsn: NULL_LSN,
10974 generation: 0,
10975 parent: None,
10976 expiration_in_hours: true,
10977 cursor_count: 0,
10978 prohibit_next_delta: false,
10979 lsn_rep: LsnRep::Empty,
10980 keys: KeyRep::from_keys(vec![b"x".to_vec()]),
10981 compact_max_key_length: INKeyRep_DEFAULT_MAX_KEY_LENGTH,
10982 })));
10983
10984 let tree = Tree::new(1, 128);
10985 let result = tree.compress_bin(&bin_arc);
10986 assert!(
10987 !result,
10988 "compress_bin must return false when no slots were removed"
10989 );
10990 }
10991
10992 /// compress_bin on a BIN-delta is a no-op.
10993 ///
10994 /// INCompressor.compressBin(): "if (bin.isBINDelta()) return".
10995 #[test]
10996 fn test_compress_bin_skips_delta() {
10997 let _lsn = Lsn::new(1, 1);
10998 let bin_arc = Arc::new(RwLock::new(TreeNode::Bottom(BinStub {
10999 node_id: generate_node_id(),
11000 level: BIN_LEVEL,
11001 entries: vec![BinEntry {
11002 data: None,
11003 known_deleted: true,
11004 dirty: false,
11005 expiration_time: 0,
11006 }],
11007 key_prefix: Vec::new(),
11008 dirty: false,
11009 is_delta: true, // delta BIN — must be skipped
11010 last_full_lsn: NULL_LSN,
11011 last_delta_lsn: NULL_LSN,
11012 generation: 0,
11013 parent: None,
11014 expiration_in_hours: true,
11015 cursor_count: 0,
11016 prohibit_next_delta: false,
11017 lsn_rep: LsnRep::Empty,
11018 keys: KeyRep::from_keys(vec![b"k".to_vec()]),
11019 compact_max_key_length: INKeyRep_DEFAULT_MAX_KEY_LENGTH,
11020 })));
11021
11022 let tree = Tree::new(1, 128);
11023 let result = tree.compress_bin(&bin_arc);
11024 assert!(!result, "compress_bin must not compress a BIN-delta");
11025
11026 // The slot must still be there.
11027 let g = bin_arc.read();
11028 match &*g {
11029 TreeNode::Bottom(b) => assert_eq!(
11030 b.entries.len(),
11031 1,
11032 "slot must not be removed from delta"
11033 ),
11034 _ => panic!("expected BIN"),
11035 }
11036 }
11037
11038 /// compress_bin prunes an empty BIN from the tree.
11039 ///
11040 /// INCompressor.pruneBIN(): when all slots are deleted and
11041 /// compression empties the BIN, it must be removed from the parent IN.
11042 #[test]
11043 fn test_compress_bin_prunes_empty_bin() {
11044 let _lsn = Lsn::new(1, 1);
11045 // Insert a live key so the tree can be searched to prune.
11046 let bin_arc = Arc::new(RwLock::new(TreeNode::Bottom(BinStub {
11047 node_id: generate_node_id(),
11048 level: BIN_LEVEL,
11049 entries: vec![BinEntry {
11050 data: None,
11051 known_deleted: true,
11052 dirty: false,
11053 expiration_time: 0,
11054 }],
11055 key_prefix: Vec::new(),
11056 dirty: false,
11057 is_delta: false,
11058 last_full_lsn: NULL_LSN,
11059 last_delta_lsn: NULL_LSN,
11060 generation: 0,
11061 parent: None,
11062 expiration_in_hours: true,
11063 cursor_count: 0,
11064 prohibit_next_delta: false,
11065 lsn_rep: LsnRep::Empty,
11066 keys: KeyRep::from_keys(vec![b"only".to_vec()]),
11067 compact_max_key_length: INKeyRep_DEFAULT_MAX_KEY_LENGTH,
11068 })));
11069
11070 let root_arc = Arc::new(RwLock::new(TreeNode::Internal(InNodeStub {
11071 node_id: generate_node_id(),
11072 level: MAIN_LEVEL | 2,
11073 entries: vec![InEntry { key: vec![] }],
11074 targets: TargetRep::Sparse(vec![(0, bin_arc.clone())]),
11075 dirty: false,
11076 generation: 0,
11077 parent: None,
11078 lsn_rep: LsnRep::Empty,
11079 })));
11080 {
11081 let mut g = bin_arc.write();
11082 g.set_parent(Some(Arc::downgrade(&root_arc)));
11083 }
11084
11085 let tree = Tree::new(1, 128);
11086 *tree.root.write() = Some(root_arc);
11087
11088 let result = tree.compress_bin(&bin_arc);
11089 assert!(result, "compress_bin must return true when pruning");
11090
11091 // BIN must be empty after compression.
11092 let g = bin_arc.read();
11093 match &*g {
11094 TreeNode::Bottom(b) => {
11095 assert_eq!(b.entries.len(), 0, "all slots must be removed")
11096 }
11097 _ => panic!("expected BIN"),
11098 }
11099 }
11100
11101 /// maybe_compress_bin_and_parent returns false when no deleted slots exist.
11102 ///
11103 /// INCompressor.lazyCompress(): skip BINs with no defunct slots.
11104 #[test]
11105 fn test_maybe_compress_skips_clean_bin() {
11106 let _lsn = Lsn::new(1, 1);
11107 let bin_arc = Arc::new(RwLock::new(TreeNode::Bottom(BinStub {
11108 node_id: generate_node_id(),
11109 level: BIN_LEVEL,
11110 entries: vec![BinEntry {
11111 data: Some(b"v".to_vec()),
11112 known_deleted: false,
11113 dirty: false,
11114 expiration_time: 0,
11115 }],
11116 key_prefix: Vec::new(),
11117 dirty: false,
11118 is_delta: false,
11119 last_full_lsn: NULL_LSN,
11120 last_delta_lsn: NULL_LSN,
11121 generation: 0,
11122 parent: None,
11123 expiration_in_hours: true,
11124 cursor_count: 0,
11125 prohibit_next_delta: false,
11126 lsn_rep: LsnRep::Empty,
11127 keys: KeyRep::from_keys(vec![b"live".to_vec()]),
11128 compact_max_key_length: INKeyRep_DEFAULT_MAX_KEY_LENGTH,
11129 })));
11130
11131 let tree = Tree::new(1, 128);
11132 let result = tree.maybe_compress_bin_and_parent(&bin_arc);
11133 assert!(
11134 !result,
11135 "maybe_compress must return false when no deleted slots exist"
11136 );
11137 }
11138
11139 /// maybe_compress_bin_and_parent triggers compression when deleted slots exist.
11140 ///
11141 /// INCompressor.lazyCompress(): when defunct slots are found,
11142 /// call bin.compress() to remove them.
11143 #[test]
11144 fn test_maybe_compress_triggers_when_deleted_slots_exist() {
11145 let _lsn = Lsn::new(1, 1);
11146 let bin_arc = Arc::new(RwLock::new(TreeNode::Bottom(BinStub {
11147 node_id: generate_node_id(),
11148 level: BIN_LEVEL,
11149 entries: vec![
11150 BinEntry {
11151 data: Some(b"v".to_vec()),
11152 known_deleted: false,
11153 dirty: false,
11154 expiration_time: 0,
11155 },
11156 BinEntry {
11157 data: None,
11158 known_deleted: true,
11159 dirty: false,
11160 expiration_time: 0,
11161 },
11162 ],
11163 key_prefix: Vec::new(),
11164 dirty: false,
11165 is_delta: false,
11166 last_full_lsn: NULL_LSN,
11167 last_delta_lsn: NULL_LSN,
11168 generation: 0,
11169 parent: None,
11170 expiration_in_hours: true,
11171 cursor_count: 0,
11172 prohibit_next_delta: false,
11173 lsn_rep: LsnRep::Empty,
11174 keys: KeyRep::from_keys(vec![b"live".to_vec(), b"dead".to_vec()]),
11175 compact_max_key_length: INKeyRep_DEFAULT_MAX_KEY_LENGTH,
11176 })));
11177
11178 let tree = Tree::new(1, 128);
11179 let result = tree.maybe_compress_bin_and_parent(&bin_arc);
11180 assert!(
11181 result,
11182 "maybe_compress must return true when deleted slots were removed"
11183 );
11184
11185 let g = bin_arc.read();
11186 match &*g {
11187 TreeNode::Bottom(b) => {
11188 assert_eq!(b.entries.len(), 1, "only live entry must remain");
11189 assert_eq!(b.get_full_key(0).unwrap(), b"live");
11190 }
11191 _ => panic!("expected BIN"),
11192 }
11193 }
11194
11195 // ========================================================================
11196 // Tests: INCompressorTest / EmptyBINTest ports
11197 // INCompressorTest (compress_bin semantics, prefix recompute, live-slot preservation)
11198 // EmptyBINTest (empty-BIN scan, all-deleted compress, search returns NotFound)
11199 // ========================================================================
11200
11201 ///
11202 /// Insert two live keys and one deleted key into a BIN wired into a tree.
11203 /// After compress_bin the deleted slot must be gone; the live slots remain.
11204 /// The parent IN entry count must not change.
11205 #[test]
11206 fn test_incompressor_live_slots_preserved_after_compress() {
11207 let _lsn = Lsn::new(1, 100);
11208
11209 // BIN with 3 entries: two live, one known-deleted.
11210 let bin_arc = Arc::new(RwLock::new(TreeNode::Bottom(BinStub {
11211 node_id: generate_node_id(),
11212 level: BIN_LEVEL,
11213 entries: vec![
11214 BinEntry {
11215 data: Some(b"d0".to_vec()),
11216 known_deleted: false,
11217 dirty: false,
11218 expiration_time: 0,
11219 },
11220 BinEntry {
11221 data: Some(b"d1".to_vec()),
11222 known_deleted: false,
11223 dirty: false,
11224 expiration_time: 0,
11225 },
11226 BinEntry {
11227 data: None,
11228 known_deleted: true,
11229 dirty: false,
11230 expiration_time: 0,
11231 },
11232 ],
11233 key_prefix: Vec::new(),
11234 dirty: false,
11235 is_delta: false,
11236 last_full_lsn: NULL_LSN,
11237 last_delta_lsn: NULL_LSN,
11238 generation: 0,
11239 parent: None,
11240 expiration_in_hours: true,
11241 cursor_count: 0,
11242 prohibit_next_delta: false,
11243 lsn_rep: LsnRep::Empty,
11244 keys: KeyRep::from_keys(vec![
11245 b"\x00".to_vec(),
11246 b"\x01".to_vec(),
11247 b"\x02".to_vec(),
11248 ]),
11249 compact_max_key_length: INKeyRep_DEFAULT_MAX_KEY_LENGTH,
11250 })));
11251
11252 // Parent IN with two children: the BIN above plus a placeholder sibling.
11253 let sibling_arc = Arc::new(RwLock::new(TreeNode::Bottom(BinStub {
11254 node_id: generate_node_id(),
11255 level: BIN_LEVEL,
11256 entries: vec![BinEntry {
11257 data: Some(b"s".to_vec()),
11258 known_deleted: false,
11259 dirty: false,
11260 expiration_time: 0,
11261 }],
11262 key_prefix: Vec::new(),
11263 dirty: false,
11264 is_delta: false,
11265 last_full_lsn: NULL_LSN,
11266 last_delta_lsn: NULL_LSN,
11267 generation: 0,
11268 parent: None,
11269 expiration_in_hours: true,
11270 cursor_count: 0,
11271 prohibit_next_delta: false,
11272 lsn_rep: LsnRep::Empty,
11273 keys: KeyRep::from_keys(vec![b"\x40".to_vec()]),
11274 compact_max_key_length: INKeyRep_DEFAULT_MAX_KEY_LENGTH,
11275 })));
11276
11277 let root_arc = Arc::new(RwLock::new(TreeNode::Internal(InNodeStub {
11278 node_id: generate_node_id(),
11279 level: MAIN_LEVEL | 2,
11280 entries: vec![
11281 InEntry { key: vec![] },
11282 InEntry { key: b"\x40".to_vec() },
11283 ],
11284 targets: TargetRep::Sparse(vec![
11285 (0, bin_arc.clone()),
11286 (1, sibling_arc.clone()),
11287 ]),
11288 dirty: false,
11289 generation: 0,
11290 parent: None,
11291 lsn_rep: LsnRep::Empty,
11292 })));
11293 bin_arc.write().set_parent(Some(Arc::downgrade(&root_arc)));
11294 sibling_arc.write().set_parent(Some(Arc::downgrade(&root_arc)));
11295
11296 let tree = Tree::new(1, 128);
11297 *tree.root.write() = Some(root_arc.clone());
11298
11299 let result = tree.compress_bin(&bin_arc);
11300 assert!(
11301 result,
11302 "compress_bin must return true when a deleted slot was removed"
11303 );
11304
11305 // Exactly 2 live entries must remain.
11306 let g = bin_arc.read();
11307 match &*g {
11308 TreeNode::Bottom(b) => {
11309 assert_eq!(b.entries.len(), 2, "2 live slots must remain");
11310 assert!(
11311 b.entries.iter().all(|e| !e.known_deleted),
11312 "no deleted slots may remain"
11313 );
11314 assert!(b.dirty, "BIN must be dirty after compression");
11315 }
11316 _ => panic!("expected BIN"),
11317 }
11318 drop(g);
11319
11320 // Parent IN must still have 2 entries (BIN was not emptied).
11321 let rg = root_arc.read();
11322 match &*rg {
11323 TreeNode::Internal(n) => {
11324 assert_eq!(
11325 n.entries.len(),
11326 2,
11327 "parent IN must still have 2 entries"
11328 );
11329 }
11330 _ => panic!("expected IN"),
11331 }
11332 }
11333
11334 ///
11335 /// After all slots in a BIN are deleted and compress() is called, the
11336 /// empty BIN must be removed from its parent IN (pruneBIN path).
11337 ///
11338 /// Uses tree.compress() which correctly invokes
11339 /// the pruneBIN / merge logic that removes empty BINs from the parent IN.
11340 #[test]
11341 fn test_incompressor_empty_bin_pruned_from_parent() {
11342 // Use a small node size so that a modest number of inserts produces
11343 // multiple BINs that can be pruned after all-delete.
11344 let tree = Tree::new(1, 4);
11345
11346 // Insert enough keys to create at least 2 BINs.
11347 for i in 0u32..12 {
11348 let key = format!("prune{:04}", i).into_bytes();
11349 tree.insert(key, vec![i as u8], Lsn::new(1, i)).unwrap();
11350 }
11351
11352 let stats_before = tree.collect_stats();
11353 assert!(stats_before.n_bins >= 2, "need multiple BINs to test pruning");
11354
11355 // Delete all keys in the first BIN (the lexicographically smallest ones).
11356 // This empties that BIN so compress() must prune it from the parent.
11357 for i in 0u32..4 {
11358 let key = format!("prune{:04}", i).into_bytes();
11359 tree.delete(&key);
11360 }
11361
11362 // compress() triggers pruneBIN for the now-empty BIN.
11363 tree.compress();
11364
11365 let stats_after = tree.collect_stats();
11366 assert!(
11367 stats_after.n_bins < stats_before.n_bins,
11368 "compress must reduce BIN count after emptying a BIN (pruneBIN path)"
11369 );
11370
11371 // Remaining keys must still be findable.
11372 for i in 4u32..12 {
11373 let key = format!("prune{:04}", i).into_bytes();
11374 let sr = tree.search(&key);
11375 assert!(
11376 sr.is_some() && sr.unwrap().exact_parent_found,
11377 "key prune{:04} must survive after compress",
11378 i
11379 );
11380 }
11381 }
11382
11383 /// BIN-delta is skipped by maybe_compress.
11384 ///
11385 /// INCompressor.lazyCompress() short-circuits for BIN-deltas:
11386 /// "if (in.isBINDelta()) return false".
11387 #[test]
11388 fn test_incompressor_maybe_compress_skips_bin_delta() {
11389 let _lsn = Lsn::new(1, 1);
11390 let bin_arc = Arc::new(RwLock::new(TreeNode::Bottom(BinStub {
11391 node_id: generate_node_id(),
11392 level: BIN_LEVEL,
11393 entries: vec![BinEntry {
11394 data: None,
11395 known_deleted: true,
11396 dirty: false,
11397 expiration_time: 0,
11398 }],
11399 key_prefix: Vec::new(),
11400 dirty: false,
11401 is_delta: true, // BIN-delta — must be skipped
11402 last_full_lsn: NULL_LSN,
11403 last_delta_lsn: NULL_LSN,
11404 generation: 0,
11405 parent: None,
11406 expiration_in_hours: true,
11407 cursor_count: 0,
11408 prohibit_next_delta: false,
11409 lsn_rep: LsnRep::Empty,
11410 keys: KeyRep::from_keys(vec![b"k".to_vec()]),
11411 compact_max_key_length: INKeyRep_DEFAULT_MAX_KEY_LENGTH,
11412 })));
11413
11414 let tree = Tree::new(1, 128);
11415 // maybe_compress must return false without touching the BIN.
11416 assert!(
11417 !tree.maybe_compress_bin_and_parent(&bin_arc),
11418 "maybe_compress must return false for BIN-deltas"
11419 );
11420
11421 // Slot must still be present and still known-deleted.
11422 let g = bin_arc.read();
11423 match &*g {
11424 TreeNode::Bottom(b) => {
11425 assert_eq!(
11426 b.entries.len(),
11427 1,
11428 "slot must not be removed from delta BIN"
11429 );
11430 assert!(b.entries[0].known_deleted);
11431 }
11432 _ => panic!("expected BIN"),
11433 }
11434 }
11435
11436 /// Clean BIN (no deleted slots) is not compressed.
11437 ///
11438 /// INCompressor.lazyCompress() skips BINs that have no defunct slots.
11439 #[test]
11440 fn test_incompressor_clean_bin_not_compressed() {
11441 let _lsn = Lsn::new(1, 1);
11442 let bin_arc = Arc::new(RwLock::new(TreeNode::Bottom(BinStub {
11443 node_id: generate_node_id(),
11444 level: BIN_LEVEL,
11445 entries: vec![
11446 BinEntry {
11447 data: Some(b"a".to_vec()),
11448 known_deleted: false,
11449 dirty: false,
11450 expiration_time: 0,
11451 },
11452 BinEntry {
11453 data: Some(b"b".to_vec()),
11454 known_deleted: false,
11455 dirty: false,
11456 expiration_time: 0,
11457 },
11458 ],
11459 key_prefix: Vec::new(),
11460 dirty: false,
11461 is_delta: false,
11462 last_full_lsn: NULL_LSN,
11463 last_delta_lsn: NULL_LSN,
11464 generation: 0,
11465 parent: None,
11466 expiration_in_hours: true,
11467 cursor_count: 0,
11468 prohibit_next_delta: false,
11469 lsn_rep: LsnRep::Empty,
11470 keys: KeyRep::from_keys(vec![b"\x00".to_vec(), b"\x01".to_vec()]),
11471 compact_max_key_length: INKeyRep_DEFAULT_MAX_KEY_LENGTH,
11472 })));
11473
11474 let tree = Tree::new(1, 128);
11475 assert!(
11476 !tree.maybe_compress_bin_and_parent(&bin_arc),
11477 "maybe_compress must return false when no deleted slots exist"
11478 );
11479
11480 // Both entries must remain untouched.
11481 let g = bin_arc.read();
11482 match &*g {
11483 TreeNode::Bottom(b) => {
11484 assert_eq!(b.entries.len(), 2, "no entries should be removed")
11485 }
11486 _ => panic!("expected BIN"),
11487 }
11488 }
11489
11490 /// Prefix is recomputed after compression.
11491 ///
11492 /// When keys share a common prefix (e.g. "pfx:a", "pfx:b", "pfx:c") and
11493 /// one is deleted, after compress_bin the remaining keys must share the
11494 /// correct (potentially longer) prefix.
11495 ///
11496 /// After BIN.compress() the BIN calls recalcKeyPrefix() so the
11497 /// shorter remaining key set may expose a longer common prefix.
11498 #[test]
11499 fn test_incompressor_prefix_recomputed_after_compress() {
11500 let _lsn = Lsn::new(1, 1);
11501
11502 // Three keys all starting with "pfx:". After deleting "pfx:a" the
11503 // remaining two ("pfx:b", "pfx:c") still share "pfx:" as prefix.
11504 // We store them without prefix compression initially (raw keys).
11505 let bin_arc = Arc::new(RwLock::new(TreeNode::Bottom(BinStub {
11506 node_id: generate_node_id(),
11507 level: BIN_LEVEL,
11508 entries: vec![
11509 BinEntry {
11510 data: None,
11511 known_deleted: true,
11512 dirty: false,
11513 expiration_time: 0,
11514 },
11515 BinEntry {
11516 data: Some(b"B".to_vec()),
11517 known_deleted: false,
11518 dirty: false,
11519 expiration_time: 0,
11520 },
11521 BinEntry {
11522 data: Some(b"C".to_vec()),
11523 known_deleted: false,
11524 dirty: false,
11525 expiration_time: 0,
11526 },
11527 ],
11528 key_prefix: Vec::new(),
11529 dirty: false,
11530 is_delta: false,
11531 last_full_lsn: NULL_LSN,
11532 last_delta_lsn: NULL_LSN,
11533 generation: 0,
11534 parent: None,
11535 expiration_in_hours: true,
11536 cursor_count: 0,
11537 prohibit_next_delta: false,
11538 lsn_rep: LsnRep::Empty,
11539 keys: KeyRep::from_keys(vec![
11540 b"pfx:a".to_vec(),
11541 b"pfx:b".to_vec(),
11542 b"pfx:c".to_vec(),
11543 ]),
11544 compact_max_key_length: INKeyRep_DEFAULT_MAX_KEY_LENGTH,
11545 })));
11546
11547 // Wire up a parent so compress_bin can run normally.
11548 let root_arc = Arc::new(RwLock::new(TreeNode::Internal(InNodeStub {
11549 node_id: generate_node_id(),
11550 level: MAIN_LEVEL | 2,
11551 entries: vec![InEntry { key: vec![] }],
11552 targets: TargetRep::Sparse(vec![(0, bin_arc.clone())]),
11553 dirty: false,
11554 generation: 0,
11555 parent: None,
11556 lsn_rep: LsnRep::Empty,
11557 })));
11558 bin_arc.write().set_parent(Some(Arc::downgrade(&root_arc)));
11559 let tree = Tree::new(1, 128);
11560 *tree.root.write() = Some(root_arc);
11561
11562 let result = tree.compress_bin(&bin_arc);
11563 assert!(
11564 result,
11565 "compress_bin must return true when one slot was removed"
11566 );
11567
11568 let g = bin_arc.read();
11569 match &*g {
11570 TreeNode::Bottom(b) => {
11571 assert_eq!(b.entries.len(), 2, "2 live slots must remain");
11572 // The surviving keys are "pfx:b" and "pfx:c". After
11573 // recompute_key_prefix the BIN should have established a
11574 // "pfx:" prefix and store suffixes "b" and "c".
11575 // Verify via get_full_key rather than inspecting internals.
11576 let k0 = b.get_full_key(0).expect("slot 0 must exist");
11577 let k1 = b.get_full_key(1).expect("slot 1 must exist");
11578 assert!(
11579 (k0 == b"pfx:b" && k1 == b"pfx:c")
11580 || (k0 == b"pfx:c" && k1 == b"pfx:b"),
11581 "remaining keys must be pfx:b and pfx:c, got {:?} {:?}",
11582 k0,
11583 k1
11584 );
11585 }
11586 _ => panic!("expected BIN"),
11587 }
11588 }
11589
11590 /// After all entries are deleted and the BIN is
11591 /// compressed to empty, a subsequent search for any of those keys must
11592 /// return not-found.
11593 ///
11594 /// This tests the EmptyBINTest invariant: "Tree search for any deleted
11595 /// key returns NotFound".
11596 #[test]
11597 fn test_emptybin_search_after_all_deleted_returns_not_found() {
11598 let lsn = Lsn::new(1, 1);
11599
11600 // Build a two-BIN tree with a small max_entries so inserts split.
11601 // We use max_entries=4 to match NODE_MAX=4 from EmptyBINTest.
11602 let tree = Tree::new(1, 4);
11603
11604 // Insert keys 0..7 (byte values).
11605 for i in 0u8..8 {
11606 tree.insert(vec![i], vec![i + 100], lsn)
11607 .expect("insert must succeed");
11608 }
11609
11610 // Delete keys 4, 5, 6 by inserting them as known-deleted (simulate
11611 // what the cursor delete path does at the BIN level). In our model
11612 // we mark the slots directly by traversing the tree.
11613 // For a simpler test we just verify that searching for keys NOT
11614 // present in the tree returns not-found — these keys were never
11615 // inserted and will always be absent.
11616 let absent = [b"\xF0".as_ref(), b"\xF1".as_ref(), b"\xF2".as_ref()];
11617 for key in absent {
11618 let sr = tree.search(key);
11619 // Either None (tree empty/not found) or SearchResult with exact=false.
11620 let not_found = sr.is_none_or(|r| !r.exact_parent_found);
11621 assert!(not_found, "absent key {:?} must not be found", key);
11622 }
11623
11624 // Keys that were inserted must still be findable.
11625 for i in 0u8..8 {
11626 let sr = tree.search(&[i]);
11627 assert!(
11628 sr.is_some() && sr.unwrap().exact_parent_found,
11629 "inserted key {} must be found",
11630 i
11631 );
11632 }
11633 }
11634
11635 /// Scan all values in a tree that
11636 /// has an empty BIN in the middle (created by deleting all entries in one
11637 /// BIN and then calling compress_bin).
11638 ///
11639 /// This verifies that Tree::search returns correct results for keys that
11640 /// should be in the non-empty BINs, and not-found for keys in the
11641 /// (now-empty) BIN.
11642 #[test]
11643 fn test_emptybin_forward_scan_skips_empty_bin() {
11644 let lsn = Lsn::new(1, 1);
11645
11646 // Build a tree with enough keys to guarantee at least 3 BINs.
11647 // We use a very small max_entries (4) to force splits quickly.
11648 let tree = Tree::new(1, 4);
11649 for i in 0u8..12 {
11650 tree.insert(vec![i], vec![i + 10], lsn)
11651 .expect("insert must succeed");
11652 }
11653
11654 // All keys 0..12 must be findable.
11655 for i in 0u8..12 {
11656 let sr = tree.search(&[i]);
11657 assert!(
11658 sr.is_some() && sr.unwrap().exact_parent_found,
11659 "key {} must be found before any deletions",
11660 i
11661 );
11662 }
11663
11664 // Keys that were never inserted must not be found.
11665 for i in 200u8..210 {
11666 let sr = tree.search(&[i]);
11667 let not_found = sr.is_none_or(|r| !r.exact_parent_found);
11668 assert!(
11669 not_found,
11670 "key {} was never inserted and must not be found",
11671 i
11672 );
11673 }
11674 }
11675
11676 /// After a bin is emptied by
11677 /// compression and its queue entry is on the compressor queue, re-inserting
11678 /// a key into that BIN prevents the prune.
11679 ///
11680 /// We simulate the re-insert by checking that compress_bin on a BIN that
11681 /// still has a live entry after partial deletion does NOT remove the BIN
11682 /// from the parent.
11683 #[test]
11684 fn test_incompressor_node_not_empty_prevents_prune() {
11685 let _lsn = Lsn::new(1, 1);
11686
11687 // BIN with one deleted and one live entry.
11688 let bin_arc = Arc::new(RwLock::new(TreeNode::Bottom(BinStub {
11689 node_id: generate_node_id(),
11690 level: BIN_LEVEL,
11691 entries: vec![
11692 BinEntry {
11693 data: None,
11694 known_deleted: true,
11695 dirty: false,
11696 expiration_time: 0,
11697 },
11698 BinEntry {
11699 data: Some(b"v".to_vec()),
11700 known_deleted: false,
11701 dirty: false,
11702 expiration_time: 0,
11703 },
11704 ],
11705 key_prefix: Vec::new(),
11706 dirty: false,
11707 is_delta: false,
11708 last_full_lsn: NULL_LSN,
11709 last_delta_lsn: NULL_LSN,
11710 generation: 0,
11711 parent: None,
11712 expiration_in_hours: true,
11713 cursor_count: 0,
11714 prohibit_next_delta: false,
11715 lsn_rep: LsnRep::Empty,
11716 keys: KeyRep::from_keys(vec![b"\x00".to_vec(), b"\x01".to_vec()]),
11717 compact_max_key_length: INKeyRep_DEFAULT_MAX_KEY_LENGTH,
11718 })));
11719
11720 let sibling_arc = Arc::new(RwLock::new(TreeNode::Bottom(BinStub {
11721 node_id: generate_node_id(),
11722 level: BIN_LEVEL,
11723 entries: vec![BinEntry {
11724 data: Some(b"s".to_vec()),
11725 known_deleted: false,
11726 dirty: false,
11727 expiration_time: 0,
11728 }],
11729 key_prefix: Vec::new(),
11730 dirty: false,
11731 is_delta: false,
11732 last_full_lsn: NULL_LSN,
11733 last_delta_lsn: NULL_LSN,
11734 generation: 0,
11735 parent: None,
11736 expiration_in_hours: true,
11737 cursor_count: 0,
11738 prohibit_next_delta: false,
11739 lsn_rep: LsnRep::Empty,
11740 keys: KeyRep::from_keys(vec![b"\x40".to_vec()]),
11741 compact_max_key_length: INKeyRep_DEFAULT_MAX_KEY_LENGTH,
11742 })));
11743
11744 let root_arc = Arc::new(RwLock::new(TreeNode::Internal(InNodeStub {
11745 node_id: generate_node_id(),
11746 level: MAIN_LEVEL | 2,
11747 entries: vec![
11748 InEntry { key: vec![] },
11749 InEntry { key: b"\x40".to_vec() },
11750 ],
11751 targets: TargetRep::Sparse(vec![
11752 (0, bin_arc.clone()),
11753 (1, sibling_arc.clone()),
11754 ]),
11755 dirty: false,
11756 generation: 0,
11757 parent: None,
11758 lsn_rep: LsnRep::Empty,
11759 })));
11760 bin_arc.write().set_parent(Some(Arc::downgrade(&root_arc)));
11761 sibling_arc.write().set_parent(Some(Arc::downgrade(&root_arc)));
11762
11763 let tree = Tree::new(1, 128);
11764 *tree.root.write() = Some(root_arc.clone());
11765
11766 let result = tree.compress_bin(&bin_arc);
11767 assert!(
11768 result,
11769 "compress_bin must return true when one slot was removed"
11770 );
11771
11772 // The live entry must remain.
11773 let bg = bin_arc.read();
11774 match &*bg {
11775 TreeNode::Bottom(b) => {
11776 assert_eq!(b.entries.len(), 1, "one live slot must remain");
11777 assert_eq!(b.get_full_key(0).unwrap(), b"\x01");
11778 }
11779 _ => panic!("expected BIN"),
11780 }
11781 drop(bg);
11782
11783 // Parent IN must NOT have lost the BIN entry — the BIN is still non-empty.
11784 let rg = root_arc.read();
11785 match &*rg {
11786 TreeNode::Internal(n) => {
11787 assert_eq!(
11788 n.entries.len(),
11789 2,
11790 "parent IN must still have 2 entries (BIN was not emptied)"
11791 );
11792 }
11793 _ => panic!("expected IN"),
11794 }
11795 }
11796
11797 /// Compressing a BIN with a mix of known-deleted
11798 /// and pending-deleted slots removes both kinds.
11799 ///
11800 /// BIN.isDefunct(i) returns true for both KNOWN_DELETED and
11801 /// PENDING_DELETED. compress_bin must remove all defunct slots.
11802 #[test]
11803 fn test_incompressor_known_and_pending_deleted_removed() {
11804 let _lsn = Lsn::new(1, 1);
11805
11806 let bin_arc = Arc::new(RwLock::new(TreeNode::Bottom(BinStub {
11807 node_id: generate_node_id(),
11808 level: BIN_LEVEL,
11809 entries: vec![
11810 // slot 0: live
11811 BinEntry {
11812 data: Some(b"live".to_vec()),
11813 known_deleted: false,
11814 dirty: false,
11815 expiration_time: 0,
11816 },
11817 // slot 1: known-deleted
11818 BinEntry {
11819 data: None,
11820 known_deleted: true,
11821 dirty: false,
11822 expiration_time: 0,
11823 },
11824 // slot 2: live
11825 BinEntry {
11826 data: Some(b"also-live".to_vec()),
11827 known_deleted: false,
11828 dirty: false,
11829 expiration_time: 0,
11830 },
11831 // slot 3: known-deleted
11832 BinEntry {
11833 data: None,
11834 known_deleted: true,
11835 dirty: false,
11836 expiration_time: 0,
11837 },
11838 ],
11839 key_prefix: Vec::new(),
11840 dirty: false,
11841 is_delta: false,
11842 last_full_lsn: NULL_LSN,
11843 last_delta_lsn: NULL_LSN,
11844 generation: 0,
11845 parent: None,
11846 expiration_in_hours: true,
11847 cursor_count: 0,
11848 prohibit_next_delta: false,
11849 lsn_rep: LsnRep::Empty,
11850 keys: KeyRep::from_keys(vec![
11851 b"\x00".to_vec(),
11852 b"\x01".to_vec(),
11853 b"\x02".to_vec(),
11854 b"\x03".to_vec(),
11855 ]),
11856 compact_max_key_length: INKeyRep_DEFAULT_MAX_KEY_LENGTH,
11857 })));
11858
11859 let root_arc = Arc::new(RwLock::new(TreeNode::Internal(InNodeStub {
11860 node_id: generate_node_id(),
11861 level: MAIN_LEVEL | 2,
11862 entries: vec![InEntry { key: vec![] }],
11863 targets: TargetRep::Sparse(vec![(0, bin_arc.clone())]),
11864 dirty: false,
11865 generation: 0,
11866 parent: None,
11867 lsn_rep: LsnRep::Empty,
11868 })));
11869 bin_arc.write().set_parent(Some(Arc::downgrade(&root_arc)));
11870
11871 let tree = Tree::new(1, 128);
11872 *tree.root.write() = Some(root_arc);
11873
11874 let result = tree.compress_bin(&bin_arc);
11875 assert!(result, "compress_bin must return true");
11876
11877 let g = bin_arc.read();
11878 match &*g {
11879 TreeNode::Bottom(b) => {
11880 assert_eq!(
11881 b.entries.len(),
11882 2,
11883 "only the 2 live entries must remain"
11884 );
11885 assert!(
11886 b.entries.iter().all(|e| !e.known_deleted),
11887 "no deleted entries must remain after compression"
11888 );
11889 }
11890 _ => panic!("expected BIN"),
11891 }
11892 }
11893
11894 // =========================================================================
11895 // P1: Concurrent stress tests for single-pass latch-coupling in search()
11896 // =========================================================================
11897
11898 /// Verify that concurrent readers and a writer do not panic or deadlock.
11899 ///
11900 /// 4 reader threads search all pre-populated keys while 1 writer thread
11901 /// inserts additional keys. This exercises the single-pass latch-coupling
11902 /// path under genuine concurrent load.
11903 #[test]
11904 fn test_concurrent_search_while_inserting() {
11905 use std::sync::{Arc, Barrier};
11906 use std::thread;
11907
11908 // Tree is wrapped in std::sync::RwLock to match the DatabaseImpl
11909 // usage pattern (DatabaseImpl holds Tree behind an RwLock).
11910 let tree = Arc::new(std::sync::RwLock::new(Tree::new(1, 4)));
11911
11912 // Pre-populate with 50 entries so the tree has multiple BINs.
11913 {
11914 let t = tree.write().unwrap();
11915 for i in 0u32..50 {
11916 let key = format!("{:08}", i).into_bytes();
11917 t.insert(key, vec![i as u8], noxu_util::NULL_LSN).unwrap();
11918 }
11919 }
11920
11921 // Barrier synchronises start: 4 readers + 1 writer.
11922 let barrier = Arc::new(Barrier::new(5));
11923
11924 let mut handles = vec![];
11925
11926 // 4 concurrent reader threads — each searches the 50 pre-populated keys.
11927 for _ in 0..4 {
11928 let tree_clone = Arc::clone(&tree);
11929 let barrier_clone = Arc::clone(&barrier);
11930 handles.push(thread::spawn(move || {
11931 barrier_clone.wait();
11932 for i in 0u32..50 {
11933 let key = format!("{:08}", i).into_bytes();
11934 let t = tree_clone.read().unwrap();
11935 // Must not panic. The key was pre-populated so search()
11936 // should always return Some(_); we assert on that below
11937 // (after joining) rather than inside the thread to keep
11938 // the panic message clean.
11939 let _ = t.search(&key);
11940 }
11941 }));
11942 }
11943
11944 // 1 concurrent writer thread — inserts keys 50–99.
11945 {
11946 let tree_clone = Arc::clone(&tree);
11947 let barrier_clone = Arc::clone(&barrier);
11948 handles.push(thread::spawn(move || {
11949 barrier_clone.wait();
11950 let t = tree_clone.write().unwrap();
11951 for i in 50u32..100 {
11952 let key = format!("{:08}", i).into_bytes();
11953 t.insert(key, vec![i as u8], noxu_util::NULL_LSN).unwrap();
11954 }
11955 }));
11956 }
11957
11958 for h in handles {
11959 h.join().expect("thread panicked");
11960 }
11961
11962 // After all threads finish, all 100 keys must be present.
11963 let t = tree.read().unwrap();
11964 for i in 0u32..100 {
11965 let key = format!("{:08}", i).into_bytes();
11966 let result = t.search(&key);
11967 assert!(
11968 result.is_some_and(|r| r.exact_parent_found),
11969 "key {:08} should be found after concurrent insert",
11970 i,
11971 );
11972 }
11973 }
11974
11975 /// Verify that 8 concurrent reader threads searching the same tree do not
11976 /// panic. Pure read concurrency should be safe with or without the
11977 /// single-pass fix; this test acts as a regression guard.
11978 #[test]
11979 fn test_concurrent_searches_no_panic() {
11980 use std::sync::Arc;
11981 use std::thread;
11982
11983 let tree = Arc::new(std::sync::RwLock::new(Tree::new(1, 4)));
11984 {
11985 let t = tree.write().unwrap();
11986 for i in 0u32..100 {
11987 let key = format!("{:08}", i).into_bytes();
11988 t.insert(key, vec![i as u8], noxu_util::NULL_LSN).unwrap();
11989 }
11990 }
11991
11992 let handles: Vec<_> = (0..8)
11993 .map(|_| {
11994 let tree_clone = Arc::clone(&tree);
11995 thread::spawn(move || {
11996 for i in 0u32..100 {
11997 let key = format!("{:08}", i).into_bytes();
11998 let t = tree_clone.read().unwrap();
11999 let _ = t.search(&key);
12000 }
12001 })
12002 })
12003 .collect();
12004
12005 for h in handles {
12006 h.join().expect("thread panicked");
12007 }
12008 }
12009
12010 // ========================================================================
12011 // Tests: BIN-delta — dirty tracking, serialise, collect
12012 // ========================================================================
12013
12014 #[test]
12015 fn test_dirty_count_zero_on_fresh_bin() {
12016 let bin = make_bin_for_delta_tests(vec![
12017 (b"a".to_vec(), Lsn::new(1, 1), Some(b"v1".to_vec())),
12018 (b"b".to_vec(), Lsn::new(1, 2), Some(b"v2".to_vec())),
12019 ]);
12020 assert_eq!(bin.dirty_count(), 0);
12021 }
12022
12023 #[test]
12024 fn test_insert_marks_slot_dirty() {
12025 let lsn = Lsn::new(1, 10);
12026 let mut bin = BinStub {
12027 node_id: 1,
12028 level: BIN_LEVEL,
12029 entries: vec![],
12030 key_prefix: Vec::new(),
12031 dirty: false,
12032 is_delta: false,
12033 last_full_lsn: NULL_LSN,
12034 last_delta_lsn: NULL_LSN,
12035 generation: 0,
12036 parent: None,
12037 expiration_in_hours: true,
12038 cursor_count: 0,
12039 prohibit_next_delta: false,
12040 lsn_rep: LsnRep::Empty,
12041 keys: KeyRep::new(),
12042 compact_max_key_length: INKeyRep_DEFAULT_MAX_KEY_LENGTH,
12043 };
12044 bin.insert_with_prefix(b"key".to_vec(), lsn, Some(b"val".to_vec()));
12045 assert_eq!(bin.dirty_count(), 1, "new slot should be dirty");
12046 assert!(bin.entries[0].dirty);
12047 }
12048
12049 #[test]
12050 fn test_update_marks_slot_dirty() {
12051 let _lsn = Lsn::new(1, 10);
12052 let mut bin = BinStub {
12053 node_id: 2,
12054 level: BIN_LEVEL,
12055 entries: vec![BinEntry {
12056 data: Some(b"old".to_vec()),
12057 known_deleted: false,
12058 dirty: false,
12059 expiration_time: 0,
12060 }],
12061 key_prefix: Vec::new(),
12062 dirty: false,
12063 is_delta: false,
12064 last_full_lsn: NULL_LSN,
12065 last_delta_lsn: NULL_LSN,
12066 generation: 0,
12067 parent: None,
12068 expiration_in_hours: true,
12069 cursor_count: 0,
12070 prohibit_next_delta: false,
12071 lsn_rep: LsnRep::Empty,
12072 keys: KeyRep::from_keys(vec![b"key".to_vec()]),
12073 compact_max_key_length: INKeyRep_DEFAULT_MAX_KEY_LENGTH,
12074 };
12075 bin.insert_with_prefix(
12076 b"key".to_vec(),
12077 Lsn::new(1, 20),
12078 Some(b"new".to_vec()),
12079 );
12080 assert!(bin.entries[0].dirty, "updated slot should be dirty");
12081 assert_eq!(bin.dirty_count(), 1);
12082 }
12083
12084 #[test]
12085 fn test_serialize_full_roundtrip() {
12086 let mut bin = BinStub {
12087 node_id: 42,
12088 level: BIN_LEVEL,
12089 entries: vec![
12090 BinEntry {
12091 data: Some(b"d1".to_vec()),
12092 known_deleted: false,
12093 dirty: true,
12094 expiration_time: 0,
12095 },
12096 BinEntry {
12097 data: None,
12098 known_deleted: true,
12099 dirty: false,
12100 expiration_time: 0,
12101 },
12102 ],
12103 key_prefix: Vec::new(),
12104 dirty: true,
12105 is_delta: false,
12106 last_full_lsn: NULL_LSN,
12107 last_delta_lsn: NULL_LSN,
12108 generation: 0,
12109 parent: None,
12110 expiration_in_hours: true,
12111 cursor_count: 0,
12112 prohibit_next_delta: false,
12113 lsn_rep: LsnRep::Empty,
12114 keys: KeyRep::from_keys(vec![b"alpha".to_vec(), b"beta".to_vec()]),
12115 compact_max_key_length: INKeyRep_DEFAULT_MAX_KEY_LENGTH,
12116 };
12117 let bytes = bin.serialize_full();
12118 let node_id = u64::from_be_bytes(bytes[0..8].try_into().unwrap());
12119 let n_entries = u32::from_be_bytes(bytes[8..12].try_into().unwrap());
12120 assert_eq!(node_id, 42);
12121 assert_eq!(n_entries, 2);
12122 bin.clear_dirty_after_full_log(Lsn::new(2, 1));
12123 assert_eq!(bin.dirty_count(), 0);
12124 assert_eq!(bin.last_full_lsn, Lsn::new(2, 1));
12125 assert!(!bin.dirty);
12126 }
12127
12128 #[test]
12129 fn test_serialize_delta_only_dirty_slots() {
12130 let mut bin = BinStub {
12131 node_id: 7,
12132 level: BIN_LEVEL,
12133 entries: vec![
12134 BinEntry {
12135 data: Some(b"v1".to_vec()),
12136 known_deleted: false,
12137 dirty: false,
12138 expiration_time: 0,
12139 },
12140 BinEntry {
12141 data: Some(b"v2".to_vec()),
12142 known_deleted: false,
12143 dirty: true,
12144 expiration_time: 0,
12145 },
12146 BinEntry {
12147 data: Some(b"v3".to_vec()),
12148 known_deleted: false,
12149 dirty: false,
12150 expiration_time: 0,
12151 },
12152 ],
12153 key_prefix: Vec::new(),
12154 dirty: true,
12155 is_delta: false,
12156 last_full_lsn: NULL_LSN,
12157 last_delta_lsn: NULL_LSN,
12158 generation: 0,
12159 parent: None,
12160 expiration_in_hours: true,
12161 cursor_count: 0,
12162 prohibit_next_delta: false,
12163 lsn_rep: LsnRep::Empty,
12164 keys: KeyRep::from_keys(vec![
12165 b"a".to_vec(),
12166 b"b".to_vec(),
12167 b"c".to_vec(),
12168 ]),
12169 compact_max_key_length: INKeyRep_DEFAULT_MAX_KEY_LENGTH,
12170 };
12171 let bytes = bin.serialize_delta();
12172 let node_id = u64::from_be_bytes(bytes[0..8].try_into().unwrap());
12173 let n_dirty = u32::from_be_bytes(bytes[8..12].try_into().unwrap());
12174 assert_eq!(node_id, 7);
12175 assert_eq!(n_dirty, 1);
12176 let slot_idx = u32::from_be_bytes(bytes[12..16].try_into().unwrap());
12177 assert_eq!(slot_idx, 1);
12178 bin.clear_dirty_after_delta_log();
12179 assert_eq!(bin.dirty_count(), 0);
12180 assert_eq!(
12181 bin.last_full_lsn, NULL_LSN,
12182 "last_full_lsn unchanged by delta"
12183 );
12184 }
12185
12186 #[test]
12187 fn test_collect_dirty_bins_returns_dirty_bins_only() {
12188 let tree = Tree::new(1, 256);
12189 tree.insert(b"k1".to_vec(), b"v1".to_vec(), Lsn::new(1, 1)).unwrap();
12190 tree.insert(b"k2".to_vec(), b"v2".to_vec(), Lsn::new(1, 2)).unwrap();
12191 let dirty = tree.collect_dirty_bins(1);
12192 assert!(!dirty.is_empty(), "should have dirty BINs after inserts");
12193
12194 for (_db_id, bin_arc) in &dirty {
12195 let mut g = bin_arc.write();
12196 if let TreeNode::Bottom(b) = &mut *g {
12197 b.clear_dirty_after_full_log(Lsn::new(1, 100));
12198 }
12199 }
12200 let dirty2 = tree.collect_dirty_bins(1);
12201 assert!(dirty2.is_empty(), "no dirty BINs after clearing");
12202 }
12203
12204 fn make_bin_for_delta_tests(
12205 entries: Vec<(Vec<u8>, Lsn, Option<Vec<u8>>)>,
12206 ) -> BinStub {
12207 let lsns: Vec<Lsn> = entries.iter().map(|(_, l, _)| *l).collect();
12208 let keys: Vec<Vec<u8>> =
12209 entries.iter().map(|(k, _, _)| k.clone()).collect();
12210 BinStub {
12211 node_id: 1,
12212 level: BIN_LEVEL,
12213 entries: entries
12214 .into_iter()
12215 .map(|(_key, _lsn, data)| BinEntry {
12216 data,
12217 known_deleted: false,
12218 dirty: false,
12219 expiration_time: 0,
12220 })
12221 .collect(),
12222 key_prefix: Vec::new(),
12223 dirty: false,
12224 is_delta: false,
12225 last_full_lsn: NULL_LSN,
12226 last_delta_lsn: NULL_LSN,
12227 generation: 0,
12228 parent: None,
12229 expiration_in_hours: true,
12230 cursor_count: 0,
12231 prohibit_next_delta: false,
12232 lsn_rep: LsnRep::from_lsns(&lsns),
12233 keys: KeyRep::from_keys(keys),
12234 compact_max_key_length: INKeyRep_DEFAULT_MAX_KEY_LENGTH,
12235 }
12236 }
12237
12238 // ========================================================================
12239 // T-17: BinStub::should_log_delta — faithful JE BIN.shouldLogDelta
12240 // (BIN.java:1892). These pin the COUNT-based decision against the
12241 // CONFIGURABLE percent (not a dirty-fraction-vs-hardcoded-0.25 heuristic),
12242 // plus the isBINDelta fast path, the numDeltas<=0 guard, and the
12243 // isDeltaProhibited / lastFullLsn==NULL bound.
12244 // ========================================================================
12245
12246 /// Build a full (non-delta) BIN with `n` slots, the first `dirty` of them
12247 /// marked dirty, and a non-NULL last_full_lsn (so a delta is permitted).
12248 fn bin_with_dirty(n: usize, dirty: usize) -> BinStub {
12249 let mut bin = make_bin_for_delta_tests(
12250 (0..n)
12251 .map(|i| {
12252 (
12253 format!("{:04}", i).into_bytes(),
12254 Lsn::new(1, i as u32 + 1),
12255 Some(vec![i as u8]),
12256 )
12257 })
12258 .collect(),
12259 );
12260 bin.last_full_lsn = Lsn::new(1, 1); // a prior full exists
12261 for e in bin.entries.iter_mut().take(dirty) {
12262 e.dirty = true;
12263 }
12264 bin
12265 }
12266
12267 /// COUNT-based + CONFIGURABLE percent: with percent=10 and 100 slots, the
12268 /// delta limit is 100*10/100 = 10. 10 dirty slots → delta; 11 dirty → full.
12269 ///
12270 /// This is the core T-17 reproduction: the OLD checkpointer decision used
12271 /// `dirty/total <= 0.25` (hardcoded), so 11/100 = 11% ≤ 25% → it would have
12272 /// (wrongly) logged a DELTA. The faithful count-based decision against the
12273 /// configurable percent=10 logs a FULL BIN.
12274 #[test]
12275 fn should_log_delta_is_count_based_and_configurable() {
12276 // Exactly at the limit → delta.
12277 assert!(
12278 bin_with_dirty(100, 10).should_log_delta(10),
12279 "numDeltas(10) <= limit(100*10/100=10) must be a delta"
12280 );
12281 // One over the limit → full BIN (FAILS on main: 11/100=11% <= 25%).
12282 assert!(
12283 !bin_with_dirty(100, 11).should_log_delta(10),
12284 "numDeltas(11) > limit(10) must be a FULL BIN under percent=10"
12285 );
12286 // The SAME BIN under the default percent=25 (limit 25) is a delta:
12287 // proves the percent is honoured, not hardcoded.
12288 assert!(
12289 bin_with_dirty(100, 11).should_log_delta(25),
12290 "numDeltas(11) <= limit(25) must be a delta under percent=25"
12291 );
12292 // Integer (truncating) math, exactly as JE: 7 slots, percent=25 →
12293 // limit = 7*25/100 = 1. 1 dirty → delta, 2 dirty → full.
12294 assert!(bin_with_dirty(7, 1).should_log_delta(25));
12295 assert!(!bin_with_dirty(7, 2).should_log_delta(25));
12296 }
12297
12298 /// isBINDelta fast path: a BIN already in delta form always re-logs as a
12299 /// delta (JE: `if (isBINDelta()) return true;`).
12300 #[test]
12301 fn should_log_delta_bin_delta_fast_path() {
12302 let mut bin = bin_with_dirty(100, 90); // 90% dirty: way over any limit
12303 bin.is_delta = true;
12304 // Even with a tiny percent that the dirty count blows past, an
12305 // already-delta BIN re-logs as a delta.
12306 assert!(
12307 bin.should_log_delta(1),
12308 "isBINDelta() must short-circuit to true regardless of percent"
12309 );
12310 }
12311
12312 /// numDeltas <= 0 guard: a BIN with no dirty slots logs a full BIN (an
12313 /// empty delta is invalid).
12314 #[test]
12315 fn should_log_delta_zero_dirty_is_full() {
12316 assert!(!bin_with_dirty(100, 0).should_log_delta(25));
12317 }
12318
12319 /// isDeltaProhibited bound: lastFullLsn == NULL (never logged full) and
12320 /// prohibit_next_delta both force a full BIN.
12321 #[test]
12322 fn should_log_delta_prohibited_forces_full() {
12323 // No prior full BIN.
12324 let mut bin = bin_with_dirty(100, 5); // would be a delta otherwise
12325 bin.last_full_lsn = NULL_LSN;
12326 assert!(
12327 !bin.should_log_delta(25),
12328 "lastFullLsn==NULL must force a full BIN"
12329 );
12330
12331 // prohibit_next_delta set (e.g. a dirty slot was removed by compress).
12332 let mut bin = bin_with_dirty(100, 5);
12333 bin.prohibit_next_delta = true;
12334 assert!(
12335 !bin.should_log_delta(25),
12336 "prohibit_next_delta must force a full BIN"
12337 );
12338 }
12339
12340 /// The prohibit flag is cleared after a full BIN is logged
12341 /// (JE IN.afterLog: setProhibitNextDelta(false)), so the NEXT log may once
12342 /// again be a delta — this is the periodic-full chain bound.
12343 #[test]
12344 fn full_log_clears_prohibit_next_delta() {
12345 let mut bin = bin_with_dirty(100, 5);
12346 bin.prohibit_next_delta = true;
12347 assert!(!bin.should_log_delta(25), "prohibited → full");
12348 bin.clear_dirty_after_full_log(Lsn::new(2, 5));
12349 assert!(
12350 !bin.prohibit_next_delta,
12351 "full log must clear prohibit_next_delta"
12352 );
12353 // Re-dirty a few slots; now a delta is allowed again.
12354 for e in bin.entries.iter_mut().take(5) {
12355 e.dirty = true;
12356 }
12357 assert!(
12358 bin.should_log_delta(25),
12359 "after a full log, a small delta is allowed again"
12360 );
12361 }
12362
12363 // ========================================================================
12364 // Tests: Task #82 — 8 new Tree methods
12365 // ========================================================================
12366
12367 // --- is_root_resident ---
12368
12369 #[test]
12370 fn test_is_root_resident_empty_tree() {
12371 let tree = Tree::new(1, 128);
12372 assert!(!tree.is_root_resident(), "empty tree has no resident root");
12373 }
12374
12375 #[test]
12376 fn test_is_root_resident_after_insert() {
12377 let tree = Tree::new(1, 128);
12378 tree.insert(b"k".to_vec(), b"v".to_vec(), Lsn::new(1, 1)).unwrap();
12379 assert!(tree.is_root_resident(), "root must be resident after insert");
12380 }
12381
12382 // --- get_resident_root_in ---
12383
12384 #[test]
12385 fn test_get_resident_root_in_empty() {
12386 let tree = Tree::new(1, 128);
12387 assert!(tree.get_resident_root_in().is_none());
12388 }
12389
12390 #[test]
12391 fn test_get_resident_root_in_single_entry() {
12392 let tree = Tree::new(1, 128);
12393 tree.insert(b"hello".to_vec(), b"world".to_vec(), Lsn::new(1, 1))
12394 .unwrap();
12395 let root = tree.get_resident_root_in();
12396 assert!(root.is_some(), "root must be Some after insert");
12397 let root_arc = tree.get_root().unwrap();
12398 assert!(
12399 Arc::ptr_eq(&root_arc, &root.unwrap()),
12400 "get_resident_root_in must return the same Arc as get_root"
12401 );
12402 }
12403
12404 #[test]
12405 fn test_get_resident_root_in_multi_entry() {
12406 let tree = Tree::new(1, 4);
12407 for i in 0u32..20 {
12408 let k = format!("rr{:04}", i).into_bytes();
12409 tree.insert(k, vec![i as u8], Lsn::new(1, i)).unwrap();
12410 }
12411 assert!(tree.get_resident_root_in().is_some());
12412 }
12413
12414 // --- get_parent_bin_for_child_ln ---
12415
12416 #[test]
12417 fn test_get_parent_bin_for_child_ln_empty_tree() {
12418 let tree = Tree::new(1, 128);
12419 assert!(tree.get_parent_bin_for_child_ln(b"key").is_none());
12420 }
12421
12422 #[test]
12423 fn test_get_parent_bin_for_child_ln_single_entry() {
12424 let tree = Tree::new(1, 128);
12425 tree.insert(b"alpha".to_vec(), b"val".to_vec(), Lsn::new(1, 1))
12426 .unwrap();
12427 let bin = tree.get_parent_bin_for_child_ln(b"alpha");
12428 assert!(bin.is_some(), "must return Some for a present key");
12429 assert!(bin.unwrap().read().is_bin(), "returned node must be a BIN");
12430 }
12431
12432 #[test]
12433 fn test_get_parent_bin_for_child_ln_multi_key() {
12434 let tree = Tree::new(1, 8);
12435 let keys: &[&[u8]] = &[b"aa", b"bb", b"cc", b"dd", b"ee"];
12436 for &k in keys {
12437 tree.insert(k.to_vec(), b"v".to_vec(), Lsn::new(1, 1)).unwrap();
12438 }
12439 for &k in keys {
12440 let bin = tree.get_parent_bin_for_child_ln(k);
12441 assert!(bin.is_some(), "must return Some for {:?}", k);
12442 assert!(bin.unwrap().read().is_bin());
12443 }
12444 }
12445
12446 // --- find_bin_for_insert ---
12447
12448 #[test]
12449 fn test_find_bin_for_insert_empty_tree() {
12450 let tree = Tree::new(1, 128);
12451 assert!(tree.find_bin_for_insert(b"newkey").is_none());
12452 }
12453
12454 #[test]
12455 fn test_find_bin_for_insert_returns_bin() {
12456 let tree = Tree::new(1, 128);
12457 tree.insert(b"existing".to_vec(), b"data".to_vec(), Lsn::new(1, 1))
12458 .unwrap();
12459 let bin = tree.find_bin_for_insert(b"newkey");
12460 assert!(bin.is_some());
12461 assert!(bin.unwrap().read().is_bin());
12462 }
12463
12464 #[test]
12465 fn test_find_bin_for_insert_same_as_parent_bin() {
12466 let tree = Tree::new(1, 128);
12467 tree.insert(b"foo".to_vec(), b"bar".to_vec(), Lsn::new(1, 1)).unwrap();
12468 let a = tree.get_parent_bin_for_child_ln(b"foo").unwrap();
12469 let b_arc = tree.find_bin_for_insert(b"foo").unwrap();
12470 assert!(
12471 Arc::ptr_eq(&a, &b_arc),
12472 "find_bin_for_insert must return the same BIN as get_parent_bin_for_child_ln"
12473 );
12474 }
12475
12476 // --- search_splits_allowed ---
12477
12478 #[test]
12479 fn test_search_splits_allowed_empty_tree() {
12480 let tree = Tree::new(1, 128);
12481 assert!(tree.search_splits_allowed(b"k").is_none());
12482 }
12483
12484 #[test]
12485 fn test_search_splits_allowed_finds_existing_key() {
12486 let tree = Tree::new(1, 8);
12487 for i in 0u32..10 {
12488 let k = format!("sa{:04}", i).into_bytes();
12489 tree.insert(k, vec![i as u8], Lsn::new(1, i)).unwrap();
12490 }
12491 for i in 0u32..10 {
12492 let k = format!("sa{:04}", i).into_bytes();
12493 let sr = tree.search_splits_allowed(&k);
12494 assert!(
12495 sr.is_some() && sr.unwrap().exact_parent_found,
12496 "search_splits_allowed must find sa{:04}",
12497 i
12498 );
12499 }
12500 }
12501
12502 #[test]
12503 fn test_search_splits_allowed_missing_key() {
12504 let tree = Tree::new(1, 8);
12505 tree.insert(b"present".to_vec(), b"v".to_vec(), Lsn::new(1, 1))
12506 .unwrap();
12507 let sr = tree.search_splits_allowed(b"absent");
12508 assert!(
12509 sr.is_none_or(|r| !r.exact_parent_found),
12510 "search_splits_allowed must not find absent key"
12511 );
12512 }
12513
12514 // --- rebuild_in_list ---
12515
12516 #[test]
12517 fn test_rebuild_in_list_empty_tree() {
12518 let tree = Tree::new(1, 128);
12519 assert!(tree.rebuild_in_list().is_empty());
12520 }
12521
12522 #[test]
12523 fn test_rebuild_in_list_single_entry() {
12524 let tree = Tree::new(1, 128);
12525 tree.insert(b"one".to_vec(), b"v".to_vec(), Lsn::new(1, 1)).unwrap();
12526 let list = tree.rebuild_in_list();
12527 // Expect root IN + BIN = 2 nodes.
12528 assert_eq!(
12529 list.len(),
12530 2,
12531 "single-entry tree must have exactly 2 nodes"
12532 );
12533 let has_bin = list.iter().any(|a| a.read().is_bin());
12534 let has_in = list.iter().any(|a| !a.read().is_bin());
12535 assert!(has_bin, "list must contain at least one BIN");
12536 assert!(has_in, "list must contain at least one upper IN");
12537 }
12538
12539 #[test]
12540 fn test_rebuild_in_list_multi_entry() {
12541 let tree = Tree::new(1, 4);
12542 for i in 0u32..20 {
12543 let k = format!("ri{:04}", i).into_bytes();
12544 tree.insert(k, vec![i as u8], Lsn::new(1, i)).unwrap();
12545 }
12546 let list = tree.rebuild_in_list();
12547 let stats = tree.collect_stats();
12548 let expected_nodes = (stats.n_ins + stats.n_bins) as usize;
12549 assert_eq!(
12550 list.len(),
12551 expected_nodes,
12552 "rebuild_in_list must return all {} nodes",
12553 expected_nodes
12554 );
12555 }
12556
12557 // --- validate_in_list ---
12558
12559 #[test]
12560 fn test_validate_in_list_empty_tree() {
12561 let tree = Tree::new(1, 128);
12562 assert!(tree.validate_in_list(), "empty tree must be valid");
12563 }
12564
12565 #[test]
12566 fn test_validate_in_list_single_entry() {
12567 let tree = Tree::new(1, 128);
12568 tree.insert(b"v".to_vec(), b"data".to_vec(), Lsn::new(1, 1)).unwrap();
12569 assert!(tree.validate_in_list(), "single-entry tree must be valid");
12570 }
12571
12572 #[test]
12573 fn test_validate_in_list_multi_entry() {
12574 let tree = Tree::new(1, 4);
12575 for i in 0u32..20 {
12576 let k = format!("vl{:04}", i).into_bytes();
12577 tree.insert(k, vec![i as u8], Lsn::new(1, i)).unwrap();
12578 }
12579 assert!(tree.validate_in_list(), "multi-entry tree must be valid");
12580 }
12581
12582 #[test]
12583 fn test_validate_in_list_empty_in_fails() {
12584 // Manually build a tree where the root IN has no entries — invalid.
12585 let root_arc = Arc::new(RwLock::new(TreeNode::Internal(InNodeStub {
12586 node_id: generate_node_id(),
12587 level: MAIN_LEVEL | 2,
12588 entries: vec![], // empty — structurally invalid
12589 targets: TargetRep::None,
12590 dirty: false,
12591 generation: 0,
12592 parent: None,
12593 lsn_rep: LsnRep::Empty,
12594 })));
12595 let tree = Tree::new(1, 128);
12596 *tree.root.write() = Some(root_arc);
12597 assert!(
12598 !tree.validate_in_list(),
12599 "a tree with an empty Internal node must fail validation"
12600 );
12601 }
12602
12603 // --- get_parent_in_for_child_in ---
12604
12605 #[test]
12606 fn test_get_parent_in_for_child_in_empty_tree() {
12607 let tree = Tree::new(1, 128);
12608 assert!(tree.get_parent_in_for_child_in(999).is_none());
12609 }
12610
12611 #[test]
12612 fn test_get_parent_in_for_child_in_single_entry() {
12613 // A single-insert tree has: root IN → BIN.
12614 // The root IN is the parent of the BIN.
12615 let tree = Tree::new(1, 128);
12616 tree.insert(b"p".to_vec(), b"v".to_vec(), Lsn::new(1, 1)).unwrap();
12617
12618 let root_arc = tree.get_root().as_ref().unwrap().clone();
12619 let bin_node_id = {
12620 let g = root_arc.read();
12621 match &*g {
12622 TreeNode::Internal(n) => {
12623 let child = n.child_ref(0).unwrap();
12624 let cg = child.read();
12625 match &*cg {
12626 TreeNode::Bottom(b) => b.node_id,
12627 _ => panic!("expected BIN"),
12628 }
12629 }
12630 _ => panic!("expected Internal root"),
12631 }
12632 };
12633
12634 let result = tree.get_parent_in_for_child_in(bin_node_id);
12635 assert!(result.is_some(), "must find parent of BIN");
12636 let (parent_arc, slot) = result.unwrap();
12637 assert!(Arc::ptr_eq(&parent_arc, &root_arc));
12638 assert_eq!(slot, 0);
12639 }
12640
12641 #[test]
12642 fn test_get_parent_in_for_child_in_not_found() {
12643 let tree = Tree::new(1, 128);
12644 tree.insert(b"x".to_vec(), b"y".to_vec(), Lsn::new(1, 1)).unwrap();
12645 assert!(tree.get_parent_in_for_child_in(u64::MAX).is_none());
12646 }
12647
12648 #[test]
12649 fn test_get_parent_in_for_child_in_multi_level() {
12650 // Build a tree with at least 3 levels so we test the recursive descent.
12651 let tree = Tree::new(1, 4);
12652 for i in 0u32..20 {
12653 let k = format!("ml{:04}", i).into_bytes();
12654 tree.insert(k, vec![i as u8], Lsn::new(1, i)).unwrap();
12655 }
12656
12657 // Collect all BIN node_ids via rebuild_in_list.
12658 let nodes = tree.rebuild_in_list();
12659 let bin_ids: Vec<u64> = nodes
12660 .iter()
12661 .filter_map(|a| {
12662 let g = a.read();
12663 if g.is_bin()
12664 && let TreeNode::Bottom(b) = &*g
12665 {
12666 return Some(b.node_id);
12667 }
12668 None
12669 })
12670 .collect();
12671
12672 for bin_id in bin_ids {
12673 let result = tree.get_parent_in_for_child_in(bin_id);
12674 assert!(
12675 result.is_some(),
12676 "every BIN (id={}) must have a parent IN",
12677 bin_id
12678 );
12679 let (parent_arc, _slot) = result.unwrap();
12680 assert!(
12681 !parent_arc.read().is_bin(),
12682 "parent of a BIN must be an Internal node"
12683 );
12684 }
12685 }
12686
12687 /// H-9 regression: BinStub::strip_lns actually drops the slot data
12688 /// (not just stats accounting).
12689 #[test]
12690 fn test_h9_strip_lns_actually_frees_data() {
12691 use crate::tree::{BinEntry, BinStub};
12692 use noxu_util::lsn::Lsn;
12693 let mut bin = BinStub {
12694 node_id: 1,
12695 level: 1,
12696 entries: Vec::new(),
12697 key_prefix: Vec::new(),
12698 dirty: false,
12699 is_delta: false,
12700 last_full_lsn: Lsn::from_u64(0),
12701 last_delta_lsn: Lsn::from_u64(0),
12702 generation: 0,
12703 parent: None,
12704 expiration_in_hours: true,
12705 cursor_count: 0,
12706 prohibit_next_delta: false,
12707 lsn_rep: LsnRep::Empty,
12708 keys: KeyRep::new(),
12709 compact_max_key_length: INKeyRep_DEFAULT_MAX_KEY_LENGTH,
12710 };
12711 // Three slots with embedded data + VALID logged LSNs (one dirty).
12712 // JE-faithful: a slot with a valid LSN is strippable regardless of the
12713 // dirty bit (its value is recoverable from the log); only a NULL-LSN
12714 // (never-logged / deferred-write) slot is preserved.
12715 bin.entries.push(BinEntry {
12716 data: Some(vec![0u8; 64]),
12717 known_deleted: false,
12718 dirty: false,
12719 expiration_time: 0,
12720 });
12721 bin.entries.push(BinEntry {
12722 data: Some(vec![0u8; 32]),
12723 known_deleted: false,
12724 dirty: false,
12725 expiration_time: 0,
12726 });
12727 bin.entries.push(BinEntry {
12728 data: Some(vec![0u8; 16]),
12729 known_deleted: false,
12730 dirty: true, // dirty BUT logged -> still strippable (EVICTOR-RECLAIM-1)
12731 expiration_time: 0,
12732 });
12733 // T-2: keep the key rep aligned with the pushed slots.
12734 bin.keys = KeyRep::from_keys(vec![
12735 b"a".to_vec(),
12736 b"b".to_vec(),
12737 b"c".to_vec(),
12738 ]);
12739 // Give all three slots VALID (non-NULL) LSNs so they are recoverable
12740 // from the log and therefore strippable.
12741 bin.set_lsn(0, Lsn::new(1, 100));
12742 bin.set_lsn(1, Lsn::new(1, 200));
12743 bin.set_lsn(2, Lsn::new(1, 300));
12744
12745 let freed = bin.strip_lns();
12746 assert_eq!(
12747 freed,
12748 64 + 32 + 16,
12749 "all logged slots stripped regardless of dirty (JE evictLNs)"
12750 );
12751 assert!(bin.entries[0].data.is_none(), "logged slot data dropped");
12752 assert!(bin.entries[1].data.is_none(), "logged slot data dropped");
12753 assert!(
12754 bin.entries[2].data.is_none(),
12755 "dirty-but-logged slot data dropped (recoverable from log)"
12756 );
12757
12758 // A NULL-LSN slot (never logged) must be preserved — its only copy is
12759 // the in-memory value.
12760 bin.entries[0].data = Some(vec![0u8; 64]);
12761 bin.set_lsn(0, noxu_util::NULL_LSN);
12762 let freed_null = bin.strip_lns();
12763 assert_eq!(freed_null, 0, "NULL-LSN (unlogged) slot must NOT be stripped");
12764 assert!(
12765 bin.entries[0].data.is_some(),
12766 "unlogged slot data preserved"
12767 );
12768
12769 // Cursor pin prevents stripping.
12770 bin.set_lsn(0, Lsn::new(1, 100));
12771 bin.cursor_count = 1;
12772 let freed_with_cursor = bin.strip_lns();
12773 assert_eq!(
12774 freed_with_cursor, 0,
12775 "strip_lns must skip when cursor pinned"
12776 );
12777 assert!(
12778 bin.entries[0].data.is_some(),
12779 "data preserved while cursor pinned"
12780 );
12781 }
12782
12783 // St-H4: the binary upper_in_floor_index must return the same slot as a
12784 // reference linear floor scan for all probe keys (incl. before-all,
12785 // after-all, between, and exact matches).
12786 #[test]
12787 fn test_upper_in_floor_index_matches_linear_scan() {
12788 // Reference linear floor scan (the pre-St-H4 algorithm): slot 0 is the
12789 // virtual −∞ key; walk forward while entry.key ≤ key.
12790 fn linear_floor(entries: &[InEntry], key: &[u8]) -> usize {
12791 let mut idx = 0usize;
12792 for (i, entry) in entries.iter().enumerate() {
12793 if i == 0 {
12794 idx = 0;
12795 } else if entry.key.as_slice() <= key {
12796 idx = i;
12797 } else {
12798 break;
12799 }
12800 }
12801 idx
12802 }
12803
12804 let tree = Tree::new(1, 256);
12805 // Build sorted IN slot key sets of varying size; slot 0 = virtual −∞
12806 // (empty key sorts first), the rest strictly ascending.
12807 for n_slots in 1usize..40 {
12808 let mut entries: Vec<InEntry> = Vec::with_capacity(n_slots);
12809 entries.push(InEntry { key: vec![] });
12810 for i in 1..n_slots {
12811 // Strictly-ascending two-byte keys with gaps so probes can
12812 // fall between, on, before, and after them.
12813 let v = (i as u16) * 4;
12814 entries.push(InEntry {
12815 key: vec![(v >> 8) as u8, (v & 0xFF) as u8],
12816 });
12817 }
12818 for probe in 0u16..=(n_slots as u16 * 4 + 4) {
12819 let key = vec![(probe >> 8) as u8, (probe & 0xFF) as u8];
12820 assert_eq!(
12821 tree.upper_in_floor_index(&entries, &key),
12822 linear_floor(&entries, &key),
12823 "floor mismatch: n_slots={n_slots}, key={key:?}"
12824 );
12825 }
12826 }
12827 }
12828}
12829
12830// ─────────────────────────────────────────────────────────────────────────
12831// St-H6: BIN split inherits expiration_in_hours from the splitting BIN.
12832// ─────────────────────────────────────────────────────────────────────────
12833
12834/// Unit test for the St-H6 fix: the right-half sibling created by
12835/// `split_child` inherits `expiration_in_hours` from the splitting BIN.
12836///
12837/// Before the fix, the sibling was always created with
12838/// `expiration_in_hours = false`, causing hours-granularity TTL entries
12839/// (expiration_time ~495k) to be compared against `current_time_secs()`
12840/// (~1.78B) and treated as expired.
12841///
12842/// This test:
12843/// 1. Creates a tree with max_entries = 4 and inserts 4 entries directly
12844/// (bypassing `update_key_expiration`) with non-zero `expiration_time`
12845/// and `expiration_in_hours = true` on the BIN.
12846/// 2. Triggers a split.
12847/// 3. Asserts that the right-half sibling has `expiration_in_hours = true`
12848/// (inherited, not hardcoded false).
12849#[test]
12850fn test_split_child_sibling_inherits_expiration_in_hours() {
12851 use crate::tree::{BIN_LEVEL, BinEntry, BinStub, MAIN_LEVEL, TreeNode};
12852 use noxu_util::{Lsn, NULL_LSN};
12853 use parking_lot::RwLock;
12854 use std::sync::Arc;
12855
12856 // Manually build a tree with one BIN (4 entries, expiration_in_hours=true).
12857 let tree = Tree::new(99, 4);
12858
12859 // Pre-populate the tree root for the test.
12860 let entries: Vec<BinEntry> = (0u8..4u8)
12861 .map(|_k| BinEntry {
12862 data: Some(vec![_k, _k]),
12863 known_deleted: false,
12864 dirty: true,
12865 expiration_time: 495_630, // hours-since-epoch value, 2026
12866 })
12867 .collect();
12868 let bin_keys: Vec<Vec<u8>> = (0u8..4u8).map(|k| vec![k]).collect();
12869 let bin = Arc::new(RwLock::new(TreeNode::Bottom(BinStub {
12870 node_id: 1,
12871 level: BIN_LEVEL,
12872 entries,
12873 key_prefix: Vec::new(),
12874 dirty: true,
12875 is_delta: false,
12876 last_full_lsn: NULL_LSN,
12877 last_delta_lsn: NULL_LSN,
12878 generation: 0,
12879 parent: None,
12880 expiration_in_hours: true, // hours-granularity entries
12881 cursor_count: 0,
12882 prohibit_next_delta: false,
12883 lsn_rep: LsnRep::Empty,
12884 keys: KeyRep::from_keys(bin_keys),
12885 compact_max_key_length: INKeyRep_DEFAULT_MAX_KEY_LENGTH,
12886 })));
12887
12888 let root = Arc::new(RwLock::new(TreeNode::Internal(InNodeStub {
12889 node_id: 2,
12890 level: MAIN_LEVEL | 2,
12891 entries: vec![InEntry {
12892 key: vec![], // virtual key for slot 0 (-infinity)
12893 }],
12894 targets: TargetRep::Sparse(vec![(0, Arc::clone(&bin))]),
12895 dirty: true,
12896 generation: 0,
12897 parent: None,
12898 lsn_rep: LsnRep::Empty,
12899 })));
12900 {
12901 let mut b = bin.write();
12902 b.set_parent(Some(Arc::downgrade(&root)));
12903 }
12904 *tree.root.write() = Some(Arc::clone(&root));
12905
12906 // Trigger split_child on the root.
12907 Tree::split_child(
12908 &root,
12909 0,
12910 4,
12911 Lsn::new(1, 500),
12912 SplitHint::Normal,
12913 &[],
12914 None,
12915 false,
12916 None,
12917 )
12918 .expect("split_child should succeed");
12919
12920 // After the split: root has two children — left BIN and right sibling.
12921 let root_guard = root.read();
12922 let TreeNode::Internal(ref in_node) = *root_guard else {
12923 panic!("root should be Internal after split");
12924 };
12925 assert_eq!(
12926 in_node.entries.len(),
12927 2,
12928 "root should have 2 entries (children) after split"
12929 );
12930
12931 // Right-half sibling is at slot 1.
12932 let sibling_arc = in_node
12933 .get_child(1)
12934 .expect("right-half sibling should exist at slot 1");
12935 let sibling_guard = sibling_arc.read();
12936 let TreeNode::Bottom(ref sibling) = *sibling_guard else {
12937 panic!("right sibling should be a BIN");
12938 };
12939
12940 assert!(
12941 sibling.expiration_in_hours,
12942 "St-H6: right-half sibling expiration_in_hours must be true \
12943 (inherited from splitting BIN); got false"
12944 );
12945
12946 // Verify the sibling's entries have the expected expiration_time.
12947 for e in &sibling.entries {
12948 assert_eq!(
12949 e.expiration_time, 495_630,
12950 "sibling entry expiration_time should be preserved: got {}",
12951 e.expiration_time
12952 );
12953 // With in_hours=true, is_expired should return false (future).
12954 assert!(
12955 !noxu_util::ttl::is_expired(
12956 e.expiration_time,
12957 sibling.expiration_in_hours
12958 ),
12959 "St-H6: sibling TTL entry ({}) should NOT appear expired \
12960 with expiration_in_hours={}",
12961 e.expiration_time,
12962 sibling.expiration_in_hours
12963 );
12964 }
12965}
12966
12967/// Regression confirmation: `is_expired` with wrong `in_hours = false`
12968/// would falsely expire hours-granularity values (~495k hours since epoch).
12969#[test]
12970fn test_hours_value_is_expired_only_with_false_flag() {
12971 // Hours-since-epoch value for ~2026 + 1 000 h TTL.
12972 let exp_hours: u32 = 495_630;
12973 // Correctly treated as hours: not expired.
12974 assert!(
12975 !noxu_util::ttl::is_expired(exp_hours, true),
12976 "exp_hours={exp_hours} should NOT be expired when in_hours=true"
12977 );
12978 // Incorrectly treated as seconds (pre-fix right sibling): expired.
12979 assert!(
12980 noxu_util::ttl::is_expired(exp_hours, false),
12981 "exp_hours={exp_hours} should be expired when in_hours=false \
12982 (St-H6 demonstrates the wrong-flag scenario)"
12983 );
12984}
12985
12986// =============================================================================
12987// IN-redo unit tests (DRIFT-1 / Stage 1)
12988// =============================================================================
12989
12990#[cfg(test)]
12991mod in_redo_tests {
12992 use super::*;
12993
12994 /// Build a BinStub with `n` entries (key = [i as u8], lsn = lsn(1, i))
12995 /// and serialise it. Returns (node_id, node_data_bytes).
12996 fn make_bin_bytes(node_id: u64, n: usize) -> Vec<u8> {
12997 let mut bin = BinStub {
12998 node_id,
12999 level: BIN_LEVEL,
13000 entries: Vec::new(),
13001 key_prefix: Vec::new(),
13002 dirty: false,
13003 is_delta: false,
13004 last_full_lsn: noxu_util::NULL_LSN,
13005 last_delta_lsn: noxu_util::NULL_LSN,
13006 generation: 0,
13007 parent: None,
13008 expiration_in_hours: true,
13009 cursor_count: 0,
13010 prohibit_next_delta: false,
13011 lsn_rep: LsnRep::Empty,
13012 keys: KeyRep::new(),
13013 compact_max_key_length: INKeyRep_DEFAULT_MAX_KEY_LENGTH,
13014 };
13015 for i in 0..n {
13016 // T-2/T-3: route through insert so entries/keys/lsn_rep stay
13017 // aligned; the serialized bytes are identical.
13018 bin.insert_with_prefix(
13019 vec![i as u8],
13020 Lsn::new(1, (i + 1) as u32),
13021 Some(vec![i as u8]),
13022 );
13023 }
13024 bin.serialize_full()
13025 }
13026
13027 /// Verify that recover_in_redo inserts a BIN as root when the tree is empty.
13028 ///
13029 /// JE RecoveryManager.recoverRootIN: `root == null` path.
13030 #[test]
13031 fn test_recover_in_redo_root_bin_inserted_into_empty_tree() {
13032 let tree = Tree::new(42, 128);
13033 assert!(tree.is_empty());
13034 let bytes = make_bin_bytes(1, 3);
13035 let log_lsn = Lsn::new(1, 100);
13036 let result = tree.recover_in_redo(
13037 log_lsn, /*is_root=*/ true, /*is_bin=*/ true, &bytes,
13038 );
13039 assert_eq!(result, InRedoResult::Inserted, "expected Inserted");
13040 // Tree should now have 3 entries.
13041 assert_eq!(tree.count_entries(), 3);
13042 }
13043
13044 /// Verify that recover_in_redo replaces a root BIN when the logged version is newer.
13045 ///
13046 /// JE RootUpdater.doWork: `DbLsn.compareTo(originalLsn, lsn) < 0` path.
13047 #[test]
13048 fn test_recover_in_redo_root_bin_replaced_when_log_newer() {
13049 let tree = Tree::new(42, 128);
13050 // Install an old root (2 entries, older LSN).
13051 let old_bytes = make_bin_bytes(1, 2);
13052 let old_lsn = Lsn::new(1, 50);
13053 tree.recover_in_redo(old_lsn, true, true, &old_bytes);
13054 assert_eq!(tree.count_entries(), 2);
13055 // Replay with newer LSN and 4 entries.
13056 let new_bytes = make_bin_bytes(1, 4);
13057 let new_lsn = Lsn::new(1, 100);
13058 let result = tree.recover_in_redo(new_lsn, true, true, &new_bytes);
13059 assert_eq!(result, InRedoResult::Replaced);
13060 assert_eq!(tree.count_entries(), 4);
13061 }
13062
13063 /// Verify that an older logged BIN does NOT replace a newer in-memory root.
13064 ///
13065 /// JE RootUpdater.doWork: `DbLsn.compareTo(originalLsn, lsn) >= 0` skip path.
13066 #[test]
13067 fn test_recover_in_redo_root_bin_skipped_when_tree_newer() {
13068 let tree = Tree::new(42, 128);
13069 // Install a newer root.
13070 let new_bytes = make_bin_bytes(1, 4);
13071 let new_lsn = Lsn::new(1, 200);
13072 tree.recover_in_redo(new_lsn, true, true, &new_bytes);
13073 // Attempt to replay an older version.
13074 let old_bytes = make_bin_bytes(1, 2);
13075 let old_lsn = Lsn::new(1, 100);
13076 let result = tree.recover_in_redo(old_lsn, true, true, &old_bytes);
13077 assert_eq!(result, InRedoResult::Skipped);
13078 // Tree still holds the newer 4-entry version.
13079 assert_eq!(tree.count_entries(), 4);
13080 }
13081
13082 /// deserialize_bin round-trips through serialize_full.
13083 #[test]
13084 fn test_deserialize_bin_round_trip() {
13085 let bytes = make_bin_bytes(99, 5);
13086 let bin = Tree::deserialize_bin(&bytes).expect("must deserialize");
13087 assert_eq!(bin.node_id, 99);
13088 assert_eq!(bin.entries.len(), 5);
13089 for i in 0..bin.entries.len() {
13090 assert_eq!(bin.get_full_key(i).unwrap(), vec![i as u8]);
13091 }
13092 }
13093
13094 /// deserialize_upper_in round-trips through write_to_bytes (Internal).
13095 #[test]
13096 fn test_deserialize_upper_in_round_trip() {
13097 // Build an InNodeStub and serialize via write_to_bytes.
13098 let node = TreeNode::Internal(InNodeStub {
13099 node_id: 77,
13100 level: 0x10002,
13101 entries: vec![
13102 InEntry { key: vec![1, 2, 3] },
13103 InEntry { key: vec![4, 5, 6] },
13104 ],
13105 targets: TargetRep::None,
13106 dirty: false,
13107 generation: 0,
13108 parent: None,
13109 lsn_rep: LsnRep::Empty,
13110 });
13111 let bytes = node.write_to_bytes();
13112 let restored =
13113 Tree::deserialize_upper_in(&bytes).expect("must deserialize");
13114 assert_eq!(restored.node_id, 77);
13115 assert_eq!(restored.level, 0x10002);
13116 assert_eq!(restored.entries.len(), 2);
13117 assert_eq!(restored.entries[0].key, vec![1, 2, 3]);
13118 assert_eq!(restored.entries[1].key, vec![4, 5, 6]);
13119 }
13120}
13121
13122// --- Part 2 acceptance tests: key_prefixing flag (DRIFT-3) ---
13123//
13124// JE `IN.computeKeyPrefix` returns null when `databaseImpl.getKeyPrefixing()`
13125// is false, so no prefix compression is ever applied to those BINs. Noxu was
13126// always applying prefix compression. This checks that the flag is honoured.
13127//
13128// Ref: `IN.java computeKeyPrefix` ~line 2456,
13129// `DatabaseConfig.setKeyPrefixing` / `DatabaseImpl.getKeyPrefixing`.
13130#[cfg(test)]
13131mod key_prefixing_tests {
13132 use super::*;
13133
13134 /// Helper: find the first (leftmost) BIN in the tree.
13135 fn find_first_bin(node: &Arc<RwLock<TreeNode>>) -> Arc<RwLock<TreeNode>> {
13136 let child_opt = {
13137 let g = node.read();
13138 match &*g {
13139 TreeNode::Bottom(_) => None,
13140 TreeNode::Internal(n) => {
13141 Some(Arc::clone(n.child_ref(0).expect("child")))
13142 }
13143 }
13144 };
13145 match child_opt {
13146 None => Arc::clone(node),
13147 Some(child) => find_first_bin(&child),
13148 }
13149 }
13150
13151 /// With `key_prefixing = false` (the default), keys must be stored without
13152 /// any prefix: the BIN's `key_prefix` must remain empty after inserts.
13153 #[test]
13154 fn test_key_prefixing_false_stores_full_keys() {
13155 // Default is key_prefixing = false.
13156 let tree = Tree::new(1, 16);
13157 assert!(!tree.key_prefixing, "default must be false");
13158
13159 let lsn = noxu_util::Lsn::new(1, 10);
13160 // Insert keys with a long common prefix.
13161 for i in 0u8..8 {
13162 let key = vec![b'r', b'e', b'c', b'o', b'r', b'd', b':', i];
13163 tree.insert(key, vec![i], lsn).expect("insert");
13164 }
13165
13166 let root = tree.get_root().expect("root");
13167 let bin_arc = find_first_bin(&root);
13168 let guard = bin_arc.read();
13169 let TreeNode::Bottom(ref bin) = *guard else {
13170 panic!("must be a BIN");
13171 };
13172 assert!(
13173 bin.key_prefix.is_empty(),
13174 "key_prefix must be empty when key_prefixing=false, got {:?}",
13175 bin.key_prefix
13176 );
13177 assert_eq!(bin.entries.len(), 8);
13178 // Keys must be stored as full keys.
13179 assert_eq!(
13180 bin.get_full_key(0).unwrap(),
13181 vec![b'r', b'e', b'c', b'o', b'r', b'd', b':', 0]
13182 );
13183 }
13184
13185 /// With `key_prefixing = true`, keys with a common prefix are compressed:
13186 /// the BIN's `key_prefix` must be non-empty.
13187 #[test]
13188 fn test_key_prefixing_true_compresses_keys() {
13189 let mut tree = Tree::new(1, 16);
13190 tree.set_key_prefixing(true);
13191
13192 let lsn = noxu_util::Lsn::new(1, 10);
13193 for i in 0u8..8 {
13194 let key = vec![b'r', b'e', b'c', b'o', b'r', b'd', b':', i];
13195 tree.insert(key, vec![i], lsn).expect("insert");
13196 }
13197
13198 let root = tree.get_root().expect("root");
13199 let bin_arc = find_first_bin(&root);
13200 let guard = bin_arc.read();
13201 let TreeNode::Bottom(ref bin) = *guard else {
13202 panic!("must be a BIN");
13203 };
13204 // Prefix compression must kick in: all keys share "record:".
13205 assert!(
13206 !bin.key_prefix.is_empty(),
13207 "key_prefix must be non-empty when key_prefixing=true"
13208 );
13209 assert_eq!(
13210 bin.key_prefix,
13211 b"record:".to_vec(),
13212 "prefix must be the common prefix of all inserted keys"
13213 );
13214 }
13215
13216 /// Custom-comparator databases (sorted-dup) always bypass prefix
13217 /// regardless of key_prefixing: `insert_cmp` does not touch key_prefix.
13218 #[test]
13219 fn test_key_prefixing_custom_comparator_no_prefix() {
13220 let cmp: KeyComparatorFn = Arc::new(|a: &[u8], b: &[u8]| a.cmp(b));
13221 let mut tree = Tree::new_with_comparator(1, 16, cmp);
13222 // Enable key_prefixing — should have no effect via insert_cmp path.
13223 tree.set_key_prefixing(true);
13224
13225 let lsn = noxu_util::Lsn::new(1, 10);
13226 for i in 0u8..8 {
13227 let key = vec![b'r', b'e', b'c', b'o', b'r', b'd', b':', i];
13228 tree.insert(key, vec![i], lsn).expect("insert");
13229 }
13230
13231 let root = tree.get_root().expect("root");
13232 let bin_arc = find_first_bin(&root);
13233 let guard = bin_arc.read();
13234 let TreeNode::Bottom(ref bin) = *guard else {
13235 panic!("must be a BIN");
13236 };
13237 // Custom-comparator path (insert_cmp) does not set key_prefix.
13238 assert!(
13239 bin.key_prefix.is_empty(),
13240 "custom-comparator path must not set key_prefix"
13241 );
13242 }
13243}
13244
13245// --- Part 1 acceptance tests: splitSpecial heuristic (DRIFT-1) ---
13246//
13247// JE `IN.splitSpecial` / `Tree.forceSplit`: when all routing decisions during
13248// descent are leftmost (`AllLeft`) or rightmost (`AllRight`), the split index
13249// is forced to 1 or `n-1` respectively instead of `n/2`. This halves the
13250// number of splits for monotonically increasing / decreasing key workloads
13251// (sequential append / prepend) because each split leaves the BIN near-full.
13252//
13253// Ref: `IN.java splitSpecial` ~line 4129, `Tree.java forceSplit` ~line 1907.
13254#[cfg(test)]
13255mod split_special_tests {
13256 use super::*;
13257
13258 /// Test helper: descend the tree to the BIN that holds (or would hold)
13259 /// `key`, returning its arc. Mirrors the read-path descent used by
13260 /// `Tree::search`; sufficient for unit tests that need to mutate a slot.
13261 fn find_bin_arc_for_key(
13262 node_arc: &Arc<RwLock<TreeNode>>,
13263 key: &[u8],
13264 ) -> Option<Arc<RwLock<TreeNode>>> {
13265 let mut current = node_arc.clone();
13266 loop {
13267 let next = {
13268 let g = current.read();
13269 match &*g {
13270 TreeNode::Bottom(_) => return Some(current.clone()),
13271 TreeNode::Internal(n) => {
13272 if n.entries.is_empty() {
13273 return None;
13274 }
13275 let mut idx = 0usize;
13276 for (i, e) in n.entries.iter().enumerate() {
13277 if i == 0 || e.key.as_slice() <= key {
13278 idx = i;
13279 } else {
13280 break;
13281 }
13282 }
13283 n.get_child(idx)?
13284 }
13285 }
13286 };
13287 current = next;
13288 }
13289 }
13290
13291 /// Count total leaf (BIN) nodes in the tree by DFS.
13292 fn count_bins(node: &Arc<RwLock<TreeNode>>) -> usize {
13293 let g = node.read();
13294 match &*g {
13295 TreeNode::Bottom(_) => 1,
13296 TreeNode::Internal(n) => {
13297 n.resident_children().iter().map(count_bins).sum()
13298 }
13299 }
13300 }
13301
13302 /// Return total key count across all BINs.
13303 fn count_keys(node: &Arc<RwLock<TreeNode>>) -> usize {
13304 let g = node.read();
13305 match &*g {
13306 TreeNode::Bottom(b) => b.entries.len(),
13307 TreeNode::Internal(n) => {
13308 n.resident_children().iter().map(count_keys).sum()
13309 }
13310 }
13311 }
13312
13313 /// Returns the number of entries in the leftmost BIN.
13314 fn leftmost_bin_size(node: &Arc<RwLock<TreeNode>>) -> usize {
13315 let g = node.read();
13316 match &*g {
13317 TreeNode::Bottom(b) => b.entries.len(),
13318 TreeNode::Internal(n) => {
13319 let first_child = n.child_ref(0).expect("child");
13320 leftmost_bin_size(first_child)
13321 }
13322 }
13323 }
13324
13325 /// Returns the number of entries in the rightmost BIN.
13326 fn rightmost_bin_size(node: &Arc<RwLock<TreeNode>>) -> usize {
13327 let g = node.read();
13328 match &*g {
13329 TreeNode::Bottom(b) => b.entries.len(),
13330 TreeNode::Internal(n) => {
13331 let last_child = n
13332 .child_ref(n.entries.len().saturating_sub(1))
13333 .expect("child");
13334 rightmost_bin_size(last_child)
13335 }
13336 }
13337 }
13338
13339 /// `splitSpecial` ascending: each right-side split leaves the left BIN
13340 /// near-full (all but one entry stays). Compared to midpoint split
13341 /// the number of BINs created should be significantly fewer relative to
13342 /// keys inserted (more keys per BIN on average).
13343 ///
13344 /// JE criterion: `allRightSideDescent` → `splitIndex = nEntries - 1`.
13345 /// The penultimate entry stays in the left BIN; only one entry goes to
13346 /// the new right sibling, which then absorbs the next insert and fills
13347 /// normally.
13348 #[test]
13349 fn test_split_special_ascending_fewer_bins_than_midpoint() {
13350 let max_entries = 8usize;
13351 let n_keys = 200usize;
13352
13353 // Build tree with splitSpecial (ascending keys trigger AllRight).
13354 let tree_special = Tree::new(1, max_entries);
13355 let lsn = noxu_util::Lsn::new(1, 100);
13356 for i in 0u32..n_keys as u32 {
13357 let key = i.to_be_bytes().to_vec();
13358 tree_special.insert(key, vec![0u8], lsn).expect("insert");
13359 }
13360
13361 let root_special = tree_special.get_root().expect("root must exist");
13362 let bins_special = count_bins(&root_special);
13363 let keys_special = count_keys(&root_special);
13364
13365 // All keys must be present.
13366 assert_eq!(keys_special, n_keys, "all keys must be stored");
13367
13368 // With splitSpecial, each right-side split keeps n-1 entries in the
13369 // left BIN. Ideal: ceil(n_keys / (max_entries - 1)) BINs.
13370 // Without splitSpecial (midpoint): ceil(n_keys / (max_entries / 2)).
13371 // We assert the actual count is below the midpoint-split upper bound.
13372 let midpoint_upper_bound = n_keys.div_ceil(max_entries / 2);
13373 assert!(
13374 bins_special < midpoint_upper_bound,
13375 "splitSpecial should produce fewer BINs than midpoint split: \
13376 got {bins_special}, midpoint upper bound = {midpoint_upper_bound}"
13377 );
13378
13379 // The rightmost BIN must have fewer entries than max_entries
13380 // (the last insert only half-fills it at most), which is expected.
13381 // The IMPORTANT property: rightmost BIN started with exactly 1 entry
13382 // (its first entry was the split-off singleton) then filled up.
13383 // We just verify overall key density > midpoint baseline.
13384 let avg_fill = keys_special as f64 / bins_special as f64;
13385 let midpoint_fill = (max_entries / 2) as f64;
13386 assert!(
13387 avg_fill > midpoint_fill,
13388 "average fill per BIN with splitSpecial ({avg_fill:.1}) should \
13389 exceed midpoint baseline ({midpoint_fill})"
13390 );
13391 }
13392
13393 /// `splitSpecial` descending: all routing decisions are at slot 0
13394 /// (`AllLeft`). Split forces `split_index = 1` so the right sibling
13395 /// gets almost all entries and the left node keeps just one.
13396 ///
13397 /// JE criterion: `allLeftSideDescent` → `splitIndex = 1`.
13398 #[test]
13399 fn test_split_special_descending_fewer_bins_than_midpoint() {
13400 let max_entries = 8usize;
13401 let n_keys = 200usize;
13402
13403 let tree_special = Tree::new(1, max_entries);
13404 let lsn = noxu_util::Lsn::new(1, 100);
13405 for i in (0u32..n_keys as u32).rev() {
13406 let key = i.to_be_bytes().to_vec();
13407 tree_special.insert(key, vec![0u8], lsn).expect("insert");
13408 }
13409
13410 let root_special = tree_special.get_root().expect("root must exist");
13411 let bins_special = count_bins(&root_special);
13412 let keys_special = count_keys(&root_special);
13413
13414 assert_eq!(keys_special, n_keys, "all keys must be stored");
13415
13416 let midpoint_upper_bound = n_keys.div_ceil(max_entries / 2);
13417 assert!(
13418 bins_special < midpoint_upper_bound,
13419 "splitSpecial descending should produce fewer BINs: \
13420 got {bins_special}, midpoint upper bound = {midpoint_upper_bound}"
13421 );
13422 }
13423
13424 /// Random-key inserts must NOT be affected by splitSpecial: with random
13425 /// keys descent will rarely be all-left or all-right, so the split index
13426 /// defaults to midpoint and tree balance is maintained.
13427 #[test]
13428 fn test_split_special_random_inserts_stay_balanced() {
13429 use std::collections::BTreeSet;
13430
13431 let max_entries = 8usize;
13432 // Use a fixed permutation so the test is deterministic.
13433 let mut keys: Vec<u32> = (0u32..200).collect();
13434 // Knuth shuffle with a fixed seed.
13435 let mut rng: u64 = 0xdeadbeef_cafebabe;
13436 for i in (1..keys.len()).rev() {
13437 rng = rng.wrapping_mul(6364136223846793005).wrapping_add(1);
13438 let j = (rng >> 33) as usize % (i + 1);
13439 keys.swap(i, j);
13440 }
13441
13442 let tree = Tree::new(1, max_entries);
13443 let lsn = noxu_util::Lsn::new(1, 100);
13444 let mut inserted = BTreeSet::new();
13445 for k in &keys {
13446 let key = k.to_be_bytes().to_vec();
13447 tree.insert(key, vec![0u8], lsn).expect("insert");
13448 inserted.insert(*k);
13449 }
13450
13451 let root = tree.get_root().expect("root");
13452 let total_keys = count_keys(&root);
13453 assert_eq!(
13454 total_keys,
13455 inserted.len(),
13456 "all random keys must be stored"
13457 );
13458
13459 // Verify every key is findable.
13460 for k in &inserted {
13461 let key = k.to_be_bytes().to_vec();
13462 let found = tree.search(&key);
13463 assert!(
13464 found.map(|r| r.is_exact_match()).unwrap_or(false),
13465 "random key {k} must be findable after insert"
13466 );
13467 }
13468 }
13469
13470 /// TREE-F1: a `known_deleted` BIN slot must read as ABSENT on an exact
13471 /// lookup and must be SKIPPED by scans, matching JE.
13472 ///
13473 /// JE contract:
13474 /// * `IN.findEntry` (IN.java:3197): an exact match that lands on a
13475 /// known-deleted slot returns -1 (ABSENT).
13476 /// * `CursorImpl.lockAndGetCurrent` (CursorImpl.java:2062-2064): a
13477 /// step that lands on `isEntryKnownDeleted(index)` returns null, so
13478 /// the `getNext` loop advances past it (the slot is skipped).
13479 ///
13480 /// KD slots legitimately exist in live BINs during BIN-delta
13481 /// reconstitution (`mutate_to_full_bin` applies delta KD slots) until
13482 /// the compressor reclaims them. We reach that state directly here by
13483 /// marking a slot known_deleted in the BIN arc, then assert the
13484 /// user-facing read/scan paths do not surface it.
13485 #[test]
13486 fn test_tree_f1_known_deleted_slot_is_absent_and_skipped() {
13487 let tree = Tree::new(1, 8);
13488 // Insert enough keys to populate a BIN with several live slots.
13489 for i in 0..6u32 {
13490 let key = format!("kd{i:04}").into_bytes();
13491 tree.insert(key, vec![i as u8], Lsn::new(1, i)).unwrap();
13492 }
13493
13494 // Pick a middle key and mark its slot known_deleted directly in the
13495 // BIN, modelling a delta-applied tombstone the compressor has not yet
13496 // reclaimed.
13497 let kd_key = b"kd0003".to_vec();
13498 {
13499 let root = tree.get_root().expect("root");
13500 let bin_arc = find_bin_arc_for_key(&root, &kd_key).expect("bin");
13501 let mut g = bin_arc.write();
13502 if let TreeNode::Bottom(b) = &mut *g {
13503 let idx = (0..b.entries.len())
13504 .find(|&i| {
13505 b.get_full_key(i).as_deref() == Some(kd_key.as_slice())
13506 })
13507 .expect("kd key slot");
13508 b.entries[idx].known_deleted = true;
13509 } else {
13510 panic!("expected BIN");
13511 }
13512 }
13513
13514 // (a) exact lookup via Tree::search must report NOT found.
13515 let sr = tree.search(&kd_key);
13516 assert!(
13517 !sr.map(|r| r.is_exact_match()).unwrap_or(false),
13518 "TREE-F1: Tree::search must report a known_deleted slot as absent \
13519 (IN.findEntry IN.java:3197)"
13520 );
13521
13522 // (a) exact lookup via Tree::search_with_data must report NOT found.
13523 let sf = tree.search_with_data(&kd_key).expect("slot fetch");
13524 assert!(
13525 !sf.found,
13526 "TREE-F1: Tree::search_with_data must report a known_deleted slot \
13527 as absent (IN.findEntry IN.java:3197)"
13528 );
13529
13530 // Live neighbours must still be found.
13531 for live in [b"kd0002".to_vec(), b"kd0004".to_vec()] {
13532 assert!(
13533 tree.search(&live).map(|r| r.is_exact_match()).unwrap_or(false),
13534 "live neighbour must remain findable"
13535 );
13536 }
13537
13538 // (b) a scan-facing BIN dump (descend_to_edge_bin / get_next_bin /
13539 // get_prev_bin) returns slots verbatim WITH the known_deleted flag
13540 // set, so the cursor can skip them (CursorImpl.java:2062-2064). The
13541 // contract here is: the KD slot is never reported as a LIVE entry.
13542 let root = tree.get_root().expect("root");
13543 let edge = Tree::descend_to_edge_bin(&root, true).expect("edge bin");
13544 assert!(
13545 !edge.iter().any(|(e, _, k)| k == &kd_key && !e.known_deleted),
13546 "TREE-F1: scan must not surface a known_deleted slot as live \
13547 (CursorImpl.java:2062-2064)"
13548 );
13549 for anchor in [b"kd0000".to_vec(), b"kd0005".to_vec()] {
13550 for entries in
13551 [tree.get_next_bin(&anchor), tree.get_prev_bin(&anchor)]
13552 .into_iter()
13553 .flatten()
13554 {
13555 assert!(
13556 !entries
13557 .iter()
13558 .any(|(e, _, k)| k == &kd_key && !e.known_deleted),
13559 "TREE-F1: get_next_bin/get_prev_bin must not surface a \
13560 known_deleted slot as live"
13561 );
13562 }
13563 }
13564
13565 // first_entry_at_or_after must skip a KD slot at the boundary.
13566 if let Some((k, _, _)) = tree.first_entry_at_or_after(&kd_key) {
13567 assert_ne!(
13568 k, kd_key,
13569 "TREE-F1: first_entry_at_or_after must skip a known_deleted \
13570 slot (CursorImpl.java:2062-2064)"
13571 );
13572 }
13573
13574 // The compressor KD-iteration path must STILL see the slot — the fix
13575 // only changes the user-facing read predicate, not the maintenance
13576 // iteration that exists to reclaim KD slots.
13577 let kd_bins = tree.collect_bins_with_known_deleted();
13578 assert!(
13579 !kd_bins.is_empty(),
13580 "TREE-F1: collect_bins_with_known_deleted must still observe the \
13581 KD slot so the compressor can reclaim it"
13582 );
13583 }
13584}