noxu_tree/tree.rs
1//! B+tree implementation.
2//!
3//!
4//! Tree implements the B+tree. It provides search, insert, and delete
5//! operations on the tree structure. The tree uses latch-coupling for
6//! concurrent access: when traversing down the tree, the parent latch
7//! is released after the child latch is acquired.
8//!
9//! # Architecture
10//!
11//! The tree has a hierarchical structure:
12//! - Internal Nodes (IN) at levels 2 and above
13//! - Bottom Internal Nodes (BIN) at level 1
14//! - Leaf Nodes (LN) containing actual data
15//!
16//! # Locking Strategy
17//!
18//! - Root latch protects the root pointer itself
19//! - Each node has its own latch for concurrent access
20//! - Search uses latch-coupling: acquire child, release parent
21//! - Modifications may require exclusive latches
22
23use crate::error::TreeError;
24use crate::key::{create_key_prefix, get_key_prefix_length};
25use crate::search_result::SearchResult;
26use noxu_latch::{LatchContext, SharedLatch};
27use noxu_util::{Lsn, NULL_LSN};
28use parking_lot::RwLock;
29use std::sync::atomic::{AtomicI64, AtomicU64, Ordering};
30use std::sync::{Arc, Weak};
31
32/// Observer that mirrors JE's `INList` feeding the evictor's `LRUList`s.
33///
34/// The tree owns no eviction policy of its own; instead it notifies a
35/// registered listener whenever an IN/BIN node enters the resident cache, is
36/// accessed, or is removed. The `Evictor` (in `noxu-evictor`) implements this
37/// trait, but the dependency is one-way (`noxu-evictor` → `noxu-tree`), so the
38/// tree refers to the listener only through this trait object — avoiding a
39/// circular crate dependency.
40///
41/// JE reference: `IN.fetchTarget` / split / `rebuildINList` call
42/// `Evictor.addBack`; node access calls `Evictor.moveBack`; node removal
43/// calls `Evictor.remove`.
44pub trait InListListener: Send + Sync {
45 /// A node has just become resident in the cache (JE `Evictor.addBack`).
46 fn note_ins_added(&self, node_id: u64);
47 /// A resident node was accessed (JE `Evictor.moveBack` — LRU touch).
48 fn note_ins_accessed(&self, node_id: u64);
49 /// A node was removed from the cache (JE `Evictor.remove`).
50 fn note_ins_removed(&self, node_id: u64);
51}
52
53// Level and flag constants re-exported here for tree-internal use.
54pub const DBMAP_LEVEL: i32 = 0x20000;
55pub const MAIN_LEVEL: i32 = 0x10000;
56pub const LEVEL_MASK: i32 = 0x0ffff;
57pub const MIN_LEVEL: i32 = -1;
58pub const BIN_LEVEL: i32 = MAIN_LEVEL | 1;
59pub const EXACT_MATCH: i32 = 1 << 16;
60pub const INSERT_SUCCESS: i32 = 1 << 17;
61
62/// Per-slot fixed memory overhead for a BIN entry, in bytes (DBI-23).
63///
64/// This is the heap footprint of one `BinEntry` *struct* as it lives inside
65/// the BIN's `Vec<BinEntry>` buffer — NOT counting the variable-length key and
66/// data bytes, which are separate heap allocations counted on top of this.
67///
68/// Faithful to JE `IN.getEntryInMemorySize` + the per-slot `entryStates` /
69/// LSN-array overhead folded into `IN.computeMemorySize` (IN.java ~4632):
70/// JE measures the slot's fixed cost with `Sizeof` on the JVM; Rust has a
71/// fixed struct layout so `size_of::<BinEntry>()` is exact. The previous
72/// magic constant `48` *undercounted* every BIN slot (a `BinEntry` is 64
73/// bytes), so the live budget read below real heap and the evictor under-fired.
74///
75/// Derived (not hard-coded) so a layout change to `BinEntry` is tracked
76/// automatically — see `bin_stub_conformance` for the drift guard.
77pub const BIN_ENTRY_OVERHEAD: usize = std::mem::size_of::<BinEntry>();
78
79/// Per-slot fixed memory overhead for an IN entry, in bytes (DBI-23).
80///
81/// Heap footprint of one `InEntry` struct inside the IN's `Vec<InEntry>`
82/// buffer (key bytes counted separately). JE `IN.getEntryInMemorySize` for
83/// an upper IN plus the per-slot state/LSN/target overhead from
84/// `IN.computeMemorySize`.
85pub const IN_ENTRY_OVERHEAD: usize = std::mem::size_of::<InEntry>();
86
87/// Type alias for the key comparator used by sorted-duplicate databases.
88///
89/// The comparator takes two full (uncompressed) keys and returns their
90/// relative ordering. For sorted-dup databases this is `DupKeyData::compare`,
91/// which splits each key into primary + data parts and applies separate
92/// comparators to each. For normal databases this field is `None` and
93/// lexicographic byte comparison is used.
94///
95/// `DatabaseImpl.btreeComparator` / `DatabaseImpl.dupComparator`.
96pub type KeyComparatorFn =
97 Arc<dyn Fn(&[u8], &[u8]) -> std::cmp::Ordering + Send + Sync>;
98
99/// Combined search result carrying slot data and the BIN arc, returned by
100/// [`Tree::search_with_data`].
101///
102/// Avoids the double-descent pattern where `Tree::search` checked key
103/// existence and a second call re-descended to fetch the actual slot bytes.
104/// One descent now serves both purposes (Wave-11-I optimisation).
105pub struct SlotFetch {
106 /// `true` if an exact key match was found and is not expired.
107 pub found: bool,
108 /// Data bytes for the slot (`None` when `found` is `false`).
109 pub data: Option<Vec<u8>>,
110 /// Raw slot LSN as `u64`; zero when `found` is `false`.
111 pub lsn: u64,
112 /// Slot index within the BIN. Set to the actual BIN slot index when
113 /// `found` is `true`; `0` otherwise.
114 ///
115 /// Used by `CursorImpl` to set `current_index` correctly so that
116 /// `retrieve_next` advances to the right slot after a search.
117 pub slot_index: usize,
118 /// Arc to the BIN that the descent reached. Always `Some` when the
119 /// tree has at least one node, regardless of whether `found` is `true`.
120 pub bin_arc: Arc<RwLock<TreeNode>>,
121}
122
123/// The B+tree.
124///
125///
126///
127/// This is the main tree structure that manages the B+tree nodes and
128/// provides operations for search, insert, delete, and tree maintenance.
129pub struct Tree {
130 /// Database ID this tree belongs to.
131 database_id: u64,
132
133 /// Maximum entries per node (from config).
134 max_entries_per_node: usize,
135
136 /// Root of the tree. None if tree is empty.
137 ///
138 /// Wrapped in `RwLock` so that `insert`, `delete`, and other mutating
139 /// operations can take `&self` (interior mutability), enabling concurrent
140 /// access to different BIN nodes without requiring a global `&mut Tree`
141 /// borrow. The root pointer itself is only written during root splits
142 /// and initial creation; all other access is read-only.
143 ///
144 /// `Tree.root` protected by the root latch.
145 root: RwLock<Option<Arc<RwLock<TreeNode>>>>,
146
147 /// Latch protecting the root reference itself.
148 /// Must be held when changing the root pointer.
149 root_latch: SharedLatch,
150
151 /// LSN at which the current root IN/BIN was last logged.
152 ///
153 /// Used by the IN-redo currency check (`recover_root_bin` /
154 /// `recover_root_upper_in`) to decide whether a logged root replaces the
155 /// in-memory one. Updated whenever a new root is installed via
156 /// `set_root_with_lsn` or the IN-redo recover-root path.
157 ///
158 /// JE `RootUpdater.originalLsn` / `ChildReference.getLsn()` for the root.
159 root_log_lsn: RwLock<noxu_util::Lsn>,
160
161 /// Statistics: number of times the root has been split.
162 root_splits: AtomicU64,
163
164 /// Statistics: number of latch upgrades from shared to exclusive.
165 relatches_required: AtomicU64,
166
167 /// Optional custom key comparator for sorted-duplicate databases.
168 ///
169 /// When `Some`, all key comparisons in tree traversal (upper IN routing
170 /// and BIN entry search/insert/delete) use this comparator instead of
171 /// lexicographic byte comparison.
172 ///
173 /// / `dupComparator` stored on the
174 /// database and consulted at every `IN.findEntry()` call.
175 pub key_comparator: Option<KeyComparatorFn>,
176
177 /// Shared memory counter for the evictor / MemoryBudget.
178 ///
179 /// Updated on every BIN entry insert (+key+data+overhead) and delete
180 /// (-key+overhead) so the evictor sees real cache pressure.
181 ///
182 /// `env.getMemoryBudget().updateTreeMemoryUsage(delta)` call
183 /// in the equivalent `IN.updateMemorySize()`. In Noxu the counter is an
184 /// `Arc<AtomicI64>` shared with the `Arbiter` (and later `MemoryBudget`)
185 /// to avoid a circular crate dependency (`noxu-tree` → `noxu-dbi`).
186 pub memory_counter: Option<Arc<AtomicI64>>,
187
188 /// Optional listener fed on node add/access/remove, mirroring JE's
189 /// `INList` feeding the evictor's `LRUList`s.
190 ///
191 /// When `None` (the default — used by unit tests with no environment),
192 /// the notifications are no-ops. `EnvironmentImpl` installs the
193 /// `Evictor` here so production inserts/accesses populate the LRU lists
194 /// the evictor drains.
195 ///
196 /// JE reference: `IN.fetchTarget`/split/`rebuildINList` → `addBack`,
197 /// access → `moveBack`, removal → `remove`.
198 pub in_list_listener: Option<Arc<dyn InListListener>>,
199
200 /// Capacity hint for the recovery redo path.
201 ///
202 /// When non-zero, the first BIN created by `redo_insert` (the first-key
203 /// path) pre-allocates its `entries` Vec with this capacity so that
204 /// redo insertions proceed without Vec-resize doublings. The value is
205 /// clamped to `max_entries_per_node` at use.
206 ///
207 /// Set by `hint_redo_capacity` before the redo loop.
208 /// Wave 11-K optimisation (Fix 3).
209 redo_capacity_hint: usize,
210
211 /// Whether key-prefix compression is enabled for this tree's BINs.
212 ///
213 /// JE `DatabaseImpl.getKeyPrefixing()` / `DatabaseConfig.setKeyPrefixing()`.
214 /// When `false`, `IN.computeKeyPrefix` returns `null` in JE — no prefix
215 /// is ever set. Noxu mirrors this: `insert_with_prefix` is skipped in
216 /// favour of `insert_raw`, and `recompute_key_prefix` is not called on
217 /// BIN halves after a split.
218 ///
219 /// Default: `false` (matches JE's `DatabaseConfig.KEY_PREFIXING_DEFAULT`).
220 ///
221 /// Ref: `IN.java computeKeyPrefix` ~line 2456.
222 pub key_prefixing: bool,
223}
224
225/// A node in the tree.
226///
227/// TreeNode wraps an upper IN or a BIN. Each variant carries a lightweight
228/// stub whose fields mirror the persistent IN/BIN structure. The stubs will
229/// be replaced with full InNode/Bin types as the implementation matures; the
230/// API surface here is intentionally minimal.
231#[derive(Debug)]
232pub enum TreeNode {
233 /// Internal Node (IN) - non-leaf node in the tree.
234 Internal(InNodeStub),
235
236 /// Bottom Internal Node (BIN) - leaf-level internal node.
237 Bottom(BinStub),
238}
239
240/// Lightweight upper-IN representation used by the tree traversal layer.
241///
242/// `IN`: carries the dirty flag (IN_DIRTY_BIT), the LRU
243/// generation counter, and a weak back-pointer to the parent so that
244/// dirty state can be propagated upward.
245#[derive(Debug)]
246pub struct InNodeStub {
247 /// Node ID.
248 pub node_id: u64,
249 /// Level in tree.
250 pub level: i32,
251 /// Child entries (key, lsn, optional child).
252 pub entries: Vec<InEntry>,
253 /// Dirty flag — set whenever this node is modified.
254 /// `IN.dirty` (IN_DIRTY_BIT).
255 pub dirty: bool,
256 /// LRU generation counter for the evictor.
257 /// `IN.generation`.
258 pub generation: u64,
259 /// Weak back-pointer to parent IN.
260 /// Enables dirty-propagation and latch-coupling validation.
261 /// `IN.parent` reference used during splits and logging.
262 pub parent: Option<Weak<RwLock<TreeNode>>>,
263}
264
265/// Entry in an IN node.
266#[derive(Debug, Clone)]
267pub struct InEntry {
268 /// Key for this entry.
269 pub key: Vec<u8>,
270 /// LSN where child is stored.
271 pub lsn: Lsn,
272 /// Cached child node (if resident).
273 pub child: Option<Arc<RwLock<TreeNode>>>,
274}
275
276/// Lightweight BIN representation used by the tree traversal layer.
277///
278/// `BIN` (which extends `IN`): carries the dirty flag, LRU
279/// generation counter, and a weak back-pointer to the parent IN.
280///
281/// # Key Prefix Compression
282///
283/// BINs support key prefix compression. When
284/// `key_prefix` is non-empty the `key` field of every `BinEntry` stores only
285/// the *suffix* — the bytes after stripping the common leading bytes. The
286/// full key is reconstructed by prepending `key_prefix` to the stored suffix.
287///
288/// This is transparent to callers through the `get_full_key` / `find_entry`
289/// helpers on `BinStub`. The prefix is recomputed after every insert and
290/// after a split via `recompute_key_prefix`.
291#[derive(Debug)]
292pub struct BinStub {
293 /// Node ID.
294 pub node_id: u64,
295 /// Level (always BIN_LEVEL).
296 pub level: i32,
297 /// Entries. When `key_prefix` is non-empty the `key` field in each entry
298 /// is the *suffix* of the full key (leading `key_prefix` bytes stripped).
299 /// `IN.entryKeys` (suffix-only storage when prefixing is on).
300 pub entries: Vec<BinEntry>,
301 /// Common prefix shared by every key in this BIN.
302 /// Empty slice means no prefix compression is active.
303 /// `IN.keyPrefix`.
304 pub key_prefix: Vec<u8>,
305 /// Dirty flag — set whenever this BIN is modified.
306 /// `IN.dirty` (IN_DIRTY_BIT).
307 pub dirty: bool,
308 /// BIN-delta flag — true when this BIN contains only dirty (delta) slots
309 /// rather than a complete set of entries.
310 /// `IN.IN_DELTA_BIT` (the IN_DELTA_BIT flag inside `flags`).
311 pub is_delta: bool,
312 /// LSN at which this BIN was last logged as a full (non-delta) BIN.
313 ///
314 /// Used by the checkpoint path to construct `BINDeltaLogEntry.prev_full_lsn`
315 /// and to compare against `prev_delta_lsn` when deciding whether to write
316 /// a delta or a full BIN.
317 ///
318 /// `BIN.lastFullLsn`.
319 pub last_full_lsn: Lsn,
320 /// LSN at which this BIN was last logged as a BIN-delta.
321 ///
322 /// Written as `prev_delta_lsn` into the next `BINDeltaLogEntry` so the
323 /// cleaner's utilization tracker can mark the superseded delta obsolete.
324 /// Reset to `NULL_LSN` whenever a full BIN is written.
325 ///
326 /// `BIN.lastDeltaVersion` / `BIN.getLastDeltaLsn()`.
327 pub last_delta_lsn: Lsn,
328 /// LRU generation counter for the evictor.
329 /// `IN.generation`.
330 pub generation: u64,
331 /// Weak back-pointer to parent IN.
332 /// Enables dirty-propagation and latch-coupling validation.
333 pub parent: Option<Weak<RwLock<TreeNode>>>,
334 /// If true, `BinEntry.expiration_time` values in this BIN are packed hours
335 /// since epoch; if false, they are packed seconds since epoch.
336 ///
337 /// Default: `true` (hours, matching TTL resolution).
338 ///
339 /// `BIN.expirationInHours`.
340 pub expiration_in_hours: bool,
341 /// Number of cursors currently positioned on this BIN.
342 ///
343 /// The evictor skips BINs with a non-zero cursor count to avoid evicting
344 /// a node that a cursor is actively traversing. CursorImpl increments
345 /// this when positioning on a BIN and decrements it on reposition/close.
346 ///
347 /// `IN.cursorSet.size()` used by `Evictor.selectIN()`.
348 pub cursor_count: i32,
349 /// When true, the NEXT log of this BIN must be a full BIN, not a delta.
350 ///
351 /// Set after a dirty slot is removed (a delta would silently lose that
352 /// removal) and cleared after a full BIN is written. This is the
353 /// delta-chain bound: it forces a periodic full BIN so a delta never
354 /// references stale state.
355 ///
356 /// `IN.prohibitNextDelta` / `IN.setProhibitNextDelta` (IN.java:5013) /
357 /// `IN.getProhibitNextDelta`.
358 pub prohibit_next_delta: bool,
359}
360
361/// Entry in a BIN node.
362#[derive(Debug, Clone)]
363pub struct BinEntry {
364 /// Key for this entry. When the owning `BinStub.key_prefix` is non-empty
365 /// this stores only the suffix (bytes after the prefix is stripped).
366 pub key: Vec<u8>,
367 /// LSN where LN is stored.
368 pub lsn: Lsn,
369 /// Optional embedded data (for small records) or cached LN.
370 pub data: Option<Vec<u8>>,
371 /// True when this slot has been marked known-deleted (analogous to the
372 /// KNOWN_DELETED_BIT in `IN.entryStates`). The slot is eligible for
373 /// removal by `compress_bin()`.
374 pub known_deleted: bool,
375 /// True when this slot has been modified since the last full BIN log write.
376 ///
377 /// `IN.entryStates[i] & IN_DIRTY_BIT`. Used by the checkpoint
378 /// path to decide whether to write a BIN-delta (few dirty slots) or a
379 /// full BIN (many dirty slots).
380 pub dirty: bool,
381 /// Packed expiration time (0 = no expiration).
382 ///
383 /// When the owning `BinStub.expiration_in_hours` is true, this value is
384 /// hours since Unix epoch; otherwise it is seconds since Unix epoch.
385 ///
386 /// `IN.entryExpiration`.
387 pub expiration_time: u32,
388}
389
390impl BinStub {
391 /// TREE-F1: the single user-facing liveness predicate for a BIN slot.
392 ///
393 /// A slot is LIVE for reads/scans iff it is neither `known_deleted` nor
394 /// TTL-expired. This mirrors the two ways JE makes a slot read as ABSENT:
395 /// * `IN.findEntry` (IN.java:3197) returns -1 for a `known_deleted`
396 /// exact match;
397 /// * `CursorImpl.isProbablyExpired` / `lockAndGetCurrent`
398 /// (CursorImpl.java:2062-2064) skip `isEntryKnownDeleted` (and
399 /// expired) slots while stepping.
400 ///
401 /// KD slots legitimately exist in live BINs during BIN-delta
402 /// reconstitution until the compressor reclaims them; the maintenance
403 /// paths (compressor / recovery undo) iterate them on purpose and do NOT
404 /// use this predicate.
405 #[inline]
406 pub fn slot_is_live(&self, idx: usize) -> bool {
407 match self.entries.get(idx) {
408 Some(e) => {
409 !(e.known_deleted
410 || (e.expiration_time != 0
411 && noxu_util::ttl::is_expired(
412 e.expiration_time,
413 self.expiration_in_hours,
414 )))
415 }
416 None => false,
417 }
418 }
419
420 // ========================================================================
421 // Key prefix compression helpers
422 // IN.computeKeyPrefix / IN.recalcSuffixes / IN.getKey
423 // ========================================================================
424
425 /// Strips embedded LN data from non-dirty slots, freeing the heap
426 /// allocations of the per-slot value bytes while keeping the slot keys
427 /// and LSNs addressable. Used by the evictor's PartialEvict path: a
428 /// hot BIN is kept in cache so its descent path stays warm, but the LN
429 /// data is dropped to make room for hotter content. Subsequent reads
430 /// re-fetch the data from the log via the slot LSN.
431 ///
432 /// Skips slots that are still dirty (their data has not been written
433 /// to the log yet, so dropping the in-memory copy would lose the
434 /// update). Returns the number of bytes freed (sum of the lengths
435 /// of the dropped `Vec<u8>` data fields).
436 ///
437 /// Returns 0 if the BIN has any open cursors (the cursor may be
438 /// reading the data right now).
439 pub fn strip_lns(&mut self) -> usize {
440 if self.cursor_count > 0 {
441 return 0;
442 }
443 let mut freed = 0usize;
444 for entry in &mut self.entries {
445 if entry.dirty {
446 continue;
447 }
448 if let Some(data) = entry.data.take() {
449 freed = freed.saturating_add(data.len());
450 }
451 }
452 freed
453 }
454
455 /// Reconstruct the full key for slot `idx` by prepending the BIN's
456 /// current prefix to the stored suffix.
457 ///
458 /// `IN.getKey(int idx)`.
459 pub fn get_full_key(&self, idx: usize) -> Option<Vec<u8>> {
460 let suffix = self.entries.get(idx)?.key.as_slice();
461 if self.key_prefix.is_empty() {
462 Some(suffix.to_vec())
463 } else {
464 let mut full =
465 Vec::with_capacity(self.key_prefix.len() + suffix.len());
466 full.extend_from_slice(&self.key_prefix);
467 full.extend_from_slice(suffix);
468 Some(full)
469 }
470 }
471
472 /// Decompress a stored suffix back to a full key.
473 ///
474 /// `IN.getKey` used from outside: prepend `key_prefix` to
475 /// `suffix`. If `key_prefix` is empty the suffix *is* the full key.
476 pub fn decompress_key(&self, suffix: &[u8]) -> Vec<u8> {
477 if self.key_prefix.is_empty() {
478 suffix.to_vec()
479 } else {
480 let mut full =
481 Vec::with_capacity(self.key_prefix.len() + suffix.len());
482 full.extend_from_slice(&self.key_prefix);
483 full.extend_from_slice(suffix);
484 full
485 }
486 }
487
488 /// Strip the current prefix from a full key to obtain the stored suffix.
489 ///
490 /// `IN.computeKeySuffix(byte[] prefix, byte[] key)`.
491 ///
492 /// # Panics
493 /// Panics (debug only) if `full_key` does not start with `key_prefix`.
494 pub fn compress_key(&self, full_key: &[u8]) -> Vec<u8> {
495 let plen = self.key_prefix.len();
496 if plen == 0 {
497 full_key.to_vec()
498 } else {
499 debug_assert!(
500 full_key.starts_with(&self.key_prefix),
501 "compress_key: key does not start with current prefix"
502 );
503 full_key[plen..].to_vec()
504 }
505 }
506
507 /// Compute the longest common prefix of all full keys currently in this
508 /// BIN, optionally excluding the entry at `exclude_idx` (used during
509 /// insertions to ignore the slot that is about to be replaced).
510 ///
511 /// Returns an empty `Vec` if the BIN has fewer than 2 entries or if the
512 /// keys share no common leading bytes.
513 ///
514 /// `IN.computeKeyPrefix(int excludeIdx)`.
515 pub fn compute_key_prefix(&self, exclude_idx: Option<usize>) -> Vec<u8> {
516 // Need at least 2 entries to find a common prefix.
517 let n = self.entries.len();
518 if n < 2 {
519 return Vec::new();
520 }
521
522 // Pick the first non-excluded index as the seed.
523 let first_idx = match exclude_idx {
524 Some(0) => 1,
525 _ => 0,
526 };
527
528 // The current prefix_len is taken from the seed full key.
529 let seed_full = match self.get_full_key(first_idx) {
530 Some(k) => k,
531 None => return Vec::new(),
532 };
533 let mut prefix_len = seed_full.len();
534
535 // Compare every other non-excluded entry against the running prefix.
536 // Iterate all entries (byteOrdered disabled in too).
537 for i in (first_idx + 1)..n {
538 if let Some(ex) = exclude_idx
539 && i == ex
540 {
541 continue;
542 }
543 let full_key = match self.get_full_key(i) {
544 Some(k) => k,
545 None => continue,
546 };
547 let new_len =
548 get_key_prefix_length(&seed_full[..prefix_len], &full_key);
549 if new_len < prefix_len {
550 prefix_len = new_len;
551 }
552 if prefix_len == 0 {
553 return Vec::new();
554 }
555 }
556
557 seed_full[..prefix_len].to_vec()
558 }
559
560 /// Recompute the key prefix from scratch and re-encode every stored suffix.
561 ///
562 /// Call this after bulk inserts, splits, or merges.
563 ///
564 /// `IN.recalcKeyPrefix()` → `IN.recalcSuffixes(newPrefix, …)`.
565 pub fn recompute_key_prefix(&mut self) {
566 let new_prefix = self.compute_key_prefix(None);
567 self.apply_new_prefix(new_prefix);
568 }
569
570 /// Apply `new_prefix` as the BIN's key prefix, re-encoding all stored
571 /// suffixes from the old prefix into the new one.
572 ///
573 /// This is the Rust.
574 fn apply_new_prefix(&mut self, new_prefix: Vec<u8>) {
575 // Reconstruct all full keys (using old prefix), then re-encode with
576 // the new prefix.
577 let full_keys: Vec<Vec<u8>> = (0..self.entries.len())
578 .map(|i| self.get_full_key(i).unwrap_or_default())
579 .collect();
580
581 self.key_prefix = new_prefix;
582
583 for (i, full_key) in full_keys.into_iter().enumerate() {
584 self.entries[i].key = self.compress_key(&full_key);
585 }
586 }
587
588 /// Binary-search this BIN for `full_key` (a full, uncompressed key).
589 ///
590 /// The stored suffixes are compared after stripping the current prefix
591 /// from `full_key`, so the search is done entirely in suffix-space — no
592 /// heap allocation needed in the happy path.
593 ///
594 /// Returns `(idx, exact)` where:
595 /// - `idx` is the slot index (or insertion point when `exact == false`).
596 /// - `exact` is `true` when an exact match was found.
597 ///
598 /// `IN.findEntry(key, indicateIfDuplicate, exact)`.
599 pub fn find_entry_compressed(&self, full_key: &[u8]) -> (usize, bool) {
600 let plen = self.key_prefix.len();
601 // Check that the key shares the current prefix; if not it cannot be
602 // present and we return the appropriate insertion point.
603 if plen > 0
604 && (full_key.len() < plen
605 || &full_key[..plen] != self.key_prefix.as_slice())
606 {
607 // The key does not share the current prefix.
608 // Determine insertion point using full-key comparison.
609 let pos = self.entries.partition_point(|e| {
610 self.decompress_key(&e.key).as_slice() < full_key
611 });
612 return (pos, false);
613 }
614 let suffix = &full_key[plen..];
615 match self.entries.binary_search_by(|e| e.key.as_slice().cmp(suffix)) {
616 Ok(idx) => (idx, true),
617 Err(idx) => (idx, false),
618 }
619 }
620
621 /// Insert or update a full (uncompressed) key in this BIN.
622 ///
623 /// After insertion the key prefix is recomputed; if the prefix changes all
624 /// stored suffixes are re-encoded.
625 ///
626 /// Returns `(slot_index, is_new_insert)`.
627 ///
628 /// `IN.setKey` / BIN insert path.
629 pub fn insert_with_prefix(
630 &mut self,
631 full_key: Vec<u8>,
632 lsn: Lsn,
633 data: Option<Vec<u8>>,
634 ) -> (usize, bool) {
635 // Is the current prefix still compatible with this key?
636 let plen = self.key_prefix.len();
637 let new_len = if plen > 0 {
638 get_key_prefix_length(&self.key_prefix, &full_key)
639 } else {
640 0
641 };
642
643 // If the new key shrinks the prefix we must re-encode everything first.
644 if plen > 0 && new_len < plen {
645 // Compute new prefix considering the incoming key and
646 // all existing full keys. We pass `None` for exclude_idx because
647 // the slot for this key does not yet exist.
648 let mut candidate = self.compute_key_prefix(None);
649 // Also constrain by the new key itself.
650 if !candidate.is_empty() {
651 let cl = get_key_prefix_length(&candidate, &full_key);
652 candidate.truncate(cl);
653 } else {
654 // No existing prefix; try to build one from the new key
655 // against the existing full keys.
656 if !self.entries.is_empty()
657 && let Some(first_full) = self.get_full_key(0)
658 {
659 candidate = create_key_prefix(&first_full, &full_key)
660 .unwrap_or_default();
661 for i in 1..self.entries.len() {
662 if candidate.is_empty() {
663 break;
664 }
665 if let Some(fk) = self.get_full_key(i) {
666 let l = get_key_prefix_length(&candidate, &fk);
667 candidate.truncate(l);
668 }
669 }
670 }
671 }
672 self.apply_new_prefix(candidate);
673 }
674
675 // Compress the new key under the (possibly updated) prefix.
676 let suffix = self.compress_key(&full_key);
677
678 match self.entries.binary_search_by(|e| e.key.as_slice().cmp(&suffix)) {
679 Ok(idx) => {
680 // Key exists — update in place.
681 self.entries[idx].lsn = lsn;
682 self.entries[idx].data = data;
683 // Mark slot dirty: this slot changed since the last full BIN log.
684 // `IN.setDirtyEntry(idx)`.
685 self.entries[idx].dirty = true;
686 (idx, false)
687 }
688 Err(idx) => {
689 // New key — insert in sorted position.
690 // New slots start dirty: they have never been logged in any BIN.
691 // `IN.setDirtyEntry(idx)` called after `insertEntry`.
692 self.entries.insert(
693 idx,
694 BinEntry {
695 key: suffix,
696 lsn,
697 data,
698 known_deleted: false,
699 dirty: true,
700 expiration_time: 0,
701 },
702 );
703 // After insertion, if there is no prefix yet, try to establish one.
704 if self.key_prefix.is_empty() && self.entries.len() >= 2 {
705 self.recompute_key_prefix();
706 }
707 (idx, true)
708 }
709 }
710 }
711
712 /// Slice-based variant of [`BinStub::insert_with_prefix`] for the recovery redo path.
713 ///
714 /// Accepts `key` and `data` as `&[u8]` slices instead of owned `Vec<u8>`,
715 /// eliminating the intermediate `Vec<u8>` that `redo_ln` would otherwise
716 /// allocate before crossing the BIN boundary. The compressed suffix and
717 /// the data bytes are each copied into the `BinEntry` exactly once.
718 ///
719 /// Semantics are identical to `insert_with_prefix`:
720 /// - Updates the slot in place when the key already exists.
721 /// - Inserts a new sorted entry when absent, recomputing the key prefix.
722 ///
723 /// Wave 11-K optimisation (Fix 1).
724 pub fn insert_with_prefix_slice(
725 &mut self,
726 full_key: &[u8],
727 lsn: Lsn,
728 data: Option<&[u8]>,
729 ) -> (usize, bool) {
730 let plen = self.key_prefix.len();
731 let new_len = if plen > 0 {
732 get_key_prefix_length(&self.key_prefix, full_key)
733 } else {
734 0
735 };
736
737 if plen > 0 && new_len < plen {
738 let mut candidate = self.compute_key_prefix(None);
739 if !candidate.is_empty() {
740 let cl = get_key_prefix_length(&candidate, full_key);
741 candidate.truncate(cl);
742 } else {
743 if !self.entries.is_empty()
744 && let Some(first_full) = self.get_full_key(0)
745 {
746 candidate = create_key_prefix(&first_full, full_key)
747 .unwrap_or_default();
748 for i in 1..self.entries.len() {
749 if candidate.is_empty() {
750 break;
751 }
752 if let Some(fk) = self.get_full_key(i) {
753 let l = get_key_prefix_length(&candidate, &fk);
754 candidate.truncate(l);
755 }
756 }
757 }
758 }
759 self.apply_new_prefix(candidate);
760 }
761
762 let suffix = self.compress_key(full_key);
763
764 match self.entries.binary_search_by(|e| e.key.as_slice().cmp(&suffix)) {
765 Ok(idx) => {
766 self.entries[idx].lsn = lsn;
767 self.entries[idx].data = data.map(|d| d.to_vec());
768 self.entries[idx].dirty = true;
769 (idx, false)
770 }
771 Err(idx) => {
772 self.entries.insert(
773 idx,
774 BinEntry {
775 key: suffix,
776 lsn,
777 data: data.map(|d| d.to_vec()),
778 known_deleted: false,
779 dirty: true,
780 expiration_time: 0,
781 },
782 );
783 if self.key_prefix.is_empty() && self.entries.len() >= 2 {
784 self.recompute_key_prefix();
785 }
786 (idx, true)
787 }
788 }
789 }
790
791 /// Returns the number of slots that are marked dirty.
792 ///
793 /// `BIN.getNumDirtyEntries()`.
794 pub fn dirty_count(&self) -> usize {
795 self.entries.iter().filter(|e| e.dirty).count()
796 }
797
798 /// Decide whether to log this BIN as a delta (true) or a full BIN (false).
799 ///
800 /// Faithful port of JE `BIN.shouldLogDelta()` (BIN.java:1892). The
801 /// decision is COUNT-based (number of would-be delta slots vs a percent of
802 /// `nEntries`), NOT a dirty-fraction-vs-hardcoded-0.25 heuristic:
803 ///
804 /// ```text
805 /// if (isBINDelta()) { return true; } // already a delta
806 /// if (isDeltaProhibited()) return false; // prohibit / no prior full
807 /// numDeltas = getNDeltas();
808 /// if (numDeltas <= 0) return false; // empty delta is invalid
809 /// deltaLimit = (getNEntries() * binDeltaPercent) / 100; // INTEGER math
810 /// return numDeltas <= deltaLimit;
811 /// ```
812 ///
813 /// `numDeltas` (JE `getNDeltas`) is the count of slots that would appear in
814 /// the delta — i.e. the dirty slots since the last full BIN — which here is
815 /// `dirty_count()`. `binDeltaPercent` is the CONFIGURABLE `TREE_BIN_DELTA`
816 /// param (JE `DatabaseImpl.getBinDeltaPercent()`, default 25), threaded in
817 /// by the checkpointer — NOT a hardcoded constant.
818 ///
819 /// `isDeltaProhibited()` (BIN.java:1867) is
820 /// `getProhibitNextDelta() || isDeferredWriteMode() || lastFullLsn == NULL`.
821 /// Deferred-write mode is not modelled in the runtime stub; the other two
822 /// terms are.
823 ///
824 /// JE ref: `BIN.shouldLogDelta` (BIN.java:1892), `BIN.isDeltaProhibited`
825 /// (BIN.java:1867).
826 pub fn should_log_delta(&self, bin_delta_percent: i32) -> bool {
827 // Already a delta: re-log as a delta. JE asserts !prohibitNextDelta
828 // and lastFullLsn != NULL here.
829 if self.is_delta {
830 return self.last_full_lsn != NULL_LSN && !self.prohibit_next_delta;
831 }
832
833 // isDeltaProhibited(): cheapest checks first.
834 if self.prohibit_next_delta || self.last_full_lsn == NULL_LSN {
835 return false;
836 }
837
838 // numDeltas = getNDeltas(): the dirty slots that would be in the delta.
839 let num_deltas = self.dirty_count() as i32;
840
841 // A delta with zero items is not valid.
842 if num_deltas <= 0 {
843 return false;
844 }
845
846 // Configured BinDeltaPercent limit — INTEGER math, exactly as JE.
847 let delta_limit = (self.entries.len() as i32 * bin_delta_percent) / 100;
848 num_deltas <= delta_limit
849 }
850
851 /// Comparator-aware binary search: finds `full_key` using `cmp`.
852 ///
853 /// Unlike `find_entry_compressed` (which uses suffix-based lexicographic
854 /// comparison), this decompresses each entry's key to its full form and
855 /// applies the provided comparator — required for sorted-dup databases
856 /// where lexicographic suffix comparison would give wrong results when
857 /// different-length primary keys are in the same BIN.
858 ///
859 /// Returns `(idx, exact)`. Does NOT do prefix compression.
860 ///
861 /// `IN.findEntry` with btreeComparator active.
862 pub fn find_entry_cmp(
863 &self,
864 full_key: &[u8],
865 cmp: &dyn Fn(&[u8], &[u8]) -> std::cmp::Ordering,
866 ) -> (usize, bool) {
867 // Hot path: avoid per-comparison Vec<u8> allocation.
868 // When key_prefix is empty the stored suffix IS the full key, so we
869 // pass the suffix slice directly. When prefix is non-empty we build a
870 // temporary concatenation only once per comparison using a small
871 // stack-local Vec that is dropped immediately after the call — this
872 // still allocates but is limited to O(key_len) bytes per call and
873 // avoids retaining any heap state between comparisons.
874 if self.key_prefix.is_empty() {
875 match self
876 .entries
877 .binary_search_by(|e| cmp(e.key.as_slice(), full_key))
878 {
879 Ok(idx) => (idx, true),
880 Err(idx) => (idx, false),
881 }
882 } else {
883 let prefix = self.key_prefix.as_slice();
884 match self.entries.binary_search_by(|e| {
885 let mut fk = Vec::with_capacity(prefix.len() + e.key.len());
886 fk.extend_from_slice(prefix);
887 fk.extend_from_slice(&e.key);
888 cmp(&fk, full_key)
889 }) {
890 Ok(idx) => (idx, true),
891 Err(idx) => (idx, false),
892 }
893 }
894 }
895
896 /// Returns the LSN of the slot matching `full_key`, if one exists.
897 ///
898 /// Used by the recovery LN-redo apply to enforce JE's currency check
899 /// (`RecoveryManager.redo()` line ~2512): a logged LN is applied only
900 /// when `logrecLsn > treeLsn`. Returns `None` when the key is absent
901 /// (always apply). Uses the same lookup variant the matching insert
902 /// path uses so the comparison is over the right slot.
903 pub fn redo_slot_lsn(
904 &self,
905 full_key: &[u8],
906 cmp: Option<&dyn Fn(&[u8], &[u8]) -> std::cmp::Ordering>,
907 key_prefixing: bool,
908 ) -> Option<Lsn> {
909 let (idx, found) = match cmp {
910 Some(c) => self.find_entry_cmp(full_key, c),
911 None if key_prefixing => self.find_entry_compressed(full_key),
912 None => {
913 // insert_raw path: full keys stored verbatim.
914 match self
915 .entries
916 .binary_search_by(|e| e.key.as_slice().cmp(full_key))
917 {
918 Ok(idx) => (idx, true),
919 Err(idx) => (idx, false),
920 }
921 }
922 };
923 if found { Some(self.entries[idx].lsn) } else { None }
924 }
925
926 /// Raw insert (no prefix compression) for databases with
927 /// `key_prefixing = false`.
928 ///
929 /// JE `IN.computeKeyPrefix` returns `null` when
930 /// `databaseImpl.getKeyPrefixing()` is `false`, so no prefix is ever
931 /// set on those BINs. Noxu was previously ignoring the flag and always
932 /// calling `insert_with_prefix`; this method provides the faithful path.
933 ///
934 /// The key is stored verbatim (no suffix stripping). An existing
935 /// `key_prefix` on the BIN is left untouched; callers must ensure it is
936 /// empty (split_child already guarantees this for new BINs when
937 /// `key_prefixing = false`).
938 ///
939 /// Returns `(slot_index, is_new_insert)`.
940 ///
941 /// Ref: `IN.java computeKeyPrefix` ~line 2456,
942 /// `DatabaseConfig.setKeyPrefixing` / `DatabaseImpl.getKeyPrefixing`.
943 pub fn insert_raw(
944 &mut self,
945 full_key: Vec<u8>,
946 lsn: Lsn,
947 data: Option<Vec<u8>>,
948 ) -> (usize, bool) {
949 // Binary search on the stored (full) keys.
950 match self.entries.binary_search_by(|e| {
951 // When key_prefix is empty entries store full keys directly.
952 // If somehow a prefix exists (shouldn't happen for key_prefixing
953 // DBs), reconstruct. ponytail: no prefix expected here — if we
954 // see one it is a configuration bug, not a data-path concern.
955 let stored: &[u8] = if self.key_prefix.is_empty() {
956 &e.key
957 } else {
958 // fallback: compare as if prefix is empty (best effort)
959 &e.key
960 };
961 stored.cmp(full_key.as_slice())
962 }) {
963 Ok(idx) => {
964 self.entries[idx].lsn = lsn;
965 self.entries[idx].data = data;
966 self.entries[idx].dirty = true;
967 (idx, false)
968 }
969 Err(idx) => {
970 self.entries.insert(
971 idx,
972 BinEntry {
973 key: full_key,
974 lsn,
975 data,
976 known_deleted: false,
977 dirty: true,
978 expiration_time: 0,
979 },
980 );
981 (idx, true)
982 }
983 }
984 }
985
986 /// Comparator-aware insert: inserts `full_key` into the BIN using `cmp`.
987 ///
988 /// Prefix compression is DISABLED: the key is stored as-is. This is
989 /// intentional for sorted-dup databases where the custom comparator
990 /// requires full-key access at every comparison.
991 ///
992 /// Returns `(slot_index, is_new_insert)`.
993 ///
994 pub fn insert_cmp(
995 &mut self,
996 full_key: Vec<u8>,
997 lsn: Lsn,
998 data: Option<Vec<u8>>,
999 cmp: &dyn Fn(&[u8], &[u8]) -> std::cmp::Ordering,
1000 ) -> (usize, bool) {
1001 if self.key_prefix.is_empty() {
1002 match self
1003 .entries
1004 .binary_search_by(|e| cmp(e.key.as_slice(), &full_key))
1005 {
1006 Ok(idx) => {
1007 self.entries[idx].lsn = lsn;
1008 self.entries[idx].data = data;
1009 self.entries[idx].dirty = true;
1010 (idx, false)
1011 }
1012 Err(idx) => {
1013 self.entries.insert(
1014 idx,
1015 BinEntry {
1016 key: full_key,
1017 lsn,
1018 data,
1019 known_deleted: false,
1020 dirty: true,
1021 expiration_time: 0,
1022 },
1023 );
1024 (idx, true)
1025 }
1026 }
1027 } else {
1028 let prefix = self.key_prefix.clone();
1029 match self.entries.binary_search_by(|e| {
1030 let mut fk = Vec::with_capacity(prefix.len() + e.key.len());
1031 fk.extend_from_slice(&prefix);
1032 fk.extend_from_slice(&e.key);
1033 cmp(&fk, &full_key)
1034 }) {
1035 Ok(idx) => {
1036 // Key exists — update in place.
1037 self.entries[idx].lsn = lsn;
1038 self.entries[idx].data = data;
1039 self.entries[idx].dirty = true;
1040 (idx, false)
1041 }
1042 Err(idx) => {
1043 // New key — insert at sorted position (no prefix compression).
1044 self.entries.insert(
1045 idx,
1046 BinEntry {
1047 key: full_key,
1048 lsn,
1049 data,
1050 known_deleted: false,
1051 dirty: true,
1052 expiration_time: 0,
1053 },
1054 );
1055 (idx, true)
1056 }
1057 }
1058 }
1059 }
1060
1061 /// Comparator-aware delete: removes `full_key` from the BIN using `cmp`.
1062 ///
1063 /// Returns `true` if the entry was found and removed.
1064 pub fn delete_cmp(
1065 &mut self,
1066 full_key: &[u8],
1067 cmp: &dyn Fn(&[u8], &[u8]) -> std::cmp::Ordering,
1068 ) -> bool {
1069 let result = if self.key_prefix.is_empty() {
1070 self.entries.binary_search_by(|e| cmp(e.key.as_slice(), full_key))
1071 } else {
1072 let prefix = self.key_prefix.clone();
1073 self.entries.binary_search_by(|e| {
1074 let mut fk = Vec::with_capacity(prefix.len() + e.key.len());
1075 fk.extend_from_slice(&prefix);
1076 fk.extend_from_slice(&e.key);
1077 cmp(&fk, full_key)
1078 })
1079 };
1080 match result {
1081 Ok(idx) => {
1082 self.entries.remove(idx);
1083 self.dirty = true;
1084 true
1085 }
1086 Err(_) => false,
1087 }
1088 }
1089
1090 /// Serialise ALL entries (full BIN write).
1091 ///
1092 /// Format (per slot): key_len(u32BE) | key | lsn(u64BE) |
1093 /// has_data(u8) | data_len(u32BE) | data | known_deleted(u8)
1094 ///
1095 /// Prepended by: node_id(u64BE) | num_entries(u32BE).
1096 ///
1097 /// `BIN.writeToLog()` (non-delta path).
1098 pub fn serialize_full(&self) -> Vec<u8> {
1099 let mut buf = Vec::new();
1100 buf.extend_from_slice(&self.node_id.to_be_bytes());
1101 buf.extend_from_slice(&(self.entries.len() as u32).to_be_bytes());
1102 for i in 0..self.entries.len() {
1103 let full_key = self.get_full_key(i).unwrap_or_default();
1104 buf.extend_from_slice(&(full_key.len() as u32).to_be_bytes());
1105 buf.extend_from_slice(&full_key);
1106 let e = &self.entries[i];
1107 buf.extend_from_slice(&e.lsn.as_u64().to_be_bytes());
1108 if let Some(d) = &e.data {
1109 buf.push(1u8);
1110 buf.extend_from_slice(&(d.len() as u32).to_be_bytes());
1111 buf.extend_from_slice(d);
1112 } else {
1113 buf.push(0u8);
1114 }
1115 buf.push(e.known_deleted as u8);
1116 }
1117 buf
1118 }
1119
1120 /// Serialise only dirty slots (BIN-delta write).
1121 ///
1122 /// Format (per dirty slot): slot_idx(u32BE) | key_len(u32BE) | key |
1123 /// lsn(u64BE) | has_data(u8) | data_len(u32BE) | data | known_deleted(u8)
1124 ///
1125 /// Prepended by: node_id(u64BE) | num_dirty(u32BE).
1126 ///
1127 /// `BIN.writeToLog()` (delta path).
1128 pub fn serialize_delta(&self) -> Vec<u8> {
1129 let dirty: Vec<usize> = (0..self.entries.len())
1130 .filter(|&i| self.entries[i].dirty)
1131 .collect();
1132 let mut buf = Vec::new();
1133 buf.extend_from_slice(&self.node_id.to_be_bytes());
1134 buf.extend_from_slice(&(dirty.len() as u32).to_be_bytes());
1135 for idx in dirty {
1136 buf.extend_from_slice(&(idx as u32).to_be_bytes());
1137 let full_key = self.get_full_key(idx).unwrap_or_default();
1138 buf.extend_from_slice(&(full_key.len() as u32).to_be_bytes());
1139 buf.extend_from_slice(&full_key);
1140 let e = &self.entries[idx];
1141 buf.extend_from_slice(&e.lsn.as_u64().to_be_bytes());
1142 if let Some(d) = &e.data {
1143 buf.push(1u8);
1144 buf.extend_from_slice(&(d.len() as u32).to_be_bytes());
1145 buf.extend_from_slice(d);
1146 } else {
1147 buf.push(0u8);
1148 }
1149 buf.push(e.known_deleted as u8);
1150 }
1151 buf
1152 }
1153
1154 /// Deserialise a full BIN from the bytes produced by `serialize_full()`.
1155 ///
1156 /// Returns a `BinStub` with all entries populated and all slots marked
1157 /// clean (they are already on disk at `last_full_lsn`). Returns `None`
1158 /// if the byte slice is malformed.
1159 ///
1160 /// `INLogEntry.readEntry()` / `IN.readFromLog()` (non-delta).
1161 pub fn deserialize_full(bytes: &[u8]) -> Option<BinStub> {
1162 if bytes.len() < 12 {
1163 return None;
1164 }
1165 let node_id = u64::from_be_bytes(bytes[0..8].try_into().ok()?);
1166 let num_entries =
1167 u32::from_be_bytes(bytes[8..12].try_into().ok()?) as usize;
1168 let mut pos = 12usize;
1169 let mut entries = Vec::with_capacity(num_entries);
1170 for _ in 0..num_entries {
1171 // key_len(u32BE) | key | lsn(u64BE) | has_data(u8) [| data_len(u32BE) | data] | known_deleted(u8)
1172 if pos + 4 > bytes.len() {
1173 return None;
1174 }
1175 let key_len =
1176 u32::from_be_bytes(bytes[pos..pos + 4].try_into().ok()?)
1177 as usize;
1178 pos += 4;
1179 if pos + key_len > bytes.len() {
1180 return None;
1181 }
1182 let key = bytes[pos..pos + key_len].to_vec();
1183 pos += key_len;
1184 if pos + 8 > bytes.len() {
1185 return None;
1186 }
1187 let lsn = Lsn::from_u64(u64::from_be_bytes(
1188 bytes[pos..pos + 8].try_into().ok()?,
1189 ));
1190 pos += 8;
1191 if pos + 1 > bytes.len() {
1192 return None;
1193 }
1194 let has_data = bytes[pos] != 0;
1195 pos += 1;
1196 let data = if has_data {
1197 if pos + 4 > bytes.len() {
1198 return None;
1199 }
1200 let data_len =
1201 u32::from_be_bytes(bytes[pos..pos + 4].try_into().ok()?)
1202 as usize;
1203 pos += 4;
1204 if pos + data_len > bytes.len() {
1205 return None;
1206 }
1207 let d = bytes[pos..pos + data_len].to_vec();
1208 pos += data_len;
1209 Some(d)
1210 } else {
1211 None
1212 };
1213 if pos + 1 > bytes.len() {
1214 return None;
1215 }
1216 let known_deleted = bytes[pos] != 0;
1217 pos += 1;
1218 entries.push(BinEntry {
1219 key,
1220 lsn,
1221 data,
1222 known_deleted,
1223 dirty: false, // freshly loaded from log — clean
1224 expiration_time: 0,
1225 });
1226 }
1227 // Keys stored in the serialized format are full (uncompressed) keys.
1228 // Re-establish the key prefix after loading so that memory use and
1229 // search performance match an in-memory BIN.
1230 // `IN.readFromLog()` → key prefix is part of the wire
1231 // format in the; in Noxu we store full keys and recompute on load.
1232 let mut bin = BinStub {
1233 node_id,
1234 level: BIN_LEVEL,
1235 entries,
1236 key_prefix: Vec::new(),
1237 dirty: false,
1238 is_delta: false,
1239 last_full_lsn: NULL_LSN, // caller sets this to the logged LSN
1240 last_delta_lsn: NULL_LSN,
1241 generation: 0,
1242 parent: None,
1243 expiration_in_hours: true,
1244 cursor_count: 0,
1245 prohibit_next_delta: false,
1246 };
1247 // Recompute key prefix from the full keys just loaded.
1248 // `IN.recalcKeyPrefix()` called after materializing from log.
1249 if bin.entries.len() >= 2 {
1250 bin.recompute_key_prefix();
1251 }
1252 Some(bin)
1253 }
1254
1255 /// Deserialise a BIN delta from the bytes produced by `serialize_delta()`.
1256 ///
1257 /// **DO NOT USE for BIN reconstruction.** This helper writes full
1258 /// (uncompressed) keys directly into slots without recomputing the BIN
1259 /// key prefix, so on a prefix-compressed BIN it corrupts the slot keys and
1260 /// breaks the sorted-suffix invariant. It is NOT wired into any live path.
1261 /// The correct delta-reconstruction path is
1262 /// `mutate_to_full_bin` → `apply_delta_to_bin` → `insert_with_prefix`,
1263 /// which recomputes the prefix. This function is retained only for the
1264 /// raw byte-format round-trip and must not be used to reconstitute a BIN.
1265 /// Tracked for removal — see the v3.x review synthesis (storage C-2).
1266 ///
1267 /// Returns `None` if `delta_bytes` is malformed.
1268 pub fn apply_delta(base: &mut BinStub, delta_bytes: &[u8]) -> Option<()> {
1269 if delta_bytes.len() < 12 {
1270 return None;
1271 }
1272 // node_id(u64BE) — must match base
1273 let _node_id = u64::from_be_bytes(delta_bytes[0..8].try_into().ok()?);
1274 let num_dirty =
1275 u32::from_be_bytes(delta_bytes[8..12].try_into().ok()?) as usize;
1276 let mut pos = 12usize;
1277 for _ in 0..num_dirty {
1278 // slot_idx(u32BE) | key_len(u32BE) | key | lsn(u64BE) | has_data(u8) [| data_len | data] | known_deleted(u8)
1279 if pos + 4 > delta_bytes.len() {
1280 return None;
1281 }
1282 let slot_idx =
1283 u32::from_be_bytes(delta_bytes[pos..pos + 4].try_into().ok()?)
1284 as usize;
1285 pos += 4;
1286 if pos + 4 > delta_bytes.len() {
1287 return None;
1288 }
1289 let key_len =
1290 u32::from_be_bytes(delta_bytes[pos..pos + 4].try_into().ok()?)
1291 as usize;
1292 pos += 4;
1293 if pos + key_len > delta_bytes.len() {
1294 return None;
1295 }
1296 let key = delta_bytes[pos..pos + key_len].to_vec();
1297 pos += key_len;
1298 if pos + 8 > delta_bytes.len() {
1299 return None;
1300 }
1301 let lsn = Lsn::from_u64(u64::from_be_bytes(
1302 delta_bytes[pos..pos + 8].try_into().ok()?,
1303 ));
1304 pos += 8;
1305 if pos + 1 > delta_bytes.len() {
1306 return None;
1307 }
1308 let has_data = delta_bytes[pos] != 0;
1309 pos += 1;
1310 let data = if has_data {
1311 if pos + 4 > delta_bytes.len() {
1312 return None;
1313 }
1314 let data_len = u32::from_be_bytes(
1315 delta_bytes[pos..pos + 4].try_into().ok()?,
1316 ) as usize;
1317 pos += 4;
1318 if pos + data_len > delta_bytes.len() {
1319 return None;
1320 }
1321 let d = delta_bytes[pos..pos + data_len].to_vec();
1322 pos += data_len;
1323 Some(d)
1324 } else {
1325 None
1326 };
1327 if pos + 1 > delta_bytes.len() {
1328 return None;
1329 }
1330 let known_deleted = delta_bytes[pos] != 0;
1331 pos += 1;
1332
1333 // Apply to base: update existing slot or insert new one.
1334 if slot_idx < base.entries.len() {
1335 base.entries[slot_idx].key = key;
1336 base.entries[slot_idx].lsn = lsn;
1337 base.entries[slot_idx].data = data;
1338 base.entries[slot_idx].known_deleted = known_deleted;
1339 base.entries[slot_idx].dirty = false;
1340 } else {
1341 // Slot index beyond current length — append.
1342 base.entries.push(BinEntry {
1343 key,
1344 lsn,
1345 data,
1346 known_deleted,
1347 dirty: false,
1348 expiration_time: 0,
1349 });
1350 }
1351 }
1352 Some(())
1353 }
1354
1355 /// Clear per-slot dirty flags and record `logged_at` as the LSN at which
1356 /// this BIN was last fully logged.
1357 ///
1358 /// Called by the checkpoint path after a successful full-BIN log write.
1359 /// `BIN.afterLog()` / `BIN.setLastFullLsn()`.
1360 pub fn clear_dirty_after_full_log(&mut self, logged_at: Lsn) {
1361 for e in &mut self.entries {
1362 e.dirty = false;
1363 }
1364 self.last_full_lsn = logged_at;
1365 self.dirty = false;
1366 // A full BIN captures all current state, so the delta-chain bound is
1367 // cleared: the next log may once again be a delta.
1368 // JE `IN.afterLog` clears the prohibit flag after a full log
1369 // (IN.java:5557 `bin.setProhibitNextDelta(false)`).
1370 self.prohibit_next_delta = false;
1371 }
1372
1373 /// Clear per-slot dirty flags after a successful delta log write.
1374 ///
1375 /// `last_full_lsn` is NOT updated — the full LSN only changes after a
1376 /// full BIN write.
1377 /// `BIN.afterLog()` (delta path).
1378 pub fn clear_dirty_after_delta_log(&mut self) {
1379 for e in &mut self.entries {
1380 e.dirty = false;
1381 }
1382 self.dirty = false;
1383 }
1384}
1385
1386impl TreeNode {
1387 /// Returns true if this is a BIN (bottom internal node).
1388 pub fn is_bin(&self) -> bool {
1389 matches!(self, TreeNode::Bottom(_))
1390 }
1391
1392 /// Returns the level of this node.
1393 pub fn level(&self) -> i32 {
1394 match self {
1395 TreeNode::Internal(n) => n.level,
1396 TreeNode::Bottom(b) => b.level,
1397 }
1398 }
1399
1400 /// Faithful in-memory heap footprint of this node, in bytes.
1401 ///
1402 /// JE `IN.getBudgetedMemorySize()` (IN.java) returns the running
1403 /// `inMemorySize` that `MemoryBudget` tracks for the node: the fixed
1404 /// IN/BIN struct overhead plus, per slot, the fixed entry overhead and the
1405 /// variable key (and embedded-LN data for BINs) bytes. This is the single
1406 /// source of truth for both the live tree accounting and the evictor's
1407 /// detach credit (EV-13) — keeping it on `TreeNode` avoids the formula
1408 /// drifting between `noxu-tree` and `noxu-evictor`.
1409 ///
1410 /// Rust has a fixed struct layout (unlike JE's `Sizeof`-measured JVM
1411 /// constants) so `size_of` is exact for the fixed overheads; the variable
1412 /// part mirrors JE's per-slot `entryKeys`/embedded-data accounting.
1413 pub fn budgeted_memory_size(&self) -> u64 {
1414 use std::mem::size_of;
1415 match self {
1416 TreeNode::Bottom(b) => {
1417 (size_of::<BinStub>()
1418 + b.entries.len() * size_of::<BinEntry>()
1419 + b.key_prefix.len()
1420 + b.entries
1421 .iter()
1422 .map(|e| {
1423 e.key.len()
1424 + e.data.as_ref().map(|d| d.len()).unwrap_or(0)
1425 })
1426 .sum::<usize>()) as u64
1427 }
1428 TreeNode::Internal(n) => {
1429 (size_of::<InNodeStub>()
1430 + n.entries.len() * size_of::<InEntry>()
1431 + n.entries.iter().map(|e| e.key.len()).sum::<usize>())
1432 as u64
1433 }
1434 }
1435 }
1436
1437 /// Binary search for a key in this node.
1438 ///
1439 /// For BIN nodes the search is prefix-aware: if the BIN has a key prefix,
1440 /// `key` (a full, uncompressed key) is compared against stored suffixes
1441 /// after stripping the prefix.
1442 /// `IN.findEntry(key, indicateIfDuplicate, exact)`.
1443 ///
1444 /// Returns index with EXACT_MATCH flag set if exact match found.
1445 /// If exact is false, returns insertion point.
1446 pub fn find_entry(&self, key: &[u8], _indicator: bool, exact: bool) -> i32 {
1447 match self {
1448 TreeNode::Internal(n) => {
1449 let result = n
1450 .entries
1451 .binary_search_by(|entry| entry.key.as_slice().cmp(key));
1452 match result {
1453 Ok(idx) => (idx as i32) | EXACT_MATCH,
1454 Err(idx) => {
1455 if exact {
1456 -1
1457 } else {
1458 // Floor (not insertion point): the child slot to
1459 // descend into is the largest entry ≤ key. Slot 0
1460 // is the leftmost child, so a key below every
1461 // separator floors to 0. (St-H5: previously
1462 // returned the insertion point `idx`, which routes
1463 // one child too far right.)
1464 (idx as i32 - 1).max(0)
1465 }
1466 }
1467 }
1468 }
1469 TreeNode::Bottom(b) => {
1470 // Use prefix-aware search: the stored key is a suffix when
1471 // key_prefix is non-empty.
1472 let (idx, found) = b.find_entry_compressed(key);
1473 if found {
1474 (idx as i32) | EXACT_MATCH
1475 } else if exact {
1476 -1
1477 } else {
1478 idx as i32
1479 }
1480 }
1481 }
1482 }
1483
1484 /// Gets the number of entries in this node.
1485 pub fn get_n_entries(&self) -> usize {
1486 match self {
1487 TreeNode::Internal(n) => n.entries.len(),
1488 TreeNode::Bottom(b) => b.entries.len(),
1489 }
1490 }
1491
1492 // ========================================================================
1493 // Dirty flag
1494 // ========================================================================
1495
1496 /// Returns true if this node has been modified since last checkpoint.
1497 ///
1498 /// `IN.getDirty()`.
1499 pub fn is_dirty(&self) -> bool {
1500 match self {
1501 TreeNode::Internal(n) => n.dirty,
1502 TreeNode::Bottom(b) => b.dirty,
1503 }
1504 }
1505
1506 /// Sets or clears the dirty flag on this node.
1507 ///
1508 /// `IN.setDirty(boolean dirty)`.
1509 pub fn set_dirty(&mut self, dirty: bool) {
1510 match self {
1511 TreeNode::Internal(n) => n.dirty = dirty,
1512 TreeNode::Bottom(b) => b.dirty = dirty,
1513 }
1514 }
1515
1516 // ========================================================================
1517 // LRU generation
1518 // ========================================================================
1519
1520 /// Returns the LRU generation counter.
1521 ///
1522 /// `IN.getGeneration()`.
1523 pub fn get_generation(&self) -> u64 {
1524 match self {
1525 TreeNode::Internal(n) => n.generation,
1526 TreeNode::Bottom(b) => b.generation,
1527 }
1528 }
1529
1530 /// Sets the LRU generation counter.
1531 ///
1532 /// `IN.setGeneration(long gen)`.
1533 pub fn set_generation(&mut self, r#gen: u64) {
1534 match self {
1535 TreeNode::Internal(n) => n.generation = r#gen,
1536 TreeNode::Bottom(b) => b.generation = r#gen,
1537 }
1538 }
1539
1540 // ========================================================================
1541 // Parent pointer
1542 // ========================================================================
1543
1544 /// Returns a clone of the weak parent pointer, if any.
1545 pub fn get_parent(&self) -> Option<Weak<RwLock<TreeNode>>> {
1546 match self {
1547 TreeNode::Internal(n) => n.parent.clone(),
1548 TreeNode::Bottom(b) => b.parent.clone(),
1549 }
1550 }
1551
1552 /// Sets the weak parent pointer on this node.
1553 pub fn set_parent(&mut self, parent: Option<Weak<RwLock<TreeNode>>>) {
1554 match self {
1555 TreeNode::Internal(n) => n.parent = parent,
1556 TreeNode::Bottom(b) => b.parent = parent,
1557 }
1558 }
1559
1560 // ========================================================================
1561 // Log serialization
1562 // ========================================================================
1563
1564 /// Estimates the serialized byte size of this node for log/checkpoint use.
1565 ///
1566 /// `IN.getLogSize()` — Noxu-native serialization format.
1567 ///
1568 /// Format (big-endian):
1569 /// - node_id : 8 bytes
1570 /// - level : 4 bytes
1571 /// - n_entries : 4 bytes
1572 /// - dirty : 1 byte
1573 /// - For each entry:
1574 /// - key_len : 2 bytes
1575 /// - key : key_len bytes
1576 /// - lsn : 8 bytes
1577 pub fn log_size(&self) -> usize {
1578 // Fixed header: node_id(8) + level(4) + n_entries(4) + dirty(1)
1579 let mut size: usize = 8 + 4 + 4 + 1;
1580 match self {
1581 TreeNode::Internal(n) => {
1582 for entry in &n.entries {
1583 size += 2 + entry.key.len() + 8; // key_len + key + lsn
1584 }
1585 }
1586 TreeNode::Bottom(b) => {
1587 for entry in &b.entries {
1588 size += 2 + entry.key.len() + 8; // key_len + key + lsn
1589 }
1590 }
1591 }
1592 size
1593 }
1594
1595 /// Serializes this node to bytes for log writing.
1596 ///
1597 /// `IN.writeToLog(ByteBuffer logBuffer)` — Noxu-native
1598 /// format matching `log_size()`.
1599 pub fn write_to_bytes(&self) -> Vec<u8> {
1600 let mut buf = Vec::with_capacity(self.log_size());
1601 match self {
1602 TreeNode::Internal(n) => {
1603 buf.extend_from_slice(&n.node_id.to_be_bytes());
1604 buf.extend_from_slice(&n.level.to_be_bytes());
1605 buf.extend_from_slice(&(n.entries.len() as u32).to_be_bytes());
1606 buf.push(n.dirty as u8);
1607 for entry in &n.entries {
1608 buf.extend_from_slice(
1609 &(entry.key.len() as u16).to_be_bytes(),
1610 );
1611 buf.extend_from_slice(&entry.key);
1612 buf.extend_from_slice(&entry.lsn.as_u64().to_be_bytes());
1613 }
1614 }
1615 TreeNode::Bottom(b) => {
1616 buf.extend_from_slice(&b.node_id.to_be_bytes());
1617 buf.extend_from_slice(&b.level.to_be_bytes());
1618 buf.extend_from_slice(&(b.entries.len() as u32).to_be_bytes());
1619 buf.push(b.dirty as u8);
1620 for entry in &b.entries {
1621 buf.extend_from_slice(
1622 &(entry.key.len() as u16).to_be_bytes(),
1623 );
1624 buf.extend_from_slice(&entry.key);
1625 buf.extend_from_slice(&entry.lsn.as_u64().to_be_bytes());
1626 }
1627 }
1628 }
1629 buf
1630 }
1631}
1632
1633/// Internal helper used during splits to carry entries of either node kind.
1634///
1635/// `BinStub` and `InNodeStub` store different entry types, so we need a
1636/// common wrapper to pass split slices around without code duplication.
1637enum SplitEntries {
1638 Internal(Vec<InEntry>),
1639 Bottom(Vec<BinEntry>),
1640}
1641
1642impl SplitEntries {
1643 /// Returns the number of entries.
1644 fn len(&self) -> usize {
1645 match self {
1646 SplitEntries::Internal(v) => v.len(),
1647 SplitEntries::Bottom(v) => v.len(),
1648 }
1649 }
1650
1651 /// Returns the key at `index` as a slice.
1652 fn get_key(&self, index: usize) -> &[u8] {
1653 match self {
1654 SplitEntries::Internal(v) => v[index].key.as_slice(),
1655 SplitEntries::Bottom(v) => v[index].key.as_slice(),
1656 }
1657 }
1658
1659 /// Returns a sub-range `[lo, hi)` as a new `SplitEntries`.
1660 fn slice(&self, lo: usize, hi: usize) -> Self {
1661 match self {
1662 SplitEntries::Internal(v) => {
1663 SplitEntries::Internal(v[lo..hi].to_vec())
1664 }
1665 SplitEntries::Bottom(v) => SplitEntries::Bottom(v[lo..hi].to_vec()),
1666 }
1667 }
1668}
1669
1670/// Tri-state outcome from one attempt at
1671/// `Tree::get_adjacent_bin_attempt`.
1672///
1673/// Distinguishes "the tree genuinely has no BIN in the requested
1674/// direction" (→ propagate as end-of-iteration) from "the path we
1675/// captured was invalidated by a concurrent split" (→ caller
1676/// retries from root). This split is necessary because the cursor
1677/// translates a `None` from `get_adjacent_bin` into
1678/// `OperationStatus::NotFound`, which is indistinguishable from a
1679/// real end-of-tree.
1680#[derive(Debug)]
1681enum AdjacentBinOutcome {
1682 /// A BIN was found in the requested direction.
1683 Found(Vec<BinEntry>),
1684 /// The tree genuinely has no BIN in the requested direction.
1685 NoAdjacent,
1686 /// A concurrent split invalidated our captured path; the
1687 /// caller should retry from root.
1688 SplitRaceRetry,
1689}
1690
1691/// Split hint for the `splitSpecial` heuristic.
1692///
1693/// JE `Tree.forceSplit` tracks `allLeftSideDescent` / `allRightSideDescent`
1694/// (true if **every** routing decision during the top-down descent followed
1695/// the leftmost / rightmost child). At split time, when one of those flags
1696/// is set, `IN.splitSpecial` forces the split index to 1 (left side) or
1697/// `nEntries - 1` (right side) instead of `nEntries / 2`.
1698///
1699/// Effect: for sequential-append workloads the left BIN stays near-full
1700/// after every split (only one entry migrates to the new sibling), cutting
1701/// the split count roughly in half and reducing write amplification.
1702///
1703/// Ref: `IN.java splitSpecial` ~line 4129, `Tree.java forceSplit` ~line 1907.
1704#[derive(Clone, Copy, Debug, PartialEq, Eq)]
1705enum SplitHint {
1706 /// Normal midpoint split (`n_entries / 2`).
1707 Normal,
1708 /// Key was at position 0 on every level of descent.
1709 /// → `split_index = 1` so left node keeps all but the first entry.
1710 AllLeft,
1711 /// Key was at the rightmost position on every level of descent.
1712 /// → `split_index = n_entries - 1` so left node keeps almost everything.
1713 AllRight,
1714}
1715
1716impl Tree {
1717 /// Creates a new empty tree.
1718 ///
1719 /// Constructor.
1720 pub fn new(database_id: u64, max_entries_per_node: usize) -> Self {
1721 Tree {
1722 database_id,
1723 max_entries_per_node,
1724 root: RwLock::new(None),
1725 root_latch: SharedLatch::new(LatchContext::new("TreeRoot"), false),
1726 root_log_lsn: RwLock::new(noxu_util::NULL_LSN),
1727 root_splits: AtomicU64::new(0),
1728 relatches_required: AtomicU64::new(0),
1729 key_comparator: None,
1730 memory_counter: None,
1731 in_list_listener: None,
1732 redo_capacity_hint: 0,
1733 key_prefixing: false, // JE default: KEY_PREFIXING_DEFAULT = false
1734 }
1735 }
1736
1737 /// Installs a shared memory counter for evictor / MemoryBudget feedback.
1738 ///
1739 /// → `env.getMemoryBudget().updateTreeMemoryUsage(delta)`
1740 ///. The counter is updated on every BIN entry insert/delete.
1741 pub fn set_memory_counter(&mut self, counter: Arc<AtomicI64>) {
1742 self.memory_counter = Some(counter);
1743 }
1744
1745 /// Installs the [`InListListener`] (the evictor) so node add/access/remove
1746 /// feed the LRU lists. JE: `INList` registration that feeds
1747 /// `Evictor.addBack`/`moveBack`/`remove`.
1748 pub fn set_in_list_listener(&mut self, listener: Arc<dyn InListListener>) {
1749 self.in_list_listener = Some(listener);
1750 }
1751
1752 /// Notify the listener that a node became resident (JE `Evictor.addBack`).
1753 #[inline]
1754 fn note_added(&self, node_id: u64) {
1755 if let Some(l) = &self.in_list_listener {
1756 l.note_ins_added(node_id);
1757 }
1758 }
1759
1760 /// Notify the listener that a resident node was accessed
1761 /// (JE `Evictor.moveBack` — LRU touch).
1762 #[inline]
1763 fn note_accessed(&self, node_id: u64) {
1764 if let Some(l) = &self.in_list_listener {
1765 l.note_ins_accessed(node_id);
1766 }
1767 }
1768
1769 /// Notify the listener that a node was removed (JE `Evictor.remove`).
1770 #[inline]
1771 fn note_removed(&self, node_id: u64) {
1772 if let Some(l) = &self.in_list_listener {
1773 l.note_ins_removed(node_id);
1774 }
1775 }
1776
1777 /// Creates a new empty tree with a custom key comparator.
1778 ///
1779 /// Used for sorted-duplicate databases where keys are two-part
1780 /// composite keys that require a custom ordering function.
1781 ///
1782 /// Constructor with `btreeComparator` parameter.
1783 pub fn new_with_comparator(
1784 database_id: u64,
1785 max_entries_per_node: usize,
1786 comparator: KeyComparatorFn,
1787 ) -> Self {
1788 Tree {
1789 database_id,
1790 max_entries_per_node,
1791 root: RwLock::new(None),
1792 root_latch: SharedLatch::new(LatchContext::new("TreeRoot"), false),
1793 root_log_lsn: RwLock::new(noxu_util::NULL_LSN),
1794 root_splits: AtomicU64::new(0),
1795 relatches_required: AtomicU64::new(0),
1796 key_comparator: Some(comparator),
1797 memory_counter: None,
1798 in_list_listener: None,
1799 redo_capacity_hint: 0,
1800 key_prefixing: false,
1801 }
1802 }
1803
1804 /// Sets the key-prefixing flag.
1805 ///
1806 /// When `true`, BIN key-prefix compression is enabled: shared leading
1807 /// bytes are factored out of each slot's key. When `false` (the
1808 /// default), keys are stored verbatim — matching JE
1809 /// `DatabaseConfig.setKeyPrefixing(false)` / `IN.computeKeyPrefix`
1810 /// returning `null`.
1811 ///
1812 /// Ref: `IN.java computeKeyPrefix` ~line 2456.
1813 pub fn set_key_prefixing(&mut self, enabled: bool) {
1814 self.key_prefixing = enabled;
1815 }
1816
1817 /// Sets the key comparator, replacing any existing one.
1818 pub fn set_comparator(&mut self, comparator: KeyComparatorFn) {
1819 self.key_comparator = Some(comparator);
1820 }
1821
1822 /// Store a capacity hint used by `redo_insert` when it creates the first
1823 /// BIN for this tree (the first-key path).
1824 ///
1825 /// The first BIN's `entries` Vec is pre-allocated with
1826 /// `capacity.min(max_entries_per_node)` slots, eliminating the
1827 /// Vec-resize doubling cycle (1 → 2 → 4 → … → cap) that would
1828 /// otherwise occur during the redo loop.
1829 ///
1830 /// Call once before the redo loop. Has no effect on `insert` (the
1831 /// normal, non-recovery path).
1832 ///
1833 /// Wave 11-K optimisation (Fix 3).
1834 pub fn hint_redo_capacity(&mut self, capacity: usize) {
1835 self.redo_capacity_hint = capacity;
1836 }
1837
1838 /// Returns the current redo capacity hint (0 = no hint set).
1839 pub fn get_redo_capacity_hint(&self) -> usize {
1840 self.redo_capacity_hint
1841 }
1842
1843 /// Takes the key comparator out of this tree (leaving None).
1844 pub fn take_comparator(&mut self) -> Option<KeyComparatorFn> {
1845 self.key_comparator.take()
1846 }
1847
1848 /// Returns a reference to the key comparator, if configured.
1849 ///
1850 /// Used by `CursorImpl::find_bin_for_key` (R4 fix) so the cursor's own
1851 /// IN-level descent uses the same comparator-aware floor slot as the
1852 /// tree's own search paths. Mirrors JE `DatabaseImpl.getKeyComparator()`.
1853 pub fn get_comparator(&self) -> Option<&KeyComparatorFn> {
1854 self.key_comparator.as_ref()
1855 }
1856
1857 /// Returns the key comparator if set, or performs lexicographic comparison.
1858 #[inline]
1859 fn key_cmp(&self, a: &[u8], b: &[u8]) -> std::cmp::Ordering {
1860 match &self.key_comparator {
1861 Some(cmp) => cmp(a, b),
1862 None => a.cmp(b),
1863 }
1864 }
1865
1866 /// Floor child slot index for descending an internal node: the largest
1867 /// slot whose key is ≤ `key`. Slot 0 carries a virtual −∞ key (always
1868 /// qualifies); `entries[1..]` are sorted ascending, so this binary-searches
1869 /// the partition point instead of an O(n) linear walk (St-H4). Uses
1870 /// `key_cmp` so a configured custom comparator is honoured on every descent
1871 /// path. Returns 0 for an empty/single-slot node.
1872 fn upper_in_floor_index(&self, entries: &[InEntry], key: &[u8]) -> usize {
1873 if entries.len() <= 1 {
1874 return 0;
1875 }
1876 entries[1..].partition_point(|e| {
1877 self.key_cmp(e.key.as_slice(), key) != std::cmp::Ordering::Greater
1878 })
1879 }
1880
1881 /// Returns true if the tree has no root (is empty).
1882 pub fn is_empty(&self) -> bool {
1883 self.root.read().is_none()
1884 }
1885
1886 /// Sets the root of the tree.
1887 ///
1888 /// Must hold root_latch exclusively before calling.
1889 pub fn set_root(&self, node: TreeNode) {
1890 *self.root.write() = Some(Arc::new(RwLock::new(node)));
1891 }
1892
1893 /// Returns the root Arc, if any.
1894 ///
1895 /// Returns a cloned `Arc` rather than a reference so the caller does not
1896 /// hold the inner `RwLock` guard.
1897 pub fn get_root(&self) -> Option<Arc<RwLock<TreeNode>>> {
1898 self.root.read().clone()
1899 }
1900
1901 /// Returns the database ID.
1902 pub fn get_database_id(&self) -> u64 {
1903 self.database_id
1904 }
1905
1906 /// Count the total number of live (non-deleted) entries across all BINs.
1907 ///
1908 /// Used by `DatabaseImpl::set_recovered_tree()` to initialise the
1909 /// per-database `entry_count` AtomicU64 after recovery replays the log.
1910 pub fn count_entries(&self) -> u64 {
1911 let mut total = 0u64;
1912 if let Some(root) = self.get_root() {
1913 Self::count_entries_recursive(&root, &mut total);
1914 }
1915 total
1916 }
1917
1918 fn count_entries_recursive(
1919 node_arc: &Arc<RwLock<TreeNode>>,
1920 total: &mut u64,
1921 ) {
1922 let guard = node_arc.read();
1923 match &*guard {
1924 TreeNode::Bottom(b) => {
1925 // Count only live (non-known_deleted) entries.
1926 *total += b.entries.iter().filter(|e| !e.known_deleted).count()
1927 as u64;
1928 }
1929 TreeNode::Internal(n) => {
1930 let children: Vec<Arc<RwLock<TreeNode>>> =
1931 n.entries.iter().filter_map(|e| e.child.clone()).collect();
1932 drop(guard);
1933 for child in children {
1934 Self::count_entries_recursive(&child, total);
1935 }
1936 }
1937 }
1938 }
1939
1940 /// Sum the real in-memory heap footprint of every resident node in the
1941 /// tree (DBI-23 oracle / reconciliation), in bytes.
1942 ///
1943 /// Walks all resident IN/BIN nodes and adds each node's
1944 /// `budgeted_memory_size` (JE `IN.getBudgetedMemorySize`). This is the
1945 /// authoritative "real heap" figure the incrementally-maintained
1946 /// `memory_counter` is meant to approximate; an engine can call it to
1947 /// reconcile counter drift, and the DBI-23 test uses it as the oracle the
1948 /// live counter must stay within tolerance of.
1949 pub fn total_budgeted_memory(&self) -> u64 {
1950 let mut total = 0u64;
1951 if let Some(root) = self.get_root() {
1952 Self::total_budgeted_memory_recursive(&root, &mut total);
1953 }
1954 total
1955 }
1956
1957 fn total_budgeted_memory_recursive(
1958 node_arc: &Arc<RwLock<TreeNode>>,
1959 total: &mut u64,
1960 ) {
1961 let guard = node_arc.read();
1962 *total += guard.budgeted_memory_size();
1963 if let TreeNode::Internal(n) = &*guard {
1964 let children: Vec<Arc<RwLock<TreeNode>>> =
1965 n.entries.iter().filter_map(|e| e.child.clone()).collect();
1966 drop(guard);
1967 for child in children {
1968 Self::total_budgeted_memory_recursive(&child, total);
1969 }
1970 }
1971 }
1972
1973 /// Search for a BIN that should contain the given key.
1974 ///
1975 /// This is the core tree traversal operation. It walks from root to BIN
1976 /// using latch-coupling (acquire child latch, then release parent latch).
1977 ///
1978 /// . Descends the tree until a BIN is
1979 /// reached, following the child pointer at the slot whose key is the
1980 /// largest key <= the search key (the "LTE" rule). Slot 0 in every upper
1981 /// IN carries a virtual key (-infinity) so any search key routes through
1982 /// it when all real keys are larger.
1983 ///
1984 /// Returns a SearchResult indicating where the key is or should be.
1985 /// Returns None if tree is empty.
1986 pub fn search(&self, key: &[u8]) -> Option<SearchResult> {
1987 let root = self.get_root()?;
1988
1989 // Hand-over-hand latch coupling for the descent. At each level we
1990 // hold a `parking_lot::ArcRwLockReadGuard` on the current node;
1991 // before dropping it, we acquire the child's read guard via
1992 // `Arc::read_arc`. This keeps a continuous chain of read locks
1993 // along the descent path so that no concurrent `split_child(parent,
1994 // …)` can run on a node we are about to enter — `split_child` takes
1995 // `parent.write()` to install the new sibling, and that write
1996 // blocks while we hold `parent.read()`. Without this, the prior
1997 // pattern (capture child Arc, drop parent guard, then take child
1998 // read lock) left a window in which a split could relocate the
1999 // child entries: a search for a key that should have ended up in
2000 // the new sibling would instead reach the (now left-half) child
2001 // and return a false `NotFound`.
2002 //
2003 // `read_arc()` returns `ArcRwLockReadGuard<RawRwLock, TreeNode>`
2004 // — a guard that owns its own Arc reference, so it has no
2005 // borrow lifetime and can be held across loop iterations and
2006 // assignment.
2007 let mut guard: parking_lot::ArcRwLockReadGuard<
2008 parking_lot::RawRwLock,
2009 TreeNode,
2010 > = root.read_arc();
2011
2012 loop {
2013 if guard.is_bin() {
2014 // JE: IN.fetchTarget / CursorImpl access moves the reached
2015 // BIN toward the hot end of the evictor's LRU list
2016 // (Evictor.moveBack). A freshly split BIN that has not yet
2017 // been registered is added here (moveBack is add-if-absent).
2018 if let TreeNode::Bottom(bin) = &*guard {
2019 self.note_accessed(bin.node_id);
2020 }
2021 // Reached a BIN: final key lookup within the same guard.
2022 // Use indicate_if_duplicate=true so an exact match sets
2023 // EXACT_MATCH in the return value. Guard against -1 (not
2024 // found): -1i32 has all bits set, so the naive
2025 // `index & EXACT_MATCH != 0` check would incorrectly report
2026 // an exact match for a missing key.
2027 let (found, raw_idx) = match &*guard {
2028 TreeNode::Bottom(bin) => match &self.key_comparator {
2029 Some(cmp) => {
2030 let (idx, exact) =
2031 bin.find_entry_cmp(key, cmp.as_ref());
2032 (exact, idx as i32)
2033 }
2034 None => {
2035 let index = guard.find_entry(key, true, true);
2036 let exact =
2037 index >= 0 && (index & EXACT_MATCH != 0);
2038 (exact, index & 0xFFFF)
2039 }
2040 },
2041 _ => {
2042 let index = guard.find_entry(key, true, true);
2043 let exact = index >= 0 && (index & EXACT_MATCH != 0);
2044 (exact, index & 0xFFFF)
2045 }
2046 };
2047 // CursorImpl.isProbablyExpired(): if an exact match
2048 // was found, check whether the entry's TTL has already elapsed.
2049 // If it has, treat the slot as not found so callers skip it.
2050 //
2051 // TREE-F1: also treat a known_deleted slot as ABSENT on an
2052 // exact lookup, mirroring the tail of IN.findEntry
2053 // (IN.java:3197): `if (ret >= 0 && exact &&
2054 // isEntryKnownDeleted(ret & 0xffff)) return -1;`. KD slots
2055 // legitimately exist in live BINs during BIN-delta
2056 // reconstitution until the compressor reclaims them.
2057 let found = if found {
2058 if let TreeNode::Bottom(bin) = &*guard {
2059 let idx = (raw_idx & 0x7FFF) as usize;
2060 bin.slot_is_live(idx)
2061 } else {
2062 found
2063 }
2064 } else {
2065 found
2066 };
2067 return Some(SearchResult::with_values(found, raw_idx, false));
2068 }
2069
2070 // Upper IN: find the child slot with the largest key <= search
2071 // key, and capture the child Arc WHILE HOLDING the guard.
2072 // Slot 0 has a virtual key that compares as -infinity.
2073 let next_arc = match &*guard {
2074 TreeNode::Internal(n) => {
2075 if n.entries.is_empty() {
2076 return None;
2077 }
2078 // Walk forward as long as entry.key <= key, starting
2079 // from slot 0 (which always qualifies because its key
2080 // is the virtual -infinity key).
2081 let idx = self.upper_in_floor_index(&n.entries, key);
2082 n.entries.get(idx)?.child.clone()?
2083 }
2084 TreeNode::Bottom(_) => {
2085 unreachable!("is_bin() returned false above")
2086 }
2087 };
2088 // Take the child read lock BEFORE releasing the parent's read
2089 // lock — this is the actual hand-over-hand step that closes
2090 // the descender-vs-splitter race for the read path.
2091 let next_guard = next_arc.read_arc();
2092 drop(guard);
2093 guard = next_guard;
2094 }
2095 }
2096
2097 /// Combined search-and-fetch: descend once to the BIN and return the
2098 /// slot's data together with a reference to the BIN arc.
2099 ///
2100 /// Replaces the previous three-descent sequence on the `Database::get`
2101 /// hot path:
2102 /// 1. `Tree::search` — existence check only.
2103 /// 2. `CursorImpl::get_data_from_tree` — re-descended to fetch data.
2104 /// 3. `CursorImpl::find_bin_for_key` — re-descended for BIN pinning.
2105 ///
2106 /// One descent now does all three jobs. At the BIN level it uses the
2107 /// existing binary-search helper `find_entry_compressed` instead of the
2108 /// O(n) `iter().find()` used by `get_data_from_tree`.
2109 ///
2110 /// Returns `None` only when the tree is empty. Otherwise returns
2111 /// `Some(SlotFetch)` — callers must inspect `SlotFetch::found` to
2112 /// determine whether the key was present. The BIN read-guard is released
2113 /// before this method returns so callers may safely call `lock_ln`
2114 /// (which may block) without holding any tree latch.
2115 ///
2116 /// Wave-11-I — see the 2026 review.
2117 pub fn search_with_data(&self, key: &[u8]) -> Option<SlotFetch> {
2118 let root = self.get_root()?;
2119 let mut guard: parking_lot::ArcRwLockReadGuard<
2120 parking_lot::RawRwLock,
2121 TreeNode,
2122 > = root.read_arc();
2123
2124 loop {
2125 if guard.is_bin() {
2126 // Capture the BIN Arc before inspecting entries.
2127 let bin_arc =
2128 parking_lot::ArcRwLockReadGuard::rwlock(&guard).clone();
2129
2130 let (found, data, lsn, slot_index) = match &*guard {
2131 TreeNode::Bottom(bin) => {
2132 let (idx, exact) = match &self.key_comparator {
2133 Some(cmp) => bin.find_entry_cmp(key, cmp.as_ref()),
2134 None => bin.find_entry_compressed(key),
2135 };
2136 if exact {
2137 // TREE-F1: a slot is reported as found only when
2138 // live (not known_deleted, not TTL-expired) — the
2139 // same predicate used by Tree::search and the
2140 // cursor scan. Mirrors IN.findEntry (IN.java:3197)
2141 // and CursorImpl.isProbablyExpired.
2142 if bin.slot_is_live(idx) {
2143 let e = &bin.entries[idx];
2144 (true, e.data.clone(), e.lsn.as_u64(), idx)
2145 } else {
2146 (false, None, 0u64, 0)
2147 }
2148 } else {
2149 (false, None, 0u64, 0)
2150 }
2151 }
2152 _ => (false, None, 0u64, 0),
2153 };
2154 // Release the BIN read guard before returning so the caller
2155 // can call lock_ln (which may block) without holding a latch.
2156 drop(guard);
2157 return Some(SlotFetch {
2158 found,
2159 data,
2160 lsn,
2161 slot_index,
2162 bin_arc,
2163 });
2164 }
2165
2166 // Upper IN: same hand-over-hand descent as `Tree::search`.
2167 let next_arc = match &*guard {
2168 TreeNode::Internal(n) => {
2169 if n.entries.is_empty() {
2170 return None;
2171 }
2172 // Slot 0 = virtual −∞; walk forward while entry.key ≤ key.
2173 let idx = self.upper_in_floor_index(&n.entries, key);
2174 n.entries.get(idx)?.child.clone()?
2175 }
2176 TreeNode::Bottom(_) => {
2177 unreachable!("is_bin() returned false above")
2178 }
2179 };
2180 let next_guard = next_arc.read_arc();
2181 drop(guard);
2182 guard = next_guard;
2183 }
2184 }
2185
2186 /// Sets the expiration time (in absolute hours since Unix epoch) for an
2187 /// existing key's BIN slot.
2188 ///
2189 /// Returns `true` if the key was found and updated, `false` otherwise.
2190 ///
2191 /// Used by `Database::put_with_options()` to apply per-record TTL.
2192 /// `IN.entryExpiration` / `BIN.expirationInHours` path.
2193 pub fn update_key_expiration(
2194 &self,
2195 key: &[u8],
2196 expiration_hours: u32,
2197 ) -> bool {
2198 let root = match self.get_root() {
2199 Some(r) => r,
2200 None => return false,
2201 };
2202 // Hand-over-hand latch coupling for the descent. At the BIN we
2203 // need a write lock; we drop our read lock first and take the
2204 // write lock under the protection of the *outer* parent's read
2205 // lock (held by the previous loop iteration's guard). For the
2206 // first iteration there is no outer parent, but no `split_child`
2207 // can run on the root itself in that single-level case because
2208 // root splits go through `split_root_if_needed` which holds
2209 // `self.root.write()`. So the worst case is that the root is
2210 // promoted from a single BIN to a level-2 IN between our read
2211 // detect and our write — handled by the `is_bin` re-check
2212 // inside the write lock.
2213 //
2214 // We retry the descent up to a small bound to absorb the rare
2215 // case where a concurrent split moved this key into the new
2216 // sibling between the read-chain release and the write-lock
2217 // acquisition. Without the retry, the sole caller
2218 // (`Database::put_with_options`) would silently lose the TTL
2219 // for the affected key. Three attempts is generous: each
2220 // retry only races a single split and splits are infrequent.
2221 for _ in 0..3 {
2222 let mut guard: parking_lot::ArcRwLockReadGuard<
2223 parking_lot::RawRwLock,
2224 TreeNode,
2225 > = root.read_arc();
2226 let bin_arc;
2227 loop {
2228 if guard.is_bin() {
2229 bin_arc =
2230 parking_lot::ArcRwLockReadGuard::rwlock(&guard).clone();
2231 drop(guard);
2232 break;
2233 }
2234 let next_arc = match &*guard {
2235 TreeNode::Internal(n) => {
2236 if n.entries.is_empty() {
2237 return false;
2238 }
2239 let idx = self.upper_in_floor_index(&n.entries, key);
2240 match n.entries.get(idx).and_then(|e| e.child.clone()) {
2241 Some(c) => c,
2242 None => return false,
2243 }
2244 }
2245 TreeNode::Bottom(_) => unreachable!(),
2246 };
2247 let next_guard = next_arc.read_arc();
2248 drop(guard);
2249 guard = next_guard;
2250 }
2251
2252 // Now take the write lock on the BIN we descended to.
2253 let mut wguard = bin_arc.write();
2254 if let TreeNode::Bottom(bin) = &mut *wguard {
2255 let slot = if let Some(cmp) = &self.key_comparator {
2256 let (idx, exact) = bin.find_entry_cmp(key, cmp.as_ref());
2257 if exact { Some(idx) } else { None }
2258 } else {
2259 let (idx, exact) = bin.find_entry_compressed(key);
2260 if exact { Some(idx) } else { None }
2261 };
2262 if let Some(slot_idx) = slot
2263 && let Some(entry) = bin.entries.get_mut(slot_idx)
2264 {
2265 entry.expiration_time = expiration_hours;
2266 bin.expiration_in_hours = true;
2267 bin.dirty = true;
2268 return true;
2269 }
2270 }
2271 // Key not in this BIN — either it was never present or a
2272 // concurrent split moved it. Retry the descent; at most a
2273 // few iterations are needed to follow the key into its new
2274 // BIN.
2275 }
2276 false
2277 }
2278
2279 /// Returns the key and data of the first BIN entry at or after `key`.
2280 ///
2281 /// Descends with the tree's key comparator (same path as `search()`), then
2282 /// within the BIN finds the first slot whose stored key >= `key` using the
2283 /// comparator. Returns `None` if every entry in the tree is < `key`.
2284 ///
2285 /// Used by sorted-duplicate cursor `search(Set)` to position at the first
2286 /// (key, data) pair whose two-part key >= `lower_bound(primary_key)`.
2287 ///
2288 /// → BIN scan path.
2289 pub fn first_entry_at_or_after(
2290 &self,
2291 key: &[u8],
2292 ) -> Option<(Vec<u8>, Vec<u8>, u64)> {
2293 // Hand-over-hand latch coupling — see Tree::search for the
2294 // detailed rationale on why this closes a reader-vs-splitter
2295 // race window.
2296 let mut guard: parking_lot::ArcRwLockReadGuard<
2297 parking_lot::RawRwLock,
2298 TreeNode,
2299 > = self.get_root()?.read_arc();
2300
2301 loop {
2302 if guard.is_bin() {
2303 let result = match &*guard {
2304 TreeNode::Bottom(bin) => {
2305 let (mut idx, _exact) = match &self.key_comparator {
2306 Some(cmp) => bin.find_entry_cmp(key, cmp.as_ref()),
2307 None => bin.find_entry_compressed(key),
2308 };
2309 // TREE-F1: skip non-live slots (known_deleted /
2310 // TTL-expired) at/after the floor index, mirroring the
2311 // cursor getNext skip (CursorImpl.java:2062-2064).
2312 while idx < bin.entries.len() && !bin.slot_is_live(idx)
2313 {
2314 idx += 1;
2315 }
2316 if idx < bin.entries.len() {
2317 let full_key =
2318 bin.get_full_key(idx).unwrap_or_default();
2319 let data = bin.entries[idx]
2320 .data
2321 .clone()
2322 .unwrap_or_default();
2323 let lsn = bin.entries[idx].lsn.as_u64();
2324 Some((full_key, data, lsn))
2325 } else {
2326 None
2327 }
2328 }
2329 _ => None,
2330 };
2331 return result;
2332 }
2333
2334 // Upper IN: same descent as search().
2335 let next_arc = match &*guard {
2336 TreeNode::Internal(n) => {
2337 if n.entries.is_empty() {
2338 return None;
2339 }
2340 let idx = self.upper_in_floor_index(&n.entries, key);
2341 n.entries.get(idx)?.child.clone()?
2342 }
2343 TreeNode::Bottom(_) => unreachable!(),
2344 };
2345 // Take child read lock BEFORE releasing parent's.
2346 let next_guard = next_arc.read_arc();
2347 drop(guard);
2348 guard = next_guard;
2349 }
2350 }
2351
2352 /// Like [`Tree::first_entry_at_or_after`] but also returns the BIN node
2353 /// (so callers may pin it) and the entry's slot index inside that
2354 /// BIN.
2355 ///
2356 /// Wave 11-N (Bug 2): `CursorImpl::search_dup` previously stored
2357 /// `current_index = 0` after a sorted-dup `Search`, which broke the
2358 /// fast-path of `retrieve_next` (and the slow path's
2359 /// `next_index = current_index + 1` arithmetic) for any primary
2360 /// that was not the first slot of its BIN. This helper hands back
2361 /// the real index so the cursor can be positioned correctly.
2362 ///
2363 /// CC-2 fix: uses the same `read_arc()` hand-over-hand latch coupling
2364 /// as every other descent method (`search`, `first_entry_at_or_after`,
2365 /// `get_first_node`, `get_adjacent_bin_attempt`). The original
2366 /// implementation did `arc.read().is_bin()` (lock acquired and released)
2367 /// then a SECOND `arc.read()` on the next line — a gap in which a
2368 /// concurrent split can promote the node (BIN→upper IN) or move the
2369 /// sought key to a new sibling, yielding a false "not found" for an
2370 /// existing key. Mirrors JE `Tree.searchSubTree` / `Tree.search`
2371 /// which hold the latch across the `is_bin()` test and the subsequent
2372 /// entry lookup.
2373 pub fn first_entry_at_or_after_with_index(
2374 &self,
2375 key: &[u8],
2376 ) -> Option<(
2377 Vec<u8>,
2378 Vec<u8>,
2379 usize,
2380 u64,
2381 std::sync::Arc<crate::NodeRwLock<TreeNode>>,
2382 )> {
2383 // Hand-over-hand latch coupling — identical strategy to
2384 // first_entry_at_or_after; the guard is held continuously across
2385 // is_bin() and the subsequent entry lookup so no split can
2386 // restructure the path between the two observations.
2387 let mut guard: parking_lot::ArcRwLockReadGuard<
2388 parking_lot::RawRwLock,
2389 TreeNode,
2390 > = self.get_root()?.read_arc();
2391 loop {
2392 if guard.is_bin() {
2393 if let TreeNode::Bottom(bin) = &*guard {
2394 let (idx, _exact) = match &self.key_comparator {
2395 Some(cmp) => bin.find_entry_cmp(key, cmp.as_ref()),
2396 None => bin.find_entry_compressed(key),
2397 };
2398 // TREE-F1: skip non-live slots (known_deleted /
2399 // TTL-expired) at/after the floor index
2400 // (CursorImpl.java:2062-2064).
2401 let mut idx = idx;
2402 while idx < bin.entries.len() && !bin.slot_is_live(idx) {
2403 idx += 1;
2404 }
2405 if idx < bin.entries.len() {
2406 let full_key =
2407 bin.get_full_key(idx).unwrap_or_default();
2408 let data =
2409 bin.entries[idx].data.clone().unwrap_or_default();
2410 let lsn = bin.entries[idx].lsn.as_u64();
2411 // Obtain the Arc for the BIN node the guard came from.
2412 // `ArcRwLockReadGuard::rwlock()` returns the backing Arc.
2413 let bin_arc =
2414 parking_lot::ArcRwLockReadGuard::rwlock(&guard)
2415 .clone();
2416 return Some((full_key, data, idx, lsn, bin_arc));
2417 } else {
2418 return None;
2419 }
2420 }
2421 return None;
2422 }
2423
2424 // Upper IN: descend as in first_entry_at_or_after / search.
2425 let next_arc = match &*guard {
2426 TreeNode::Internal(n) => {
2427 if n.entries.is_empty() {
2428 return None;
2429 }
2430 let idx = self.upper_in_floor_index(&n.entries, key);
2431 n.entries.get(idx)?.child.clone()?
2432 }
2433 TreeNode::Bottom(_) => unreachable!(),
2434 };
2435 // Acquire child's read lock BEFORE releasing the parent's — this
2436 // closes the window where a concurrent split could restructure
2437 // the path between the two observations.
2438 let next_guard = next_arc.read_arc();
2439 drop(guard);
2440 guard = next_guard;
2441 }
2442 }
2443
2444 /// Insert a key/data pair into the tree.
2445 ///
2446 /// . Handles the root-is-null case by
2447 /// creating a two-level tree (upper IN + BIN) per initialisation path,
2448 /// then delegates to `insert_recursive` which performs preemptive splitting
2449 /// as it descends.
2450 ///
2451 /// Returns Ok(true) if this was a new insert, Ok(false) if it was an update.
2452 pub fn insert(
2453 &self,
2454 key: Vec<u8>,
2455 data: Vec<u8>,
2456 lsn: Lsn,
2457 ) -> Result<bool, TreeError> {
2458 // Save sizes before potentially moving key/data — needed for memory tracking.
2459 let key_len = key.len();
2460 let data_len = data.len();
2461
2462 // First-key path. We MUST hold the write lock while testing
2463 // root.is_none() and replacing the root, otherwise N threads can all
2464 // observe an empty tree, each build a fresh single-entry root, and
2465 // the last writer's `*self.root.write() = Some(...)` silently
2466 // discards the others' inserts. (Reproducer:
2467 // xa_protocol_test::test_concurrent_independent_xids — 8 threads
2468 // each inserting their own key into an empty tree lost ~30% of
2469 // inserts before this lock change.)
2470 {
2471 let mut root_guard = self.root.write();
2472 if root_guard.is_none() {
2473 let bin_node_id = generate_node_id();
2474 let root_node_id = generate_node_id();
2475 let bin = Arc::new(RwLock::new(TreeNode::Bottom(BinStub {
2476 node_id: bin_node_id,
2477 level: BIN_LEVEL,
2478 entries: vec![BinEntry {
2479 key,
2480 lsn,
2481 data: Some(data),
2482 known_deleted: false,
2483 dirty: false,
2484 expiration_time: 0,
2485 }],
2486 key_prefix: Vec::new(), // single entry — no common prefix yet
2487 dirty: true,
2488 is_delta: false,
2489 last_full_lsn: NULL_LSN,
2490 last_delta_lsn: NULL_LSN,
2491 generation: 0,
2492 parent: None, // set below after root_in is created
2493 // St-H6: use true to match the engine-wide invariant that
2494 // every BIN which may hold TTL entries uses hours granularity
2495 // (JE BIN.java default; matches tree.rs:980 and read_from_log).
2496 expiration_in_hours: true,
2497 cursor_count: 0,
2498 prohibit_next_delta: false,
2499 })));
2500
2501 // Upper IN at level 2; slot 0 uses an empty key (virtual root key).
2502 let root_arc =
2503 Arc::new(RwLock::new(TreeNode::Internal(InNodeStub {
2504 node_id: root_node_id,
2505 level: MAIN_LEVEL | 2,
2506 entries: vec![InEntry {
2507 key: vec![], // virtual key for slot 0 in upper IN
2508 lsn,
2509 child: Some(bin.clone()),
2510 }],
2511 dirty: true,
2512 generation: 0,
2513 parent: None,
2514 })));
2515
2516 // Wire the BIN's parent pointer back to the root IN.
2517 {
2518 let mut g = bin.write();
2519 g.set_parent(Some(Arc::downgrade(&root_arc)));
2520 }
2521
2522 *root_guard = Some(root_arc);
2523
2524 // JE: IN.fetchTarget / initial tree build registers the new
2525 // resident nodes with the evictor (Evictor.addBack).
2526 self.note_added(root_node_id);
2527 self.note_added(bin_node_id);
2528
2529 // Count the first entry.
2530 if let Some(counter) = &self.memory_counter {
2531 let delta =
2532 (key_len + data_len + BIN_ENTRY_OVERHEAD) as i64;
2533 counter.fetch_add(delta, Ordering::Relaxed);
2534 }
2535 return Ok(true);
2536 }
2537 // Another thread initialized the root while we were waiting for
2538 // the write lock; fall through and insert into the existing tree.
2539 }
2540
2541 // Check whether the root itself needs to be split before descending.
2542 // Tree.searchSplitsAllowed(): if rootIN.needsSplitting()
2543 // call splitRoot first.
2544 self.split_root_if_needed(lsn)?;
2545
2546 // Recursively insert, splitting children proactively as we descend
2547 // (forceSplit / searchSplitsAllowed pattern).
2548 let root_arc = self.get_root().unwrap();
2549 let result = Self::insert_recursive(
2550 &root_arc,
2551 key,
2552 data,
2553 lsn,
2554 self.max_entries_per_node,
2555 self.key_comparator.as_ref(),
2556 self.key_prefixing,
2557 )?;
2558
2559 // Update the memory counter for new inserts.
2560 // IN.updateMemorySize(delta) → MemoryBudget.updateTreeMemoryUsage(delta).
2561 // LN_OVERHEAD = 48 bytes (approximate fixed overhead per entry).
2562 if result && let Some(counter) = &self.memory_counter {
2563 let delta = (key_len + data_len + BIN_ENTRY_OVERHEAD) as i64;
2564 counter.fetch_add(delta, Ordering::Relaxed);
2565 }
2566
2567 Ok(result)
2568 }
2569
2570 /// Recovery-redo variant of [`Tree::insert`] that accepts `&[u8]` slices.
2571 ///
2572 /// Eliminates the two intermediate `Vec<u8>` allocations that the normal
2573 /// insert path requires at the `redo_ln` call site (one for the key, one
2574 /// for the data). The compressed key suffix and the data bytes are each
2575 /// materialised into their `BinEntry` slots exactly once.
2576 ///
2577 /// Semantics are identical to `insert`:
2578 /// - Updates the existing slot when the key is already present.
2579 /// - Inserts a new sorted entry when the key is absent.
2580 /// - Triggers the same root-split and proactive-split logic.
2581 ///
2582 /// `data` should be the raw value bytes, or an empty slice for a
2583 /// deletion (which should not normally arrive here during redo, but is
2584 /// handled gracefully).
2585 ///
2586 /// Wave 11-K optimisation (Fix 1).
2587 pub fn redo_insert(
2588 &self,
2589 key: &[u8],
2590 data: &[u8],
2591 lsn: Lsn,
2592 ) -> Result<bool, TreeError> {
2593 let key_len = key.len();
2594 let data_len = data.len();
2595 let data_opt: Option<&[u8]> =
2596 if data.is_empty() { None } else { Some(data) };
2597
2598 // First-key path: initialise a two-level tree from scratch.
2599 {
2600 let mut root_guard = self.root.write();
2601 if root_guard.is_none() {
2602 // Pre-allocate the BIN's entries Vec using the redo capacity
2603 // hint (Fix 3). Without the hint the first BIN starts at
2604 // capacity 1 and doubles on each insert; with the hint it
2605 // starts at min(hint, max_entries) entries, eliminating
2606 // ~log2(max_entries) Vec-resize doublings.
2607 let initial_cap = if self.redo_capacity_hint > 0 {
2608 self.redo_capacity_hint.min(self.max_entries_per_node)
2609 } else {
2610 1
2611 };
2612 let mut initial_entries = Vec::with_capacity(initial_cap);
2613 initial_entries.push(BinEntry {
2614 key: key.to_vec(),
2615 lsn,
2616 data: data_opt.map(|d| d.to_vec()),
2617 known_deleted: false,
2618 dirty: false,
2619 expiration_time: 0,
2620 });
2621 let bin = Arc::new(RwLock::new(TreeNode::Bottom(BinStub {
2622 node_id: generate_node_id(),
2623 level: BIN_LEVEL,
2624 entries: initial_entries,
2625 key_prefix: Vec::new(),
2626 dirty: true,
2627 is_delta: false,
2628 last_full_lsn: NULL_LSN,
2629 last_delta_lsn: NULL_LSN,
2630 generation: 0,
2631 parent: None,
2632 // St-H6: use true to match the engine-wide hours-only
2633 // invariant (JE BIN.java default; matches tree.rs:980).
2634 expiration_in_hours: true,
2635 cursor_count: 0,
2636 prohibit_next_delta: false,
2637 })));
2638
2639 let root_arc =
2640 Arc::new(RwLock::new(TreeNode::Internal(InNodeStub {
2641 node_id: generate_node_id(),
2642 level: MAIN_LEVEL | 2,
2643 entries: vec![InEntry {
2644 key: vec![],
2645 lsn,
2646 child: Some(bin.clone()),
2647 }],
2648 dirty: true,
2649 generation: 0,
2650 parent: None,
2651 })));
2652
2653 {
2654 let mut g = bin.write();
2655 g.set_parent(Some(Arc::downgrade(&root_arc)));
2656 }
2657
2658 *root_guard = Some(root_arc);
2659
2660 if let Some(counter) = &self.memory_counter {
2661 let delta =
2662 (key_len + data_len + BIN_ENTRY_OVERHEAD) as i64;
2663 counter.fetch_add(delta, Ordering::Relaxed);
2664 }
2665 return Ok(true);
2666 }
2667 }
2668
2669 self.split_root_if_needed(lsn)?;
2670
2671 let root_arc = self.get_root().unwrap();
2672 let result = Self::redo_insert_recursive(
2673 &root_arc,
2674 key,
2675 data_opt,
2676 lsn,
2677 self.max_entries_per_node,
2678 self.key_comparator.as_ref(),
2679 self.key_prefixing,
2680 )?;
2681
2682 if result && let Some(counter) = &self.memory_counter {
2683 let delta = (key_len + data_len + BIN_ENTRY_OVERHEAD) as i64;
2684 counter.fetch_add(delta, Ordering::Relaxed);
2685 }
2686
2687 Ok(result)
2688 }
2689
2690 /// Splits the root node if it is full (needsSplitting).
2691 ///
2692 ///
2693 /// ```text
2694 /// 1. Save oldRoot (the current root IN or BIN).
2695 /// 2. Create newRoot at oldRoot.level + 1.
2696 /// 3. Insert oldRoot into newRoot at slot 0 with a virtual (empty) key.
2697 /// 4. Call split_node on oldRoot, passing newRoot as parent.
2698 /// 5. Replace tree root with newRoot.
2699 /// ```
2700 fn split_root_if_needed(&self, lsn: Lsn) -> Result<(), TreeError> {
2701 // Hold `self.root.write()` across the needs_split check and the
2702 // root promotion, mirroring the first-key path fix and matching
2703 // the broader insert/split serialisation discipline.
2704 //
2705 // With the previous read-then-write pattern, two concurrent
2706 // splitters could each observe needs_split == true, then take()
2707 // and install in turn, with the second wrapping the first's
2708 // already-promoted root in its own new IN. Each level wraps the
2709 // previous, producing a chain of one-child internal nodes. No
2710 // data is lost (every entry is still reachable) but the tree
2711 // becomes unnecessarily deep, and the imbalance can compound
2712 // under heavy concurrent insertion.
2713 let mut root_guard = self.root.write();
2714 let needs_split = match root_guard.as_ref() {
2715 Some(arc) => {
2716 let g = arc.read();
2717 g.get_n_entries() >= self.max_entries_per_node
2718 }
2719 None => false,
2720 };
2721 if !needs_split {
2722 return Ok(());
2723 }
2724
2725 // Create a fresh new root one level above the current root.
2726 let old_root_arc = root_guard.take().expect("checked Some above");
2727 let old_root_level = {
2728 let g = old_root_arc.read();
2729 g.level()
2730 };
2731
2732 // newRoot = new IN(level = oldRoot.level + 1) with slot 0 = oldRoot.
2733 // The key at slot 0 is the virtual key (empty slice) following the
2734 // convention that entry-zero in an upper IN compares as -infinity.
2735 let new_root_arc =
2736 Arc::new(RwLock::new(TreeNode::Internal(InNodeStub {
2737 node_id: generate_node_id(),
2738 level: old_root_level + 1,
2739 entries: vec![InEntry {
2740 key: vec![],
2741 lsn,
2742 child: Some(old_root_arc.clone()),
2743 }],
2744 dirty: true,
2745 generation: 0,
2746 parent: None,
2747 })));
2748
2749 // Update the old root's parent pointer to the new root.
2750 {
2751 let mut g = old_root_arc.write();
2752 g.set_parent(Some(Arc::downgrade(&new_root_arc)));
2753 }
2754
2755 // Install the new root before calling split_child so split_child
2756 // (which itself takes parent.write()) can run unencumbered.
2757 *root_guard = Some(new_root_arc.clone());
2758 drop(root_guard);
2759
2760 // Now split the old root (which is now child at slot 0 in new_root).
2761 Self::split_child(
2762 &new_root_arc,
2763 0, // child is at slot 0
2764 self.max_entries_per_node,
2765 lsn,
2766 SplitHint::Normal,
2767 &[], // no insertion key at root-init time
2768 self.key_comparator.as_ref(),
2769 self.key_prefixing,
2770 )?;
2771
2772 self.root_splits.fetch_add(1, Ordering::Relaxed);
2773 Ok(())
2774 }
2775
2776 /// Splits the child at `child_index` in `parent`.
2777 ///
2778 /// . This implementation always keeps the **left** half in the
2779 /// existing child node (`child_arc`) and puts the right half in the new
2780 /// sibling, regardless of where the `identifierKey` falls. JE's
2781 /// `IN.splitInternal` (`idKeyIndex` logic ~line 4172) can place either
2782 /// half in the existing node; Noxu's preemptive-split discipline ensures
2783 /// the parent always has a free slot at split time (the split is done on
2784 /// the way *down*, before the parent fills up), so the safe simplification
2785 /// of always using the left half is correct here — no routing information
2786 /// is lost. This comment replaces the previous incorrect claim that
2787 /// `idKeyIndex` drove the choice.
2788 ///
2789 /// Note: does not emit a split log entry; split nodes are marked dirty
2790 /// and flushed at the next checkpoint (flush_dirty_bins/upper_ins).
2791 ///
2792 /// ```text
2793 /// 1. splitIndex = child.nEntries / 2 (or 1 / n-1 for splitSpecial)
2794 /// 2. Create newSibling at the same level.
2795 /// 3. Move entries [splitIndex..nEntries) to newSibling.
2796 /// 4. Update parent slot childIndex -> child (left half),
2797 /// insert newSibling with newIdKey after childIndex.
2798 /// ```
2799 fn split_child(
2800 parent: &Arc<RwLock<TreeNode>>,
2801 child_index: usize,
2802 max_entries: usize,
2803 lsn: Lsn,
2804 hint: SplitHint,
2805 insert_key: &[u8],
2806 key_comparator: Option<&KeyComparatorFn>,
2807 key_prefixing: bool,
2808 ) -> Result<(), TreeError> {
2809 // The split is performed under `parent.write()` for the entire
2810 // duration. This is a deliberate choice for correctness:
2811 //
2812 // - Without it, between dropping `child.write()` (after installing
2813 // the left half) and acquiring `parent.write()` (to install the
2814 // sibling), a concurrent descender can pick `child_arc` from the
2815 // parent (still pointing at it), descend, take `child.write()`
2816 // and insert a key. Whether the descender's key belongs in the
2817 // left half (now in `child`) or the right half (which will be
2818 // in the new sibling) is determined by the parent's split key —
2819 // but the parent doesn't know about the split key yet, so the
2820 // descender's routing decision is based on stale data. If the
2821 // descender's key falls in the right half, it lands in `child`
2822 // (left half) where a future search will not find it: the
2823 // future search descends from the root, the parent now has the
2824 // sibling installed, the search routes the key to the sibling,
2825 // the sibling does not contain the key — silently lost.
2826 //
2827 // - Holding `parent.write()` throughout serialises split_child
2828 // against every descender that wants `parent.read()`. A
2829 // descender already holding `parent.read()` (latch coupling
2830 // from above) keeps split_child waiting at this lock until it
2831 // has finished its own work. Combined, the split + sibling
2832 // install is atomic with respect to descents.
2833 //
2834 // - Splits are infrequent compared to inserts (~ once per
2835 // max_entries new keys) so the extra serialisation here does
2836 // not dominate.
2837 //
2838 // Reproducer that exercises this race:
2839 // crates/noxu-db/tests/concurrent_commits_stress.rs.
2840 let mut parent_write_guard = parent.write();
2841
2842 // Extract the child Arc from the parent slot.
2843 let child_arc = match &*parent_write_guard {
2844 TreeNode::Internal(p) => p
2845 .entries
2846 .get(child_index)
2847 .and_then(|e| e.child.clone())
2848 .ok_or(TreeError::SplitRequired)?,
2849 TreeNode::Bottom(_) => return Err(TreeError::SplitRequired),
2850 };
2851
2852 // Gather all entries from the child plus split metadata, AND
2853 // perform the in-place left-half install, all under a single
2854 // write lock on the child. See the earlier comment on the race
2855 // this avoids inside split_child.
2856 let mut child_guard = child_arc.write();
2857 let child_level = child_guard.level();
2858 // St-H6: capture the splitting BIN's expiration_in_hours flag BEFORE
2859 // drop(child_guard) so the right-half sibling inherits it.
2860 // JE: BIN.java::setExpiration calls setExpirationInHours(hours) to
2861 // propagate the flag on split/clone; the Rust split was hardcoding
2862 // false instead of inheriting — this caused hours-granularity TTL
2863 // entries in the right sibling to be read with in_hours=false, making
2864 // the hours-since-epoch value compare as seconds-since-epoch (far in
2865 // the past) and every right-sibling TTL record appear expired.
2866 let bin_expiration_in_hours: bool = match &*child_guard {
2867 TreeNode::Bottom(b) => b.expiration_in_hours,
2868 // Internal nodes do not carry per-entry TTL; default to true
2869 // (the engine-wide invariant for any BIN that may hold TTL data).
2870 TreeNode::Internal(_) => true,
2871 };
2872 let (all_entries, bin_old_prefix) = match &*child_guard {
2873 TreeNode::Internal(n) => {
2874 (SplitEntries::Internal(n.entries.clone()), Vec::new())
2875 }
2876 TreeNode::Bottom(b) => {
2877 // Decompress to full keys.
2878 let full: Vec<BinEntry> = (0..b.entries.len())
2879 .map(|i| BinEntry {
2880 key: b.get_full_key(i).unwrap_or_default(),
2881 lsn: b.entries[i].lsn,
2882 data: b.entries[i].data.clone(),
2883 known_deleted: b.entries[i].known_deleted,
2884 dirty: b.entries[i].dirty,
2885 expiration_time: b.entries[i].expiration_time,
2886 })
2887 .collect();
2888 (SplitEntries::Bottom(full), b.key_prefix.clone())
2889 }
2890 };
2891
2892 // Determine split point — JE `IN.splitSpecial` / `IN.splitInternal`.
2893 //
2894 // Normal midpoint: `n_entries / 2`.
2895 // AllLeft: insertion key is at position 0 on every descend level.
2896 // → split_index = 1 (left half keeps n-1 entries; new right sibling
2897 // gets only the former-first slot, then the insertion fills it).
2898 // This matches JE: `if (leftSide && index == 0) splitInternal(…, 1)`.
2899 // AllRight: insertion key is at the last position on every level.
2900 // → split_index = n_entries - 1 (left half keeps all but one entry).
2901 // JE: `else if (!leftSide && index == nEntries-1) splitInternal(…, nEntries-1)`.
2902 //
2903 // Ref: `IN.java` splitSpecial ~line 4129, splitInternal ~line 4159.
2904 let n_entries = all_entries.len();
2905 let split_index = if n_entries >= 2 {
2906 // Find where insert_key falls in the child.
2907 let insert_idx = {
2908 let mut idx = 0usize;
2909 for i in 1..n_entries {
2910 let ord = match key_comparator {
2911 Some(cmp) => cmp(all_entries.get_key(i), insert_key),
2912 None => all_entries.get_key(i).cmp(insert_key),
2913 };
2914 if ord != std::cmp::Ordering::Greater {
2915 idx = i;
2916 } else {
2917 break;
2918 }
2919 }
2920 idx
2921 };
2922 match hint {
2923 SplitHint::AllLeft if insert_idx == 0 => 1,
2924 SplitHint::AllRight if insert_idx == n_entries - 1 => {
2925 n_entries - 1
2926 }
2927 _ => n_entries / 2,
2928 }
2929 } else {
2930 n_entries / 2
2931 };
2932
2933 // newIdKey — the full key of the first entry of the right half.
2934 // For BIN: entries are already full keys after decompression above.
2935 // For IN: entries carry full keys directly.
2936 let new_id_key = all_entries.get_key(split_index).to_vec();
2937 // Suppress unused-variable warning when no BIN is involved.
2938 let _ = &bin_old_prefix;
2939
2940 // Divide into left and right halves.
2941 let left_entries = all_entries.slice(0, split_index);
2942 let right_entries = all_entries.slice(split_index, n_entries);
2943
2944 // Install the left half into `child_arc` (still under the same
2945 // write lock) and mark the node dirty.
2946 match (&mut *child_guard, &left_entries) {
2947 (TreeNode::Internal(n), SplitEntries::Internal(le)) => {
2948 n.entries = le.clone();
2949 }
2950 (TreeNode::Bottom(b), SplitEntries::Bottom(le)) => {
2951 // Reset prefix; entries are full keys.
2952 b.key_prefix = Vec::new();
2953 // Pre-allocate at max_entries capacity so the left half
2954 // does not need to reallocate on the next insert (Fix 3).
2955 let mut left = Vec::with_capacity(max_entries);
2956 left.extend_from_slice(le);
2957 b.entries = left;
2958 // Recompute prefix on each half after split (only when
2959 // key_prefixing is enabled for this database).
2960 // JE: IN.computeKeyPrefix returns null when
2961 // databaseImpl.getKeyPrefixing() is false.
2962 // Ref: IN.java computeKeyPrefix ~line 2456.
2963 if key_prefixing && b.entries.len() >= 2 {
2964 b.recompute_key_prefix();
2965 }
2966 }
2967 _ => return Err(TreeError::SplitRequired),
2968 }
2969 child_guard.set_dirty(true);
2970 drop(child_guard);
2971
2972 // Create the new right-half sibling.
2973 // Parent pointer will be wired in when it is inserted into the parent.
2974 let new_sibling = match right_entries {
2975 SplitEntries::Internal(re) => {
2976 Arc::new(RwLock::new(TreeNode::Internal(InNodeStub {
2977 node_id: generate_node_id(),
2978 level: child_level,
2979 entries: re,
2980 dirty: true,
2981 generation: 0,
2982 parent: None, // set below
2983 })))
2984 }
2985 SplitEntries::Bottom(re) => {
2986 // Entries are full keys; build BinStub with no prefix then
2987 // recompute key prefix for the new sibling.
2988 // Pre-allocate at max_entries capacity so the right half
2989 // does not need to reallocate on the next insert (Fix 3).
2990 let mut right = Vec::with_capacity(max_entries);
2991 right.extend(re);
2992 let mut sibling_bin = BinStub {
2993 node_id: generate_node_id(),
2994 level: child_level,
2995 entries: right,
2996 key_prefix: Vec::new(),
2997 dirty: true,
2998 is_delta: false,
2999 last_full_lsn: NULL_LSN,
3000 last_delta_lsn: NULL_LSN,
3001 generation: 0,
3002 parent: None, // set below
3003 // St-H6 fix: inherit the splitting BIN's flag so that
3004 // is_expired() uses the correct granularity for entries
3005 // that were already in the BIN before the split.
3006 // JE reference: BIN.java::split() propagates
3007 // expirationInHours via setExpirationInHours(hours).
3008 expiration_in_hours: bin_expiration_in_hours,
3009 cursor_count: 0,
3010 prohibit_next_delta: false,
3011 };
3012 // St-H6 debug guard: the sibling must carry the same flag as
3013 // the splitting BIN so that in_hours-resolution entries are
3014 // never silently expired by a mismatched false flag.
3015 debug_assert_eq!(
3016 sibling_bin.expiration_in_hours, bin_expiration_in_hours,
3017 "St-H6 invariant: sibling BIN expiration_in_hours must \
3018 match the splitting BIN (got {}, expected {})",
3019 sibling_bin.expiration_in_hours, bin_expiration_in_hours
3020 );
3021
3022 if key_prefixing && sibling_bin.entries.len() >= 2 {
3023 sibling_bin.recompute_key_prefix();
3024 }
3025 Arc::new(RwLock::new(TreeNode::Bottom(sibling_bin)))
3026 }
3027 };
3028
3029 // Note: the child (left half) was marked dirty earlier under the
3030 // same write lock that installed left_entries; no need to re-take
3031 // the write lock here.
3032
3033 // Insert the new sibling into the parent after child_index.
3034 // We already hold `parent.write()` (taken at the top of the
3035 // function); operate on it directly rather than re-acquiring.
3036 match &mut *parent_write_guard {
3037 TreeNode::Internal(p) => {
3038 let insert_pos = child_index + 1;
3039 p.entries.insert(
3040 insert_pos,
3041 InEntry {
3042 key: new_id_key,
3043 lsn,
3044 child: Some(new_sibling.clone()),
3045 },
3046 );
3047 // Parent is dirty because it gained a new entry.
3048 p.dirty = true;
3049 }
3050 TreeNode::Bottom(_) => return Err(TreeError::SplitRequired),
3051 }
3052
3053 // Wire the new sibling's parent pointer to the parent node
3054 // before releasing parent_write_guard, so a future descent that
3055 // takes parent.read() and finds the sibling immediately sees a
3056 // fully-wired parent pointer.
3057 {
3058 let mut g = new_sibling.write();
3059 g.set_parent(Some(Arc::downgrade(parent)));
3060 }
3061 drop(parent_write_guard);
3062
3063 Ok(())
3064 }
3065
3066 /// Recursive insert with preemptive splitting.
3067 ///
3068 /// Top-down traversal in `Tree.forceSplit` +
3069 /// `Tree.searchSplitsAllowed`:
3070 ///
3071 /// 1. At an upper IN: find which child slot covers `key`, split the child
3072 /// proactively if it is full (so we always have room to insert the split
3073 /// key into the parent), then recurse into the appropriate child.
3074 /// 2. At a BIN: insert the key/data directly.
3075 ///
3076 /// This implements the "preemptive splitting" strategy from the: we split
3077 /// children on the way down so we never need to walk back up.
3078 fn insert_recursive(
3079 node_arc: &Arc<RwLock<TreeNode>>,
3080 key: Vec<u8>,
3081 data: Vec<u8>,
3082 lsn: Lsn,
3083 max_entries: usize,
3084 key_comparator: Option<&KeyComparatorFn>,
3085 key_prefixing: bool,
3086 ) -> Result<bool, TreeError> {
3087 Self::insert_recursive_inner(
3088 node_arc,
3089 key,
3090 data,
3091 lsn,
3092 max_entries,
3093 key_comparator,
3094 key_prefixing,
3095 true, // all_left_so_far
3096 true, // all_right_so_far
3097 )
3098 }
3099
3100 /// Inner recursive helper that threads `allLeftSideDescent` /
3101 /// `allRightSideDescent` from `Tree.forceSplit` (JE ~line 1912).
3102 ///
3103 /// Both flags start `true` at the root and are cleared as soon as the
3104 /// descent takes a non-leftmost / non-rightmost child slot. At split
3105 /// time they are forwarded to `split_child` which uses them to pick the
3106 /// `splitSpecial` split index (JE `IN.splitSpecial` ~line 4129).
3107 #[allow(clippy::too_many_arguments)]
3108 fn insert_recursive_inner(
3109 node_arc: &Arc<RwLock<TreeNode>>,
3110 key: Vec<u8>,
3111 data: Vec<u8>,
3112 lsn: Lsn,
3113 max_entries: usize,
3114 key_comparator: Option<&KeyComparatorFn>,
3115 key_prefixing: bool,
3116 all_left_so_far: bool,
3117 all_right_so_far: bool,
3118 ) -> Result<bool, TreeError> {
3119 // Determine if this is a BIN (leaf level).
3120 //
3121 // We hold a read lock on `node_arc` (the parent of any descent we
3122 // do below) for the duration of this call, releasing it just
3123 // before returning. That achieves *latch coupling*: a concurrent
3124 // `split_child(parent, …)` that wants to reorganise our subtree
3125 // ultimately needs `parent.write()` to install the new sibling,
3126 // and that write blocks until our read lock is dropped. Without
3127 // this, the descender-vs-splitter race goes:
3128 //
3129 // T_X: at root, picks child_arc (BIN), drops root read lock.
3130 // T_Y: at root, runs split_child(root, …): takes child_arc.write(),
3131 // installs left half [E1..E5], creates sibling [E6..E10],
3132 // takes root.write() and inserts the sibling.
3133 // T_X: now takes child_arc.write() and inserts a key whose
3134 // sort order falls in the right half. The key lands in
3135 // child_arc (left half) but a future search descending
3136 // from the root routes that key to the new sibling and
3137 // does not find it — silently lost.
3138 //
3139 // Reproducer: noxu-db/tests/concurrent_commits_stress.rs
3140 // (32 threads × 100 keys, ~1–6 lost writes per run before this fix;
3141 // occasionally hundreds when an entire BIN is orphaned).
3142 let parent_guard = node_arc.read();
3143 let is_bin = parent_guard.is_bin();
3144
3145 if is_bin {
3146 // BIN: drop the read lock and take the write lock; this is
3147 // safe because the *outer* call frame still holds a read
3148 // lock on this BIN's parent (or this is the root, in which
3149 // case the first-key path has already initialised it). A
3150 // concurrent split_child(parent, …) cannot run while the
3151 // outer parent.read() is held, so the BIN cannot be
3152 // restructured between dropping our read lock and acquiring
3153 // our write lock.
3154 drop(parent_guard);
3155 let mut guard = node_arc.write();
3156 match &mut *guard {
3157 TreeNode::Bottom(bin) => {
3158 let is_new = if let Some(cmp) = key_comparator {
3159 // Comparator-based insert: no prefix compression.
3160 let (_idx, new) =
3161 bin.insert_cmp(key, lsn, Some(data), cmp.as_ref());
3162 new
3163 } else if key_prefixing {
3164 // insert_with_prefix handles prefix recomputation when
3165 // the new key shrinks the existing prefix, and also
3166 // initialises the prefix when 2 entries are present for
3167 // the first time.
3168 let (_idx, new) =
3169 bin.insert_with_prefix(key, lsn, Some(data));
3170 new
3171 } else {
3172 // key_prefixing disabled: store full key, no prefix.
3173 // JE: IN.computeKeyPrefix returns null when
3174 // databaseImpl.getKeyPrefixing() is false.
3175 // Ref: IN.java computeKeyPrefix ~line 2456.
3176 let (_idx, new) = bin.insert_raw(key, lsn, Some(data));
3177 new
3178 };
3179 // Mark dirty after any modification.
3180 bin.dirty = true;
3181 Ok(is_new)
3182 }
3183 TreeNode::Internal(_) => Err(TreeError::SplitRequired),
3184 }
3185 } else {
3186 // Upper IN: find the child slot that covers key.
3187 // Index = parent.findEntry(key, false, false)
3188 // Entry zero in an upper IN has a virtual key (-infinity), so
3189 // any real key is routed to at least slot 0.
3190 let (child_index, n_entries_at_level, child_arc) =
3191 match &*parent_guard {
3192 TreeNode::Internal(n) => {
3193 // Binary search for the largest key <= search key.
3194 // Slot 0 always matches (virtual key = -infinity).
3195 let mut idx = 0usize;
3196 for (i, entry) in n.entries.iter().enumerate() {
3197 if i == 0 {
3198 idx = 0;
3199 } else {
3200 let ord = match key_comparator {
3201 Some(cmp) => cmp(
3202 entry.key.as_slice(),
3203 key.as_slice(),
3204 ),
3205 None => {
3206 entry.key.as_slice().cmp(key.as_slice())
3207 }
3208 };
3209 if ord != std::cmp::Ordering::Greater {
3210 idx = i;
3211 } else {
3212 break;
3213 }
3214 }
3215 }
3216 let child = n
3217 .entries
3218 .get(idx)
3219 .and_then(|e| e.child.clone())
3220 .ok_or(TreeError::SplitRequired)?;
3221 (idx, n.entries.len(), child)
3222 }
3223 TreeNode::Bottom(_) => {
3224 return Err(TreeError::SplitRequired);
3225 }
3226 };
3227
3228 // Update the descent-side flags (JE `Tree.forceSplit` ~1959).
3229 // `allLeftSideDescent` ← still true only if we chose slot 0.
3230 // `allRightSideDescent` ← still true only if we chose the last slot.
3231 let all_left = all_left_so_far && child_index == 0;
3232 let all_right = all_right_so_far
3233 && child_index == n_entries_at_level.saturating_sub(1);
3234
3235 // Proactively split the child if it is full.
3236 // If (child.needsSplitting()) child.split(parent, ...)
3237 let child_full = {
3238 let g = child_arc.read();
3239 g.get_n_entries() >= max_entries
3240 };
3241
3242 if child_full {
3243 // Build the splitSpecial hint from the accumulated flags.
3244 // JE `Tree.forceSplit` ~line 2010:
3245 // if (allLeftSideDescent || allRightSideDescent)
3246 // child.splitSpecial(parent, index, grandParent,
3247 // maxTreeEntriesPerNode, key, allLeftSideDescent)
3248 let hint = match (all_left, all_right) {
3249 (true, _) => SplitHint::AllLeft,
3250 (_, true) => SplitHint::AllRight,
3251 _ => SplitHint::Normal,
3252 };
3253 // split_child(parent, …) needs parent.write(); we must
3254 // drop our parent read lock before calling it.
3255 drop(parent_guard);
3256 Self::split_child(
3257 node_arc,
3258 child_index,
3259 max_entries,
3260 lsn,
3261 hint,
3262 &key,
3263 key_comparator,
3264 key_prefixing,
3265 )?;
3266
3267 // After the split, re-find which child now covers key.
3268 // Re-enter at the top of the inner function; carry the
3269 // flags (the new topology doesn't invalidate them — we
3270 // still know the overall descent direction).
3271 return Self::insert_recursive_inner(
3272 node_arc,
3273 key,
3274 data,
3275 lsn,
3276 max_entries,
3277 key_comparator,
3278 key_prefixing,
3279 all_left_so_far,
3280 all_right_so_far,
3281 );
3282 }
3283
3284 // Descend into the child while still holding parent_guard.
3285 // The recursive call will hold child.read() before this
3286 // returns, then drop it; combined with our parent_guard,
3287 // the latch coupling chain is preserved on the way down and
3288 // unwound on the way back up.
3289 let r = Self::insert_recursive_inner(
3290 &child_arc,
3291 key,
3292 data,
3293 lsn,
3294 max_entries,
3295 key_comparator,
3296 key_prefixing,
3297 all_left,
3298 all_right,
3299 );
3300 drop(parent_guard);
3301 r
3302 }
3303 }
3304
3305 /// Slice-based variant of [`Tree::insert_recursive`] for the recovery redo path.
3306 ///
3307 /// Accepts `key: &[u8]` and `data: Option<&[u8]>` instead of owned
3308 /// `Vec<u8>` values. At the BIN leaf, calls
3309 /// [`BinStub::insert_with_prefix_slice`] which copies bytes into the
3310 /// `BinEntry` exactly once.
3311 ///
3312 /// For the comparator path (custom key comparator), falls back to
3313 /// `insert_cmp` with a one-time `to_vec()` conversion — that path is
3314 /// rare in practice (sorted-dup databases only) and is not on the
3315 /// W11 hot path.
3316 ///
3317 /// Wave 11-K optimisation (Fix 1).
3318 fn redo_insert_recursive(
3319 node_arc: &Arc<RwLock<TreeNode>>,
3320 key: &[u8],
3321 data: Option<&[u8]>,
3322 lsn: Lsn,
3323 max_entries: usize,
3324 key_comparator: Option<&KeyComparatorFn>,
3325 key_prefixing: bool,
3326 ) -> Result<bool, TreeError> {
3327 Self::redo_insert_recursive_inner(
3328 node_arc,
3329 key,
3330 data,
3331 lsn,
3332 max_entries,
3333 key_comparator,
3334 key_prefixing,
3335 true,
3336 true,
3337 )
3338 }
3339
3340 #[allow(clippy::too_many_arguments)]
3341 fn redo_insert_recursive_inner(
3342 node_arc: &Arc<RwLock<TreeNode>>,
3343 key: &[u8],
3344 data: Option<&[u8]>,
3345 lsn: Lsn,
3346 max_entries: usize,
3347 key_comparator: Option<&KeyComparatorFn>,
3348 key_prefixing: bool,
3349 all_left_so_far: bool,
3350 all_right_so_far: bool,
3351 ) -> Result<bool, TreeError> {
3352 let parent_guard = node_arc.read();
3353 let is_bin = parent_guard.is_bin();
3354
3355 if is_bin {
3356 drop(parent_guard);
3357 let mut guard = node_arc.write();
3358 match &mut *guard {
3359 TreeNode::Bottom(bin) => {
3360 // REC-F2: JE redo currency check
3361 // (RecoveryManager.redo() line ~2512/2544). A logged LN
3362 // is applied only when logrecLsn > treeLsn. If the slot
3363 // already holds an equal-or-newer LSN, skip the overwrite
3364 // so an out-of-order (older-LSN) redo cannot revert
3365 // committed data or reset the slot LSN backward. This
3366 // makes redo genuinely idempotent regardless of
3367 // redo/undo phase order. Deletes never reach this path
3368 // (redo_ln routes Delete through tree.delete), so the JE
3369 // "lsnCmp == 0 && isDeletion -> set KD" sub-case does not
3370 // apply here.
3371 let cmp_ref = key_comparator.map(|c| {
3372 c.as_ref()
3373 as &dyn Fn(&[u8], &[u8]) -> std::cmp::Ordering
3374 });
3375 if let Some(slot_lsn) =
3376 bin.redo_slot_lsn(key, cmp_ref, key_prefixing)
3377 && lsn <= slot_lsn
3378 {
3379 // Tree already holds an equal-or-newer version.
3380 return Ok(false);
3381 }
3382 let is_new = if let Some(cmp) = key_comparator {
3383 // Comparator path: fall back to owned-Vec variant.
3384 let (_idx, new) = bin.insert_cmp(
3385 key.to_vec(),
3386 lsn,
3387 data.map(|d| d.to_vec()),
3388 cmp.as_ref(),
3389 );
3390 new
3391 } else if key_prefixing {
3392 let (_idx, new) =
3393 bin.insert_with_prefix_slice(key, lsn, data);
3394 new
3395 } else {
3396 // key_prefixing disabled: store full key verbatim.
3397 // Ref: IN.java computeKeyPrefix ~line 2456.
3398 let (_idx, new) = bin.insert_raw(
3399 key.to_vec(),
3400 lsn,
3401 data.map(|d| d.to_vec()),
3402 );
3403 new
3404 };
3405 bin.dirty = true;
3406 Ok(is_new)
3407 }
3408 TreeNode::Internal(_) => Err(TreeError::SplitRequired),
3409 }
3410 } else {
3411 let (child_index, n_entries_at_level, child_arc) =
3412 match &*parent_guard {
3413 TreeNode::Internal(n) => {
3414 let mut idx = 0usize;
3415 for (i, entry) in n.entries.iter().enumerate() {
3416 if i == 0 {
3417 idx = 0;
3418 } else {
3419 let ord = match key_comparator {
3420 Some(cmp) => cmp(entry.key.as_slice(), key),
3421 None => entry.key.as_slice().cmp(key),
3422 };
3423 if ord != std::cmp::Ordering::Greater {
3424 idx = i;
3425 } else {
3426 break;
3427 }
3428 }
3429 }
3430 let child = n
3431 .entries
3432 .get(idx)
3433 .and_then(|e| e.child.clone())
3434 .ok_or(TreeError::SplitRequired)?;
3435 (idx, n.entries.len(), child)
3436 }
3437 TreeNode::Bottom(_) => {
3438 return Err(TreeError::SplitRequired);
3439 }
3440 };
3441
3442 let all_left = all_left_so_far && child_index == 0;
3443 let all_right = all_right_so_far
3444 && child_index == n_entries_at_level.saturating_sub(1);
3445
3446 let child_full = {
3447 let g = child_arc.read();
3448 g.get_n_entries() >= max_entries
3449 };
3450
3451 if child_full {
3452 let hint = match (all_left, all_right) {
3453 (true, _) => SplitHint::AllLeft,
3454 (_, true) => SplitHint::AllRight,
3455 _ => SplitHint::Normal,
3456 };
3457 drop(parent_guard);
3458 Self::split_child(
3459 node_arc,
3460 child_index,
3461 max_entries,
3462 lsn,
3463 hint,
3464 key,
3465 key_comparator,
3466 key_prefixing,
3467 )?;
3468 return Self::redo_insert_recursive_inner(
3469 node_arc,
3470 key,
3471 data,
3472 lsn,
3473 max_entries,
3474 key_comparator,
3475 key_prefixing,
3476 all_left_so_far,
3477 all_right_so_far,
3478 );
3479 }
3480
3481 let r = Self::redo_insert_recursive_inner(
3482 &child_arc,
3483 key,
3484 data,
3485 lsn,
3486 max_entries,
3487 key_comparator,
3488 key_prefixing,
3489 all_left,
3490 all_right,
3491 );
3492 drop(parent_guard);
3493 r
3494 }
3495 }
3496
3497 /// Pre-warm the tree's internal `Vec<BinEntry>` capacity before a redo
3498 /// pass that will insert approximately `n` records.
3499 ///
3500 /// If the tree is empty, this is a no-op (there is no BIN yet to reserve
3501 /// capacity on). If the tree already has a root BIN (from a previous
3502 /// checkpoint), reserves `n.min(max_entries_per_node)` additional slots
3503 /// in that BIN's entries vector, eliminating the resize-double cycle
3504 /// during the redo loop.
3505 ///
3506 /// Wave 11-K optimisation (Fix 3).
3507 pub fn reserve_redo_capacity(&self, n: usize) {
3508 if n == 0 {
3509 return;
3510 }
3511 let root = match self.get_root() {
3512 Some(r) => r,
3513 None => return,
3514 };
3515 // Descend to the leftmost BIN and reserve there.
3516 let mut arc = root;
3517 loop {
3518 let guard = arc.read();
3519 match &*guard {
3520 TreeNode::Bottom(bin_guard) => {
3521 let additional = n
3522 .min(self.max_entries_per_node)
3523 .saturating_sub(bin_guard.entries.len());
3524 drop(guard);
3525 let mut wguard = arc.write();
3526 if let TreeNode::Bottom(bin) = &mut *wguard {
3527 bin.entries.reserve(additional);
3528 }
3529 return;
3530 }
3531 TreeNode::Internal(inner) => {
3532 let child =
3533 inner.entries.first().and_then(|e| e.child.clone());
3534 drop(guard);
3535 match child {
3536 Some(c) => arc = c,
3537 None => return,
3538 }
3539 }
3540 }
3541 }
3542 }
3543
3544 /// Get the first (leftmost) BIN in the tree.
3545 ///
3546 /// Descends to the leftmost BIN by
3547 /// always following the first child slot at each upper IN level.
3548 pub fn get_first_node(&self) -> Option<SearchResult> {
3549 let mut guard: parking_lot::ArcRwLockReadGuard<
3550 parking_lot::RawRwLock,
3551 TreeNode,
3552 > = self.get_root()?.read_arc();
3553
3554 loop {
3555 if guard.is_bin() {
3556 let n = guard.get_n_entries();
3557 if n == 0 {
3558 return None;
3559 }
3560 // TREE-F1: return the first LIVE slot, skipping known_deleted
3561 // slots (CursorImpl.java:2062-2064). If the leftmost BIN is
3562 // entirely KD during the reconstitution window the cursor's
3563 // get_first falls through to its cross-BIN advance.
3564 if let TreeNode::Bottom(b) = &*guard {
3565 match (0..b.entries.len()).find(|&i| b.slot_is_live(i)) {
3566 Some(i) => {
3567 return Some(SearchResult::with_values(
3568 true, i as i32, false,
3569 ));
3570 }
3571 None => return None,
3572 }
3573 }
3574 return Some(SearchResult::with_values(true, 0, false));
3575 }
3576
3577 // Capture the leftmost child Arc while holding `guard`, then
3578 // hand-over-hand: take the child read lock before releasing
3579 // the parent's. Same race fix as `Tree::search`.
3580 let next_arc = match &*guard {
3581 TreeNode::Internal(n_node) => {
3582 n_node.entries.first().and_then(|e| e.child.clone())?
3583 }
3584 _ => return None,
3585 };
3586 let next_guard = next_arc.read_arc();
3587 drop(guard);
3588 guard = next_guard;
3589 }
3590 }
3591
3592 /// Get the last (rightmost) BIN in the tree.
3593 ///
3594 /// Descends to the rightmost BIN by
3595 /// always following the last child slot at each upper IN level.
3596 pub fn get_last_node(&self) -> Option<SearchResult> {
3597 let mut guard: parking_lot::ArcRwLockReadGuard<
3598 parking_lot::RawRwLock,
3599 TreeNode,
3600 > = self.get_root()?.read_arc();
3601
3602 loop {
3603 if guard.is_bin() {
3604 let n = guard.get_n_entries();
3605 if n == 0 {
3606 return None;
3607 }
3608 // TREE-F1: return the last LIVE slot, skipping known_deleted
3609 // slots (CursorImpl.java:2062-2064).
3610 if let TreeNode::Bottom(b) = &*guard {
3611 match (0..b.entries.len())
3612 .rev()
3613 .find(|&i| b.slot_is_live(i))
3614 {
3615 Some(i) => {
3616 return Some(SearchResult::with_values(
3617 true, i as i32, false,
3618 ));
3619 }
3620 None => return None,
3621 }
3622 }
3623 return Some(SearchResult::with_values(
3624 true,
3625 (n - 1) as i32,
3626 false,
3627 ));
3628 }
3629
3630 // Capture the rightmost child Arc while holding `guard`, then
3631 // hand-over-hand: take the child read lock before releasing
3632 // the parent's. Same race fix as `Tree::search`.
3633 let next_arc = match &*guard {
3634 TreeNode::Internal(n_node) => {
3635 n_node.entries.last().and_then(|e| e.child.clone())?
3636 }
3637 _ => return None,
3638 };
3639 let next_guard = next_arc.read_arc();
3640 drop(guard);
3641 guard = next_guard;
3642 }
3643 }
3644
3645 /// Returns the number of root splits that have occurred.
3646 pub fn get_root_splits(&self) -> u64 {
3647 self.root_splits.load(Ordering::Relaxed)
3648 }
3649
3650 /// Returns the number of relatches required.
3651 pub fn get_relatches_required(&self) -> u64 {
3652 self.relatches_required.load(Ordering::Relaxed)
3653 }
3654
3655 /// Delete a key from the tree.
3656 ///
3657 /// Traverses the tree to find the BIN that should contain the key, then
3658 /// removes the entry. Returns true if the key was found and removed.
3659 ///
3660 /// Delete path in `Tree` from the.
3661 ///
3662 /// In-memory removal only — WAL logging for deletes is handled by the
3663 /// cursor layer (`cursor_impl.rs::log_ln_write`) before this is called,
3664 /// matching separation between LN logging and tree mutation.
3665 pub fn delete(&self, key: &[u8]) -> bool {
3666 let root = match self.get_root() {
3667 Some(r) => r,
3668 None => return false,
3669 };
3670
3671 // F8 consistency: insert accounts key + data + BIN_ENTRY_OVERHEAD; delete must
3672 // subtract the SAME (data_len was previously omitted, leaking
3673 // data_len from the cache counter on every delete and biasing the
3674 // evictor's over-budget view). Peek the data length before deleting.
3675 let data_len = if self.memory_counter.is_some() {
3676 self.search_with_data(key)
3677 .filter(|sf| sf.found)
3678 .and_then(|sf| sf.data.as_ref().map(|d| d.len()))
3679 .unwrap_or(0)
3680 } else {
3681 0
3682 };
3683
3684 let deleted =
3685 Self::delete_recursive(&root, key, self.key_comparator.as_ref());
3686
3687 // Update the memory counter when an entry is removed.
3688 // IN.updateMemorySize(-delta) → MemoryBudget.updateTreeMemoryUsage(-delta).
3689 if deleted && let Some(counter) = &self.memory_counter {
3690 let delta = (key.len() + data_len + BIN_ENTRY_OVERHEAD) as i64;
3691 counter.fetch_sub(delta, Ordering::Relaxed);
3692 }
3693
3694 deleted
3695 }
3696
3697 /// Recursive helper for `delete`: descend to the BIN that holds `key`
3698 /// and remove it.
3699 fn delete_recursive(
3700 node_arc: &Arc<RwLock<TreeNode>>,
3701 key: &[u8],
3702 key_comparator: Option<&KeyComparatorFn>,
3703 ) -> bool {
3704 // Latch coupling, mirroring `insert_recursive`. Without this,
3705 // delete has the same "BIN split out from under us" race: thread
3706 // A finds child_arc as the target BIN under parent.read(), drops
3707 // the lock, and another thread runs split_child(parent, …) that
3708 // moves the target key into the new sibling. A then takes
3709 // child_arc.write(), looks for the key in the (now left-half)
3710 // BIN, doesn't find it, and returns `false`. The caller treats
3711 // the `false` as "key was not present", but the key is actually
3712 // still in the tree (in the sibling). Subsequent operations
3713 // observe a stale record that should have been deleted —
3714 // semantically a lost delete.
3715 let parent_guard = node_arc.read();
3716 let is_bin = parent_guard.is_bin();
3717 let child_arc = if !is_bin {
3718 match &*parent_guard {
3719 TreeNode::Internal(n) => {
3720 // Find child slot with largest key <= search key
3721 let mut idx = 0usize;
3722 for (i, entry) in n.entries.iter().enumerate() {
3723 if i == 0 {
3724 idx = 0;
3725 } else {
3726 let ord = match key_comparator {
3727 Some(cmp) => cmp(entry.key.as_slice(), key),
3728 None => entry.key.as_slice().cmp(key),
3729 };
3730 if ord != std::cmp::Ordering::Greater {
3731 idx = i;
3732 } else {
3733 break;
3734 }
3735 }
3736 }
3737 n.entries.get(idx).and_then(|e| e.child.clone())
3738 }
3739 _ => None,
3740 }
3741 } else {
3742 None
3743 };
3744
3745 if is_bin {
3746 // Drop the read lock before taking the write lock; the outer
3747 // call frame still holds the parent read lock so a concurrent
3748 // split_child cannot run on this BIN's parent until we unwind.
3749 drop(parent_guard);
3750 let mut g = node_arc.write();
3751 match &mut *g {
3752 TreeNode::Bottom(bin) => {
3753 if let Some(cmp) = key_comparator {
3754 bin.delete_cmp(key, cmp.as_ref())
3755 } else {
3756 // Entries store compressed (suffix) keys when key_prefix
3757 // is non-empty. Compress the search key before comparing.
3758 //
3759 // The caller is not required to ensure that `key`
3760 // shares this BIN's learned `key_prefix` — a stray
3761 // delete of a key that was never present (or that
3762 // sits under a different prefix) is legal and must
3763 // simply return `false`. Calling `compress_key`
3764 // unconditionally would `debug_assert!`-panic on
3765 // such inputs, so guard it the same way the cursor
3766 // path does.
3767 if !bin.key_prefix.is_empty()
3768 && !key.starts_with(bin.key_prefix.as_slice())
3769 {
3770 return false;
3771 }
3772 let suffix = bin.compress_key(key);
3773 match bin.entries.binary_search_by(|e| {
3774 e.key.as_slice().cmp(suffix.as_slice())
3775 }) {
3776 Ok(idx) => {
3777 bin.entries.remove(idx);
3778 // Mark dirty after any modification.
3779 bin.dirty = true;
3780 true
3781 }
3782 Err(_) => false,
3783 }
3784 }
3785 }
3786 _ => false,
3787 }
3788 } else {
3789 // Descend with parent_guard still held; the recursion will
3790 // hold its own read lock and drop ours after it returns.
3791 let r = match child_arc {
3792 Some(child) => {
3793 Self::delete_recursive(&child, key, key_comparator)
3794 }
3795 None => false,
3796 };
3797 drop(parent_guard);
3798 r
3799 }
3800 }
3801
3802 // ========================================================================
3803 // B-tree Merge / Compress
3804 // ========================================================================
3805
3806 /// Merge under-full sibling BIN pairs and remove empty subtrees.
3807 ///
3808 /// `INCompressor` / `Tree.compressInternal()` logic.
3809 ///
3810 /// merges two adjacent siblings when their combined entry count is
3811 /// ≤ `max_entries_per_node` (the merge threshold equal to the node
3812 /// capacity). The left sibling's entries are prepended into the right
3813 /// sibling; the parent key slot pointing at the left sibling is then
3814 /// removed from the parent IN with `deleteEntry`. If the parent IN
3815 /// becomes empty after the removal the process repeats recursively up
3816 /// the tree.
3817 ///
3818 /// This implementation performs a single post-order walk so that each
3819 /// level is compressed after all its children have been compressed.
3820 pub fn compress(&self) {
3821 let root = match self.get_root() {
3822 Some(r) => r,
3823 None => return,
3824 };
3825 Self::compress_node(&root, self.max_entries_per_node);
3826 }
3827
3828 /// Recursive post-order compress helper.
3829 ///
3830 /// Visits children first (post-order), then scans adjacent child
3831 /// pairs in the current IN and merges them when the merge condition
3832 /// holds: `left.n_entries + right.n_entries <= max_entries`.
3833 ///
3834 /// After merging, the parent entry for the left sibling is deleted.
3835 /// The loop restarts after each merge so that newly under-full pairs
3836 /// created by previous merges are also considered.
3837 fn compress_node(node_arc: &Arc<RwLock<TreeNode>>, max_entries: usize) {
3838 // Collect child arcs to recurse without holding the node lock.
3839 let children: Vec<Arc<RwLock<TreeNode>>> = {
3840 let g = node_arc.read();
3841 match &*g {
3842 TreeNode::Internal(n) => {
3843 n.entries.iter().filter_map(|e| e.child.clone()).collect()
3844 }
3845 // BINs are leaves; nothing to compress at this level.
3846 TreeNode::Bottom(_) => return,
3847 }
3848 };
3849
3850 // Post-order: recurse into every child before working on this level.
3851 for child in &children {
3852 Self::compress_node(child, max_entries);
3853 }
3854
3855 // Compress the current IN level: merge adjacent under-full children.
3856 // Repeat until a full pass produces no merges.
3857 loop {
3858 let n_entries = {
3859 let g = node_arc.read();
3860 g.get_n_entries()
3861 };
3862
3863 let mut merged_any = false;
3864
3865 // `i` is the index of the *left* candidate; right is at `i+1`.
3866 let mut i = 0usize;
3867 while i + 1 < n_entries {
3868 // Fetch left and right child arcs.
3869 let (left_arc, right_arc) = {
3870 let g = node_arc.read();
3871 match &*g {
3872 TreeNode::Internal(p) => {
3873 let l =
3874 p.entries.get(i).and_then(|e| e.child.clone());
3875 let r = p
3876 .entries
3877 .get(i + 1)
3878 .and_then(|e| e.child.clone());
3879 match (l, r) {
3880 (Some(l), Some(r)) => (l, r),
3881 _ => {
3882 i += 1;
3883 continue;
3884 }
3885 }
3886 }
3887 TreeNode::Bottom(_) => return,
3888 }
3889 };
3890
3891 let left_n = { left_arc.read().get_n_entries() };
3892 let right_n = { right_arc.read().get_n_entries() };
3893
3894 // merge condition: combined count fits within one node.
3895 if left_n + right_n > max_entries {
3896 i += 1;
3897 continue;
3898 }
3899
3900 // Determine node kind from left child.
3901 let left_is_bin = { left_arc.read().is_bin() };
3902
3903 if left_is_bin {
3904 // BIN merge: decompress left entries to full keys, then
3905 // prepend into right BIN (also decompressed), and finally
3906 // recompute the merged BIN's prefix.
3907 // merge left into right, then
3908 // recalcKeyPrefix on the merged node.
3909 let left_full_entries: Vec<BinEntry> = {
3910 {
3911 let g = left_arc.read();
3912 match &*g {
3913 TreeNode::Bottom(b) => (0..b.entries.len())
3914 .map(|j| BinEntry {
3915 key: b
3916 .get_full_key(j)
3917 .unwrap_or_default(),
3918 lsn: b.entries[j].lsn,
3919 data: b.entries[j].data.clone(),
3920 known_deleted: b.entries[j]
3921 .known_deleted,
3922 dirty: b.entries[j].dirty,
3923 expiration_time: b.entries[j]
3924 .expiration_time,
3925 })
3926 .collect(),
3927 _ => {
3928 i += 1;
3929 continue;
3930 }
3931 }
3932 }
3933 };
3934 {
3935 {
3936 let mut g = right_arc.write();
3937 match &mut *g {
3938 TreeNode::Bottom(rb) => {
3939 // Decompress right entries to full keys.
3940 let right_full: Vec<BinEntry> = (0..rb
3941 .entries
3942 .len())
3943 .map(|j| BinEntry {
3944 key: rb
3945 .get_full_key(j)
3946 .unwrap_or_default(),
3947 lsn: rb.entries[j].lsn,
3948 data: rb.entries[j].data.clone(),
3949 known_deleted: rb.entries[j]
3950 .known_deleted,
3951 dirty: rb.entries[j].dirty,
3952 expiration_time: rb.entries[j]
3953 .expiration_time,
3954 })
3955 .collect();
3956 // Left entries are all smaller; prepend.
3957 let mut combined = left_full_entries;
3958 combined.extend(right_full);
3959 // Reset prefix and assign full keys.
3960 rb.key_prefix = Vec::new();
3961 rb.entries = combined;
3962 // Recompute prefix on merged BIN.
3963 if rb.entries.len() >= 2 {
3964 rb.recompute_key_prefix();
3965 }
3966 rb.dirty = true;
3967 }
3968 _ => {
3969 i += 1;
3970 continue;
3971 }
3972 }
3973 }
3974 }
3975 // Clear the now-merged left BIN.
3976 {
3977 let mut g = left_arc.write();
3978 if let TreeNode::Bottom(lb) = &mut *g {
3979 lb.entries.clear();
3980 lb.key_prefix = Vec::new();
3981 lb.dirty = true;
3982 }
3983 }
3984 } else {
3985 // Upper-IN merge: prepend left's InEntries into right.
3986 let left_in_entries: Vec<InEntry> = {
3987 {
3988 let g = left_arc.read();
3989 match &*g {
3990 TreeNode::Internal(n) => n.entries.clone(),
3991 _ => {
3992 i += 1;
3993 continue;
3994 }
3995 }
3996 }
3997 };
3998 {
3999 {
4000 let mut g = right_arc.write();
4001 match &mut *g {
4002 TreeNode::Internal(rn) => {
4003 let mut combined = left_in_entries.clone();
4004 combined.append(&mut rn.entries);
4005 rn.entries = combined;
4006 rn.dirty = true;
4007 }
4008 _ => {
4009 i += 1;
4010 continue;
4011 }
4012 }
4013 }
4014 }
4015 // Update parent pointers for moved children.
4016 for entry in &left_in_entries {
4017 if let Some(child) = &entry.child {
4018 let mut cg = child.write();
4019 cg.set_parent(Some(Arc::downgrade(&right_arc)));
4020 }
4021 }
4022 // Clear the now-merged left IN.
4023 {
4024 let mut g = left_arc.write();
4025 if let TreeNode::Internal(ln) = &mut *g {
4026 ln.entries.clear();
4027 ln.dirty = true;
4028 }
4029 }
4030 }
4031
4032 // Remove the right sibling's parent slot and update
4033 // the left slot to point at the merged right child.
4034 //
4035 // We keep the LEFT slot's key (which is the correct minimum for
4036 // the merged BIN's range) and remove the RIGHT slot (i+1).
4037 // This avoids having to update the parent key when i == 0.
4038 {
4039 {
4040 let mut g = node_arc.write();
4041 match &mut *g {
4042 TreeNode::Internal(p) => {
4043 // Update left slot (i) to point at right_arc
4044 // (which now contains the merged entries).
4045 if let Some(slot) = p.entries.get_mut(i) {
4046 slot.child = Some(right_arc.clone());
4047 }
4048 // Remove right slot (i+1) — it is now redundant.
4049 p.entries.remove(i + 1);
4050 p.dirty = true;
4051 }
4052 TreeNode::Bottom(_) => return,
4053 }
4054 }
4055 }
4056
4057 merged_any = true;
4058 // Advance i to check the merged BIN against its new right
4059 // sibling (the old slot i+2 is now at i+1).
4060 i += 1;
4061 let updated_n = { node_arc.read().get_n_entries() };
4062 if i + 1 >= updated_n {
4063 break;
4064 }
4065 }
4066
4067 if !merged_any {
4068 break;
4069 }
4070 }
4071 }
4072
4073 // ========================================================================
4074 // BIN slot compression
4075 // ========================================================================
4076
4077 /// Compress deleted slots from a BIN node, then prune it from its parent
4078 /// IN when it becomes empty.
4079 ///
4080 /// (the in-place slot-removal
4081 /// path, NOT the sibling-merge path handled by `compress()`).
4082 ///
4083 /// # Algorithm
4084 ///
4085 /// 1. If the BIN is a delta, skip — deltas cannot be compressed.
4086 /// 2. Remove all slots where `entry.known_deleted` is true. This mirrors
4087 /// `bin.compress(!bin.shouldLogDelta(), localTracker)`.
4088 /// 3. If the BIN is now empty, remove it from its parent IN. This mirrors
4089 /// `pruneBIN(db, binRef, idKey)` → `tree.delete(idKey)`.
4090 ///
4091 /// # Arguments
4092 ///
4093 /// * `bin_arc` — the BIN to compress (must be a `TreeNode::Bottom`).
4094 ///
4095 /// # Returns
4096 ///
4097 /// `true` if compression made progress (slots were removed or the BIN was
4098 /// pruned), `false` if the BIN was skipped (delta, no cursors issue, etc.).
4099 pub fn compress_bin(&self, bin_arc: &Arc<RwLock<TreeNode>>) -> bool {
4100 // ---- Step 1: collect metadata without holding the write lock ----
4101 let (is_delta, n_entries, id_key) = {
4102 {
4103 let g = bin_arc.read();
4104 match &*g {
4105 TreeNode::Bottom(b) => {
4106 // Identifier key = first full key in the BIN
4107 // (the: bin.getIdentifierKey()).
4108 let id_key = b.get_full_key(0);
4109 (b.is_delta, b.entries.len(), id_key)
4110 }
4111 _ => return false, // not a BIN
4112 }
4113 }
4114 };
4115
4116 // If (bin.isBINDelta()) return; — deltas cannot be compressed.
4117 if is_delta {
4118 return false;
4119 }
4120
4121 // ---- Step 2: remove known-deleted slots) ----
4122 // We compress dirty slots too (compress_dirty_slots = true) because
4123 // we are not writing a BIN-delta here.
4124 let removed_any = {
4125 {
4126 let mut g = bin_arc.write();
4127 match &mut *g {
4128 TreeNode::Bottom(b) => {
4129 let before = b.entries.len();
4130 // BIN.compress(): walk backwards to remove
4131 // deleted slots without index confusion.
4132 //
4133 // ponytail: IC-3 — we remove `known_deleted` slots
4134 // without consulting the lock manager's per-record
4135 // write-lock state (JE BIN.compress inspects the
4136 // cursor/lock state). The lock manager lives in a
4137 // DIFFERENT crate (noxu-txn); the tree layer has no
4138 // access to it, so a cross-crate write-lock check is
4139 // out of scope here. This is SAFE in the current
4140 // design because the only slots that reach here with
4141 // `known_deleted == true` are committed deletes:
4142 // * the dbi write path (cursor_impl.rs delete())
4143 // PHYSICALLY removes the slot via tree.delete()
4144 // while holding the txn write lock — it never
4145 // leaves a write-locked `known_deleted` tombstone
4146 // in a BinStub; and
4147 // * the only writer of BinStub.known_deleted == true
4148 // is BIN-delta / recovery replay, which only
4149 // replays already-committed deletes.
4150 // The compressor daemon
4151 // (environment_impl.rs: collect_bins_with_known_deleted
4152 // → compress_bin) therefore only ever sees committed
4153 // (unlocked) defunct slots. See
4154 // docs/src/operations/known-limitations.md (IC-3) for
4155 // the upgrade path if a future write path ever leaves
4156 // an uncommitted write-locked tombstone in a BinStub.
4157 let mut j = b.entries.len();
4158 while j > 0 {
4159 j -= 1;
4160 if b.entries[j].known_deleted {
4161 // JE `IN.deleteEntry` (IN.java:3466): removing a
4162 // DIRTY slot must prohibit the next delta — a
4163 // delta only carries dirty slots, so the removal
4164 // would otherwise be silently lost. Force a
4165 // full BIN on the next log.
4166 if b.entries[j].dirty {
4167 b.prohibit_next_delta = true;
4168 }
4169 b.entries.remove(j);
4170 b.dirty = true;
4171 }
4172 }
4173 // Recompute prefix after slot removal, since the
4174 // remaining keys may share a longer common prefix.
4175 // After compress(), call recalcKeyPrefix().
4176 if b.entries.len() >= 2 {
4177 b.recompute_key_prefix();
4178 } else if b.entries.len() < 2 {
4179 b.key_prefix = Vec::new();
4180 }
4181 b.entries.len() < before
4182 }
4183 _ => false,
4184 }
4185 }
4186 };
4187
4188 // ---- Step 3: prune empty BIN from parent ----
4189 // If (empty) pruneBIN(db, binRef, idKey) → tree.delete(idKey).
4190 // We only prune when the BIN is actually empty after compression.
4191 let now_empty = { bin_arc.read().get_n_entries() == 0 };
4192
4193 if now_empty {
4194 // pruneBIN re-descends to the SPECIFIC empty BIN and removes its
4195 // parent-IN slot ONLY IF the BIN is still empty (and has no
4196 // cursors and is not a delta) UNDER THE PARENT LATCH.
4197 //
4198 // We must NOT use `self.delete(&id_key)` here (IC-1): that
4199 // re-descends by key and removes whatever live entry now matches
4200 // `id_key`. Between reading `now_empty` (a fresh read lock taken
4201 // after the compression write lock was dropped) and acting on it,
4202 // a concurrent insert can repopulate this BIN; `self.delete` would
4203 // then drop a LIVE entry — tree corruption / lost write.
4204 //
4205 // JE `INCompressor.pruneBIN` (INCompressor.java ~line 502-510)
4206 // calls `tree.delete(idKey)`, and JE `Tree.delete` /
4207 // `searchDeletableSubTree` (Tree.java ~line 755-800) re-validates
4208 // `bin.getNEntries() != 0` → NODE_NOT_EMPTY (abort) and
4209 // `bin.nCursors() > 0` → CURSORS_EXIST (abort) while holding the
4210 // parent (branch) latch. `prune_empty_bin` reproduces exactly
4211 // that re-validation. See `prune_empty_bin` below.
4212 //
4213 // Note: we only attempt the prune if n_entries was > 0 before
4214 // compression (an already-empty BIN we never populated is left
4215 // alone, matching the pre-existing guard).
4216 if let Some(key) = id_key
4217 && n_entries > 0
4218 {
4219 self.prune_empty_bin(&key);
4220 }
4221 return true;
4222 }
4223
4224 removed_any
4225 }
4226
4227 /// Re-descend to the leaf BIN that should contain `id_key` and remove its
4228 /// parent-IN child slot ONLY IF the BIN is still safe to prune.
4229 ///
4230 /// This is the faithful port of JE `Tree.delete(idKey)` /
4231 /// `Tree.searchDeletableSubTree` (Tree.java ~line 755-800) as invoked by
4232 /// `INCompressor.pruneBIN` (INCompressor.java ~line 502-510). JE takes the
4233 /// branch-parent latch, re-descends to the specific empty BIN, and aborts
4234 /// the prune (removing NOTHING) if any of the following changed since the
4235 /// compressor observed the BIN as empty:
4236 ///
4237 /// * `bin.getNEntries() != 0` → `NodeNotEmptyException` (a concurrent
4238 /// insert repopulated the BIN — IC-1: we must NOT delete a live entry).
4239 /// * `bin.isBINDelta()` → `unexpectedState` (deltas are never empty).
4240 /// * `bin.nCursors() > 0` → `CursorsExistException` (a cursor is parked
4241 /// on the empty BIN; requeue rather than orphan the cursor).
4242 ///
4243 /// The re-check and the slot removal both happen while holding the
4244 /// **parent IN write latch**. Holding the parent write latch blocks every
4245 /// descender (insert / delete take `parent.read()` hand-over-hand), so a
4246 /// concurrent insert cannot reach the BIN between our re-check and the
4247 /// slot removal — the TOCTOU window IC-1 describes is closed.
4248 ///
4249 /// Returns `true` iff a parent-IN slot was removed, `false` otherwise
4250 /// (BIN repopulated, has a cursor, is a delta, vanished, or is the root —
4251 /// in every `false` case NOTHING is removed).
4252 pub fn prune_empty_bin(&self, id_key: &[u8]) -> bool {
4253 let root = match self.get_root() {
4254 Some(r) => r,
4255 None => return false,
4256 };
4257
4258 // If the root itself is the BIN (single-BIN tree) there is no parent
4259 // IN to remove a slot from. JE's searchDeletableSubTree returns null
4260 // ("the entire tree is empty") and keeps the root BIN; we do the same.
4261 if root.read().is_bin() {
4262 return false;
4263 }
4264
4265 // Descend by id_key tracking the IN that is the *parent of the leaf
4266 // BIN* and the child index within it. Hand-over-hand read coupling
4267 // keeps the descent consistent with concurrent splits, exactly like
4268 // `get_parent_bin_for_child_ln`.
4269 let (parent_arc, child_index) = {
4270 let mut parent_arc: Arc<RwLock<TreeNode>> = root.clone();
4271 let mut guard: parking_lot::ArcRwLockReadGuard<
4272 parking_lot::RawRwLock,
4273 TreeNode,
4274 > = root.read_arc();
4275 loop {
4276 let (next_arc, idx) = match &*guard {
4277 TreeNode::Internal(n) => {
4278 if n.entries.is_empty() {
4279 return false;
4280 }
4281 let idx = self.upper_in_floor_index(&n.entries, id_key);
4282 match n.entries.get(idx).and_then(|e| e.child.clone()) {
4283 Some(c) => (c, idx),
4284 None => return false,
4285 }
4286 }
4287 TreeNode::Bottom(_) => {
4288 unreachable!("is_bin checked before / below")
4289 }
4290 };
4291 // Is the next node the leaf BIN? If so, `guard`'s node is the
4292 // parent IN we want and `idx` is the child slot.
4293 if next_arc.read().is_bin() {
4294 drop(guard);
4295 break (parent_arc, idx);
4296 }
4297 let next_guard = next_arc.read_arc();
4298 drop(guard);
4299 parent_arc = next_arc;
4300 guard = next_guard;
4301 }
4302 };
4303
4304 // ---- Re-validate and remove the slot UNDER THE PARENT WRITE LATCH ----
4305 // Holding parent.write() excludes all descenders (they need
4306 // parent.read()), so the BIN cannot be repopulated between the
4307 // re-check and the slot removal.
4308 let mut parent_guard = parent_arc.write();
4309 let pruned_bin_id;
4310 let removed_key_len = match &mut *parent_guard {
4311 TreeNode::Internal(p) => {
4312 let child = match p.entries.get(child_index) {
4313 Some(e) => match &e.child {
4314 Some(c) => c.clone(),
4315 None => return false, // slot already vacated
4316 },
4317 None => return false, // slot index no longer valid
4318 };
4319 // Re-validate the child BIN under the parent latch.
4320 {
4321 let cg = child.read();
4322 match &*cg {
4323 TreeNode::Bottom(b) => {
4324 // JE: bin.getNEntries() != 0 → NODE_NOT_EMPTY (abort).
4325 if !b.entries.is_empty() {
4326 return false;
4327 }
4328 // JE: bin.isBINDelta() → unexpectedState (abort).
4329 if b.is_delta {
4330 return false;
4331 }
4332 // JE: bin.nCursors() > 0 → CURSORS_EXIST (abort).
4333 if b.cursor_count > 0 {
4334 return false;
4335 }
4336 pruned_bin_id = b.node_id;
4337 }
4338 // A concurrent split could in principle have replaced
4339 // the child with an IN; never prune in that case.
4340 TreeNode::Internal(_) => return false,
4341 }
4342 }
4343 // Safe to prune: remove the BIN's slot from the parent IN.
4344 // Mirrors the parent-slot removal `Tree.delete` performs for
4345 // an empty BIN (Tree.java deleteEntry under the branch latch).
4346 let removed = p.entries.remove(child_index);
4347 p.dirty = true;
4348 removed.key.len()
4349 }
4350 TreeNode::Bottom(_) => return false,
4351 };
4352 drop(parent_guard);
4353
4354 // JE: removing the BIN slot detaches the BIN from the tree; the
4355 // evictor must drop it from its LRU lists (Evictor.remove).
4356 self.note_removed(pruned_bin_id);
4357
4358 // Preserve the memory-counter bookkeeping that `self.delete` performed
4359 // (IN.updateMemorySize(-delta) → MemoryBudget.updateTreeMemoryUsage).
4360 // The pruned slot's key plus the fixed per-entry overhead matches the
4361 // `delete` accounting (key.len() + BIN_ENTRY_OVERHEAD).
4362 if let Some(counter) = &self.memory_counter {
4363 let delta = (removed_key_len + BIN_ENTRY_OVERHEAD) as i64;
4364 counter.fetch_sub(delta, Ordering::Relaxed);
4365 }
4366
4367 true
4368 }
4369
4370 /// Detach the resident child node `node_id` from its parent IN, dropping
4371 /// the strong `Arc` so the node is actually freed from memory, and return
4372 /// the heap bytes reclaimed (0 if not found / not detachable).
4373 ///
4374 /// This is the faithful port of JE `IN.detachNode(idx, updateLsn, newLsn)`
4375 /// (IN.java ~4019) as called from `Evictor.evict` (Evictor.java ~3035):
4376 /// `evict` measures `target.getBudgetedMemorySize()` and then
4377 /// `parent.detachNode(index, ...)` does `setTarget(idx, null)` to drop the
4378 /// child reference and `getInMemoryINs().remove(child)` to drop it from
4379 /// the INList.
4380 ///
4381 /// EV-13: before this method existed, the evictor credited
4382 /// `node_size_fn(node_id)` bytes back to the budget and removed the node
4383 /// from the LRU lists, but the parent's `InEntry.child` still held a
4384 /// strong `Arc` — so the node was never dropped from the heap. The budget
4385 /// over-credited (claimed bytes freed that were not), `cache_usage`
4386 /// drifted below reality, and the evictor under-fired. Detaching here
4387 /// drops the `Arc` for real and credits exactly the measured size.
4388 ///
4389 /// The detach happens **under the parent IN write latch** (JE detaches
4390 /// under the parent's latch), so no concurrent descender can re-cache the
4391 /// child between measurement and detach. The slot (key + LSN) is kept —
4392 /// only the in-memory `child` target is cleared — matching JE's
4393 /// `setTarget(idx, null)` which leaves the `ChildReference` LSN intact so
4394 /// the node can be re-fetched from the log later.
4395 ///
4396 /// Returns `0` if the node is not a resident child of any IN (e.g. it is
4397 /// the root, already detached, or was pinned and could not be latched).
4398 pub fn detach_node_by_id(&self, node_id: u64) -> u64 {
4399 let root = match self.get_root() {
4400 Some(r) => r,
4401 None => return 0,
4402 };
4403
4404 // The root has no parent IN to detach from (JE evicts the root via a
4405 // separate evictRoot path; we keep the root resident here).
4406 let root_id = {
4407 let g = root.read();
4408 match &*g {
4409 TreeNode::Internal(n) => n.node_id,
4410 TreeNode::Bottom(b) => b.node_id,
4411 }
4412 };
4413 if root_id == node_id {
4414 return 0;
4415 }
4416
4417 // Locate the parent IN and the child slot index.
4418 let (parent_arc, child_index) =
4419 match Self::find_parent_of_node_id(&root, node_id) {
4420 Some(p) => p,
4421 None => return 0,
4422 };
4423
4424 // ---- Measure + detach UNDER THE PARENT WRITE LATCH ----
4425 // Holding parent.write() excludes all descenders (they take
4426 // parent.read() hand-over-hand), so the child cannot be re-cached or
4427 // re-pinned between the measurement and the detach. Mirrors JE
4428 // detachNode running under the parent latch held by Evictor.evict.
4429 let mut parent_guard = parent_arc.write();
4430 let TreeNode::Internal(p) = &mut *parent_guard else {
4431 return 0; // parent is not an IN (concurrent restructure)
4432 };
4433 let entry = match p.entries.get_mut(child_index) {
4434 Some(e) => e,
4435 None => return 0,
4436 };
4437 let child = match entry.child.take() {
4438 Some(c) => c, // child Arc removed from the slot
4439 None => return 0, // already detached
4440 };
4441
4442 // Measure the child's real heap footprint while we still hold it.
4443 // JE: long evictedBytes = target.getBudgetedMemorySize().
4444 let freed = child.read().budgeted_memory_size();
4445
4446 // Mark the parent dirty: the slot's in-memory target changed (JE
4447 // detachNode sets dirty when updateLsn; we conservatively mark dirty
4448 // so the parent is re-logged with the now-non-resident slot).
4449 p.dirty = true;
4450
4451 // Drop the strong Arc explicitly so the node is freed now (the slot's
4452 // `child` is already None). If any other resident path still held a
4453 // strong reference this would not free — but the tree is the sole
4454 // strong owner of a cached child, so this drops the last strong ref.
4455 drop(parent_guard);
4456 drop(child);
4457
4458 // JE: getInMemoryINs().remove(child) — drop it from the evictor LRU.
4459 self.note_removed(node_id);
4460
4461 // NOTE: the live tree-memory counter (`memory_counter`) is the SAME
4462 // `Arc<AtomicI64>` the evictor's Arbiter uses as `cache_usage`. The
4463 // evictor decrements it once via `Arbiter::release_memory(bytes)` for
4464 // the full eviction batch, so detach must NOT decrement here too —
4465 // that would double-credit and drive `cache_usage` below reality
4466 // (the very drift EV-13 fixes, in the other direction). We only
4467 // measure-and-free; the caller does the single counter update.
4468 freed
4469 }
4470
4471 /// Check whether a BIN node is a candidate for slot compression and,
4472 /// if so, trigger `compress_bin`.
4473 ///
4474 /// from (the opportunistic / lazy compression path).
4475 ///
4476 /// # Algorithm
4477 ///
4478 /// 1. Skip the BIN if it is a delta or has no defunct (known-deleted) slots.
4479 /// 2. If compression succeeds and the BIN becomes empty, it is pruned.
4480 ///
4481 /// # Returns
4482 ///
4483 /// `true` if compression was triggered (regardless of whether any slots
4484 /// were actually removed), `false` if the BIN does not need compression.
4485 pub fn maybe_compress_bin_and_parent(
4486 &self,
4487 bin_arc: &Arc<RwLock<TreeNode>>,
4488 ) -> bool {
4489 // Check whether the BIN has any deleted slots worth compressing.
4490 // lazyCompress: skip deltas and BINs with no defunct slots.
4491 let should_compress = {
4492 {
4493 let g = bin_arc.read();
4494 match &*g {
4495 TreeNode::Bottom(b) => {
4496 // Skip deltas (the: !in.isBIN() || in.isBINDelta()).
4497 if b.is_delta {
4498 false
4499 } else {
4500 // Check for any known-deleted slot
4501 // (the: for (int i=0; i < bin.getNEntries(); i++) {
4502 // if (bin.isDefunct(i)) { ... break; }
4503 // }).
4504 b.entries.iter().any(|e| e.known_deleted)
4505 }
4506 }
4507 _ => false,
4508 }
4509 }
4510 };
4511
4512 if !should_compress {
4513 return false;
4514 }
4515
4516 self.compress_bin(bin_arc)
4517 }
4518
4519 // ========================================================================
4520 // Latch-coupling validation
4521 // ========================================================================
4522
4523 /// Validate that `parent.entries[child_index].child` still points at
4524 /// `child_arc` after acquiring the child's latch.
4525 ///
4526 /// Re-latch validation step inside the
4527 /// `Tree.searchSplitsAllowed`: after a concurrent split the parent
4528 /// slot that previously held the child may have changed. Callers that
4529 /// plan to mutate the child must verify the parent-child link is still
4530 /// intact before proceeding.
4531 ///
4532 /// Returns `true` if the parent-child link is intact.
4533 pub fn validate_parent_child(
4534 parent: &Arc<RwLock<TreeNode>>,
4535 child_index: usize,
4536 child_arc: &Arc<RwLock<TreeNode>>,
4537 ) -> bool {
4538 let g = parent.read();
4539 match &*g {
4540 TreeNode::Internal(p) => match p.entries.get(child_index) {
4541 Some(entry) => match &entry.child {
4542 Some(stored) => Arc::ptr_eq(stored, child_arc),
4543 None => false,
4544 },
4545 None => false,
4546 },
4547 TreeNode::Bottom(_) => false,
4548 }
4549 }
4550
4551 /// Search for the BIN that should contain `key`, with latch-coupling
4552 /// validation at every level of descent.
4553 ///
4554 /// .
4555 ///
4556 /// The difference from `search()` is that after obtaining the child
4557 /// arc we call `validate_parent_child` to confirm the parent still
4558 /// holds the expected Arc. If the link has been broken (e.g. by a
4559 /// concurrent split that relocated the child) the traversal restarts
4560 /// from the root.
4561 ///
4562 /// Returns a `SearchResult` if the key is (or should be) in the tree,
4563 /// `None` if the tree is empty.
4564 ///
4565 /// Same as [`Tree::search`] but exposes the hand-over-hand latch
4566 /// coupling explicitly. Kept as a public, equivalent API for
4567 /// callers (today only tests) that want to verify the
4568 /// latch-coupling behaviour against `search()` itself.
4569 ///
4570 /// Both `search()` and this method use the same `read_arc()`
4571 /// hand-over-hand: take the child read guard *before* dropping
4572 /// the parent guard, so a concurrent `split_child(parent, ..)`
4573 /// (which takes `parent.write()`) cannot run between when we
4574 /// captured the child Arc and when we entered the child. There
4575 /// is no validate-and-restart loop because the coupling makes
4576 /// the race unreachable.
4577 pub fn search_with_coupling(&self, key: &[u8]) -> Option<SearchResult> {
4578 let root = self.get_root()?;
4579 let mut guard: parking_lot::ArcRwLockReadGuard<
4580 parking_lot::RawRwLock,
4581 TreeNode,
4582 > = root.read_arc();
4583
4584 loop {
4585 if guard.is_bin() {
4586 let index = guard.find_entry(key, true, true);
4587 let found = index >= 0 && (index & EXACT_MATCH != 0);
4588 return Some(SearchResult::with_values(
4589 found,
4590 index & 0xFFFF,
4591 false,
4592 ));
4593 }
4594
4595 let next_arc = match &*guard {
4596 TreeNode::Internal(n) => {
4597 if n.entries.is_empty() {
4598 return None;
4599 }
4600 let idx = self.upper_in_floor_index(&n.entries, key);
4601 n.entries.get(idx)?.child.clone()?
4602 }
4603 TreeNode::Bottom(_) => {
4604 unreachable!("is_bin() returned false above")
4605 }
4606 };
4607 // Hand-over-hand: take the child read guard before
4608 // releasing the parent guard. Closes the
4609 // descender-vs-splitter window: a concurrent
4610 // split_child(parent, ..) takes parent.write(), which
4611 // blocks while we still hold parent.read().
4612 let next_guard = next_arc.read_arc();
4613 drop(guard);
4614 guard = next_guard;
4615 }
4616 }
4617
4618 // ========================================================================
4619 // BIN-Delta reconstitution
4620 // ========================================================================
4621
4622 /// Increments the cursor-pin count on a BIN node.
4623 ///
4624 /// Called by `CursorImpl` when it positions on (or enters) a BIN.
4625 /// The evictor will not select a BIN with `cursor_count > 0` for eviction
4626 /// (`RealNodeInfo.pin_count`), matching `BIN.incrementCursorCount()`.
4627 pub fn pin_bin(bin_arc: &Arc<RwLock<TreeNode>>) {
4628 let mut guard = bin_arc.write();
4629 if let TreeNode::Bottom(ref mut stub) = *guard {
4630 stub.cursor_count += 1;
4631 }
4632 }
4633
4634 /// Decrements the cursor-pin count on a BIN node.
4635 ///
4636 /// Called by `CursorImpl` when it moves away from or closes on a BIN.
4637 /// Uses `saturating_sub` to guard against an accidental double-unpin.
4638 /// Matching `BIN.decrementCursorCount()`.
4639 pub fn unpin_bin(bin_arc: &Arc<RwLock<TreeNode>>) {
4640 let mut guard = bin_arc.write();
4641 if let TreeNode::Bottom(ref mut stub) = *guard {
4642 stub.cursor_count = stub.cursor_count.saturating_sub(1);
4643 }
4644 }
4645
4646 /// Returns `true` if the given `BinStub` is a BIN-delta (not a full BIN).
4647 ///
4648 /// `IN.isBINDelta()`.
4649 pub fn bin_is_delta(bin: &BinStub) -> bool {
4650 bin.is_delta
4651 }
4652
4653 /// Merge delta entries into a full BIN's entry list.
4654 ///
4655 /// - For each delta entry: if a matching key already exists in `bin`,
4656 /// replace it (delta is authoritative).
4657 /// - Otherwise insert the delta entry in sorted position.
4658 ///
4659 /// Delta entries carry **full** keys (prefix already prepended by the
4660 /// caller). After applying all delta entries the BIN's prefix is
4661 /// recomputed so the final state is consistent.
4662 ///
4663 /// All delta entries are considered to be the most-recently-dirtied
4664 /// state, exactly as in where delta slots supersede full-BIN slots.
4665 pub fn apply_delta_to_bin(bin: &mut BinStub, delta_entries: Vec<BinEntry>) {
4666 for delta in delta_entries {
4667 // `delta.key` is a full (uncompressed) key here.
4668 bin.insert_with_prefix(delta.key, delta.lsn, delta.data);
4669 }
4670 bin.dirty = true;
4671 }
4672
4673 /// Reconstitute a BIN-delta into a full BIN.
4674 ///
4675 /// from the:
4676 ///
4677 /// 1. Extract the delta entries from `self` (this BIN-delta), decompressing
4678 /// them to full keys.
4679 /// 2. Apply them onto `base` (the previously logged full BIN) via
4680 /// `apply_delta_to_bin`.
4681 /// 3. Copy `base`'s merged entries and prefix back into `self`.
4682 /// 4. Clear the `is_delta` flag so subsequent code treats `self` as
4683 /// a full BIN.
4684 ///
4685 /// After this call `self` is a full BIN; `base` should be discarded.
4686 pub fn mutate_to_full_bin(delta: &mut BinStub, mut base: BinStub) {
4687 // Decompress delta entries to full keys before applying.
4688 let delta_full_entries: Vec<BinEntry> = (0..delta.entries.len())
4689 .map(|i| BinEntry {
4690 key: delta.get_full_key(i).unwrap_or_default(),
4691 lsn: delta.entries[i].lsn,
4692 data: delta.entries[i].data.clone(),
4693 known_deleted: delta.entries[i].known_deleted,
4694 dirty: delta.entries[i].dirty,
4695 expiration_time: delta.entries[i].expiration_time,
4696 })
4697 .collect();
4698 // reconstituteBIN + resetContent + setBINDelta(false).
4699 Self::apply_delta_to_bin(&mut base, delta_full_entries);
4700 delta.entries = base.entries;
4701 delta.key_prefix = base.key_prefix;
4702 delta.is_delta = false;
4703 delta.dirty = true;
4704 }
4705
4706 /// Reconstitute a BIN-delta into a full BIN by reading the base from log.
4707 ///
4708 /// — the
4709 /// single-argument overload that calls `fetchFullBIN(databaseImpl)` to
4710 /// read the last full BIN from the log manager automatically.
4711 ///
4712 /// Algorithm:
4713 /// 1. If `delta.last_full_lsn == NULL_LSN`, the BIN was never written as a
4714 /// full entry; there is no base to merge so the delta IS the full BIN.
4715 /// Clear `is_delta` and return.
4716 /// 2. Read the full-BIN log entry at `delta.last_full_lsn` using
4717 /// `log_manager.read_entry(lsn)`.
4718 /// 3. Deserialize the payload with `BinStub::deserialize_full()`.
4719 /// 4. Delegate to `Self::mutate_to_full_bin(delta, base)` to merge and
4720 /// replace `delta`'s contents.
4721 ///
4722 /// On any read / parse failure the function falls back to clearing the
4723 /// `is_delta` flag without merging, so the caller always gets a non-delta
4724 /// BIN (possibly missing some old slots). This mirrors the
4725 /// `EnvironmentFailureException` path but gracefully degrades instead of
4726 /// panicking.
4727 ///
4728 /// `BIN.fetchFullBIN(dbImpl)` + `BIN.mutateToFullBIN(boolean)`.
4729 pub fn mutate_to_full_bin_from_log(
4730 delta: &mut BinStub,
4731 log_manager: &noxu_log::LogManager,
4732 ) {
4733 if !delta.is_delta {
4734 // Already a full BIN; nothing to do.
4735 return;
4736 }
4737
4738 if delta.last_full_lsn == NULL_LSN {
4739 // BIN has never been logged as a full entry — the in-memory delta
4740 // is effectively the full state. During recovery this path is
4741 // harmless.
4742 delta.is_delta = false;
4743 return;
4744 }
4745
4746 // Read the full-BIN log entry at last_full_lsn.
4747 // `envImpl.getLogManager().getEntryHandleFileNotFound(lsn)`.
4748 match log_manager.read_entry(delta.last_full_lsn) {
4749 Ok((entry_type, payload)) => {
4750 use noxu_log::LogEntryType;
4751 if entry_type == LogEntryType::BIN {
4752 if let Some(mut base) = BinStub::deserialize_full(&payload)
4753 {
4754 // Set the base's last_full_lsn so it is preserved
4755 // into the merged result.
4756 base.last_full_lsn = delta.last_full_lsn;
4757 Self::mutate_to_full_bin(delta, base);
4758 return;
4759 }
4760 // Deserialization failed — fall through to graceful degradation.
4761 log::warn!(
4762 "mutate_to_full_bin_from_log: failed to deserialize \
4763 full BIN at LSN {:?}; keeping delta as-is",
4764 delta.last_full_lsn
4765 );
4766 } else {
4767 log::warn!(
4768 "mutate_to_full_bin_from_log: expected BIN entry at \
4769 LSN {:?}, got {:?}",
4770 delta.last_full_lsn,
4771 entry_type
4772 );
4773 }
4774 }
4775 Err(e) => {
4776 log::warn!(
4777 "mutate_to_full_bin_from_log: failed to read log at \
4778 LSN {:?}: {}",
4779 delta.last_full_lsn,
4780 e
4781 );
4782 }
4783 }
4784
4785 // Graceful degradation: promote the delta to a "full" BIN without
4786 // the base slots. The BIN will be re-logged as a full BIN at the
4787 // next checkpoint.
4788 delta.is_delta = false;
4789 delta.dirty = true;
4790 }
4791
4792 // ========================================================================
4793 // getNextBin / getPrevBin
4794 // ========================================================================
4795
4796 /// Return the entries of the BIN immediately to the right of the BIN
4797 /// that contains (or would contain) `current_key`.
4798 ///
4799 /// → `Tree.getNextIN(forward=true)`.
4800 ///
4801 /// # Algorithm
4802 /// 1. Build a root-to-BIN path for `current_key`.
4803 /// 2. Walk the path back up looking for a parent that has a slot to the
4804 /// right of the slot we descended through.
4805 /// 3. When found, descend to the leftmost BIN of that sibling subtree.
4806 /// 4. If no such parent exists, return `None` (no next BIN).
4807 pub fn get_next_bin(&self, current_key: &[u8]) -> Option<Vec<BinEntry>> {
4808 let root = self.get_root()?;
4809 self.get_adjacent_bin(&root, current_key, true)
4810 }
4811
4812 /// Return the entries of the BIN immediately to the left of the BIN
4813 /// that contains (or would contain) `current_key`.
4814 ///
4815 /// → `Tree.getNextIN(forward=false)`.
4816 pub fn get_prev_bin(&self, current_key: &[u8]) -> Option<Vec<BinEntry>> {
4817 let root = self.get_root()?;
4818 self.get_adjacent_bin(&root, current_key, false)
4819 }
4820
4821 /// Core implementation shared by `get_next_bin` and `get_prev_bin`.
4822 ///
4823 /// Builds the path from `root` down to the BIN for `current_key`
4824 /// (each element records the parent arc, the slot index taken,
4825 /// and the child Arc reached) using `read_arc()` hand-over-hand
4826 /// latch coupling.
4827 ///
4828 /// The ascent re-acquires the parent's read lock one level at a
4829 /// time. To handle a concurrent split that completes between
4830 /// path capture and ascent, we validate that the slot still
4831 /// holds the child Arc we descended through. If the slot
4832 /// mismatches we retry the whole operation from root with a
4833 /// short pause between attempts. The retry budget is generous
4834 /// (`MAX_ASCENT_ATTEMPTS`) so that the typical case of a few
4835 /// cascading splits between two BIN-level cursor steps is
4836 /// absorbed without surfacing as a false end-of-iteration.
4837 /// After exhausting the budget we conservatively return `None`,
4838 /// signalling "no adjacent BIN found"; the cursor will then
4839 /// either restart its scan or report end-of-iteration. The
4840 /// budget is finite so a pathological workload (a thread
4841 /// permanently splitting under us) cannot livelock the lookup.
4842 /// JE `Tree.getNextIN` / `Tree.getPrevIN`.
4843 ///
4844 /// R3 fix (2026-06-16): converted from `static fn` to `&self` so that the
4845 /// IN-level descent uses `self.upper_in_floor_index` (comparator-aware)
4846 /// instead of a raw byte `<=`. Without this, databases with a custom
4847 /// comparator (secondary indexes, sorted-dup) could descend to the wrong
4848 /// child → wrong adjacent BIN → incorrect cursor iteration across BIN
4849 /// boundaries. Mirrors `Tree.getNextIN`/`Tree.getPrevIN` using the
4850 /// comparator-aware `IN.findEntry`.
4851 fn get_adjacent_bin(
4852 &self,
4853 root: &Arc<RwLock<TreeNode>>,
4854 current_key: &[u8],
4855 forward: bool,
4856 ) -> Option<Vec<BinEntry>> {
4857 const MAX_ASCENT_ATTEMPTS: u32 = 8;
4858 for attempt in 0..MAX_ASCENT_ATTEMPTS {
4859 match self.get_adjacent_bin_attempt(root, current_key, forward) {
4860 AdjacentBinOutcome::Found(v) => return Some(v),
4861 AdjacentBinOutcome::NoAdjacent => return None,
4862 AdjacentBinOutcome::SplitRaceRetry => {
4863 // Brief pause to let the splitter finish.
4864 if attempt + 1 < MAX_ASCENT_ATTEMPTS {
4865 std::thread::yield_now();
4866 }
4867 }
4868 }
4869 }
4870 // Exhausted retry budget. Signal "no adjacent" so the
4871 // cursor can fall back to its end-of-iteration path.
4872 None
4873 }
4874
4875 /// One attempt at `get_adjacent_bin`. The tri-state return
4876 /// value distinguishes "no adjacent BIN exists" (which the
4877 /// caller should propagate as end-of-iteration) from "a
4878 /// concurrent split invalidated our path" (which the caller
4879 /// should retry from root).
4880 fn get_adjacent_bin_attempt(
4881 &self,
4882 root: &Arc<RwLock<TreeNode>>,
4883 current_key: &[u8],
4884 forward: bool,
4885 ) -> AdjacentBinOutcome {
4886 // Path entry: (parent_arc, slot_idx_taken, child_arc_reached).
4887 // The child Arc lets the ascent validate that the slot still
4888 // points to the same node we descended through.
4889 let mut path: Vec<(
4890 Arc<RwLock<TreeNode>>,
4891 usize,
4892 Arc<RwLock<TreeNode>>,
4893 )> = Vec::new();
4894
4895 let mut guard: parking_lot::ArcRwLockReadGuard<
4896 parking_lot::RawRwLock,
4897 TreeNode,
4898 > = root.read_arc();
4899 loop {
4900 if guard.is_bin() {
4901 break;
4902 }
4903
4904 let (next_arc, slot_idx) = match &*guard {
4905 TreeNode::Internal(n) => {
4906 if n.entries.is_empty() {
4907 return AdjacentBinOutcome::NoAdjacent;
4908 }
4909 // R3 fix: use comparator-aware upper_in_floor_index so
4910 // that custom-comparator / sorted-dup databases descend
4911 // to the correct child. Mirrors JE Tree.getNextIN which
4912 // uses IN.findEntry (comparator-aware) not raw byte order.
4913 let idx =
4914 self.upper_in_floor_index(&n.entries, current_key);
4915 let child = match n
4916 .entries
4917 .get(idx)
4918 .and_then(|e| e.child.clone())
4919 {
4920 Some(c) => c,
4921 None => return AdjacentBinOutcome::NoAdjacent,
4922 };
4923 (child, idx)
4924 }
4925 TreeNode::Bottom(_) => unreachable!(),
4926 };
4927
4928 // Record the parent and the child we are about to enter
4929 // — the child Arc lets the ascent validate the slot.
4930 let parent_arc =
4931 parking_lot::ArcRwLockReadGuard::rwlock(&guard).clone();
4932 path.push((parent_arc, slot_idx, Arc::clone(&next_arc)));
4933
4934 // Hand-over-hand: take child read lock BEFORE releasing parent.
4935 let next_guard = next_arc.read_arc();
4936 drop(guard);
4937 guard = next_guard;
4938 }
4939 drop(guard);
4940
4941 // Ascend the path. At each level, validate that
4942 // `parent.entries[taken_idx].child == descended_child` before
4943 // trusting `taken_idx` as a coordinate. If not, return
4944 // `SplitRaceRetry` so the caller restarts from root.
4945 while let Some((parent_arc, taken_idx, descended_child)) = path.pop() {
4946 let parent_guard = parent_arc.read();
4947 let (n_entries, slot_still_valid) = match &*parent_guard {
4948 TreeNode::Internal(p) => {
4949 let n = p.entries.len();
4950 let valid = p
4951 .entries
4952 .get(taken_idx)
4953 .and_then(|e| e.child.as_ref())
4954 .is_some_and(|c| Arc::ptr_eq(c, &descended_child));
4955 (n, valid)
4956 }
4957 _ => return AdjacentBinOutcome::NoAdjacent,
4958 };
4959 drop(parent_guard);
4960
4961 if !slot_still_valid {
4962 return AdjacentBinOutcome::SplitRaceRetry;
4963 }
4964
4965 let sibling_idx = if forward {
4966 taken_idx + 1
4967 } else if taken_idx == 0 {
4968 // No left sibling at this level — ascend further.
4969 continue;
4970 } else {
4971 taken_idx - 1
4972 };
4973
4974 if forward && sibling_idx >= n_entries {
4975 // No right sibling at this level — ascend further.
4976 continue;
4977 }
4978
4979 // Found a sibling slot — fetch the sibling child arc.
4980 let sibling_arc = {
4981 let g = parent_arc.read();
4982 match &*g {
4983 TreeNode::Internal(p) => match p
4984 .entries
4985 .get(sibling_idx)
4986 .and_then(|e| e.child.clone())
4987 {
4988 Some(c) => c,
4989 None => return AdjacentBinOutcome::NoAdjacent,
4990 },
4991 _ => return AdjacentBinOutcome::NoAdjacent,
4992 }
4993 };
4994
4995 // Descend to the leftmost (forward) or rightmost (!forward) BIN.
4996 return match Self::descend_to_edge_bin(&sibling_arc, forward) {
4997 Some(v) => AdjacentBinOutcome::Found(v),
4998 None => AdjacentBinOutcome::NoAdjacent,
4999 };
5000 }
5001
5002 // Exhausted path without finding a sibling → no adjacent BIN.
5003 AdjacentBinOutcome::NoAdjacent
5004 }
5005
5006 /// Descend to the leftmost BIN (`forward = true`) or rightmost BIN
5007 /// (`forward = false`) in the sub-tree rooted at `node_arc`.
5008 ///
5009 /// `Tree.searchSubTree(SearchType.LEFT / RIGHT, targetLevel)`.
5010 fn descend_to_edge_bin(
5011 node_arc: &Arc<RwLock<TreeNode>>,
5012 forward: bool,
5013 ) -> Option<Vec<BinEntry>> {
5014 // Hand-over-hand latch coupling — see Tree::search.
5015 let mut guard: parking_lot::ArcRwLockReadGuard<
5016 parking_lot::RawRwLock,
5017 TreeNode,
5018 > = node_arc.read_arc();
5019
5020 loop {
5021 if guard.is_bin() {
5022 return match &*guard {
5023 TreeNode::Bottom(b) => {
5024 // Return entries with full (decompressed) keys so that
5025 // callers always work with complete keys.
5026 //
5027 // TREE-F1: KD slots are NOT filtered here — the BIN's
5028 // slot indices are returned verbatim so the cursor can
5029 // skip KD slots itself (CursorImpl getNext loop;
5030 // CursorImpl.java:2062-2064) and continue to the next
5031 // BIN when an edge BIN is entirely KD during the
5032 // BIN-delta reconstitution window.
5033 let full_entries: Vec<BinEntry> = (0..b.entries.len())
5034 .map(|i| BinEntry {
5035 key: b.get_full_key(i).unwrap_or_default(),
5036 lsn: b.entries[i].lsn,
5037 data: b.entries[i].data.clone(),
5038 known_deleted: b.entries[i].known_deleted,
5039 dirty: b.entries[i].dirty,
5040 expiration_time: b.entries[i].expiration_time,
5041 })
5042 .collect();
5043 Some(full_entries)
5044 }
5045 _ => None,
5046 };
5047 }
5048
5049 let next = match &*guard {
5050 TreeNode::Internal(n) => {
5051 if forward {
5052 n.entries.first()?.child.clone()?
5053 } else {
5054 n.entries.last()?.child.clone()?
5055 }
5056 }
5057 _ => return None,
5058 };
5059 // Take child read lock BEFORE releasing parent's.
5060 let next_guard = next.read_arc();
5061 drop(guard);
5062 guard = next_guard;
5063 }
5064 }
5065}
5066
5067// ============================================================================
5068// Tree statistics
5069// ============================================================================
5070
5071/// Statistics collected by a full tree walk.
5072///
5073/// `TreeWalkerStatsAccumulator`.
5074#[derive(Debug, Default, Clone, PartialEq, Eq)]
5075pub struct TreeStats {
5076 /// Number of BINs (bottom internal nodes).
5077 pub n_bins: u64,
5078 /// Number of upper INs.
5079 pub n_ins: u64,
5080 /// Total number of entries across all nodes.
5081 pub n_entries: u64,
5082 /// Height of the tree (1 = root is a BIN, 2 = one level above BINs, …).
5083 pub height: u32,
5084}
5085
5086impl Tree {
5087 /// Walks the entire tree and collects structural statistics.
5088 ///
5089 /// `TreeWalkerStatsAccumulator` pattern — performs a simple
5090 /// recursive DFS and counts INs, BINs, entries, and tree height.
5091 pub fn collect_stats(&self) -> TreeStats {
5092 let mut stats = TreeStats::default();
5093 if let Some(root) = self.get_root() {
5094 Self::collect_stats_recursive(&root, &mut stats, 0);
5095 }
5096 stats
5097 }
5098
5099 fn collect_stats_recursive(
5100 node_arc: &Arc<RwLock<TreeNode>>,
5101 stats: &mut TreeStats,
5102 depth: u32,
5103 ) {
5104 let guard = node_arc.read();
5105
5106 let current_height = depth + 1;
5107 if current_height > stats.height {
5108 stats.height = current_height;
5109 }
5110
5111 match &*guard {
5112 TreeNode::Bottom(b) => {
5113 stats.n_bins += 1;
5114 stats.n_entries += b.entries.len() as u64;
5115 }
5116 TreeNode::Internal(n) => {
5117 stats.n_ins += 1;
5118 stats.n_entries += n.entries.len() as u64;
5119 // Collect child arcs before releasing the guard.
5120 let children: Vec<Arc<RwLock<TreeNode>>> =
5121 n.entries.iter().filter_map(|e| e.child.clone()).collect();
5122 // Release guard before recursing to avoid lock ordering issues.
5123 drop(guard);
5124 for child in children {
5125 Self::collect_stats_recursive(&child, stats, depth + 1);
5126 }
5127 }
5128 }
5129 }
5130
5131 /// Collects all dirty BINs as (Arc to node, db_id) pairs.
5132 ///
5133 /// The checkpoint path calls this to enumerate BINs that need to be
5134 /// logged. For each dirty BIN the checkpoint decides — based on the
5135 /// BIN-delta threshold — whether to write a full `BIN` entry or a
5136 /// `BINDelta` entry.
5137 ///
5138 /// `Checkpointer.processINList()` which iterates the dirty
5139 /// IN list accumulated during normal operation.
5140 pub fn collect_dirty_bins(
5141 &self,
5142 db_id: u64,
5143 ) -> Vec<(u64, Arc<RwLock<TreeNode>>)> {
5144 let mut result = Vec::new();
5145 if let Some(root) = self.get_root() {
5146 Self::collect_dirty_bins_recursive(&root, db_id, &mut result);
5147 }
5148 result
5149 }
5150
5151 fn collect_dirty_bins_recursive(
5152 node_arc: &Arc<RwLock<TreeNode>>,
5153 db_id: u64,
5154 out: &mut Vec<(u64, Arc<RwLock<TreeNode>>)>,
5155 ) {
5156 let guard = node_arc.read();
5157 match &*guard {
5158 TreeNode::Bottom(b) => {
5159 // Include this BIN if it is dirty or has any dirty slots.
5160 if b.dirty || b.dirty_count() > 0 {
5161 out.push((db_id, Arc::clone(node_arc)));
5162 }
5163 }
5164 TreeNode::Internal(n) => {
5165 let children: Vec<Arc<RwLock<TreeNode>>> =
5166 n.entries.iter().filter_map(|e| e.child.clone()).collect();
5167 drop(guard);
5168 for child in children {
5169 Self::collect_dirty_bins_recursive(&child, db_id, out);
5170 } // guard already dropped
5171 }
5172 }
5173 }
5174
5175 /// Collect all BINs that have at least one `known_deleted` slot.
5176 ///
5177 /// INCompressor queue-drain scan in the: the daemon iterates
5178 /// the in-memory IN list and identifies BINs that still hold zombie deleted
5179 /// slots. Each returned `Arc` can be passed directly to `compress_bin()`.
5180 pub fn collect_bins_with_known_deleted(
5181 &self,
5182 ) -> Vec<Arc<RwLock<TreeNode>>> {
5183 let mut result = Vec::new();
5184 if let Some(root) = self.get_root() {
5185 Self::collect_bins_with_known_deleted_recursive(&root, &mut result);
5186 }
5187 result
5188 }
5189
5190 fn collect_bins_with_known_deleted_recursive(
5191 node_arc: &Arc<RwLock<TreeNode>>,
5192 out: &mut Vec<Arc<RwLock<TreeNode>>>,
5193 ) {
5194 let guard = node_arc.read();
5195 match &*guard {
5196 TreeNode::Bottom(b) => {
5197 if b.entries.iter().any(|e| e.known_deleted) {
5198 out.push(Arc::clone(node_arc));
5199 }
5200 }
5201 TreeNode::Internal(n) => {
5202 let children: Vec<Arc<RwLock<TreeNode>>> =
5203 n.entries.iter().filter_map(|e| e.child.clone()).collect();
5204 drop(guard);
5205 for child in children {
5206 Self::collect_bins_with_known_deleted_recursive(
5207 &child, out,
5208 );
5209 }
5210 }
5211 }
5212 }
5213
5214 /// Collect all dirty upper (non-BIN) internal nodes, sorted ascending by
5215 /// level (bottom-up order, BIN level excluded).
5216 ///
5217 /// Serialise an upper-IN node (level > 1) by node_id for off-heap storage.
5218 ///
5219 /// Traverses the tree to find the internal node whose matches,
5220 /// then calls to produce a compact byte
5221 /// representation. Returns if the node is not found or is a BIN
5222 /// (BINs are not upper INs).
5223 ///
5224 /// Mirrors `OffHeapAllocator` serialises the same bytes that would be written
5225 /// to the log, allowing the evictor to store upper-INs off-heap and avoid
5226 /// log-file reads on the next traversal.
5227 pub fn serialize_upper_in(&self, node_id: u64) -> Option<Vec<u8>> {
5228 let root = self.get_root()?;
5229 Self::find_and_serialize_upper_in(&root, node_id)
5230 }
5231
5232 fn find_and_serialize_upper_in(
5233 node_arc: &Arc<RwLock<TreeNode>>,
5234 target_id: u64,
5235 ) -> Option<Vec<u8>> {
5236 let guard = node_arc.read();
5237 match &*guard {
5238 TreeNode::Bottom(_) => None, // BINs are not upper INs
5239 TreeNode::Internal(n) => {
5240 if n.node_id == target_id {
5241 // Serialise InNodeStub for off-heap storage.
5242 // Format: node_id(u64BE) | level(i32BE) | n_entries(u32BE)
5243 // then per-entry: key_len(u32BE) | key | lsn(u64BE)
5244 let mut buf = Vec::new();
5245 buf.extend_from_slice(&n.node_id.to_be_bytes());
5246 buf.extend_from_slice(&n.level.to_be_bytes());
5247 buf.extend_from_slice(
5248 &(n.entries.len() as u32).to_be_bytes(),
5249 );
5250 for e in &n.entries {
5251 buf.extend_from_slice(
5252 &(e.key.len() as u32).to_be_bytes(),
5253 );
5254 buf.extend_from_slice(&e.key);
5255 buf.extend_from_slice(&e.lsn.as_u64().to_be_bytes());
5256 }
5257 return Some(buf);
5258 }
5259 // Recurse into children before releasing the guard so we
5260 // hold the minimum read-lock duration.
5261 let children: Vec<Arc<RwLock<TreeNode>>> =
5262 n.entries.iter().filter_map(|e| e.child.clone()).collect();
5263 drop(guard);
5264 for child in &children {
5265 if let Some(bytes) =
5266 Self::find_and_serialize_upper_in(child, target_id)
5267 {
5268 return Some(bytes);
5269 }
5270 }
5271 None
5272 }
5273 }
5274 }
5275
5276 /// Upper-IN traversal in `Checkpointer.processINList()` from
5277 /// — visits all `TreeNode::Internal` nodes whose `dirty` flag is set
5278 /// and returns them together with their level, sorted lowest-level-first
5279 /// so the checkpointer can log them bottom-up. The root is always the
5280 /// last entry (highest level), which must be logged `Provisional::No`.
5281 pub fn collect_dirty_upper_ins(
5282 &self,
5283 _db_id: u64,
5284 ) -> Vec<(i32, Arc<RwLock<TreeNode>>)> {
5285 let mut result: Vec<(i32, Arc<RwLock<TreeNode>>)> = Vec::new();
5286 if let Some(root) = self.get_root() {
5287 Self::collect_dirty_upper_ins_recursive(&root, &mut result);
5288 }
5289 result.sort_by_key(|(level, _)| *level);
5290 result
5291 }
5292
5293 fn collect_dirty_upper_ins_recursive(
5294 node_arc: &Arc<RwLock<TreeNode>>,
5295 out: &mut Vec<(i32, Arc<RwLock<TreeNode>>)>,
5296 ) {
5297 let guard = node_arc.read();
5298 match &*guard {
5299 TreeNode::Bottom(_) => {
5300 // BINs are handled by flush_dirty_bins_internal; skip here.
5301 }
5302 TreeNode::Internal(n) => {
5303 let is_dirty = n.dirty;
5304 // REC-AA: return the node's ACTUAL tree level (n.level, in
5305 // MAIN_LEVEL|n units), not a root-relative depth. The level
5306 // must be on the same scale as a BIN's `level` (BIN_LEVEL =
5307 // MAIN_LEVEL|1) so that the checkpointer's flush-level
5308 // computation and the evictor's `node_level < flush_level`
5309 // comparison are meaningful. With a root-relative depth the
5310 // root had the SMALLEST value (0) and the IN above the BINs
5311 // the LARGEST, inverting the provisional/non-provisional
5312 // boundary; with n.level the root has the largest level, as JE
5313 // expects.
5314 let level = n.level;
5315 let children: Vec<Arc<RwLock<TreeNode>>> =
5316 n.entries.iter().filter_map(|e| e.child.clone()).collect();
5317 drop(guard);
5318 // Recurse into children first (bottom-up ordering).
5319 for child in &children {
5320 Self::collect_dirty_upper_ins_recursive(child, out);
5321 }
5322 // Add this node after children (so parent comes after all descendants).
5323 if is_dirty {
5324 out.push((level, Arc::clone(node_arc)));
5325 }
5326 }
5327 }
5328 }
5329
5330 // ========================================================================
5331 // Tree.java ports: 8 additional tree methods (Task #82)
5332 // ========================================================================
5333
5334 /// Returns `true` if the root node is currently loaded in memory.
5335 ///
5336 /// .
5337 pub fn is_root_resident(&self) -> bool {
5338 self.root.read().is_some()
5339 }
5340
5341 /// Returns the root node `Arc` if present, or `None`.
5342 ///
5343 /// .
5344 pub fn get_resident_root_in(&self) -> Option<Arc<RwLock<TreeNode>>> {
5345 self.root.read().clone()
5346 }
5347
5348 /// Returns the BIN that should contain a slot for `key` (the "parent" of
5349 /// LN slots).
5350 ///
5351 /// . Descends the tree
5352 /// exactly like `search()` and returns the leaf-level BIN arc, or `None`
5353 /// if the tree is empty.
5354 ///
5355 /// Uses `read_arc()` hand-over-hand on the descent — the child
5356 /// guard is taken before the parent guard is dropped, matching
5357 /// `search()`. Returns the BIN Arc with no read lock held; the
5358 /// caller must take whatever lock it needs to operate on the
5359 /// returned BIN.
5360 pub fn get_parent_bin_for_child_ln(
5361 &self,
5362 key: &[u8],
5363 ) -> Option<Arc<RwLock<TreeNode>>> {
5364 let root = self.get_root()?;
5365 let mut current_arc: Arc<RwLock<TreeNode>> = root.clone();
5366 let mut guard: parking_lot::ArcRwLockReadGuard<
5367 parking_lot::RawRwLock,
5368 TreeNode,
5369 > = root.read_arc();
5370
5371 loop {
5372 if guard.is_bin() {
5373 drop(guard);
5374 return Some(current_arc);
5375 }
5376
5377 let next_arc = match &*guard {
5378 TreeNode::Internal(n) => {
5379 if n.entries.is_empty() {
5380 return None;
5381 }
5382 let idx = self.upper_in_floor_index(&n.entries, key);
5383 n.entries.get(idx)?.child.clone()?
5384 }
5385 TreeNode::Bottom(_) => {
5386 unreachable!("is_bin() returned false above")
5387 }
5388 };
5389 // Hand-over-hand: take child guard before dropping parent.
5390 let next_guard = next_arc.read_arc();
5391 drop(guard);
5392 current_arc = next_arc;
5393 guard = next_guard;
5394 }
5395 }
5396
5397 /// Returns the BIN where `key` should be inserted.
5398 ///
5399 /// . Semantically identical to
5400 /// `get_parent_bin_for_child_ln` — expressed as a separate method to match
5401 /// API surface.
5402 ///
5403 /// Implemented as a delegation to `get_parent_bin_for_child_ln`,
5404 /// which uses `read_arc()` hand-over-hand on the descent.
5405 pub fn find_bin_for_insert(
5406 &self,
5407 key: &[u8],
5408 ) -> Option<Arc<RwLock<TreeNode>>> {
5409 self.get_parent_bin_for_child_ln(key)
5410 }
5411
5412 /// Search for a BIN, allowing splits during descent (preemptive splitting).
5413 ///
5414 /// . This thin wrapper
5415 /// delegates to `search()` and returns the result wrapped in `Some`.
5416 /// The full split-allowed descent is performed by `insert()` internally;
5417 /// this method exposes the same result type for callers that only need to
5418 /// locate the BIN.
5419 ///
5420 /// Returns `None` if the tree is empty.
5421 pub fn search_splits_allowed(&self, key: &[u8]) -> Option<SearchResult> {
5422 self.search(key)
5423 }
5424
5425 /// Traverses the entire tree and returns every IN and BIN node as a flat
5426 /// list.
5427 ///
5428 /// . Used by recovery to rebuild
5429 /// the in-memory IN list after log replay. The walk is a BFS from the
5430 /// root; every `Arc<RwLock<TreeNode>>` encountered (both Internal and
5431 /// Bottom variants) is included in the result.
5432 pub fn rebuild_in_list(&self) -> Vec<Arc<RwLock<TreeNode>>> {
5433 let mut result = Vec::new();
5434 if let Some(root) = self.get_root() {
5435 Self::rebuild_in_list_recursive(&root, &mut result);
5436 }
5437 result
5438 }
5439
5440 fn rebuild_in_list_recursive(
5441 node_arc: &Arc<RwLock<TreeNode>>,
5442 out: &mut Vec<Arc<RwLock<TreeNode>>>,
5443 ) {
5444 // Push this node unconditionally — both INs and BINs belong in the list.
5445 out.push(Arc::clone(node_arc));
5446
5447 let guard = node_arc.read();
5448
5449 if let TreeNode::Internal(n) = &*guard {
5450 // Collect child arcs while holding the guard, then drop it before
5451 // recursing to avoid holding multiple locks simultaneously.
5452 let children: Vec<Arc<RwLock<TreeNode>>> =
5453 n.entries.iter().filter_map(|e| e.child.clone()).collect();
5454 drop(guard);
5455 for child in children {
5456 Self::rebuild_in_list_recursive(&child, out);
5457 }
5458 }
5459 // BIN nodes are leaves — no children to recurse into.
5460 }
5461
5462 /// Validates internal tree consistency.
5463 ///
5464 /// . Primarily a debug/test tool.
5465 ///
5466 /// Rules checked:
5467 /// - An empty tree (no root) is trivially valid → returns `true`.
5468 /// - A non-empty tree must have a non-null root.
5469 /// - Every Internal node must have at least one entry.
5470 /// - Every child pointer that is `Some` must be readable (lock must be
5471 /// acquirable — i.e., no poisoned locks).
5472 ///
5473 /// Returns `true` if no inconsistencies are detected, `false` otherwise.
5474 pub fn validate_in_list(&self) -> bool {
5475 match self.get_root() {
5476 None => true, // empty tree is always valid
5477 Some(root) => Self::validate_node(&root),
5478 }
5479 }
5480
5481 fn validate_node(node_arc: &Arc<RwLock<TreeNode>>) -> bool {
5482 let guard = node_arc.read();
5483
5484 match &*guard {
5485 TreeNode::Bottom(_bin) => {
5486 // BIN nodes are always structurally valid at this level.
5487 true
5488 }
5489 TreeNode::Internal(n) => {
5490 // An Internal node must have at least one entry.
5491 if n.entries.is_empty() {
5492 return false;
5493 }
5494 // Collect child arcs before dropping the guard.
5495 let children: Vec<Arc<RwLock<TreeNode>>> =
5496 n.entries.iter().filter_map(|e| e.child.clone()).collect();
5497 drop(guard);
5498 // Recursively validate every resident child.
5499 for child in children {
5500 if !Self::validate_node(&child) {
5501 return false;
5502 }
5503 }
5504 true
5505 }
5506 }
5507 }
5508
5509 /// Traverses the tree to find the parent IN that contains `child_node_id`
5510 /// as one of its child slots.
5511 ///
5512 /// . Used by the cleaner
5513 /// migration path to re-insert migrated INs after eviction/fetch.
5514 ///
5515 /// Returns `(parent_arc, slot_index)` where `slot_index` is the position
5516 /// in the parent's `entries` vector whose child matches `child_node_id`,
5517 /// or `None` if no such parent is found.
5518 pub fn get_parent_in_for_child_in(
5519 &self,
5520 child_node_id: u64,
5521 ) -> Option<(Arc<RwLock<TreeNode>>, usize)> {
5522 let root = self.get_root()?;
5523 Self::find_parent_of_node_id(&root, child_node_id)
5524 }
5525
5526 /// Recursive DFS helper for `get_parent_in_for_child_in`.
5527 ///
5528 /// Scans every entry in each Internal node. When a child's node_id
5529 /// matches `target_id` the parent arc and slot index are returned.
5530 fn find_parent_of_node_id(
5531 node_arc: &Arc<RwLock<TreeNode>>,
5532 target_id: u64,
5533 ) -> Option<(Arc<RwLock<TreeNode>>, usize)> {
5534 let guard = node_arc.read();
5535
5536 let TreeNode::Internal(n) = &*guard else {
5537 // BIN nodes have no IN children — cannot be a parent of another IN.
5538 return None;
5539 };
5540
5541 // Check whether any child of this IN has the target node_id.
5542 let mut children: Vec<(usize, Arc<RwLock<TreeNode>>)> = Vec::new();
5543 for (slot, entry) in n.entries.iter().enumerate() {
5544 if let Some(child_arc) = &entry.child {
5545 // Read the child's node_id under a separate lock (acquire child
5546 // while parent guard is still held — this is intentional for
5547 // the ID comparison only; we release both immediately after).
5548 let child_id = {
5549 let cg = child_arc.read();
5550 match &*cg {
5551 TreeNode::Internal(cn) => cn.node_id,
5552 TreeNode::Bottom(cb) => cb.node_id,
5553 }
5554 };
5555
5556 if child_id == target_id {
5557 // Found — return a clone of this node as parent.
5558 let parent_clone = Arc::clone(node_arc);
5559 return Some((parent_clone, slot));
5560 }
5561
5562 // Not found at this slot; schedule this child for recursion.
5563 children.push((slot, Arc::clone(child_arc)));
5564 }
5565 }
5566 // Release parent guard before recursing.
5567 drop(guard);
5568
5569 // Recurse into each Internal child.
5570 for (_slot, child_arc) in children {
5571 if let Some(result) =
5572 Self::find_parent_of_node_id(&child_arc, target_id)
5573 {
5574 return Some(result);
5575 }
5576 }
5577
5578 None
5579 }
5580
5581 /// Propagates the dirty flag upward from `node_arc` to the root.
5582 ///
5583 /// Implicit dirty propagation: after modifying any node,
5584 /// all ancestors on the path to the root must also be marked dirty so
5585 /// the checkpointer logs them.
5586 ///
5587 /// In this happens through `IN.setDirty(true)` calls at each level
5588 /// during split/insert callbacks. Here we walk the weak parent chain.
5589 /// Reconstitute a BIN-delta by merging it onto a base full BIN.
5590 ///
5591 /// Implements JE `BINDelta.reconstituteBIN(databaseImpl)` for the recovery
5592 /// path where the log manager is not available as a `LogManager` but as
5593 /// raw serialized bytes.
5594 ///
5595 /// Algorithm:
5596 /// 1. Deserialise `base_bytes` as a full `BinStub`.
5597 /// 2. Apply `delta_bytes` slots onto the base using `BinStub::apply_delta`
5598 /// (raw slot overlay).
5599 /// 3. Recompute key prefix so prefix-compressed entries are consistent.
5600 ///
5601 /// Returns `None` if either byte slice is malformed.
5602 ///
5603 /// JE `BINDelta.reconstituteBIN` / `BINDelta.applyDelta`
5604 /// (DRIFT-10 / Stage 3).
5605 pub fn reconstitute_bin_delta(
5606 base_bytes: &[u8],
5607 delta_bytes: &[u8],
5608 ) -> Option<BinStub> {
5609 let mut base = BinStub::deserialize_full(base_bytes)?;
5610 // Apply the delta slots onto the base.
5611 // Note: BinStub::apply_delta uses slot-index addressing into base.entries,
5612 // extending with new entries when the slot_idx >= base.entries.len().
5613 // After apply_delta we recompute the key prefix to fix prefix compression.
5614 BinStub::apply_delta(&mut base, delta_bytes)?;
5615 // Recompute prefix so prefix-compressed BINs are consistent after merge.
5616 base.recompute_key_prefix();
5617 base.is_delta = false;
5618 base.dirty = false;
5619 Some(base)
5620 }
5621
5622 pub fn propagate_dirty_to_root(node_arc: &Arc<RwLock<TreeNode>>) {
5623 let parent_weak = { node_arc.read().get_parent() };
5624
5625 if let Some(parent_arc) = parent_weak.and_then(|w| w.upgrade()) {
5626 {
5627 let mut g = parent_arc.write();
5628 g.set_dirty(true);
5629 }
5630 // Recurse further up.
5631 Self::propagate_dirty_to_root(&parent_arc);
5632 }
5633 }
5634
5635 // ========================================================================
5636 // IN-redo: JE RecoveryManager.recoverIN / recoverRootIN / recoverChildIN
5637 // ========================================================================
5638
5639 /// Deserialise an upper-IN node from bytes produced by
5640 /// `TreeNode::write_to_bytes()` / `flush_one_tree_upper_ins`.
5641 ///
5642 /// Format: node_id(u64BE) | level(i32BE) | n_entries(u32BE) | dirty(u8)
5643 /// | per-entry: key_len(u16BE) | key | lsn(u64BE)
5644 ///
5645 /// JE `INFileReader.getIN(db)` / `IN.readFromLog`.
5646 pub fn deserialize_upper_in(bytes: &[u8]) -> Option<InNodeStub> {
5647 if bytes.len() < 13 {
5648 return None;
5649 }
5650 let node_id = u64::from_be_bytes(bytes[0..8].try_into().ok()?);
5651 let level = i32::from_be_bytes(bytes[8..12].try_into().ok()?);
5652 let n_entries =
5653 u32::from_be_bytes(bytes[12..16].try_into().ok()?) as usize;
5654 // dirty byte (1 byte after n_entries)
5655 if bytes.len() < 17 {
5656 return None;
5657 }
5658 let mut pos = 17usize; // skip node_id(8) + level(4) + n_entries(4) + dirty(1)
5659 let mut entries = Vec::with_capacity(n_entries);
5660 for _ in 0..n_entries {
5661 if pos + 2 > bytes.len() {
5662 return None;
5663 }
5664 let key_len =
5665 u16::from_be_bytes(bytes[pos..pos + 2].try_into().ok()?)
5666 as usize;
5667 pos += 2;
5668 if pos + key_len > bytes.len() {
5669 return None;
5670 }
5671 let key = bytes[pos..pos + key_len].to_vec();
5672 pos += key_len;
5673 if pos + 8 > bytes.len() {
5674 return None;
5675 }
5676 let lsn = noxu_util::Lsn::from_u64(u64::from_be_bytes(
5677 bytes[pos..pos + 8].try_into().ok()?,
5678 ));
5679 pos += 8;
5680 entries.push(InEntry { key, lsn, child: None });
5681 }
5682 Some(InNodeStub {
5683 node_id,
5684 level,
5685 entries,
5686 dirty: false,
5687 generation: 0,
5688 parent: None,
5689 })
5690 }
5691
5692 /// Deserialise a BIN from bytes produced by `BinStub::serialize_full()`.
5693 ///
5694 /// Thin wrapper so the recovery path does not need to import `BinStub`
5695 /// directly from callers that only have the raw bytes.
5696 ///
5697 /// JE `INFileReader.getIN(db)` for a BIN entry.
5698 pub fn deserialize_bin(bytes: &[u8]) -> Option<BinStub> {
5699 let mut bin = BinStub::deserialize_full(bytes)?;
5700 bin.dirty = false; // freshly loaded from log — clean for now
5701 Some(bin)
5702 }
5703
5704 /// Apply a logged IN/BIN to the in-memory tree during the recovery redo pass.
5705 ///
5706 /// Implements JE `RecoveryManager.recoverIN`:
5707 /// - `is_root` nodes are handled by `recover_root_in`.
5708 /// - non-root nodes are handled by `recover_child_in`.
5709 ///
5710 /// `log_lsn` is the LSN at which this IN/BIN was logged. The currency
5711 /// check in `recover_child_in` uses this to decide whether to replace the
5712 /// in-memory slot (tree slot LSN < log_lsn → replace; equal → noop;
5713 /// greater → skip).
5714 ///
5715 /// JE `RecoveryManager.recoverIN` / `replayOneIN`
5716 /// (RecoveryManager.java ~lines 1200–1280).
5717 pub fn recover_in_redo(
5718 &self,
5719 log_lsn: noxu_util::Lsn,
5720 is_root: bool,
5721 is_bin: bool,
5722 node_data: &[u8],
5723 ) -> InRedoResult {
5724 if is_bin {
5725 let Some(bin) = Self::deserialize_bin(node_data) else {
5726 return InRedoResult::DeserializeFailed;
5727 };
5728 if is_root {
5729 self.recover_root_bin(log_lsn, bin)
5730 } else {
5731 self.recover_child_bin(log_lsn, bin)
5732 }
5733 } else {
5734 let Some(upper) = Self::deserialize_upper_in(node_data) else {
5735 return InRedoResult::DeserializeFailed;
5736 };
5737 if is_root {
5738 self.recover_root_upper_in(log_lsn, upper)
5739 } else {
5740 self.recover_child_upper_in(log_lsn, upper)
5741 }
5742 }
5743 }
5744
5745 /// Recover a root BIN.
5746 ///
5747 /// If no root exists or the existing root is older (lower LSN), install
5748 /// this BIN as the new root.
5749 ///
5750 /// JE `RecoveryManager.recoverRootIN` / `RootUpdater.doWork`
5751 /// (RecoveryManager.java ~lines 1293–1410).
5752 fn recover_root_bin(
5753 &self,
5754 log_lsn: noxu_util::Lsn,
5755 bin: BinStub,
5756 ) -> InRedoResult {
5757 let mut root_guard = self.root.write();
5758 let existing_lsn = *self.root_log_lsn.read();
5759 match &*root_guard {
5760 None => {
5761 // No root — install this BIN as the root.
5762 // JE: `root == null` case in `RootUpdater.doWork`.
5763 let node = TreeNode::Bottom(bin);
5764 *root_guard = Some(Arc::new(RwLock::new(node)));
5765 *self.root_log_lsn.write() = log_lsn;
5766 InRedoResult::Inserted
5767 }
5768 Some(_) => {
5769 // JE: `originalLsn = root.getLsn()`; replace if logLsn > originalLsn.
5770 if log_lsn > existing_lsn {
5771 let node = TreeNode::Bottom(bin);
5772 *root_guard = Some(Arc::new(RwLock::new(node)));
5773 *self.root_log_lsn.write() = log_lsn;
5774 InRedoResult::Replaced
5775 } else {
5776 InRedoResult::Skipped
5777 }
5778 }
5779 }
5780 }
5781
5782 /// Recover a root upper IN.
5783 ///
5784 /// JE `RecoveryManager.recoverRootIN` for a non-BIN root.
5785 fn recover_root_upper_in(
5786 &self,
5787 log_lsn: noxu_util::Lsn,
5788 upper: InNodeStub,
5789 ) -> InRedoResult {
5790 let mut root_guard = self.root.write();
5791 let existing_lsn = *self.root_log_lsn.read();
5792 match &*root_guard {
5793 None => {
5794 let node = TreeNode::Internal(upper);
5795 *root_guard = Some(Arc::new(RwLock::new(node)));
5796 *self.root_log_lsn.write() = log_lsn;
5797 InRedoResult::Inserted
5798 }
5799 Some(_) => {
5800 if log_lsn > existing_lsn {
5801 let node = TreeNode::Internal(upper);
5802 *root_guard = Some(Arc::new(RwLock::new(node)));
5803 *self.root_log_lsn.write() = log_lsn;
5804 InRedoResult::Replaced
5805 } else {
5806 InRedoResult::Skipped
5807 }
5808 }
5809 }
5810 }
5811
5812 /// Recover a non-root BIN.
5813 ///
5814 /// Implements the three-case currency check from JE
5815 /// `RecoveryManager.recoverChildIN`
5816 /// (RecoveryManager.java lines 1412–1500):
5817 ///
5818 /// 1. Node not in tree: skip (parent logged a later structure that already
5819 /// omits this node, or node was deleted).
5820 /// 2. Physical match (slot LSN == log_lsn): noop — already current.
5821 /// 3. Logical match: another version of the node is in the slot.
5822 /// Replace if tree slot LSN < log_lsn (tree is older), skip otherwise.
5823 fn recover_child_bin(
5824 &self,
5825 log_lsn: noxu_util::Lsn,
5826 bin: BinStub,
5827 ) -> InRedoResult {
5828 let node_id = bin.node_id;
5829 let Some((parent_arc, slot)) = self.get_parent_in_for_child_in(node_id)
5830 else {
5831 // Case 1: not in tree.
5832 return InRedoResult::NotInTree;
5833 };
5834 let mut parent = parent_arc.write();
5835 let TreeNode::Internal(ref mut p) = *parent else {
5836 return InRedoResult::NotInTree;
5837 };
5838 let tree_lsn = p.entries[slot].lsn;
5839 if tree_lsn == log_lsn {
5840 // Case 2: physical match — noop.
5841 InRedoResult::Skipped
5842 } else if tree_lsn < log_lsn {
5843 // Case 3: logical match, tree is older — replace.
5844 // JE `parent.recoverIN(idx, inFromLog, logLsn, lastLoggedSize)`.
5845 let new_arc = Arc::new(RwLock::new(TreeNode::Bottom(bin)));
5846 // Set parent back-pointer on the new node.
5847 {
5848 let mut ng = new_arc.write();
5849 if let TreeNode::Bottom(ref mut b) = *ng {
5850 b.parent = Some(Arc::downgrade(&parent_arc));
5851 }
5852 }
5853 p.entries[slot].child = Some(new_arc);
5854 p.entries[slot].lsn = log_lsn;
5855 InRedoResult::Replaced
5856 } else {
5857 // tree_lsn > log_lsn: tree already holds a newer version.
5858 InRedoResult::Skipped
5859 }
5860 }
5861
5862 /// Recover a non-root upper IN.
5863 ///
5864 /// JE `RecoveryManager.recoverChildIN` for a non-BIN node.
5865 fn recover_child_upper_in(
5866 &self,
5867 log_lsn: noxu_util::Lsn,
5868 upper: InNodeStub,
5869 ) -> InRedoResult {
5870 let node_id = upper.node_id;
5871 let Some((parent_arc, slot)) = self.get_parent_in_for_child_in(node_id)
5872 else {
5873 return InRedoResult::NotInTree;
5874 };
5875 let mut parent = parent_arc.write();
5876 let TreeNode::Internal(ref mut p) = *parent else {
5877 return InRedoResult::NotInTree;
5878 };
5879 let tree_lsn = p.entries[slot].lsn;
5880 if tree_lsn == log_lsn {
5881 InRedoResult::Skipped
5882 } else if tree_lsn < log_lsn {
5883 let new_arc = Arc::new(RwLock::new(TreeNode::Internal(upper)));
5884 {
5885 let mut ng = new_arc.write();
5886 if let TreeNode::Internal(ref mut n) = *ng {
5887 n.parent = Some(Arc::downgrade(&parent_arc));
5888 }
5889 }
5890 p.entries[slot].child = Some(new_arc);
5891 p.entries[slot].lsn = log_lsn;
5892 InRedoResult::Replaced
5893 } else {
5894 InRedoResult::Skipped
5895 }
5896 }
5897}
5898
5899/// Result of a single `recover_in_redo` call.
5900///
5901/// JE traces the same outcomes in `RecoveryManager` debug logging.
5902#[derive(Debug, Clone, Copy, PartialEq, Eq)]
5903pub enum InRedoResult {
5904 /// Node was inserted as the new root.
5905 Inserted,
5906 /// Node replaced an older version in the tree.
5907 Replaced,
5908 /// Node not applied: tree already holds an equal or newer version.
5909 Skipped,
5910 /// Node not found in tree (parent logged later structure that excludes it).
5911 NotInTree,
5912 /// Deserialisation of `node_data` bytes failed.
5913 DeserializeFailed,
5914}
5915
5916/// Global node ID counter for generating unique node IDs.
5917///
5918/// This is the SINGLE source of node-ids for the whole tree subsystem. The
5919/// BIN constructor (`bin.rs`) and `node.rs` route through `generate_node_id`
5920/// so that, after crash recovery, a freshly allocated node-id is always
5921/// strictly greater than every node-id present in the recovered log.
5922///
5923/// JE ref: `NodeSequence.getNextLocalNodeId` (a single per-env counter) and
5924/// `IN.nodeId` allocation; `NodeSequence.initRealNodeId` seeds the counter
5925/// from the recovered `CheckpointEnd.lastLocalNodeId`. The env seeds this
5926/// counter post-recovery via `seed_node_id_counter`.
5927static NODE_ID_COUNTER: std::sync::atomic::AtomicU64 =
5928 std::sync::atomic::AtomicU64::new(1);
5929
5930/// Generates a unique node ID.
5931pub fn generate_node_id() -> u64 {
5932 NODE_ID_COUNTER.fetch_add(1, std::sync::atomic::Ordering::SeqCst)
5933}
5934
5935/// Returns the node-id that would be generated next (without allocating it).
5936///
5937/// Used by recovery seeding and by tests to assert no node-id reuse after a
5938/// restart.
5939pub fn peek_next_node_id_counter() -> u64 {
5940 NODE_ID_COUNTER.load(std::sync::atomic::Ordering::SeqCst)
5941}
5942
5943/// Seeds the node-id counter so the next generated id is `> last_node_id`.
5944///
5945/// Called by `EnvironmentImpl` after recovery with the recovered
5946/// `use_max_node_id`, mirroring `NodeSequence.initRealNodeId` /
5947/// `setLastNodeId`: post-restart allocation must never reuse a node-id that
5948/// is already in the log. Monotonic: never lowers the counter.
5949pub fn seed_node_id_counter(last_node_id: u64) {
5950 let want_next = last_node_id.saturating_add(1);
5951 // Bump only if our current next is below the recovered floor.
5952 let mut cur = NODE_ID_COUNTER.load(std::sync::atomic::Ordering::SeqCst);
5953 while cur < want_next {
5954 match NODE_ID_COUNTER.compare_exchange_weak(
5955 cur,
5956 want_next,
5957 std::sync::atomic::Ordering::SeqCst,
5958 std::sync::atomic::Ordering::SeqCst,
5959 ) {
5960 Ok(_) => break,
5961 Err(observed) => cur = observed,
5962 }
5963 }
5964}
5965
5966#[cfg(test)]
5967mod tests {
5968 use super::*;
5969
5970 #[test]
5971 fn test_empty_tree() {
5972 let tree = Tree::new(1, 128);
5973 assert!(tree.is_empty());
5974 assert_eq!(tree.get_database_id(), 1);
5975 assert_eq!(tree.get_root_splits(), 0);
5976 }
5977
5978 #[test]
5979 fn test_redo_insert_older_lsn_does_not_overwrite_newer_slot() {
5980 // REC-F2 reproduce-first: redo() must be idempotent w.r.t. slot
5981 // currency. JE RecoveryManager.redo() (line ~2512/2544) only
5982 // replaces a slot when logrecLsn > treeLsn. A later redo of an
5983 // OLDER committed LN for the same key must NOT revert the slot to
5984 // the older value or reset the slot LSN backward.
5985 let tree = Tree::new(1, 128);
5986 let key = b"k".to_vec();
5987
5988 // Install the newer version at LSN X (e.g. the BIN-logged value).
5989 let newer = Lsn::new(5, 500);
5990 tree.redo_insert(&key, b"new", newer).unwrap();
5991
5992 // Replay an OLDER committed LN at Y < X for the same key.
5993 let older = Lsn::new(2, 200);
5994 tree.redo_insert(&key, b"old", older).unwrap();
5995
5996 // The newer value and LSN must survive.
5997 let got = tree.search_with_data(&key).expect("key present");
5998 assert!(got.found);
5999 assert_eq!(
6000 got.data.as_deref(),
6001 Some(&b"new"[..]),
6002 "older-LSN redo reverted committed data"
6003 );
6004 assert_eq!(
6005 got.lsn,
6006 newer.as_u64(),
6007 "older-LSN redo reset slot LSN backward"
6008 );
6009
6010 // A redo at a strictly NEWER LSN must still replace (replace-only
6011 // when log_lsn > slot_lsn, matching JE lsnCmp > 0).
6012 let newest = Lsn::new(9, 900);
6013 tree.redo_insert(&key, b"newest", newest).unwrap();
6014 let got = tree.search_with_data(&key).expect("key present");
6015 assert_eq!(got.data.as_deref(), Some(&b"newest"[..]));
6016 assert_eq!(got.lsn, newest.as_u64());
6017 }
6018
6019 #[test]
6020 fn test_insert_single() {
6021 let tree = Tree::new(1, 128);
6022 let key = b"testkey".to_vec();
6023 let data = b"testdata".to_vec();
6024 let lsn = Lsn::new(1, 100);
6025
6026 let result = tree.insert(key.clone(), data, lsn);
6027 assert!(result.is_ok());
6028 assert!(result.unwrap()); // Should be a new insert
6029
6030 assert!(!tree.is_empty());
6031
6032 // Verify we can search for it
6033 let search_result = tree.search(&key);
6034 assert!(search_result.is_some());
6035 let sr = search_result.unwrap();
6036 assert!(sr.exact_parent_found || !sr.child_not_resident);
6037 }
6038
6039 #[test]
6040 fn test_insert_multiple() {
6041 let tree = Tree::new(1, 128);
6042
6043 let keys = vec![
6044 b"apple".to_vec(),
6045 b"banana".to_vec(),
6046 b"cherry".to_vec(),
6047 b"date".to_vec(),
6048 ];
6049
6050 for (i, key) in keys.iter().enumerate() {
6051 let data = format!("data{}", i).into_bytes();
6052 let lsn = Lsn::new(1, 100 + (i as u32) * 10);
6053 let result = tree.insert(key.clone(), data, lsn);
6054 assert!(result.is_ok());
6055 assert!(result.unwrap()); // All should be new inserts
6056 }
6057
6058 // Verify we can search for each
6059 for key in &keys {
6060 let search_result = tree.search(key);
6061 assert!(search_result.is_some());
6062 }
6063 }
6064
6065 #[test]
6066 fn test_insert_duplicate_key() {
6067 let tree = Tree::new(1, 128);
6068 let key = b"duplicate".to_vec();
6069 let data1 = b"first".to_vec();
6070 let data2 = b"second".to_vec();
6071 let lsn1 = Lsn::new(1, 100);
6072 let lsn2 = Lsn::new(1, 200);
6073
6074 // First insert
6075 let result1 = tree.insert(key.clone(), data1, lsn1);
6076 assert!(result1.is_ok());
6077 assert!(result1.unwrap()); // New insert
6078
6079 // Second insert with same key - should be update
6080 let result2 = tree.insert(key, data2, lsn2);
6081 assert!(result2.is_ok());
6082 assert!(!result2.unwrap()); // Update, not new insert
6083 }
6084
6085 #[test]
6086 fn test_search_empty_tree() {
6087 let tree = Tree::new(1, 128);
6088 let key = b"noexist".to_vec();
6089
6090 let result = tree.search(&key);
6091 assert!(result.is_none());
6092 }
6093
6094 #[test]
6095 fn test_first_and_last_node() {
6096 let tree = Tree::new(1, 128);
6097
6098 // Empty tree
6099 assert!(tree.get_first_node().is_none());
6100 assert!(tree.get_last_node().is_none());
6101
6102 // Insert some keys
6103 let keys = [b"a".to_vec(), b"b".to_vec(), b"c".to_vec()];
6104 for (i, key) in keys.iter().enumerate() {
6105 let data = format!("data{}", i).into_bytes();
6106 let lsn = Lsn::new(1, 100 + (i as u32) * 10);
6107 tree.insert(key.clone(), data, lsn).unwrap();
6108 }
6109
6110 // Now should have first and last
6111 let first = tree.get_first_node();
6112 assert!(first.is_some());
6113 assert_eq!(first.unwrap().index, 0);
6114
6115 let last = tree.get_last_node();
6116 assert!(last.is_some());
6117 assert_eq!(last.unwrap().index, 2);
6118 }
6119
6120 #[test]
6121 fn test_node_id_generation() {
6122 let id1 = generate_node_id();
6123 let id2 = generate_node_id();
6124 let id3 = generate_node_id();
6125
6126 assert!(id2 > id1);
6127 assert!(id3 > id2);
6128 }
6129
6130 #[test]
6131 fn test_tree_node_is_bin() {
6132 let bin = TreeNode::Bottom(BinStub {
6133 node_id: 1,
6134 level: BIN_LEVEL,
6135 entries: vec![],
6136 key_prefix: Vec::new(),
6137 dirty: false,
6138 is_delta: false,
6139 last_full_lsn: NULL_LSN,
6140 last_delta_lsn: NULL_LSN,
6141 generation: 0,
6142 parent: None,
6143 expiration_in_hours: true,
6144 cursor_count: 0,
6145 prohibit_next_delta: false,
6146 });
6147 assert!(bin.is_bin());
6148 assert_eq!(bin.level(), BIN_LEVEL);
6149
6150 let internal = TreeNode::Internal(InNodeStub {
6151 node_id: 2,
6152 level: MAIN_LEVEL + 2,
6153 entries: vec![],
6154 dirty: false,
6155 generation: 0,
6156 parent: None,
6157 });
6158 assert!(!internal.is_bin());
6159 assert_eq!(internal.level(), MAIN_LEVEL + 2);
6160 }
6161
6162 #[test]
6163 fn test_find_entry() {
6164 let mut entries = vec![];
6165 for i in 0..5 {
6166 entries.push(BinEntry {
6167 key: format!("key{}", i).into_bytes(),
6168 lsn: Lsn::new(1, 100 + i),
6169 data: Some(vec![]),
6170 known_deleted: false,
6171 dirty: false,
6172 expiration_time: 0,
6173 });
6174 }
6175
6176 let bin = TreeNode::Bottom(BinStub {
6177 node_id: 1,
6178 level: BIN_LEVEL,
6179 entries,
6180 key_prefix: Vec::new(),
6181 dirty: false,
6182 is_delta: false,
6183 last_full_lsn: NULL_LSN,
6184 last_delta_lsn: NULL_LSN,
6185 generation: 0,
6186 parent: None,
6187 expiration_in_hours: true,
6188 cursor_count: 0,
6189 prohibit_next_delta: false,
6190 });
6191
6192 // Search for existing key
6193 let result = bin.find_entry(b"key2", false, true);
6194 assert_eq!(result & 0xFFFF, 2);
6195 assert_ne!(result & EXACT_MATCH, 0);
6196
6197 // Search for non-existing key with exact=false
6198 let result = bin.find_entry(b"key15", false, false);
6199 assert_eq!(result & 0xFFFF, 2); // Would go between key1 and key2
6200 assert_eq!(result & EXACT_MATCH, 0);
6201 }
6202
6203 #[test]
6204 fn test_insert_until_full() {
6205 // With splits implemented, inserting beyond max_entries_per_node must
6206 // succeed (the tree splits proactively rather than returning an error).
6207 let tree = Tree::new(1, 3); // Small max to exercise splits
6208
6209 // Insert up to max
6210 for i in 0..3 {
6211 let key = format!("key{}", i).into_bytes();
6212 let data = format!("data{}", i).into_bytes();
6213 let lsn = Lsn::new(1, 100 + i);
6214 let result = tree.insert(key, data, lsn);
6215 assert!(result.is_ok(), "insert {} should succeed", i);
6216 }
6217
6218 // The 4th insert triggers a split and must also succeed.
6219 let key = b"key3".to_vec();
6220 let data = b"data3".to_vec();
6221 let lsn = Lsn::new(1, 103);
6222 let result = tree.insert(key.clone(), data, lsn);
6223 assert!(
6224 result.is_ok(),
6225 "insert after full should trigger split and succeed"
6226 );
6227 assert!(result.unwrap(), "should be a new insert");
6228
6229 // The inserted key must be findable after the split.
6230 let sr = tree.search(&key);
6231 assert!(sr.is_some(), "key3 must be searchable after split");
6232 assert!(sr.unwrap().exact_parent_found, "key3 must be found exactly");
6233 }
6234
6235 #[test]
6236 fn test_memory_counter_balanced_on_insert_delete_f8() {
6237 use std::sync::Arc;
6238 use std::sync::atomic::{AtomicI64, Ordering};
6239 // F8 regression: insert accounts key+data+48; delete must subtract the
6240 // SAME, so an insert+delete of the same record returns the counter to
6241 // its starting value (previously delete omitted data_len -> the counter
6242 // leaked data_len per delete, biasing the evictor over-budget view).
6243 let mut tree = Tree::new(1, 16);
6244 let counter = Arc::new(AtomicI64::new(0));
6245 tree.set_memory_counter(Arc::clone(&counter));
6246
6247 let key = b"a-key".to_vec();
6248 let data = vec![0u8; 200]; // non-trivial data length
6249 tree.insert(key.clone(), data.clone(), Lsn::new(0, 10)).unwrap();
6250 let after_insert = counter.load(Ordering::Relaxed);
6251 assert!(after_insert > 0, "insert must increase the counter");
6252 assert_eq!(
6253 after_insert,
6254 (key.len() + data.len() + BIN_ENTRY_OVERHEAD) as i64,
6255 "insert accounts key + data + per-slot BinEntry overhead"
6256 );
6257
6258 let deleted = tree.delete(&key);
6259 assert!(deleted);
6260 assert_eq!(
6261 counter.load(Ordering::Relaxed),
6262 0,
6263 "F8: delete must subtract key + data + BIN_ENTRY_OVERHEAD, returning the counter to its pre-insert value (no data_len leak)"
6264 );
6265 }
6266
6267 /// EV-13 (pass-post): a full-node detach must ACTUALLY drop the child
6268 /// `Arc` from the parent IN, not merely credit bytes. Before the fix the
6269 /// evictor credited `node_size_fn(node_id)` and removed the node from the
6270 /// LRU list, but the parent's `InEntry.child` still held a strong `Arc`,
6271 /// so the node was never freed (phantom free) and the budget over-credited.
6272 ///
6273 /// This test proves: after `detach_node_by_id` the held child `Arc` is the
6274 /// LAST strong reference (strong_count == 1), the parent slot's `child` is
6275 /// `None`, and the returned bytes equal the node's measured heap size.
6276 ///
6277 /// JE ref: `IN.detachNode` (`setTarget(idx, null)`) / `Evictor.evict`.
6278 #[test]
6279 fn test_ev13_detach_actually_frees_child() {
6280 // Tiny fanout forces a root split so we get a real IN parent with BIN
6281 // children that the evictor would target.
6282 let tree = Tree::new(7, 4);
6283 for i in 0u8..12 {
6284 tree.insert(
6285 vec![b'a' + i],
6286 vec![i; 8],
6287 Lsn::new(1, u32::from(i) + 1),
6288 )
6289 .unwrap();
6290 }
6291
6292 // Find a BIN child of the root IN (the eviction target) + its parent.
6293 let root = tree.get_root().expect("tree must have a root");
6294 let (parent_arc, child_idx, bin_id, expected_bytes) = {
6295 let rg = root.read();
6296 let TreeNode::Internal(n) = &*rg else {
6297 panic!("root must be an IN after split");
6298 };
6299 // Pick the first slot whose child is a resident BIN.
6300 let (idx, child) = n
6301 .entries
6302 .iter()
6303 .enumerate()
6304 .find_map(|(i, e)| e.child.as_ref().map(|c| (i, c.clone())))
6305 .expect("root must have a resident child");
6306 let (id, bytes) = {
6307 let cg = child.read();
6308 (
6309 match &*cg {
6310 TreeNode::Bottom(b) => b.node_id,
6311 TreeNode::Internal(n2) => n2.node_id,
6312 },
6313 cg.budgeted_memory_size(),
6314 )
6315 };
6316 (Arc::clone(&root), idx, id, bytes)
6317 };
6318
6319 // Hold an external strong reference to the child so we can observe its
6320 // strong_count drop when detach releases the parent's reference.
6321 let child_arc = {
6322 let pg = parent_arc.read();
6323 let TreeNode::Internal(n) = &*pg else { unreachable!() };
6324 Arc::clone(n.entries[child_idx].child.as_ref().unwrap())
6325 };
6326 // Two strong refs now: the parent slot + our test handle.
6327 assert_eq!(
6328 Arc::strong_count(&child_arc),
6329 2,
6330 "precondition: parent slot + test handle hold the child"
6331 );
6332
6333 let freed = tree.detach_node_by_id(bin_id);
6334
6335 // 1. Bytes credited equal the measured heap size (no phantom credit).
6336 assert_eq!(
6337 freed, expected_bytes,
6338 "detach must credit the node's real measured heap size"
6339 );
6340 // 2. The parent slot's child is now None (JE setTarget(idx, null)).
6341 {
6342 let pg = parent_arc.read();
6343 let TreeNode::Internal(n) = &*pg else { unreachable!() };
6344 assert!(
6345 n.entries[child_idx].child.is_none(),
6346 "EV-13: parent slot must be detached (child == None)"
6347 );
6348 // The slot itself (key + LSN) is retained for re-fetch.
6349 assert!(
6350 !n.entries[child_idx].lsn.is_null(),
6351 "detach keeps the slot LSN so the node can be re-fetched"
6352 );
6353 }
6354 // 3. Our handle is now the ONLY strong reference -> the parent really
6355 // dropped its Arc; the node is freed when we drop `child_arc`.
6356 // Before EV-13 this would be 2 (parent still held it) = phantom free.
6357 assert_eq!(
6358 Arc::strong_count(&child_arc),
6359 1,
6360 "EV-13: detach must drop the parent's strong Arc (no phantom free)"
6361 );
6362 }
6363
6364 /// EV-13: detach must NOT decrement the memory counter itself (the evictor
6365 /// owns that bookkeeping via `Arbiter::release_memory`). A double credit
6366 /// would drive `cache_usage` below reality.
6367 #[test]
6368 fn test_ev13_detach_does_not_touch_counter() {
6369 use std::sync::atomic::{AtomicI64, Ordering};
6370 let mut tree = Tree::new(8, 4);
6371 let counter = Arc::new(AtomicI64::new(0));
6372 tree.set_memory_counter(Arc::clone(&counter));
6373 for i in 0u8..12 {
6374 tree.insert(
6375 vec![b'a' + i],
6376 vec![i; 8],
6377 Lsn::new(1, u32::from(i) + 1),
6378 )
6379 .unwrap();
6380 }
6381 let before = counter.load(Ordering::Relaxed);
6382
6383 // Grab a BIN child id.
6384 let root = tree.get_root().unwrap();
6385 let bin_id = {
6386 let rg = root.read();
6387 let TreeNode::Internal(n) = &*rg else { unreachable!() };
6388 let child = n
6389 .entries
6390 .iter()
6391 .find_map(|e| e.child.clone())
6392 .expect("resident child");
6393 match &*child.read() {
6394 TreeNode::Bottom(b) => b.node_id,
6395 TreeNode::Internal(n2) => n2.node_id,
6396 }
6397 };
6398
6399 let freed = tree.detach_node_by_id(bin_id);
6400 assert!(freed > 0, "detach must free a resident child");
6401 assert_eq!(
6402 counter.load(Ordering::Relaxed),
6403 before,
6404 "EV-13: detach must not change the counter (evictor credits once)"
6405 );
6406 }
6407
6408 /// EV-13: detaching the root or an unknown id is a no-op returning 0.
6409 #[test]
6410 fn test_ev13_detach_root_or_missing_is_noop() {
6411 let tree = Tree::new(9, 4);
6412 for i in 0u8..12 {
6413 tree.insert(
6414 vec![b'a' + i],
6415 vec![i; 8],
6416 Lsn::new(1, u32::from(i) + 1),
6417 )
6418 .unwrap();
6419 }
6420 let root_id = {
6421 let rg = tree.get_root().unwrap();
6422 let g = rg.read();
6423 match &*g {
6424 TreeNode::Internal(n) => n.node_id,
6425 TreeNode::Bottom(b) => b.node_id,
6426 }
6427 };
6428 assert_eq!(
6429 tree.detach_node_by_id(root_id),
6430 0,
6431 "root has no parent IN -> detach is a no-op"
6432 );
6433 assert_eq!(
6434 tree.detach_node_by_id(u64::MAX),
6435 0,
6436 "unknown node id -> detach is a no-op"
6437 );
6438 }
6439
6440 /// DBI-23 (pass-post): the live `memory_counter` must APPROXIMATE the real
6441 /// in-memory heap of the tree, not the old `key + data + 48` lower bound.
6442 ///
6443 /// JE keeps `inMemorySize` (`IN.getBudgetedMemorySize`) in lock-step with
6444 /// the per-node `computeMemorySize`; the over-budget arbiter sees the real
6445 /// figure so eviction fires at the right time. The previous Noxu live
6446 /// path undercounted each BIN slot (48 vs the 64-byte `BinEntry` struct)
6447 /// and never accounted the node-struct fixed overhead, so the counter ran
6448 /// below real heap and the evictor under-fired.
6449 ///
6450 /// We assert the live counter is within tolerance of
6451 /// `total_budgeted_memory` (the authoritative walk-and-sum oracle). The
6452 /// only gap is the per-node fixed struct overhead (BinStub/InNodeStub),
6453 /// which is a small fraction for non-trivial entries — the fix closes the
6454 /// dominant per-slot gap.
6455 #[test]
6456 fn test_dbi23_live_counter_approximates_real_heap() {
6457 use std::sync::atomic::{AtomicI64, Ordering};
6458 let mut tree = Tree::new(42, 32);
6459 let counter = Arc::new(AtomicI64::new(0));
6460 tree.set_memory_counter(Arc::clone(&counter));
6461
6462 // Insert N entries with realistic key+data sizes.
6463 let n = 400u32;
6464 for i in 0..n {
6465 let key = format!("key-{i:08}").into_bytes(); // 12 bytes
6466 let data = vec![0u8; 64]; // 64 bytes
6467 tree.insert(key, data, Lsn::new(1, i + 1)).unwrap();
6468 }
6469
6470 let live = counter.load(Ordering::Relaxed) as u64;
6471 let real = tree.total_budgeted_memory();
6472
6473 // The live counter must NOT be the old lower bound. Old formula per
6474 // slot was key + data + 48; the per-slot struct alone is 64, plus the
6475 // node-struct overhead the old path ignored entirely. Assert the live
6476 // counter is at least the per-slot-correct portion and within 20% of
6477 // the real walked heap.
6478 let old_lower_bound: u64 = (0..n)
6479 .map(|i| {
6480 let key_len = format!("key-{i:08}").len();
6481 (key_len + 64 + 48) as u64 // old: key + data + 48
6482 })
6483 .sum();
6484
6485 assert!(
6486 live > old_lower_bound,
6487 "DBI-23: live counter ({live}) must exceed the old key+data+48 \
6488 lower bound ({old_lower_bound})"
6489 );
6490
6491 // Within tolerance of real heap (the residual gap is the per-node
6492 // fixed struct overhead, intentionally not tracked incrementally).
6493 let lower = real * 80 / 100;
6494 assert!(
6495 live >= lower && live <= real,
6496 "DBI-23: live counter ({live}) must approximate real heap ({real}) \
6497 within tolerance [{lower}, {real}]"
6498 );
6499 }
6500
6501 #[test]
6502 fn test_delete_existing_key() {
6503 let tree = Tree::new(1, 128);
6504 let key = b"remove_me".to_vec();
6505 tree.insert(key.clone(), b"val".to_vec(), Lsn::new(1, 10)).unwrap();
6506 assert!(tree.delete(&key));
6507
6508 // After deletion the BIN is empty, so delete returns true the first
6509 // time and false the second time.
6510 assert!(!tree.delete(&key));
6511 }
6512
6513 #[test]
6514 fn test_delete_nonexistent_key() {
6515 let tree = Tree::new(1, 128);
6516 tree.insert(b"a".to_vec(), b"v".to_vec(), Lsn::new(1, 1)).unwrap();
6517
6518 assert!(!tree.delete(b"zzz"));
6519 }
6520
6521 #[test]
6522 fn test_delete_empty_tree() {
6523 let tree = Tree::new(1, 128);
6524 assert!(!tree.delete(b"nothing"));
6525 }
6526
6527 #[test]
6528 fn test_delete_all_entries_makes_bin_empty() {
6529 let tree = Tree::new(1, 128);
6530 tree.insert(b"x".to_vec(), b"1".to_vec(), Lsn::new(1, 1)).unwrap();
6531 tree.insert(b"y".to_vec(), b"2".to_vec(), Lsn::new(1, 2)).unwrap();
6532
6533 assert!(tree.delete(b"x"));
6534 assert!(tree.delete(b"y"));
6535
6536 // Tree still has a root (empty BIN), so is_empty() returns false.
6537 assert!(!tree.is_empty());
6538 // get_first_node should return None for an empty BIN.
6539 assert!(tree.get_first_node().is_none());
6540 }
6541
6542 #[test]
6543 fn test_set_root_and_get_root() {
6544 let tree = Tree::new(1, 128);
6545 assert!(tree.get_root().is_none());
6546
6547 let bin = TreeNode::Bottom(BinStub {
6548 node_id: generate_node_id(),
6549 level: BIN_LEVEL,
6550 entries: vec![],
6551 key_prefix: Vec::new(),
6552 dirty: false,
6553 is_delta: false,
6554 last_full_lsn: NULL_LSN,
6555 last_delta_lsn: NULL_LSN,
6556 generation: 0,
6557 parent: None,
6558 expiration_in_hours: true,
6559 cursor_count: 0,
6560 prohibit_next_delta: false,
6561 });
6562 tree.set_root(bin);
6563 assert!(tree.get_root().is_some());
6564 }
6565
6566 // ========================================================================
6567 // Split / multi-level insert tests (new)
6568 // ========================================================================
6569
6570 /// inserting enough keys to fill the root IN causes
6571 /// the root IN itself to split, resulting in a tree with 3 or more levels.
6572 ///
6573 /// With max_entries_per_node = 4:
6574 /// - Each BIN holds 4 entries before it is split.
6575 /// - The root IN at level 2 holds up to 4 BIN children.
6576 /// - Filling those 4 BINs (16 entries) and adding a 17th forces the
6577 /// root IN to split, creating a level-3 root.
6578 #[test]
6579 fn test_insert_forces_root_split() {
6580 let tree = Tree::new(1, 4);
6581
6582 // 17 inserts with fanout 4 forces the root IN to split.
6583 for i in 0u32..20 {
6584 let key = format!("key{:04}", i).into_bytes();
6585 let data = format!("data{}", i).into_bytes();
6586 let lsn = Lsn::new(1, 100 + i);
6587 let r = tree.insert(key, data, lsn);
6588 assert!(r.is_ok(), "insert {} must succeed", i);
6589 }
6590
6591 // At least one root split must have occurred.
6592 assert!(
6593 tree.get_root_splits() > 0,
6594 "expected at least one root split after 20 inserts with fanout 4"
6595 );
6596
6597 // The root level must be > level-2 (i.e., the tree has grown to 3+ levels).
6598 let root_arc = tree.get_root().as_ref().unwrap().clone();
6599 let root_level = root_arc.read().level();
6600 let level_2 = MAIN_LEVEL | 2;
6601 assert!(
6602 root_level > level_2,
6603 "root level {} must be > level-2 after root split",
6604 root_level
6605 );
6606 }
6607
6608 /// Inserting 1000 keys in sorted order and verifying all are searchable.
6609 #[test]
6610 fn test_insert_many_keys() {
6611 let tree = Tree::new(1, 8);
6612 let n = 1000u32;
6613
6614 for i in 0..n {
6615 let key = format!("key{:08}", i).into_bytes();
6616 let data = format!("data{}", i).into_bytes();
6617 let lsn = Lsn::new(1, i);
6618 let r = tree.insert(key, data, lsn);
6619 assert!(r.is_ok(), "insert {} must succeed", i);
6620 }
6621
6622 // All keys must be findable.
6623 for i in 0..n {
6624 let key = format!("key{:08}", i).into_bytes();
6625 let sr = tree.search(&key);
6626 assert!(
6627 sr.is_some() && sr.unwrap().exact_parent_found,
6628 "key{:08} must be found after bulk insert",
6629 i
6630 );
6631 }
6632 }
6633
6634 /// Inserting 500 keys in pseudo-random (reverse) order and verifying all
6635 /// are searchable.
6636 #[test]
6637 fn test_insert_random_keys() {
6638 let tree = Tree::new(1, 8);
6639 let n = 500u32;
6640
6641 // Insert in reverse order as a simple non-sorted sequence.
6642 for i in (0..n).rev() {
6643 let key = format!("rkey{:08}", i).into_bytes();
6644 let data = format!("data{}", i).into_bytes();
6645 let lsn = Lsn::new(1, i);
6646 let r = tree.insert(key, data, lsn);
6647 assert!(r.is_ok(), "insert {} must succeed", i);
6648 }
6649
6650 for i in 0..n {
6651 let key = format!("rkey{:08}", i).into_bytes();
6652 let sr = tree.search(&key);
6653 assert!(
6654 sr.is_some() && sr.unwrap().exact_parent_found,
6655 "rkey{:08} must be found",
6656 i
6657 );
6658 }
6659 }
6660
6661 /// After any number of splits, every key inserted must still be findable.
6662 ///
6663 #[test]
6664 fn test_split_preserves_all_keys() {
6665 // Tiny fanout to maximise split frequency.
6666 let tree = Tree::new(1, 3);
6667 let n = 60u32;
6668
6669 let mut keys: Vec<Vec<u8>> = Vec::new();
6670 for i in 0..n {
6671 let key = format!("sk{:04}", i).into_bytes();
6672 keys.push(key.clone());
6673 let data = format!("d{}", i).into_bytes();
6674 let lsn = Lsn::new(1, i);
6675 let r = tree.insert(key, data, lsn);
6676 assert!(r.is_ok(), "insert {} must not fail", i);
6677 }
6678
6679 // After all inserts (and all the splits they induced), every key must
6680 // still be findable in the tree.
6681 for key in &keys {
6682 let sr = tree.search(key);
6683 assert!(
6684 sr.is_some() && sr.unwrap().exact_parent_found,
6685 "key {:?} must survive all splits",
6686 std::str::from_utf8(key).unwrap_or("?")
6687 );
6688 }
6689 }
6690
6691 /// The tree level (depth) must grow as keys are inserted and splits occur.
6692 #[test]
6693 fn test_tree_height_grows() {
6694 let tree = Tree::new(1, 4);
6695
6696 // With fanout 4, one level-2 root IN can hold 4 children. After enough
6697 // inserts the root itself will split and a level-3 node will appear.
6698 // Insert enough keys to force the root to split at least once.
6699 let n = 40u32;
6700 for i in 0..n {
6701 let key = format!("hk{:08}", i).into_bytes();
6702 let data = format!("d{}", i).into_bytes();
6703 let lsn = Lsn::new(1, i);
6704 tree.insert(key, data, lsn).unwrap();
6705 }
6706
6707 // At least one root split must have occurred.
6708 assert!(
6709 tree.get_root_splits() > 0,
6710 "expected root to have split at least once for {} keys with fanout 4",
6711 n
6712 );
6713
6714 // The root level must be > level-2 (i.e., the tree has grown past two levels).
6715 let root_arc = tree.get_root().as_ref().unwrap().clone();
6716 let root_level = root_arc.read().level();
6717 let level_2 = MAIN_LEVEL | 2;
6718 assert!(
6719 root_level > level_2,
6720 "root level {} must be > {} after enough inserts",
6721 root_level,
6722 level_2
6723 );
6724 }
6725
6726 #[test]
6727 fn test_find_entry_on_internal_node() {
6728 let mut entries = vec![];
6729 for i in 0..4 {
6730 entries.push(InEntry {
6731 key: format!("k{}", i).into_bytes(),
6732 lsn: Lsn::new(1, 10 + i),
6733 child: None,
6734 });
6735 }
6736 let internal = TreeNode::Internal(InNodeStub {
6737 node_id: 1,
6738 level: MAIN_LEVEL + 2,
6739 entries,
6740 dirty: false,
6741 generation: 0,
6742 parent: None,
6743 });
6744
6745 // Exact match
6746 let r = internal.find_entry(b"k2", false, true);
6747 assert_ne!(r & EXACT_MATCH, 0);
6748 assert_eq!(r & 0xFFFF, 2);
6749
6750 // No exact match with exact=true
6751 let r = internal.find_entry(b"kx", false, true);
6752 assert_eq!(r, -1);
6753 }
6754
6755 // St-H5: non-exact `find_entry` on an Internal node must return the FLOOR
6756 // child slot (largest entry ≤ key), not the insertion point. Entries are
6757 // k0,k1,k2,k3; slot 0 is the leftmost child.
6758 #[test]
6759 fn test_find_entry_internal_nonexact_returns_floor() {
6760 let mut entries = vec![];
6761 for i in 0..4 {
6762 entries.push(InEntry {
6763 key: format!("k{}", i).into_bytes(),
6764 lsn: Lsn::new(1, 10 + i),
6765 child: None,
6766 });
6767 }
6768 let internal = TreeNode::Internal(InNodeStub {
6769 node_id: 1,
6770 level: MAIN_LEVEL + 2,
6771 entries,
6772 dirty: false,
6773 generation: 0,
6774 parent: None,
6775 });
6776
6777 // Key below every separator floors to slot 0 (leftmost child).
6778 assert_eq!(internal.find_entry(b"a", false, false) & 0xFFFF, 0);
6779 // Between k1 and k2 floors to k1 (slot 1).
6780 assert_eq!(internal.find_entry(b"k1x", false, false) & 0xFFFF, 1);
6781 // Above every separator floors to the last slot (k3 = slot 3).
6782 assert_eq!(internal.find_entry(b"zzz", false, false) & 0xFFFF, 3);
6783 // Exact match still reported as the exact slot.
6784 let r = internal.find_entry(b"k2", false, false);
6785 assert_ne!(r & EXACT_MATCH, 0);
6786 assert_eq!(r & 0xFFFF, 2);
6787 }
6788
6789 // ========================================================================
6790 // New tests: dirty tracking, generation, parent pointers, log size, stats
6791 // ========================================================================
6792
6793 /// After inserting into a tree, the BIN (and root IN) must be dirty.
6794 ///
6795 /// The: Tree.insertLN() calls bin.setDirty(true) after each insert.
6796 #[test]
6797 fn test_insert_marks_bin_dirty() {
6798 let tree = Tree::new(1, 128);
6799 tree.insert(b"key1".to_vec(), b"val1".to_vec(), Lsn::new(1, 1))
6800 .unwrap();
6801
6802 let root_arc = tree.get_root().as_ref().unwrap().clone();
6803 // root is an upper IN — its slot 0 child is the BIN.
6804 let bin_arc = {
6805 let g = root_arc.read();
6806 match &*g {
6807 TreeNode::Internal(n) => n.entries[0].child.clone().unwrap(),
6808 _ => panic!("expected Internal root"),
6809 }
6810 };
6811
6812 let bin_dirty = bin_arc.read().is_dirty();
6813 assert!(bin_dirty, "BIN must be dirty after insert");
6814 }
6815
6816 /// Updating an existing key keeps the BIN dirty.
6817 #[test]
6818 fn test_update_keeps_bin_dirty() {
6819 let tree = Tree::new(1, 128);
6820 tree.insert(b"k".to_vec(), b"v1".to_vec(), Lsn::new(1, 1)).unwrap();
6821 // second insert is an update
6822 tree.insert(b"k".to_vec(), b"v2".to_vec(), Lsn::new(1, 2)).unwrap();
6823
6824 let root_arc = tree.get_root().as_ref().unwrap().clone();
6825 let bin_arc = {
6826 let g = root_arc.read();
6827 match &*g {
6828 TreeNode::Internal(n) => n.entries[0].child.clone().unwrap(),
6829 _ => panic!("expected Internal root"),
6830 }
6831 };
6832
6833 assert!(bin_arc.read().is_dirty(), "BIN must be dirty after update");
6834 }
6835
6836 /// After deleting a key the BIN must be dirty.
6837 #[test]
6838 fn test_delete_marks_bin_dirty() {
6839 let tree = Tree::new(1, 128);
6840 tree.insert(b"del".to_vec(), b"val".to_vec(), Lsn::new(1, 1)).unwrap();
6841
6842 // Manually clear dirty flag to verify delete re-sets it.
6843 {
6844 let root_arc = tree.get_root().as_ref().unwrap().clone();
6845 let bin_arc = {
6846 let g = root_arc.read();
6847 match &*g {
6848 TreeNode::Internal(n) => {
6849 n.entries[0].child.clone().unwrap()
6850 }
6851 _ => panic!("expected Internal root"),
6852 }
6853 };
6854 bin_arc.write().set_dirty(false);
6855 assert!(!bin_arc.read().is_dirty());
6856 }
6857
6858 tree.delete(b"del");
6859
6860 let root_arc = tree.get_root().as_ref().unwrap().clone();
6861 let bin_arc = {
6862 let g = root_arc.read();
6863 match &*g {
6864 TreeNode::Internal(n) => n.entries[0].child.clone().unwrap(),
6865 _ => panic!("expected Internal root"),
6866 }
6867 };
6868 assert!(bin_arc.read().is_dirty(), "BIN must be dirty after delete");
6869 }
6870
6871 /// BIN's parent pointer must point to the root IN.
6872 #[test]
6873 fn test_bin_parent_pointer_set_on_initial_insert() {
6874 let tree = Tree::new(1, 128);
6875 tree.insert(b"k".to_vec(), b"v".to_vec(), Lsn::new(1, 1)).unwrap();
6876
6877 let root_arc = tree.get_root().as_ref().unwrap().clone();
6878 let bin_arc = {
6879 let g = root_arc.read();
6880 match &*g {
6881 TreeNode::Internal(n) => n.entries[0].child.clone().unwrap(),
6882 _ => panic!("expected Internal root"),
6883 }
6884 };
6885
6886 let parent_weak = bin_arc.read().get_parent();
6887 assert!(parent_weak.is_some(), "BIN must have a parent pointer");
6888
6889 // Upgrading the weak pointer must give us the root arc.
6890 let parent_arc = parent_weak.unwrap().upgrade().unwrap();
6891 assert!(
6892 Arc::ptr_eq(&parent_arc, &root_arc),
6893 "BIN parent must be the root IN"
6894 );
6895 }
6896
6897 /// set_dirty / is_dirty round-trip on both variants.
6898 #[test]
6899 fn test_dirty_flag_roundtrip() {
6900 let mut bin_node = TreeNode::Bottom(BinStub {
6901 node_id: 1,
6902 level: BIN_LEVEL,
6903 entries: vec![],
6904 key_prefix: Vec::new(),
6905 dirty: false,
6906 is_delta: false,
6907 last_full_lsn: NULL_LSN,
6908 last_delta_lsn: NULL_LSN,
6909 generation: 0,
6910 parent: None,
6911 expiration_in_hours: true,
6912 cursor_count: 0,
6913 prohibit_next_delta: false,
6914 });
6915 assert!(!bin_node.is_dirty());
6916 bin_node.set_dirty(true);
6917 assert!(bin_node.is_dirty());
6918 bin_node.set_dirty(false);
6919 assert!(!bin_node.is_dirty());
6920
6921 let mut in_node = TreeNode::Internal(InNodeStub {
6922 node_id: 2,
6923 level: MAIN_LEVEL | 2,
6924 entries: vec![],
6925 dirty: false,
6926 generation: 0,
6927 parent: None,
6928 });
6929 assert!(!in_node.is_dirty());
6930 in_node.set_dirty(true);
6931 assert!(in_node.is_dirty());
6932 }
6933
6934 /// set_generation / get_generation round-trip on both variants.
6935 #[test]
6936 fn test_generation_roundtrip() {
6937 let mut bin_node = TreeNode::Bottom(BinStub {
6938 node_id: 1,
6939 level: BIN_LEVEL,
6940 entries: vec![],
6941 key_prefix: Vec::new(),
6942 dirty: false,
6943 is_delta: false,
6944 last_full_lsn: NULL_LSN,
6945 last_delta_lsn: NULL_LSN,
6946 generation: 0,
6947 parent: None,
6948 expiration_in_hours: true,
6949 cursor_count: 0,
6950 prohibit_next_delta: false,
6951 });
6952 assert_eq!(bin_node.get_generation(), 0);
6953 bin_node.set_generation(42);
6954 assert_eq!(bin_node.get_generation(), 42);
6955
6956 let mut in_node = TreeNode::Internal(InNodeStub {
6957 node_id: 2,
6958 level: MAIN_LEVEL | 2,
6959 entries: vec![],
6960 dirty: false,
6961 generation: 0,
6962 parent: None,
6963 });
6964 in_node.set_generation(99);
6965 assert_eq!(in_node.get_generation(), 99);
6966 }
6967
6968 /// log_size() must be consistent with write_to_bytes() length.
6969 #[test]
6970 fn test_log_size_matches_bytes_len() {
6971 // BIN stub with some entries.
6972 let bin_node = TreeNode::Bottom(BinStub {
6973 node_id: 7,
6974 level: BIN_LEVEL,
6975 entries: vec![
6976 BinEntry {
6977 key: b"alpha".to_vec(),
6978 lsn: Lsn::new(1, 10),
6979 data: Some(b"d1".to_vec()),
6980 known_deleted: false,
6981 dirty: false,
6982 expiration_time: 0,
6983 },
6984 BinEntry {
6985 key: b"beta".to_vec(),
6986 lsn: Lsn::new(1, 20),
6987 data: None,
6988 known_deleted: false,
6989 dirty: false,
6990 expiration_time: 0,
6991 },
6992 ],
6993 key_prefix: Vec::new(),
6994 dirty: true,
6995 is_delta: false,
6996 last_full_lsn: NULL_LSN,
6997 last_delta_lsn: NULL_LSN,
6998 generation: 5,
6999 parent: None,
7000 expiration_in_hours: true,
7001 cursor_count: 0,
7002 prohibit_next_delta: false,
7003 });
7004 assert_eq!(bin_node.log_size(), bin_node.write_to_bytes().len());
7005
7006 // IN stub with some entries.
7007 let in_node = TreeNode::Internal(InNodeStub {
7008 node_id: 8,
7009 level: MAIN_LEVEL | 2,
7010 entries: vec![
7011 InEntry { key: vec![], lsn: Lsn::new(1, 1), child: None },
7012 InEntry {
7013 key: b"mid".to_vec(),
7014 lsn: Lsn::new(1, 2),
7015 child: None,
7016 },
7017 ],
7018 dirty: false,
7019 generation: 0,
7020 parent: None,
7021 });
7022 assert_eq!(in_node.log_size(), in_node.write_to_bytes().len());
7023 }
7024
7025 /// write_to_bytes() output contains the node_id and dirty flag.
7026 #[test]
7027 fn test_write_to_bytes_encodes_node_id_and_dirty() {
7028 let node = TreeNode::Bottom(BinStub {
7029 node_id: 0xDEAD_BEEF_0000_0001,
7030 level: BIN_LEVEL,
7031 entries: vec![],
7032 key_prefix: Vec::new(),
7033 dirty: true,
7034 is_delta: false,
7035 last_full_lsn: NULL_LSN,
7036 last_delta_lsn: NULL_LSN,
7037 generation: 0,
7038 parent: None,
7039 expiration_in_hours: true,
7040 cursor_count: 0,
7041 prohibit_next_delta: false,
7042 });
7043 let bytes = node.write_to_bytes();
7044 // First 8 bytes = node_id big-endian.
7045 let id_bytes = &bytes[0..8];
7046 assert_eq!(id_bytes, 0xDEAD_BEEF_0000_0001u64.to_be_bytes());
7047 // Byte at offset 16 (after node_id[8] + level[4] + n_entries[4]) = dirty flag.
7048 assert_eq!(bytes[16], 1u8, "dirty flag must be 1");
7049 }
7050
7051 /// log_size() grows as entries are added.
7052 #[test]
7053 fn test_log_size_grows_with_entries() {
7054 let empty = TreeNode::Bottom(BinStub {
7055 node_id: 1,
7056 level: BIN_LEVEL,
7057 entries: vec![],
7058 key_prefix: Vec::new(),
7059 dirty: false,
7060 is_delta: false,
7061 last_full_lsn: NULL_LSN,
7062 last_delta_lsn: NULL_LSN,
7063 generation: 0,
7064 parent: None,
7065 expiration_in_hours: true,
7066 cursor_count: 0,
7067 prohibit_next_delta: false,
7068 });
7069 let with_entry = TreeNode::Bottom(BinStub {
7070 node_id: 2,
7071 level: BIN_LEVEL,
7072 entries: vec![BinEntry {
7073 key: b"longkey_here".to_vec(),
7074 lsn: Lsn::new(1, 1),
7075 data: None,
7076 known_deleted: false,
7077 dirty: false,
7078 expiration_time: 0,
7079 }],
7080 key_prefix: Vec::new(),
7081 dirty: false,
7082 is_delta: false,
7083 last_full_lsn: NULL_LSN,
7084 last_delta_lsn: NULL_LSN,
7085 generation: 0,
7086 parent: None,
7087 expiration_in_hours: true,
7088 cursor_count: 0,
7089 prohibit_next_delta: false,
7090 });
7091 assert!(
7092 with_entry.log_size() > empty.log_size(),
7093 "log_size must grow when entries are added"
7094 );
7095 }
7096
7097 /// propagate_dirty_to_root() marks all ancestors dirty.
7098 #[test]
7099 fn test_propagate_dirty_to_root() {
7100 // Build a 2-level tree manually: root IN -> BIN.
7101 let bin_arc = Arc::new(RwLock::new(TreeNode::Bottom(BinStub {
7102 node_id: generate_node_id(),
7103 level: BIN_LEVEL,
7104 entries: vec![],
7105 key_prefix: Vec::new(),
7106 dirty: false,
7107 is_delta: false,
7108 last_full_lsn: NULL_LSN,
7109 last_delta_lsn: NULL_LSN,
7110 generation: 0,
7111 parent: None, // set below
7112 expiration_in_hours: true,
7113 cursor_count: 0,
7114 prohibit_next_delta: false,
7115 })));
7116
7117 let root_arc = Arc::new(RwLock::new(TreeNode::Internal(InNodeStub {
7118 node_id: generate_node_id(),
7119 level: MAIN_LEVEL | 2,
7120 entries: vec![InEntry {
7121 key: vec![],
7122 lsn: Lsn::new(1, 1),
7123 child: Some(bin_arc.clone()),
7124 }],
7125 dirty: false,
7126 generation: 0,
7127 parent: None,
7128 })));
7129
7130 // Wire BIN's parent to root.
7131 bin_arc.write().set_parent(Some(Arc::downgrade(&root_arc)));
7132
7133 // Root is not dirty before propagation.
7134 assert!(!root_arc.read().is_dirty());
7135
7136 // Propagate from the BIN up.
7137 Tree::propagate_dirty_to_root(&bin_arc);
7138
7139 // Root must now be dirty.
7140 assert!(
7141 root_arc.read().is_dirty(),
7142 "root must be dirty after propagate_dirty_to_root"
7143 );
7144 }
7145
7146 /// collect_stats() on an empty tree returns all-zero stats.
7147 #[test]
7148 fn test_collect_stats_empty_tree() {
7149 let tree = Tree::new(1, 128);
7150 let stats = tree.collect_stats();
7151 assert_eq!(stats, TreeStats::default());
7152 }
7153
7154 /// collect_stats() on a single-entry tree: 1 IN + 1 BIN, height 2.
7155 #[test]
7156 fn test_collect_stats_single_insert() {
7157 let tree = Tree::new(1, 128);
7158 tree.insert(b"k".to_vec(), b"v".to_vec(), Lsn::new(1, 1)).unwrap();
7159 let stats = tree.collect_stats();
7160 assert_eq!(stats.n_bins, 1, "must have 1 BIN");
7161 assert_eq!(stats.n_ins, 1, "must have 1 upper IN");
7162 assert_eq!(stats.height, 2, "single-entry tree has height 2");
7163 assert!(stats.n_entries >= 1, "must have at least 1 entry total");
7164 }
7165
7166 /// collect_stats() with many inserts: entry count matches insert count.
7167 #[test]
7168 fn test_collect_stats_many_inserts() {
7169 let tree = Tree::new(1, 8);
7170 let n = 50u32;
7171 for i in 0..n {
7172 let key = format!("sk{:04}", i).into_bytes();
7173 tree.insert(key, b"v".to_vec(), Lsn::new(1, i)).unwrap();
7174 }
7175 let stats = tree.collect_stats();
7176 // All n entries should be accounted for across all BINs.
7177 // n_entries counts entries in both INs and BINs; BIN entries = n.
7178 // We verify BIN entry total equals n by summing manually.
7179 let bin_entries: u64 = stats.n_entries - stats.n_ins; // rough check
7180 // A more precise assertion: the sum of all BIN entries == n.
7181 // Since we can't easily separate, just assert the tree is non-trivial.
7182 assert!(stats.n_bins > 0, "must have at least one BIN");
7183 assert!(stats.height >= 2, "multi-entry tree has height >= 2");
7184 // Total entries in the tree must be >= n (BIN entries alone).
7185 assert!(
7186 bin_entries >= n as u64 || stats.n_entries >= n as u64,
7187 "entry count must account for all inserts"
7188 );
7189 }
7190
7191 // ========================================================================
7192 // Tests: B-tree merge / compress
7193 // ========================================================================
7194
7195 /// After deleting most keys from a tree, compress() must reduce the BIN
7196 /// count by merging under-full siblings.
7197 ///
7198 /// Strategy: build a large tree (many BINs), delete almost all keys,
7199 /// then verify compress() reduces n_bins and all surviving keys remain
7200 /// findable. We do not hard-code the exact BIN counts because the
7201 /// preemptive splitting strategy determines the exact split points.
7202 #[test]
7203 fn test_compress_merges_underfull_bins() {
7204 let tree = Tree::new(1, 8);
7205
7206 // Insert 64 sorted keys to build a multi-BIN tree.
7207 let n = 64u32;
7208 let keys: Vec<Vec<u8>> =
7209 (0..n).map(|i| format!("cm{:04}", i).into_bytes()).collect();
7210 for (i, key) in keys.iter().enumerate() {
7211 tree.insert(key.clone(), vec![i as u8], Lsn::new(1, i as u32))
7212 .unwrap();
7213 }
7214
7215 let stats_full = tree.collect_stats();
7216 assert!(
7217 stats_full.n_bins >= 2,
7218 "must have multiple BINs after 64 inserts"
7219 );
7220
7221 // Delete all but 4 widely-spaced keys (one roughly per BIN pair).
7222 // We keep every 16th key: k0000, k0016, k0032, k0048.
7223 let keep: std::collections::HashSet<u32> =
7224 [0, 16, 32, 48].iter().cloned().collect();
7225 for i in 0..n {
7226 if !keep.contains(&i) {
7227 let key = format!("cm{:04}", i).into_bytes();
7228 tree.delete(&key);
7229 }
7230 }
7231
7232 let stats_sparse = tree.collect_stats();
7233 assert!(
7234 stats_sparse.n_bins >= 2,
7235 "should still have multiple BINs before compress"
7236 );
7237
7238 // compress() must reduce BIN count since most BINs now hold 0–1 entries.
7239 tree.compress();
7240
7241 let stats_after = tree.collect_stats();
7242 assert!(
7243 stats_after.n_bins < stats_sparse.n_bins,
7244 "compress must reduce BIN count (was {}, now {})",
7245 stats_sparse.n_bins,
7246 stats_after.n_bins
7247 );
7248
7249 // Surviving keys must still be findable.
7250 for i in keep {
7251 let key = format!("cm{:04}", i).into_bytes();
7252 let sr = tree.search(&key);
7253 assert!(
7254 sr.is_some() && sr.unwrap().exact_parent_found,
7255 "key cm{:04} must survive compress",
7256 i
7257 );
7258 }
7259 }
7260
7261 /// compress() preserves all entries: a full-BIN tree has fewer merges
7262 /// but all keys remain accessible.
7263 #[test]
7264 fn test_compress_no_op_when_full() {
7265 // Insert exactly max_entries worth of keys into a single BIN — no split
7266 // will have occurred yet, and the BINs will all be reasonably full.
7267 // We can't prevent splits entirely (preemptive), but we can verify that
7268 // compress() never loses entries.
7269 let tree = Tree::new(1, 8);
7270 let n = 32u32;
7271 for i in 0..n {
7272 let key = format!("fn{:04}", i).into_bytes();
7273 tree.insert(key, vec![i as u8], Lsn::new(1, i)).unwrap();
7274 }
7275
7276 let stats_before = tree.collect_stats();
7277 tree.compress();
7278 let stats_after = tree.collect_stats();
7279
7280 // All keys still findable.
7281 for i in 0..n {
7282 let key = format!("fn{:04}", i).into_bytes();
7283 let sr = tree.search(&key);
7284 assert!(
7285 sr.is_some() && sr.unwrap().exact_parent_found,
7286 "key fn{:04} must be findable after compress",
7287 i
7288 );
7289 }
7290
7291 // BIN count must not increase.
7292 assert!(
7293 stats_after.n_bins <= stats_before.n_bins,
7294 "compress must not increase BIN count"
7295 );
7296 }
7297
7298 /// compress() on an empty tree must not panic.
7299 #[test]
7300 fn test_compress_empty_tree() {
7301 let tree = Tree::new(1, 4);
7302 tree.compress(); // must not panic
7303 }
7304
7305 /// After deleting all entries, compress() reduces BINs to 1.
7306 #[test]
7307 fn test_compress_removes_empty_bin_from_parent() {
7308 let tree = Tree::new(1, 4);
7309 // Insert enough keys to generate multiple BINs.
7310 let n = 16u32;
7311 for i in 0..n {
7312 let key = format!("ep{:04}", i).into_bytes();
7313 tree.insert(key, vec![i as u8], Lsn::new(1, i)).unwrap();
7314 }
7315
7316 let stats_before = tree.collect_stats();
7317 assert!(stats_before.n_bins >= 2, "need multiple BINs for this test");
7318
7319 // Delete everything except the very last key.
7320 for i in 0..n - 1 {
7321 let key = format!("ep{:04}", i).into_bytes();
7322 tree.delete(&key);
7323 }
7324
7325 tree.compress();
7326
7327 let stats_after = tree.collect_stats();
7328 assert!(
7329 stats_after.n_bins < stats_before.n_bins,
7330 "compress must reduce BIN count after mass deletion"
7331 );
7332
7333 // The surviving key must still be findable.
7334 let last_key = format!("ep{:04}", n - 1).into_bytes();
7335 let sr = tree.search(&last_key);
7336 assert!(
7337 sr.is_some() && sr.unwrap().exact_parent_found,
7338 "last key must survive after compress"
7339 );
7340 }
7341
7342 // ========================================================================
7343 // IC-1: prune_empty_bin must NOT remove a live entry when the BIN was
7344 // repopulated between the compressor observing it empty and the prune.
7345 // (Tree corruption / lost-write regression test.)
7346 // ========================================================================
7347
7348 /// Find a BIN arc that is currently empty (0 entries) and is NOT the
7349 /// root, returning it together with the `id_key` the compressor would
7350 /// have captured (here we just use any key that routes to that BIN).
7351 fn first_empty_non_root_bin(tree: &Tree) -> Option<Arc<RwLock<TreeNode>>> {
7352 let root = tree.get_root()?;
7353 for node in tree.rebuild_in_list() {
7354 if Arc::ptr_eq(&node, &root) {
7355 continue; // skip root (single-BIN tree is never pruned)
7356 }
7357 let is_empty_bin = {
7358 let g = node.read();
7359 matches!(&*g, TreeNode::Bottom(b) if b.entries.is_empty())
7360 };
7361 if is_empty_bin {
7362 return Some(node);
7363 }
7364 }
7365 None
7366 }
7367
7368 /// IC-1 (fail-pre / pass-post): the old `compress_bin` prune step called
7369 /// `self.delete(&id_key)`, which re-descends by key. If a concurrent
7370 /// insert repopulated the empty BIN with a LIVE entry under that same
7371 /// `id_key`, `self.delete` would silently remove the live entry — a lost
7372 /// write. `prune_empty_bin` re-validates `n_entries == 0` under the
7373 /// parent latch and must REMOVE NOTHING when the BIN is non-empty.
7374 ///
7375 /// JE `Tree.delete` / `searchDeletableSubTree` (Tree.java ~line 755-800):
7376 /// `bin.getNEntries() != 0` → NODE_NOT_EMPTY (abort prune).
7377 #[test]
7378 fn test_ic1_prune_empty_bin_aborts_when_repopulated() {
7379 let tree = Tree::new(1, 4);
7380 let n = 16u32;
7381 for i in 0..n {
7382 let key = format!("ic{:04}", i).into_bytes();
7383 tree.insert(key, vec![i as u8], Lsn::new(1, i)).unwrap();
7384 }
7385 assert!(
7386 tree.collect_stats().n_bins >= 2,
7387 "need multiple BINs for this test"
7388 );
7389
7390 // Empty out one whole BIN by deleting every key it holds. We delete
7391 // the lowest 4 keys (ic0000..ic0003) which share the first BIN, then
7392 // physically compress it so it has 0 entries.
7393 for i in 0..4 {
7394 let key = format!("ic{:04}", i).into_bytes();
7395 tree.delete(&key);
7396 }
7397
7398 // Locate the now-empty BIN and the id_key the compressor would use.
7399 let empty_bin = match first_empty_non_root_bin(&tree) {
7400 Some(b) => b,
7401 // If the layout didn't leave an isolated empty BIN, the scenario
7402 // isn't reproducible on this build; treat as vacuously passing.
7403 None => return,
7404 };
7405
7406 // SIMULATE THE RACE: a concurrent insert repopulates the empty BIN
7407 // with a LIVE entry *before* the prune runs. We insert directly into
7408 // the BIN arc to model the insert that lands after `now_empty` was
7409 // read. Pick a key that routes to this BIN.
7410 let live_key = format!("ic{:04}", 1).into_bytes(); // was deleted above
7411 {
7412 let mut g = empty_bin.write();
7413 if let TreeNode::Bottom(b) = &mut *g {
7414 b.entries.push(BinEntry {
7415 key: live_key.clone(),
7416 lsn: Lsn::new(2, 1),
7417 data: Some(vec![0xAB]),
7418 known_deleted: false,
7419 dirty: true,
7420 expiration_time: 0,
7421 });
7422 }
7423 }
7424 let id_key = {
7425 let g = empty_bin.read();
7426 match &*g {
7427 TreeNode::Bottom(b) => b.get_full_key(0).unwrap(),
7428 _ => unreachable!(),
7429 }
7430 };
7431
7432 // Prune must ABORT (return false) because the BIN is no longer empty,
7433 // and must NOT remove the live entry.
7434 let pruned = tree.prune_empty_bin(&id_key);
7435 assert!(!pruned, "IC-1: prune must abort when the BIN was repopulated");
7436
7437 // The live entry must still be present in the BIN.
7438 let still_there = {
7439 let g = empty_bin.read();
7440 match &*g {
7441 TreeNode::Bottom(b) => b
7442 .entries
7443 .iter()
7444 .any(|e| b.key_prefix.is_empty() && e.key == live_key),
7445 _ => false,
7446 }
7447 };
7448 assert!(
7449 still_there,
7450 "IC-1: prune must not remove the repopulated live entry"
7451 );
7452 }
7453
7454 /// IC-1 companion: prune_empty_bin must abort when a cursor is parked on
7455 /// the (still-empty) BIN. JE: `bin.nCursors() > 0` → CURSORS_EXIST.
7456 #[test]
7457 fn test_ic1_prune_empty_bin_aborts_with_cursor() {
7458 let tree = Tree::new(1, 4);
7459 for i in 0..16u32 {
7460 let key = format!("cu{:04}", i).into_bytes();
7461 tree.insert(key, vec![i as u8], Lsn::new(1, i)).unwrap();
7462 }
7463 for i in 0..4 {
7464 let key = format!("cu{:04}", i).into_bytes();
7465 tree.delete(&key);
7466 }
7467 let empty_bin = match first_empty_non_root_bin(&tree) {
7468 Some(b) => b,
7469 None => return,
7470 };
7471 // Park a cursor on the empty BIN.
7472 Tree::pin_bin(&empty_bin);
7473 // id_key: any key routing to this BIN. Use the first deleted key.
7474 let id_key = format!("cu{:04}", 0).into_bytes();
7475 let pruned = tree.prune_empty_bin(&id_key);
7476 assert!(
7477 !pruned,
7478 "IC-1: prune must abort when a cursor is parked on the BIN"
7479 );
7480 Tree::unpin_bin(&empty_bin);
7481 }
7482
7483 /// IC-1 happy path: prune_empty_bin removes the parent slot when the BIN
7484 /// really is empty, no cursors, not a delta.
7485 #[test]
7486 fn test_ic1_prune_empty_bin_succeeds_when_truly_empty() {
7487 let tree = Tree::new(1, 4);
7488 for i in 0..16u32 {
7489 let key = format!("ok{:04}", i).into_bytes();
7490 tree.insert(key, vec![i as u8], Lsn::new(1, i)).unwrap();
7491 }
7492 for i in 0..4 {
7493 let key = format!("ok{:04}", i).into_bytes();
7494 tree.delete(&key);
7495 }
7496 let bins_before = tree.collect_stats().n_bins;
7497 let empty_bin = match first_empty_non_root_bin(&tree) {
7498 Some(b) => b,
7499 None => return,
7500 };
7501 // id_key: a key that routes to this empty BIN (one of the deleted).
7502 let id_key = {
7503 // route by the lowest deleted key; it falls into the leftmost BIN.
7504 let _ = &empty_bin;
7505 format!("ok{:04}", 0).into_bytes()
7506 };
7507 let pruned = tree.prune_empty_bin(&id_key);
7508 assert!(pruned, "IC-1: prune must succeed on a truly empty BIN");
7509 let bins_after = tree.collect_stats().n_bins;
7510 assert!(
7511 bins_after < bins_before,
7512 "IC-1: pruned BIN slot must be removed from the parent (was {}, now {})",
7513 bins_before,
7514 bins_after
7515 );
7516 // Every surviving key must still be findable.
7517 for i in 4..16u32 {
7518 let key = format!("ok{:04}", i).into_bytes();
7519 assert!(
7520 tree.search(&key).is_some_and(|s| s.exact_parent_found),
7521 "surviving key ok{:04} must remain after prune",
7522 i
7523 );
7524 }
7525 }
7526
7527 // ========================================================================
7528 // Tests: latch-coupling validation (validate_parent_child /
7529 // search_with_coupling)
7530 // ========================================================================
7531
7532 /// validate_parent_child returns true when the parent slot points at the
7533 /// expected child.
7534 #[test]
7535 fn test_validate_parent_child_correct_link() {
7536 let bin_arc = Arc::new(RwLock::new(TreeNode::Bottom(BinStub {
7537 node_id: generate_node_id(),
7538 level: BIN_LEVEL,
7539 entries: vec![],
7540 key_prefix: Vec::new(),
7541 dirty: false,
7542 is_delta: false,
7543 last_full_lsn: NULL_LSN,
7544 last_delta_lsn: NULL_LSN,
7545 generation: 0,
7546 parent: None,
7547 expiration_in_hours: true,
7548 cursor_count: 0,
7549 prohibit_next_delta: false,
7550 })));
7551
7552 let root_arc = Arc::new(RwLock::new(TreeNode::Internal(InNodeStub {
7553 node_id: generate_node_id(),
7554 level: MAIN_LEVEL | 2,
7555 entries: vec![InEntry {
7556 key: vec![],
7557 lsn: Lsn::new(1, 1),
7558 child: Some(bin_arc.clone()),
7559 }],
7560 dirty: false,
7561 generation: 0,
7562 parent: None,
7563 })));
7564
7565 assert!(
7566 Tree::validate_parent_child(&root_arc, 0, &bin_arc),
7567 "link must be valid when parent slot 0 points at bin_arc"
7568 );
7569 }
7570
7571 /// validate_parent_child returns false when the slot index is out of range.
7572 #[test]
7573 fn test_validate_parent_child_out_of_range() {
7574 let root_arc = Arc::new(RwLock::new(TreeNode::Internal(InNodeStub {
7575 node_id: generate_node_id(),
7576 level: MAIN_LEVEL | 2,
7577 entries: vec![],
7578 dirty: false,
7579 generation: 0,
7580 parent: None,
7581 })));
7582 let other_arc = Arc::new(RwLock::new(TreeNode::Bottom(BinStub {
7583 node_id: generate_node_id(),
7584 level: BIN_LEVEL,
7585 entries: vec![],
7586 key_prefix: Vec::new(),
7587 dirty: false,
7588 is_delta: false,
7589 last_full_lsn: NULL_LSN,
7590 last_delta_lsn: NULL_LSN,
7591 generation: 0,
7592 parent: None,
7593 expiration_in_hours: true,
7594 cursor_count: 0,
7595 prohibit_next_delta: false,
7596 })));
7597
7598 assert!(
7599 !Tree::validate_parent_child(&root_arc, 0, &other_arc),
7600 "link must be invalid when parent has no entries"
7601 );
7602 }
7603
7604 /// validate_parent_child returns false when the slot points at a different Arc.
7605 #[test]
7606 fn test_validate_parent_child_wrong_child() {
7607 let bin_a = Arc::new(RwLock::new(TreeNode::Bottom(BinStub {
7608 node_id: generate_node_id(),
7609 level: BIN_LEVEL,
7610 entries: vec![],
7611 key_prefix: Vec::new(),
7612 dirty: false,
7613 is_delta: false,
7614 last_full_lsn: NULL_LSN,
7615 last_delta_lsn: NULL_LSN,
7616 generation: 0,
7617 parent: None,
7618 expiration_in_hours: true,
7619 cursor_count: 0,
7620 prohibit_next_delta: false,
7621 })));
7622 let bin_b = Arc::new(RwLock::new(TreeNode::Bottom(BinStub {
7623 node_id: generate_node_id(),
7624 level: BIN_LEVEL,
7625 entries: vec![],
7626 key_prefix: Vec::new(),
7627 dirty: false,
7628 is_delta: false,
7629 last_full_lsn: NULL_LSN,
7630 last_delta_lsn: NULL_LSN,
7631 generation: 0,
7632 parent: None,
7633 expiration_in_hours: true,
7634 cursor_count: 0,
7635 prohibit_next_delta: false,
7636 })));
7637
7638 let root_arc = Arc::new(RwLock::new(TreeNode::Internal(InNodeStub {
7639 node_id: generate_node_id(),
7640 level: MAIN_LEVEL | 2,
7641 entries: vec![InEntry {
7642 key: vec![],
7643 lsn: Lsn::new(1, 1),
7644 child: Some(bin_a),
7645 }],
7646 dirty: false,
7647 generation: 0,
7648 parent: None,
7649 })));
7650
7651 assert!(
7652 !Tree::validate_parent_child(&root_arc, 0, &bin_b),
7653 "link must be invalid when parent slot points at a different Arc"
7654 );
7655 }
7656
7657 /// search_with_coupling finds the same key as search().
7658 #[test]
7659 fn test_search_with_coupling_finds_existing_key() {
7660 let tree = Tree::new(1, 8);
7661 for i in 0u32..20 {
7662 let key = format!("c{:04}", i).into_bytes();
7663 tree.insert(key, vec![i as u8], Lsn::new(1, i)).unwrap();
7664 }
7665
7666 for i in 0u32..20 {
7667 let key = format!("c{:04}", i).into_bytes();
7668 let sr = tree.search_with_coupling(&key);
7669 assert!(
7670 sr.is_some() && sr.unwrap().exact_parent_found,
7671 "search_with_coupling must find c{:04}",
7672 i
7673 );
7674 }
7675 }
7676
7677 /// search_with_coupling returns false for a key not in the tree.
7678 #[test]
7679 fn test_search_with_coupling_missing_key() {
7680 let tree = Tree::new(1, 8);
7681 tree.insert(b"hello".to_vec(), b"v".to_vec(), Lsn::new(1, 1)).unwrap();
7682
7683 let sr = tree.search_with_coupling(b"zzz");
7684 // The search result must either be None or have exact_parent_found=false.
7685 assert!(
7686 sr.is_none_or(|r| !r.exact_parent_found),
7687 "search_with_coupling must not find a key that was never inserted"
7688 );
7689 }
7690
7691 /// search_with_coupling on an empty tree returns None.
7692 #[test]
7693 fn test_search_with_coupling_empty_tree() {
7694 let tree = Tree::new(1, 8);
7695 assert!(tree.search_with_coupling(b"k").is_none());
7696 }
7697
7698 // ========================================================================
7699 // Tests: BIN-delta reconstitution (apply_delta_to_bin / mutate_to_full_bin)
7700 // ========================================================================
7701
7702 /// apply_delta_to_bin replaces existing entries and inserts new ones.
7703 ///
7704 /// BIN.applyDelta(): delta entries are authoritative and
7705 /// supersede full-BIN entries at the same key.
7706 #[test]
7707 fn test_apply_delta_to_bin_updates_and_inserts() {
7708 let mut base = BinStub {
7709 node_id: 1,
7710 level: BIN_LEVEL,
7711 entries: vec![
7712 BinEntry {
7713 key: b"a".to_vec(),
7714 lsn: Lsn::new(1, 1),
7715 data: Some(b"old_a".to_vec()),
7716 known_deleted: false,
7717 dirty: false,
7718 expiration_time: 0,
7719 },
7720 BinEntry {
7721 key: b"c".to_vec(),
7722 lsn: Lsn::new(1, 3),
7723 data: Some(b"old_c".to_vec()),
7724 known_deleted: false,
7725 dirty: false,
7726 expiration_time: 0,
7727 },
7728 ],
7729 key_prefix: Vec::new(),
7730 dirty: false,
7731 is_delta: false,
7732 last_full_lsn: NULL_LSN,
7733 last_delta_lsn: NULL_LSN,
7734 generation: 0,
7735 parent: None,
7736 expiration_in_hours: true,
7737 cursor_count: 0,
7738 prohibit_next_delta: false,
7739 };
7740
7741 let delta_entries = vec![
7742 // Update existing key "a" with new data.
7743 BinEntry {
7744 key: b"a".to_vec(),
7745 lsn: Lsn::new(1, 10),
7746 data: Some(b"new_a".to_vec()),
7747 known_deleted: false,
7748 dirty: false,
7749 expiration_time: 0,
7750 },
7751 // Insert new key "b".
7752 BinEntry {
7753 key: b"b".to_vec(),
7754 lsn: Lsn::new(1, 20),
7755 data: Some(b"new_b".to_vec()),
7756 known_deleted: false,
7757 dirty: false,
7758 expiration_time: 0,
7759 },
7760 ];
7761
7762 Tree::apply_delta_to_bin(&mut base, delta_entries);
7763
7764 assert!(base.dirty, "base must be dirty after applying delta");
7765
7766 // "a" must be updated.
7767 let a = base.entries.iter().find(|e| e.key == b"a").unwrap();
7768 assert_eq!(a.data.as_deref(), Some(b"new_a" as &[u8]));
7769
7770 // "b" must be newly inserted.
7771 assert!(base.entries.iter().any(|e| e.key == b"b"));
7772
7773 // "c" must still be present (untouched).
7774 assert!(base.entries.iter().any(|e| e.key == b"c"));
7775
7776 // Entries must be in sorted order.
7777 let keys: Vec<&[u8]> =
7778 base.entries.iter().map(|e| e.key.as_slice()).collect();
7779 let mut sorted = keys.clone();
7780 sorted.sort();
7781 assert_eq!(
7782 keys, sorted,
7783 "entries must remain sorted after delta apply"
7784 );
7785 }
7786
7787 /// apply_delta_to_bin with an empty delta is a no-op (except dirty flag).
7788 #[test]
7789 fn test_apply_delta_to_bin_empty_delta() {
7790 let mut base = BinStub {
7791 node_id: 1,
7792 level: BIN_LEVEL,
7793 entries: vec![BinEntry {
7794 key: b"x".to_vec(),
7795 lsn: Lsn::new(1, 1),
7796 data: None,
7797 known_deleted: false,
7798 dirty: false,
7799 expiration_time: 0,
7800 }],
7801 key_prefix: Vec::new(),
7802 dirty: false,
7803 is_delta: false,
7804 last_full_lsn: NULL_LSN,
7805 last_delta_lsn: NULL_LSN,
7806 generation: 0,
7807 parent: None,
7808 expiration_in_hours: true,
7809 cursor_count: 0,
7810 prohibit_next_delta: false,
7811 };
7812 let n_before = base.entries.len();
7813 Tree::apply_delta_to_bin(&mut base, vec![]);
7814 assert_eq!(
7815 base.entries.len(),
7816 n_before,
7817 "empty delta must not change entry count"
7818 );
7819 assert!(base.dirty, "dirty must be set even for empty delta apply");
7820 }
7821
7822 /// mutate_to_full_bin reconstitutes a full BIN from a delta + base.
7823 ///
7824 /// BIN.mutateToFullBIN(BIN fullBIN): after mutation the
7825 /// `is_delta` flag must be cleared and the entries must contain both
7826 /// base and delta data.
7827 #[test]
7828 fn test_mutate_to_full_bin_merges_delta_and_base() {
7829 let base = BinStub {
7830 node_id: 2,
7831 level: BIN_LEVEL,
7832 entries: vec![
7833 BinEntry {
7834 key: b"aa".to_vec(),
7835 lsn: Lsn::new(1, 1),
7836 data: Some(b"base_aa".to_vec()),
7837 known_deleted: false,
7838 dirty: false,
7839 expiration_time: 0,
7840 },
7841 BinEntry {
7842 key: b"cc".to_vec(),
7843 lsn: Lsn::new(1, 3),
7844 data: Some(b"base_cc".to_vec()),
7845 known_deleted: false,
7846 dirty: false,
7847 expiration_time: 0,
7848 },
7849 ],
7850 key_prefix: Vec::new(),
7851 dirty: false,
7852 is_delta: false,
7853 last_full_lsn: NULL_LSN,
7854 last_delta_lsn: NULL_LSN,
7855 generation: 0,
7856 parent: None,
7857 expiration_in_hours: true,
7858 cursor_count: 0,
7859 prohibit_next_delta: false,
7860 };
7861
7862 // The delta has a new entry "bb" and overwrites "aa".
7863 let mut delta = BinStub {
7864 node_id: 2,
7865 level: BIN_LEVEL,
7866 entries: vec![
7867 BinEntry {
7868 key: b"aa".to_vec(),
7869 lsn: Lsn::new(1, 10),
7870 data: Some(b"delta_aa".to_vec()),
7871 known_deleted: false,
7872 dirty: false,
7873 expiration_time: 0,
7874 },
7875 BinEntry {
7876 key: b"bb".to_vec(),
7877 lsn: Lsn::new(1, 20),
7878 data: Some(b"delta_bb".to_vec()),
7879 known_deleted: false,
7880 dirty: false,
7881 expiration_time: 0,
7882 },
7883 ],
7884 key_prefix: Vec::new(),
7885 dirty: true,
7886 is_delta: true,
7887 last_full_lsn: NULL_LSN,
7888 last_delta_lsn: NULL_LSN,
7889 generation: 0,
7890 parent: None,
7891 expiration_in_hours: true,
7892 cursor_count: 0,
7893 prohibit_next_delta: false,
7894 };
7895
7896 Tree::mutate_to_full_bin(&mut delta, base);
7897
7898 // After mutation the node must be a full BIN.
7899 assert!(
7900 !delta.is_delta,
7901 "is_delta must be false after mutate_to_full_bin"
7902 );
7903 assert!(delta.dirty, "must be dirty after mutation");
7904
7905 // "aa" must be the delta version.
7906 let aa = delta.entries.iter().find(|e| e.key == b"aa").unwrap();
7907 assert_eq!(aa.data.as_deref(), Some(b"delta_aa" as &[u8]));
7908
7909 // "bb" must be present (from delta).
7910 assert!(delta.entries.iter().any(|e| e.key == b"bb"));
7911
7912 // "cc" must be present (from base).
7913 assert!(delta.entries.iter().any(|e| e.key == b"cc"));
7914
7915 // Three entries total, in sorted order.
7916 assert_eq!(delta.entries.len(), 3);
7917 let keys: Vec<&[u8]> =
7918 delta.entries.iter().map(|e| e.key.as_slice()).collect();
7919 let mut sorted = keys.clone();
7920 sorted.sort();
7921 assert_eq!(keys, sorted, "entries must be sorted after mutation");
7922 }
7923
7924 /// is_delta flag is correctly reported by bin_is_delta().
7925 #[test]
7926 fn test_bin_is_delta_flag() {
7927 let mut bin = BinStub {
7928 node_id: 1,
7929 level: BIN_LEVEL,
7930 entries: vec![],
7931 key_prefix: Vec::new(),
7932 dirty: false,
7933 is_delta: false,
7934 last_full_lsn: NULL_LSN,
7935 last_delta_lsn: NULL_LSN,
7936 generation: 0,
7937 parent: None,
7938 expiration_in_hours: true,
7939 cursor_count: 0,
7940 prohibit_next_delta: false,
7941 };
7942 assert!(!Tree::bin_is_delta(&bin));
7943 bin.is_delta = true;
7944 assert!(Tree::bin_is_delta(&bin));
7945 }
7946
7947 // ========================================================================
7948 // Tests: mutate_to_full_bin_from_log
7949 // ========================================================================
7950
7951 /// mutate_to_full_bin_from_log is a no-op when the BIN is already full.
7952 #[test]
7953 fn test_mutate_to_full_bin_from_log_already_full() {
7954 let dir = tempfile::tempdir().unwrap();
7955 let fm = std::sync::Arc::new(
7956 noxu_log::FileManager::new(dir.path(), false, 10_000_000, 100)
7957 .unwrap(),
7958 );
7959 let lm = noxu_log::LogManager::new(fm, 3, 1024 * 1024, 4096);
7960
7961 let mut bin = BinStub {
7962 node_id: 1,
7963 level: BIN_LEVEL,
7964 entries: vec![BinEntry {
7965 key: b"key1".to_vec(),
7966 lsn: Lsn::new(1, 10),
7967 data: Some(b"v1".to_vec()),
7968 known_deleted: false,
7969 dirty: false,
7970 expiration_time: 0,
7971 }],
7972 key_prefix: Vec::new(),
7973 dirty: false,
7974 is_delta: false, // already a full BIN
7975 last_full_lsn: NULL_LSN,
7976 last_delta_lsn: NULL_LSN,
7977 generation: 0,
7978 parent: None,
7979 expiration_in_hours: true,
7980 cursor_count: 0,
7981 prohibit_next_delta: false,
7982 };
7983
7984 Tree::mutate_to_full_bin_from_log(&mut bin, &lm);
7985
7986 // No-op: is_delta was already false, entries unchanged.
7987 assert!(!bin.is_delta);
7988 assert_eq!(bin.entries.len(), 1);
7989 }
7990
7991 /// mutate_to_full_bin_from_log with NULL_LSN promotes delta without base.
7992 ///
7993 /// When last_full_lsn is NULL_LSN the BIN has never been written as a full
7994 /// entry. The function must clear is_delta and leave the delta entries
7995 /// as-is (they are the authoritative full state).
7996 #[test]
7997 fn test_mutate_to_full_bin_from_log_null_lsn() {
7998 let dir = tempfile::tempdir().unwrap();
7999 let fm = std::sync::Arc::new(
8000 noxu_log::FileManager::new(dir.path(), false, 10_000_000, 100)
8001 .unwrap(),
8002 );
8003 let lm = noxu_log::LogManager::new(fm, 3, 1024 * 1024, 4096);
8004
8005 let mut delta = BinStub {
8006 node_id: 2,
8007 level: BIN_LEVEL,
8008 entries: vec![BinEntry {
8009 key: b"a".to_vec(),
8010 lsn: Lsn::new(1, 5),
8011 data: Some(b"delta_a".to_vec()),
8012 known_deleted: false,
8013 dirty: true,
8014 expiration_time: 0,
8015 }],
8016 key_prefix: Vec::new(),
8017 dirty: true,
8018 is_delta: true,
8019 last_full_lsn: NULL_LSN, // no full BIN ever written
8020 last_delta_lsn: NULL_LSN,
8021 generation: 0,
8022 parent: None,
8023 expiration_in_hours: true,
8024 cursor_count: 0,
8025 prohibit_next_delta: false,
8026 };
8027
8028 Tree::mutate_to_full_bin_from_log(&mut delta, &lm);
8029
8030 // is_delta must be cleared; the single delta entry is kept as-is.
8031 assert!(
8032 !delta.is_delta,
8033 "is_delta must be false after null-lsn promotion"
8034 );
8035 assert_eq!(delta.entries.len(), 1);
8036 assert_eq!(delta.entries[0].data.as_deref(), Some(b"delta_a" as &[u8]));
8037 }
8038
8039 /// mutate_to_full_bin_from_log reads full BIN from log and merges delta.
8040 ///
8041 /// Round-trip: serialize a full BIN, write it to a LogManager, record the
8042 /// LSN, then call mutate_to_full_bin_from_log on a delta referencing that
8043 /// LSN. The result must contain base-only and delta-only entries with the
8044 /// delta winning on conflicts.
8045 #[test]
8046 fn test_mutate_to_full_bin_from_log_reads_and_merges() {
8047 let dir = tempfile::tempdir().unwrap();
8048 let fm = std::sync::Arc::new(
8049 noxu_log::FileManager::new(dir.path(), false, 10_000_000, 100)
8050 .unwrap(),
8051 );
8052 let lm = noxu_log::LogManager::new(fm, 3, 1024 * 1024, 4096);
8053
8054 // Build and serialize the full BIN that will be written to the log.
8055 let full_bin = BinStub {
8056 node_id: 42,
8057 level: BIN_LEVEL,
8058 entries: vec![
8059 BinEntry {
8060 key: b"base_only".to_vec(),
8061 lsn: Lsn::new(1, 1),
8062 data: Some(b"base_val".to_vec()),
8063 known_deleted: false,
8064 dirty: false,
8065 expiration_time: 0,
8066 },
8067 BinEntry {
8068 key: b"shared_key".to_vec(),
8069 lsn: Lsn::new(1, 2),
8070 data: Some(b"base_shared".to_vec()),
8071 known_deleted: false,
8072 dirty: false,
8073 expiration_time: 0,
8074 },
8075 ],
8076 key_prefix: Vec::new(),
8077 dirty: false,
8078 is_delta: false,
8079 last_full_lsn: NULL_LSN,
8080 last_delta_lsn: NULL_LSN,
8081 generation: 0,
8082 parent: None,
8083 expiration_in_hours: true,
8084 cursor_count: 0,
8085 prohibit_next_delta: false,
8086 };
8087
8088 let payload = full_bin.serialize_full();
8089 let full_lsn = lm
8090 .log(
8091 noxu_log::LogEntryType::BIN,
8092 &payload,
8093 noxu_log::Provisional::No,
8094 true,
8095 false,
8096 )
8097 .expect("write full BIN to log");
8098 lm.flush_no_sync().expect("flush log");
8099
8100 // Build a delta BIN referencing the full BIN via last_full_lsn.
8101 let mut delta = BinStub {
8102 node_id: 42,
8103 level: BIN_LEVEL,
8104 entries: vec![
8105 // Overwrites "shared_key" from the base.
8106 BinEntry {
8107 key: b"shared_key".to_vec(),
8108 lsn: Lsn::new(1, 20),
8109 data: Some(b"delta_shared".to_vec()),
8110 known_deleted: false,
8111 dirty: true,
8112 expiration_time: 0,
8113 },
8114 // New key only in the delta.
8115 BinEntry {
8116 key: b"delta_only".to_vec(),
8117 lsn: Lsn::new(1, 30),
8118 data: Some(b"delta_val".to_vec()),
8119 known_deleted: false,
8120 dirty: true,
8121 expiration_time: 0,
8122 },
8123 ],
8124 key_prefix: Vec::new(),
8125 dirty: true,
8126 is_delta: true,
8127 last_full_lsn: full_lsn,
8128 last_delta_lsn: NULL_LSN,
8129 generation: 0,
8130 parent: None,
8131 expiration_in_hours: true,
8132 cursor_count: 0,
8133 prohibit_next_delta: false,
8134 };
8135
8136 Tree::mutate_to_full_bin_from_log(&mut delta, &lm);
8137
8138 assert!(
8139 !delta.is_delta,
8140 "is_delta must be false after log-based mutation"
8141 );
8142 assert!(delta.dirty, "must be dirty after mutation");
8143
8144 // All three distinct keys must be present.
8145 let find = |k: &[u8]| -> Option<Vec<u8>> {
8146 delta
8147 .entries
8148 .iter()
8149 .find(|e| delta.decompress_key(&e.key) == k)
8150 .and_then(|e| e.data.clone())
8151 };
8152
8153 assert_eq!(
8154 find(b"base_only"),
8155 Some(b"base_val".to_vec()),
8156 "base-only key must be present"
8157 );
8158 assert_eq!(
8159 find(b"shared_key"),
8160 Some(b"delta_shared".to_vec()),
8161 "delta must win on shared_key"
8162 );
8163 assert_eq!(
8164 find(b"delta_only"),
8165 Some(b"delta_val".to_vec()),
8166 "delta-only key must be present"
8167 );
8168 assert_eq!(delta.entries.len(), 3, "must have exactly 3 entries");
8169
8170 // Entries must be in sorted order (by full key).
8171 let full_keys: Vec<Vec<u8>> = (0..delta.entries.len())
8172 .map(|i| delta.get_full_key(i).unwrap())
8173 .collect();
8174 let mut sorted_keys = full_keys.clone();
8175 sorted_keys.sort();
8176 assert_eq!(full_keys, sorted_keys, "entries must be in sorted order");
8177 }
8178
8179 // ========================================================================
8180 // Tests: deserialize_full key prefix recomputation
8181 // ========================================================================
8182
8183 /// deserialize_full recomputes key prefix from loaded full keys.
8184 ///
8185 /// IN.recalcKeyPrefix() called after materializing from log:
8186 /// a BIN loaded from the log should have prefix compression applied so
8187 /// that search performance matches an in-memory BIN.
8188 #[test]
8189 fn test_deserialize_full_recomputes_key_prefix() {
8190 // Build a BIN with a known common prefix and serialize it.
8191 let mut source = BinStub {
8192 node_id: 99,
8193 level: BIN_LEVEL,
8194 entries: vec![
8195 BinEntry {
8196 key: b"pfx:alpha".to_vec(),
8197 lsn: Lsn::new(1, 1),
8198 data: None,
8199 known_deleted: false,
8200 dirty: false,
8201 expiration_time: 0,
8202 },
8203 BinEntry {
8204 key: b"pfx:beta".to_vec(),
8205 lsn: Lsn::new(1, 2),
8206 data: None,
8207 known_deleted: false,
8208 dirty: false,
8209 expiration_time: 0,
8210 },
8211 BinEntry {
8212 key: b"pfx:gamma".to_vec(),
8213 lsn: Lsn::new(1, 3),
8214 data: None,
8215 known_deleted: false,
8216 dirty: false,
8217 expiration_time: 0,
8218 },
8219 ],
8220 key_prefix: Vec::new(),
8221 dirty: false,
8222 is_delta: false,
8223 last_full_lsn: NULL_LSN,
8224 last_delta_lsn: NULL_LSN,
8225 generation: 0,
8226 parent: None,
8227 expiration_in_hours: true,
8228 cursor_count: 0,
8229 prohibit_next_delta: false,
8230 };
8231 source.recompute_key_prefix();
8232 // Verify the source has the expected prefix before serializing.
8233 assert_eq!(source.key_prefix, b"pfx:");
8234
8235 let payload = source.serialize_full();
8236
8237 // Deserialize and verify prefix is re-established.
8238 let loaded = BinStub::deserialize_full(&payload)
8239 .expect("deserialization must succeed");
8240
8241 assert_eq!(
8242 loaded.key_prefix, b"pfx:",
8243 "key prefix must be recomputed after deserialize_full"
8244 );
8245
8246 // All full keys must be reconstructable.
8247 for i in 0..loaded.entries.len() {
8248 let fk = loaded.get_full_key(i).unwrap();
8249 assert!(
8250 fk.starts_with(b"pfx:"),
8251 "full key {i} must start with prefix"
8252 );
8253 }
8254 }
8255
8256 /// deserialize_full with a single entry leaves key_prefix empty.
8257 ///
8258 /// A BIN with fewer than 2 entries cannot have a meaningful common prefix.
8259 #[test]
8260 fn test_deserialize_full_single_entry_no_prefix() {
8261 let source = BinStub {
8262 node_id: 7,
8263 level: BIN_LEVEL,
8264 entries: vec![BinEntry {
8265 key: b"solo".to_vec(),
8266 lsn: Lsn::new(1, 1),
8267 data: None,
8268 known_deleted: false,
8269 dirty: false,
8270 expiration_time: 0,
8271 }],
8272 key_prefix: Vec::new(),
8273 dirty: false,
8274 is_delta: false,
8275 last_full_lsn: NULL_LSN,
8276 last_delta_lsn: NULL_LSN,
8277 generation: 0,
8278 parent: None,
8279 expiration_in_hours: true,
8280 cursor_count: 0,
8281 prohibit_next_delta: false,
8282 };
8283
8284 let payload = source.serialize_full();
8285 let loaded = BinStub::deserialize_full(&payload)
8286 .expect("deserialization must succeed");
8287
8288 assert!(
8289 loaded.key_prefix.is_empty(),
8290 "single-entry BIN must have empty prefix"
8291 );
8292 assert_eq!(loaded.get_full_key(0).unwrap(), b"solo");
8293 }
8294
8295 // ========================================================================
8296 // Tests: get_next_bin / get_prev_bin
8297 // ========================================================================
8298
8299 /// get_next_bin returns the entries of the next BIN to the right.
8300 ///
8301 /// Tree.getNextBin() / getNextIN(forward=true).
8302 #[test]
8303 fn test_get_next_bin_basic() {
8304 let tree = Tree::new(1, 4);
8305
8306 // Insert 8 sorted keys — creates multiple BINs.
8307 for i in 0u32..8 {
8308 let key = format!("n{:04}", i).into_bytes();
8309 tree.insert(key, vec![i as u8], Lsn::new(1, i)).unwrap();
8310 }
8311
8312 let stats = tree.collect_stats();
8313 if stats.n_bins < 2 {
8314 // If the tree only has one BIN, skip the sibling test.
8315 return;
8316 }
8317
8318 // A key from the first BIN (e.g. "n0000") should have a next BIN.
8319 let next = tree.get_next_bin(b"n0000");
8320 assert!(
8321 next.is_some(),
8322 "must return a next BIN for a key in the leftmost BIN"
8323 );
8324
8325 let entries = next.unwrap();
8326 assert!(!entries.is_empty(), "next BIN must not be empty");
8327 // All returned keys must be strictly greater than "n0000" because they
8328 // are in a different (rightward) BIN.
8329 for e in &entries {
8330 assert!(
8331 e.key.as_slice() > b"n0000" as &[u8],
8332 "next BIN entries must all be > the search key"
8333 );
8334 }
8335 }
8336
8337 /// get_next_bin returns None for a key in the rightmost BIN.
8338 #[test]
8339 fn test_get_next_bin_at_rightmost_returns_none() {
8340 let tree = Tree::new(1, 4);
8341 for i in 0u32..8 {
8342 let key = format!("r{:04}", i).into_bytes();
8343 tree.insert(key, vec![i as u8], Lsn::new(1, i)).unwrap();
8344 }
8345 // A key from the rightmost BIN (e.g. "r0007") has no next BIN.
8346 let next = tree.get_next_bin(b"r0007");
8347 assert!(
8348 next.is_none(),
8349 "must return None for a key in the rightmost BIN"
8350 );
8351 }
8352
8353 /// get_prev_bin returns the entries of the next BIN to the left.
8354 ///
8355 /// Tree.getPrevBin() / getNextIN(forward=false).
8356 #[test]
8357 fn test_get_prev_bin_basic() {
8358 let tree = Tree::new(1, 4);
8359 for i in 0u32..8 {
8360 let key = format!("p{:04}", i).into_bytes();
8361 tree.insert(key, vec![i as u8], Lsn::new(1, i)).unwrap();
8362 }
8363
8364 // A key from the second BIN ("p0004") should have a previous BIN.
8365 let prev = tree.get_prev_bin(b"p0004");
8366 assert!(
8367 prev.is_some(),
8368 "must return a prev BIN for a key in the second BIN"
8369 );
8370
8371 let entries = prev.unwrap();
8372 assert!(!entries.is_empty(), "prev BIN must not be empty");
8373 // All returned keys must be < b"p0004".
8374 for e in &entries {
8375 assert!(
8376 e.key.as_slice() < b"p0004" as &[u8],
8377 "prev BIN entries must all be < the current BIN"
8378 );
8379 }
8380 }
8381
8382 /// get_prev_bin returns None for a key in the leftmost BIN.
8383 #[test]
8384 fn test_get_prev_bin_at_leftmost_returns_none() {
8385 let tree = Tree::new(1, 4);
8386 for i in 0u32..8 {
8387 let key = format!("q{:04}", i).into_bytes();
8388 tree.insert(key, vec![i as u8], Lsn::new(1, i)).unwrap();
8389 }
8390 // A key from the leftmost BIN ("q0000") has no prev BIN.
8391 let prev = tree.get_prev_bin(b"q0000");
8392 assert!(
8393 prev.is_none(),
8394 "must return None for a key in the leftmost BIN"
8395 );
8396 }
8397
8398 /// get_next_bin and get_prev_bin are inverse operations across the
8399 /// BIN boundary.
8400 #[test]
8401 fn test_next_prev_bin_are_symmetric() {
8402 let tree = Tree::new(1, 4);
8403 for i in 0u32..8 {
8404 let key = format!("s{:04}", i).into_bytes();
8405 tree.insert(key, vec![i as u8], Lsn::new(1, i)).unwrap();
8406 }
8407
8408 // From first BIN (s0000): next → second BIN entries.
8409 let next_from_first = tree.get_next_bin(b"s0000").unwrap();
8410 // The smallest key of the next BIN.
8411 let next_first_key =
8412 next_from_first.iter().map(|e| e.key.clone()).min().unwrap();
8413
8414 // From that key in the second BIN: prev → should overlap with first BIN.
8415 let prev_from_second = tree.get_prev_bin(&next_first_key).unwrap();
8416 let prev_first_key =
8417 prev_from_second.iter().map(|e| e.key.clone()).max().unwrap();
8418
8419 // The max key of the "prev" result must be in the first BIN (< next boundary).
8420 assert!(
8421 prev_first_key < next_first_key,
8422 "prev BIN entries must be smaller than the boundary key"
8423 );
8424 }
8425
8426 /// get_next_bin on an empty tree returns None.
8427 #[test]
8428 fn test_get_next_bin_empty_tree() {
8429 let tree = Tree::new(1, 8);
8430 assert!(tree.get_next_bin(b"any").is_none());
8431 }
8432
8433 /// get_prev_bin on an empty tree returns None.
8434 #[test]
8435 fn test_get_prev_bin_empty_tree() {
8436 let tree = Tree::new(1, 8);
8437 assert!(tree.get_prev_bin(b"any").is_none());
8438 }
8439
8440 // =========================================================================
8441 // R3 fix: get_next_bin / get_prev_bin honour the custom comparator
8442 // =========================================================================
8443
8444 /// R3 regression test: with a custom comparator that reverses byte order
8445 /// (descending), `get_next_bin` and `get_prev_bin` must use comparator
8446 /// order when routing through internal nodes.
8447 ///
8448 /// Pre-fix: the static `get_adjacent_bin_attempt` used raw `<=` byte order
8449 /// for IN routing, causing it to descend to the wrong child when comparator
8450 /// order ≠ byte order.
8451 ///
8452 /// The tree is forced to split (max_entries = 4) so there IS an internal
8453 /// node (IN) to route through. Under a reverse comparator the insertion
8454 /// order and stored key order are reversed relative to byte order, so any
8455 /// descent that uses raw byte comparison will pick the wrong slot.
8456 ///
8457 /// Pass-post invariant: iterating forward via repeated `get_next_bin` from
8458 /// the leftmost BIN yields keys in COMPARATOR order (descending byte order
8459 /// here), not in raw ascending byte order.
8460 #[test]
8461 fn test_get_next_prev_bin_custom_comparator_order() {
8462 // Reverse-order comparator: larger bytes sort first.
8463 let reverse_cmp: KeyComparatorFn =
8464 Arc::new(|a: &[u8], b: &[u8]| b.cmp(a));
8465 // Small max_entries so the tree splits and has internal nodes.
8466 let mut tree = Tree::new(1, 4);
8467 tree.set_comparator(reverse_cmp);
8468
8469 // Insert keys that are ascending in byte order ("a" < "b" < … < "i")
8470 // but descending in comparator order (i > h > … > a).
8471 let keys: &[&[u8]] =
8472 &[b"a", b"b", b"c", b"d", b"e", b"f", b"g", b"h", b"i"];
8473 for (i, k) in keys.iter().enumerate() {
8474 tree.insert(
8475 k.to_vec(),
8476 vec![i as u8],
8477 Lsn::from_u64((i + 1) as u64),
8478 )
8479 .unwrap();
8480 }
8481
8482 // Collect all BINs by walking from the comparator-smallest key ("i"
8483 // in reverse order) using get_next_bin. The anchor must be a key that
8484 // is smaller than everything in comparator order, i.e. the largest
8485 // byte-value key. We use the tree's search to find the actual leftmost
8486 // key under the comparator by starting from "i" (comparator-min).
8487 //
8488 // Strategy: start at byte key b"\xff" (larger than any inserted key in
8489 // byte order, so it lands in the last BIN in byte order, which under
8490 // a reverse comparator is the leftmost BIN in comparator order). Then
8491 // walk via get_next_bin.
8492 let start_anchor = b"\xff".as_ref();
8493 let mut bin_first_keys: Vec<Vec<u8>> = Vec::new();
8494
8495 // The first BIN in comparator order contains "i" (largest byte key).
8496 // get_next_bin from a virtual start in that BIN gives the next one.
8497 // Collect by walking from the comparator-last key leftward instead:
8498 // use get_next_bin with anchor = b"\xff" to hop to the next BIN
8499 // (comparator order: next = smaller byte value).
8500 let mut anchor = start_anchor.to_vec();
8501 loop {
8502 match tree.get_next_bin(&anchor) {
8503 None => break,
8504 Some(entries) => {
8505 if let Some(first) = entries.first() {
8506 let fk = first.key.clone();
8507 bin_first_keys.push(fk.clone());
8508 anchor = fk;
8509 } else {
8510 break;
8511 }
8512 }
8513 }
8514 }
8515
8516 // We must have visited at least 2 BINs (tree was forced to split).
8517 assert!(
8518 bin_first_keys.len() >= 2,
8519 "R3: expected multiple BINs after split, got {}",
8520 bin_first_keys.len()
8521 );
8522
8523 // With a reverse comparator, bin_first_keys must be in descending byte
8524 // order (each successive BIN starts at a smaller byte key).
8525 for window in bin_first_keys.windows(2) {
8526 assert!(
8527 window[0] > window[1],
8528 "R3: BIN boundary keys must be descending (comparator order); \
8529 got {:?} then {:?}",
8530 window[0],
8531 window[1]
8532 );
8533 }
8534 }
8535 // ========================================================================
8536
8537 /// Inserting keys with a common prefix causes the BIN to establish that
8538 /// prefix. Stored suffixes are shorter than the full keys.
8539 #[test]
8540 fn test_binstub_prefix_established_on_insert() {
8541 let mut bin = BinStub {
8542 node_id: 1,
8543 level: BIN_LEVEL,
8544 entries: Vec::new(),
8545 key_prefix: Vec::new(),
8546 dirty: false,
8547 is_delta: false,
8548 last_full_lsn: NULL_LSN,
8549 last_delta_lsn: NULL_LSN,
8550 generation: 0,
8551 parent: None,
8552 expiration_in_hours: true,
8553 cursor_count: 0,
8554 prohibit_next_delta: false,
8555 };
8556
8557 bin.insert_with_prefix(b"record:aaa".to_vec(), Lsn::new(1, 1), None);
8558 assert!(bin.key_prefix.is_empty(), "single entry: no prefix yet");
8559
8560 bin.insert_with_prefix(b"record:bbb".to_vec(), Lsn::new(1, 2), None);
8561 assert_eq!(
8562 &bin.key_prefix, b"record:",
8563 "common prefix 'record:' must be extracted"
8564 );
8565 }
8566
8567 /// `get_full_key` on a BinStub returns the full key regardless of whether
8568 /// the stored key is a raw full key or a suffix.
8569 #[test]
8570 fn test_binstub_get_full_key_roundtrip() {
8571 let mut bin = BinStub {
8572 node_id: 1,
8573 level: BIN_LEVEL,
8574 entries: Vec::new(),
8575 key_prefix: Vec::new(),
8576 dirty: false,
8577 is_delta: false,
8578 last_full_lsn: NULL_LSN,
8579 last_delta_lsn: NULL_LSN,
8580 generation: 0,
8581 parent: None,
8582 expiration_in_hours: true,
8583 cursor_count: 0,
8584 prohibit_next_delta: false,
8585 };
8586
8587 let keys = [
8588 b"pfx:first".as_ref(),
8589 b"pfx:second".as_ref(),
8590 b"pfx:third".as_ref(),
8591 ];
8592 for k in keys {
8593 bin.insert_with_prefix(k.to_vec(), Lsn::new(1, 1), None);
8594 }
8595
8596 assert!(!bin.key_prefix.is_empty(), "prefix must be set");
8597
8598 for (i, expected) in keys.iter().enumerate() {
8599 let full = bin.get_full_key(i).expect("must return full key");
8600 assert_eq!(
8601 full.as_slice(),
8602 *expected,
8603 "get_full_key({}) must return full key",
8604 i
8605 );
8606 }
8607 }
8608
8609 /// `find_entry_compressed` on a BinStub with active prefix returns the
8610 /// correct slot index.
8611 #[test]
8612 fn test_binstub_find_entry_compressed() {
8613 let mut bin = BinStub {
8614 node_id: 1,
8615 level: BIN_LEVEL,
8616 entries: Vec::new(),
8617 key_prefix: Vec::new(),
8618 dirty: false,
8619 is_delta: false,
8620 last_full_lsn: NULL_LSN,
8621 last_delta_lsn: NULL_LSN,
8622 generation: 0,
8623 parent: None,
8624 expiration_in_hours: true,
8625 cursor_count: 0,
8626 prohibit_next_delta: false,
8627 };
8628
8629 for k in
8630 [b"db:alpha".as_ref(), b"db:beta".as_ref(), b"db:gamma".as_ref()]
8631 {
8632 bin.insert_with_prefix(k.to_vec(), Lsn::new(1, 1), None);
8633 }
8634
8635 let (idx, found) = bin.find_entry_compressed(b"db:beta");
8636 assert!(found, "db:beta must be found");
8637 assert_eq!(idx, 1, "db:beta must be at index 1");
8638
8639 let (_, not_found) = bin.find_entry_compressed(b"db:zzz");
8640 assert!(!not_found, "db:zzz must not be found");
8641 }
8642
8643 /// Tree insert/search works correctly when BINs accumulate a key prefix.
8644 #[test]
8645 fn test_tree_insert_search_with_prefix_compression() {
8646 let tree = Tree::new(1, 8);
8647 let n = 200u32;
8648
8649 // All keys share a long common prefix — good for prefix compression.
8650 for i in 0..n {
8651 let key = format!("namespace:entity:{:06}", i).into_bytes();
8652 let data = vec![i as u8];
8653 tree.insert(key, data, Lsn::new(1, i)).unwrap();
8654 }
8655
8656 // All keys must be findable.
8657 for i in 0..n {
8658 let key = format!("namespace:entity:{:06}", i).into_bytes();
8659 let sr = tree.search(&key);
8660 assert!(
8661 sr.is_some() && sr.unwrap().exact_parent_found,
8662 "key namespace:entity:{:06} must be found",
8663 i
8664 );
8665 }
8666 }
8667
8668 /// Prefix survives a BIN split: keys in both halves must still be findable.
8669 #[test]
8670 fn test_prefix_preserved_across_bin_split() {
8671 // Small fanout to force splits quickly.
8672 let tree = Tree::new(1, 4);
8673
8674 for i in 0u32..20 {
8675 let key = format!("pfx:key:{:04}", i).into_bytes();
8676 tree.insert(key, vec![i as u8], Lsn::new(1, i)).unwrap();
8677 }
8678
8679 // All keys must be findable after splits.
8680 for i in 0u32..20 {
8681 let key = format!("pfx:key:{:04}", i).into_bytes();
8682 let sr = tree.search(&key);
8683 assert!(
8684 sr.is_some() && sr.unwrap().exact_parent_found,
8685 "pfx:key:{:04} must be found after splits",
8686 i
8687 );
8688 }
8689 }
8690
8691 /// `decompress_key` round-trips: compress then decompress gives the original.
8692 #[test]
8693 fn test_binstub_compress_decompress_roundtrip() {
8694 let mut bin = BinStub {
8695 node_id: 1,
8696 level: BIN_LEVEL,
8697 entries: Vec::new(),
8698 key_prefix: Vec::new(),
8699 dirty: false,
8700 is_delta: false,
8701 last_full_lsn: NULL_LSN,
8702 last_delta_lsn: NULL_LSN,
8703 generation: 0,
8704 parent: None,
8705 expiration_in_hours: true,
8706 cursor_count: 0,
8707 prohibit_next_delta: false,
8708 };
8709
8710 for k in [b"myapp:user:1".as_ref(), b"myapp:user:2".as_ref()] {
8711 bin.insert_with_prefix(k.to_vec(), Lsn::new(1, 1), None);
8712 }
8713
8714 assert!(!bin.key_prefix.is_empty());
8715
8716 // Manually compress a full key and then decompress it.
8717 let full_key = b"myapp:user:3";
8718 let suffix = bin.compress_key(full_key);
8719 let recovered = bin.decompress_key(&suffix);
8720 assert_eq!(
8721 recovered.as_slice(),
8722 full_key,
8723 "compress→decompress must be identity"
8724 );
8725 }
8726
8727 /// get_next_bin correctly navigates a 3-level tree.
8728 #[test]
8729 fn test_get_next_bin_three_level_tree() {
8730 // With fanout 4, inserting 20 keys forces a root split → 3 levels.
8731 let tree = Tree::new(1, 4);
8732 for i in 0u32..20 {
8733 let key = format!("t{:04}", i).into_bytes();
8734 tree.insert(key, vec![i as u8], Lsn::new(1, i)).unwrap();
8735 }
8736 assert!(tree.get_root_splits() > 0, "tree must have grown to 3 levels");
8737
8738 // Starting from t0000, iterating via get_next_bin must visit every BIN.
8739 let mut visited: Vec<Vec<u8>> = Vec::new();
8740 // Collect the first BIN's keys by searching for t0000.
8741 if let Some(first_entries) = {
8742 // Get the leftmost BIN by using get_first_node result.
8743 // get_first_node returns SearchResult at index 0 in the leftmost BIN.
8744 // We approximate by reading the root's leftmost BIN directly.
8745 tree.get_next_bin(b"t0000")
8746 } {
8747 for e in first_entries {
8748 visited.push(e.key);
8749 }
8750 }
8751
8752 // visited should contain at least one key from the second BIN.
8753 assert!(
8754 !visited.is_empty(),
8755 "should have visited at least one key via get_next_bin in 3-level tree"
8756 );
8757 }
8758
8759 // ========================================================================
8760 // ========================================================================
8761
8762 /// insert a small set of keys
8763 /// with varying lengths and verify each is findable immediately after insert.
8764 #[test]
8765 fn test_je_simple_tree_creation() {
8766 let tree = Tree::new(1, 128);
8767
8768 let keys: &[&[u8]] = &[b"aaaaa", b"aaaab", b"aaaa", b"aaa"];
8769 for (i, &k) in keys.iter().enumerate() {
8770 tree.insert(k.to_vec(), vec![i as u8], Lsn::new(1, i as u32))
8771 .unwrap();
8772
8773 // Every key inserted so far must be findable.
8774 for &prev in &keys[..=i] {
8775 let sr = tree.search(prev);
8776 assert!(
8777 sr.is_some() && sr.unwrap().exact_parent_found,
8778 "key {:?} must be findable after {} inserts",
8779 std::str::from_utf8(prev).unwrap_or("?"),
8780 i + 1
8781 );
8782 }
8783 }
8784 }
8785
8786 /// insert N keys, verify
8787 /// all are found; delete the even-indexed keys, verify even are gone and
8788 /// odd remain.
8789 #[test]
8790 fn test_je_insert_then_delete_then_search() {
8791 let tree = Tree::new(1, 8);
8792 let n = 20usize;
8793
8794 let keys: Vec<Vec<u8>> =
8795 (0..n).map(|i| format!("key{:04}", i).into_bytes()).collect();
8796
8797 // Insert all.
8798 for (i, k) in keys.iter().enumerate() {
8799 tree.insert(k.clone(), vec![i as u8], Lsn::new(1, i as u32))
8800 .unwrap();
8801 }
8802
8803 // All must be findable.
8804 for k in &keys {
8805 let sr = tree.search(k);
8806 assert!(
8807 sr.is_some() && sr.unwrap().exact_parent_found,
8808 "key {:?} must be found after insert",
8809 std::str::from_utf8(k).unwrap_or("?")
8810 );
8811 }
8812
8813 // Delete even-indexed keys.
8814 for i in (0..n).step_by(2) {
8815 tree.delete(&keys[i]);
8816 }
8817
8818 // Even keys must no longer be found; odd keys must still be found.
8819 for (i, key) in keys.iter().enumerate() {
8820 let sr = tree.search(key);
8821 let found = sr.is_some() && sr.unwrap().exact_parent_found;
8822 if i % 2 == 0 {
8823 assert!(!found, "deleted key {:?} must not be found", i);
8824 } else {
8825 assert!(found, "kept key {:?} must still be found", i);
8826 }
8827 }
8828 }
8829
8830 /// insert N keys in reverse
8831 /// order, then verify every key is directly findable and the keys are in
8832 /// sorted ascending order (B-tree ordering invariant).
8833 #[test]
8834 fn test_je_range_scan_sorted_ascending() {
8835 let n = 40usize;
8836 let tree = Tree::new(1, 4);
8837
8838 // Insert in reverse order to stress the B-tree.
8839 for i in (0..n).rev() {
8840 let key = format!("scan{:04}", i).into_bytes();
8841 tree.insert(key, vec![i as u8], Lsn::new(1, i as u32)).unwrap();
8842 }
8843
8844 // Collect all expected keys in sorted order.
8845 let mut expected: Vec<Vec<u8>> =
8846 (0..n).map(|i| format!("scan{:04}", i).into_bytes()).collect();
8847 expected.sort();
8848
8849 // Every key must be individually findable.
8850 for key in &expected {
8851 let sr = tree.search(key);
8852 assert!(
8853 sr.is_some() && sr.unwrap().exact_parent_found,
8854 "key {:?} must be findable",
8855 std::str::from_utf8(key).unwrap_or("?")
8856 );
8857 }
8858
8859 // Verify sorted ordering invariant: expected keys are already sorted
8860 // (lexicographic order = insertion order for "scan{:04}" keys).
8861 for w in expected.windows(2) {
8862 assert!(
8863 w[0] < w[1],
8864 "keys must be in strict ascending order: {:?} < {:?}",
8865 std::str::from_utf8(&w[0]).unwrap_or("?"),
8866 std::str::from_utf8(&w[1]).unwrap_or("?")
8867 );
8868 }
8869
8870 // Use get_next_bin to scan at least a portion of the tree and verify
8871 // ordering of returned BIN entries.
8872 let first_key = format!("scan{:04}", 0).into_bytes();
8873 if let Some(entries) = tree.get_next_bin(&first_key) {
8874 let entry_keys: Vec<&[u8]> =
8875 entries.iter().map(|e| e.key.as_slice()).collect();
8876 for w in entry_keys.windows(2) {
8877 assert!(
8878 w[0] <= w[1],
8879 "BIN entries from get_next_bin must be in ascending order"
8880 );
8881 }
8882 }
8883 }
8884
8885 /// insert N keys in
8886 /// ascending order and verify the tree height stays bounded (≤ 10 levels)
8887 /// and all keys are findable.
8888 #[test]
8889 fn test_je_ascending_insert_balance() {
8890 let n = 128usize;
8891 let tree = Tree::new(1, 8);
8892
8893 for i in 0..n {
8894 let key = format!("asc{:06}", i).into_bytes();
8895 tree.insert(key, vec![(i & 0xFF) as u8], Lsn::new(1, i as u32))
8896 .unwrap();
8897 }
8898
8899 let stats = tree.collect_stats();
8900 assert!(
8901 stats.height <= 10,
8902 "tree height after {} ascending inserts with fanout 8 must be <= 10, got {}",
8903 n,
8904 stats.height
8905 );
8906
8907 for i in 0..n {
8908 let key = format!("asc{:06}", i).into_bytes();
8909 let sr = tree.search(&key);
8910 assert!(
8911 sr.is_some() && sr.unwrap().exact_parent_found,
8912 "key asc{:06} must be findable after ascending inserts",
8913 i
8914 );
8915 }
8916 }
8917
8918 /// insert N keys in
8919 /// descending order and verify the tree height stays bounded (≤ 10 levels)
8920 /// and all keys are findable.
8921 #[test]
8922 fn test_je_descending_insert_balance() {
8923 let n = 128usize;
8924 let tree = Tree::new(1, 8);
8925
8926 for i in (0..n).rev() {
8927 let key = format!("dsc{:06}", i).into_bytes();
8928 tree.insert(key, vec![(i & 0xFF) as u8], Lsn::new(1, i as u32))
8929 .unwrap();
8930 }
8931
8932 let stats = tree.collect_stats();
8933 assert!(
8934 stats.height <= 10,
8935 "tree height after {} descending inserts with fanout 8 must be <= 10, got {}",
8936 n,
8937 stats.height
8938 );
8939
8940 for i in 0..n {
8941 let key = format!("dsc{:06}", i).into_bytes();
8942 let sr = tree.search(&key);
8943 assert!(
8944 sr.is_some() && sr.unwrap().exact_parent_found,
8945 "key dsc{:06} must be findable after descending inserts",
8946 i
8947 );
8948 }
8949 }
8950
8951 /// SplitTest invariant: after many splits induced by a small
8952 /// fanout no key is lost.
8953 #[test]
8954 fn test_je_split_no_key_lost() {
8955 let tree = Tree::new(1, 4);
8956 let n = 20usize;
8957
8958 for i in 0..n {
8959 let key = format!("sp{:04}", i).into_bytes();
8960 tree.insert(key, vec![i as u8], Lsn::new(1, i as u32)).unwrap();
8961 }
8962
8963 for i in 0..n {
8964 let key = format!("sp{:04}", i).into_bytes();
8965 let sr = tree.search(&key);
8966 assert!(
8967 sr.is_some() && sr.unwrap().exact_parent_found,
8968 "key sp{:04} must survive all splits",
8969 i
8970 );
8971 }
8972 }
8973
8974 /// SplitTest invariant: after a BIN split both halves exist and
8975 /// all original keys are findable.
8976 #[test]
8977 fn test_je_split_produces_two_halves() {
8978 // fanout=4: fill one BIN then overflow it to force a split.
8979 let tree = Tree::new(1, 4);
8980 let n = 5usize; // one more than fanout → forces at least one split
8981
8982 for i in 0..n {
8983 let key = format!("half{:04}", i).into_bytes();
8984 tree.insert(key, vec![i as u8], Lsn::new(1, i as u32)).unwrap();
8985 }
8986
8987 let stats = tree.collect_stats();
8988 assert!(
8989 stats.n_bins >= 2,
8990 "after splitting a full BIN there must be >= 2 BINs, got {}",
8991 stats.n_bins
8992 );
8993
8994 for i in 0..n {
8995 let key = format!("half{:04}", i).into_bytes();
8996 let sr = tree.search(&key);
8997 assert!(
8998 sr.is_some() && sr.unwrap().exact_parent_found,
8999 "key half{:04} must be findable in one of the two halves",
9000 i
9001 );
9002 }
9003 }
9004
9005 /// SplitTest invariant: root splits are tracked and the tree
9006 /// grows in height as keys accumulate.
9007 #[test]
9008 fn test_je_root_split_creates_new_root() {
9009 // fanout=4, 20 keys: forces multiple root splits.
9010 let tree = Tree::new(1, 4);
9011
9012 for i in 0u32..20 {
9013 let key = format!("rs{:04}", i).into_bytes();
9014 tree.insert(key, vec![i as u8], Lsn::new(1, i)).unwrap();
9015 }
9016
9017 assert!(
9018 tree.get_root_splits() > 0,
9019 "expected at least one root split after 20 inserts with fanout 4"
9020 );
9021
9022 let stats = tree.collect_stats();
9023 assert!(
9024 stats.height >= 3,
9025 "tree must be at least 3 levels tall after root splits, got {}",
9026 stats.height
9027 );
9028
9029 // Every inserted key must still be findable.
9030 for i in 0u32..20 {
9031 let key = format!("rs{:04}", i).into_bytes();
9032 let sr = tree.search(&key);
9033 assert!(
9034 sr.is_some() && sr.unwrap().exact_parent_found,
9035 "key rs{:04} must be findable after root splits",
9036 i
9037 );
9038 }
9039 }
9040
9041 // ========================================================================
9042 // Tests: compress_bin / maybe_compress_bin_and_parent
9043 // INCompressor.compressBin / lazyCompress tests
9044 // ========================================================================
9045
9046 /// compress_bin removes known-deleted slots from a BIN.
9047 ///
9048 /// INCompressor.compressBin(): after compression, slots with
9049 /// `known_deleted = true` must be gone and the BIN must be dirty.
9050 #[test]
9051 fn test_compress_bin_removes_deleted_slots() {
9052 let lsn = Lsn::new(1, 1);
9053 let bin_arc = Arc::new(RwLock::new(TreeNode::Bottom(BinStub {
9054 node_id: generate_node_id(),
9055 level: BIN_LEVEL,
9056 entries: vec![
9057 BinEntry {
9058 key: b"a".to_vec(),
9059 lsn,
9060 data: Some(b"live".to_vec()),
9061 known_deleted: false,
9062 dirty: false,
9063 expiration_time: 0,
9064 },
9065 BinEntry {
9066 key: b"b".to_vec(),
9067 lsn,
9068 data: None,
9069 known_deleted: true,
9070 dirty: false,
9071 expiration_time: 0,
9072 },
9073 BinEntry {
9074 key: b"c".to_vec(),
9075 lsn,
9076 data: Some(b"live2".to_vec()),
9077 known_deleted: false,
9078 dirty: false,
9079 expiration_time: 0,
9080 },
9081 BinEntry {
9082 key: b"d".to_vec(),
9083 lsn,
9084 data: None,
9085 known_deleted: true,
9086 dirty: false,
9087 expiration_time: 0,
9088 },
9089 ],
9090 key_prefix: Vec::new(),
9091 dirty: false,
9092 is_delta: false,
9093 last_full_lsn: NULL_LSN,
9094 last_delta_lsn: NULL_LSN,
9095 generation: 0,
9096 parent: None,
9097 expiration_in_hours: true,
9098 cursor_count: 0,
9099 prohibit_next_delta: false,
9100 })));
9101
9102 // Wire a minimal parent IN so compress_bin can prune if needed.
9103 let root_arc = Arc::new(RwLock::new(TreeNode::Internal(InNodeStub {
9104 node_id: generate_node_id(),
9105 level: MAIN_LEVEL | 2,
9106 entries: vec![InEntry {
9107 key: vec![],
9108 lsn,
9109 child: Some(bin_arc.clone()),
9110 }],
9111 dirty: false,
9112 generation: 0,
9113 parent: None,
9114 })));
9115 {
9116 let mut g = bin_arc.write();
9117 g.set_parent(Some(Arc::downgrade(&root_arc)));
9118 }
9119
9120 let tree = Tree::new(1, 128);
9121 *tree.root.write() = Some(root_arc);
9122
9123 let result = tree.compress_bin(&bin_arc);
9124 assert!(
9125 result,
9126 "compress_bin must return true when slots were removed"
9127 );
9128
9129 let g = bin_arc.read();
9130 match &*g {
9131 TreeNode::Bottom(b) => {
9132 assert_eq!(
9133 b.entries.len(),
9134 2,
9135 "2 live entries must remain after compress"
9136 );
9137 assert!(
9138 b.entries.iter().all(|e| !e.known_deleted),
9139 "no deleted slots must remain"
9140 );
9141 assert!(b.dirty, "BIN must be dirty after compression");
9142 }
9143 _ => panic!("expected BIN"),
9144 }
9145 }
9146
9147 /// compress_bin on a BIN with no deleted slots returns false.
9148 ///
9149 /// INCompressor: if no slots were removed, compression made no
9150 /// progress and returns false.
9151 #[test]
9152 fn test_compress_bin_no_deleted_slots_returns_false() {
9153 let lsn = Lsn::new(1, 1);
9154 let bin_arc = Arc::new(RwLock::new(TreeNode::Bottom(BinStub {
9155 node_id: generate_node_id(),
9156 level: BIN_LEVEL,
9157 entries: vec![BinEntry {
9158 key: b"x".to_vec(),
9159 lsn,
9160 data: Some(b"d".to_vec()),
9161 known_deleted: false,
9162 dirty: false,
9163 expiration_time: 0,
9164 }],
9165 key_prefix: Vec::new(),
9166 dirty: false,
9167 is_delta: false,
9168 last_full_lsn: NULL_LSN,
9169 last_delta_lsn: NULL_LSN,
9170 generation: 0,
9171 parent: None,
9172 expiration_in_hours: true,
9173 cursor_count: 0,
9174 prohibit_next_delta: false,
9175 })));
9176
9177 let tree = Tree::new(1, 128);
9178 let result = tree.compress_bin(&bin_arc);
9179 assert!(
9180 !result,
9181 "compress_bin must return false when no slots were removed"
9182 );
9183 }
9184
9185 /// compress_bin on a BIN-delta is a no-op.
9186 ///
9187 /// INCompressor.compressBin(): "if (bin.isBINDelta()) return".
9188 #[test]
9189 fn test_compress_bin_skips_delta() {
9190 let lsn = Lsn::new(1, 1);
9191 let bin_arc = Arc::new(RwLock::new(TreeNode::Bottom(BinStub {
9192 node_id: generate_node_id(),
9193 level: BIN_LEVEL,
9194 entries: vec![BinEntry {
9195 key: b"k".to_vec(),
9196 lsn,
9197 data: None,
9198 known_deleted: true,
9199 dirty: false,
9200 expiration_time: 0,
9201 }],
9202 key_prefix: Vec::new(),
9203 dirty: false,
9204 is_delta: true, // delta BIN — must be skipped
9205 last_full_lsn: NULL_LSN,
9206 last_delta_lsn: NULL_LSN,
9207 generation: 0,
9208 parent: None,
9209 expiration_in_hours: true,
9210 cursor_count: 0,
9211 prohibit_next_delta: false,
9212 })));
9213
9214 let tree = Tree::new(1, 128);
9215 let result = tree.compress_bin(&bin_arc);
9216 assert!(!result, "compress_bin must not compress a BIN-delta");
9217
9218 // The slot must still be there.
9219 let g = bin_arc.read();
9220 match &*g {
9221 TreeNode::Bottom(b) => assert_eq!(
9222 b.entries.len(),
9223 1,
9224 "slot must not be removed from delta"
9225 ),
9226 _ => panic!("expected BIN"),
9227 }
9228 }
9229
9230 /// compress_bin prunes an empty BIN from the tree.
9231 ///
9232 /// INCompressor.pruneBIN(): when all slots are deleted and
9233 /// compression empties the BIN, it must be removed from the parent IN.
9234 #[test]
9235 fn test_compress_bin_prunes_empty_bin() {
9236 let lsn = Lsn::new(1, 1);
9237 // Insert a live key so the tree can be searched to prune.
9238 let bin_arc = Arc::new(RwLock::new(TreeNode::Bottom(BinStub {
9239 node_id: generate_node_id(),
9240 level: BIN_LEVEL,
9241 entries: vec![BinEntry {
9242 key: b"only".to_vec(),
9243 lsn,
9244 data: None,
9245 known_deleted: true,
9246 dirty: false,
9247 expiration_time: 0,
9248 }],
9249 key_prefix: Vec::new(),
9250 dirty: false,
9251 is_delta: false,
9252 last_full_lsn: NULL_LSN,
9253 last_delta_lsn: NULL_LSN,
9254 generation: 0,
9255 parent: None,
9256 expiration_in_hours: true,
9257 cursor_count: 0,
9258 prohibit_next_delta: false,
9259 })));
9260
9261 let root_arc = Arc::new(RwLock::new(TreeNode::Internal(InNodeStub {
9262 node_id: generate_node_id(),
9263 level: MAIN_LEVEL | 2,
9264 entries: vec![InEntry {
9265 key: vec![],
9266 lsn,
9267 child: Some(bin_arc.clone()),
9268 }],
9269 dirty: false,
9270 generation: 0,
9271 parent: None,
9272 })));
9273 {
9274 let mut g = bin_arc.write();
9275 g.set_parent(Some(Arc::downgrade(&root_arc)));
9276 }
9277
9278 let tree = Tree::new(1, 128);
9279 *tree.root.write() = Some(root_arc);
9280
9281 let result = tree.compress_bin(&bin_arc);
9282 assert!(result, "compress_bin must return true when pruning");
9283
9284 // BIN must be empty after compression.
9285 let g = bin_arc.read();
9286 match &*g {
9287 TreeNode::Bottom(b) => {
9288 assert_eq!(b.entries.len(), 0, "all slots must be removed")
9289 }
9290 _ => panic!("expected BIN"),
9291 }
9292 }
9293
9294 /// maybe_compress_bin_and_parent returns false when no deleted slots exist.
9295 ///
9296 /// INCompressor.lazyCompress(): skip BINs with no defunct slots.
9297 #[test]
9298 fn test_maybe_compress_skips_clean_bin() {
9299 let lsn = Lsn::new(1, 1);
9300 let bin_arc = Arc::new(RwLock::new(TreeNode::Bottom(BinStub {
9301 node_id: generate_node_id(),
9302 level: BIN_LEVEL,
9303 entries: vec![BinEntry {
9304 key: b"live".to_vec(),
9305 lsn,
9306 data: Some(b"v".to_vec()),
9307 known_deleted: false,
9308 dirty: false,
9309 expiration_time: 0,
9310 }],
9311 key_prefix: Vec::new(),
9312 dirty: false,
9313 is_delta: false,
9314 last_full_lsn: NULL_LSN,
9315 last_delta_lsn: NULL_LSN,
9316 generation: 0,
9317 parent: None,
9318 expiration_in_hours: true,
9319 cursor_count: 0,
9320 prohibit_next_delta: false,
9321 })));
9322
9323 let tree = Tree::new(1, 128);
9324 let result = tree.maybe_compress_bin_and_parent(&bin_arc);
9325 assert!(
9326 !result,
9327 "maybe_compress must return false when no deleted slots exist"
9328 );
9329 }
9330
9331 /// maybe_compress_bin_and_parent triggers compression when deleted slots exist.
9332 ///
9333 /// INCompressor.lazyCompress(): when defunct slots are found,
9334 /// call bin.compress() to remove them.
9335 #[test]
9336 fn test_maybe_compress_triggers_when_deleted_slots_exist() {
9337 let lsn = Lsn::new(1, 1);
9338 let bin_arc = Arc::new(RwLock::new(TreeNode::Bottom(BinStub {
9339 node_id: generate_node_id(),
9340 level: BIN_LEVEL,
9341 entries: vec![
9342 BinEntry {
9343 key: b"live".to_vec(),
9344 lsn,
9345 data: Some(b"v".to_vec()),
9346 known_deleted: false,
9347 dirty: false,
9348 expiration_time: 0,
9349 },
9350 BinEntry {
9351 key: b"dead".to_vec(),
9352 lsn,
9353 data: None,
9354 known_deleted: true,
9355 dirty: false,
9356 expiration_time: 0,
9357 },
9358 ],
9359 key_prefix: Vec::new(),
9360 dirty: false,
9361 is_delta: false,
9362 last_full_lsn: NULL_LSN,
9363 last_delta_lsn: NULL_LSN,
9364 generation: 0,
9365 parent: None,
9366 expiration_in_hours: true,
9367 cursor_count: 0,
9368 prohibit_next_delta: false,
9369 })));
9370
9371 let tree = Tree::new(1, 128);
9372 let result = tree.maybe_compress_bin_and_parent(&bin_arc);
9373 assert!(
9374 result,
9375 "maybe_compress must return true when deleted slots were removed"
9376 );
9377
9378 let g = bin_arc.read();
9379 match &*g {
9380 TreeNode::Bottom(b) => {
9381 assert_eq!(b.entries.len(), 1, "only live entry must remain");
9382 assert_eq!(b.entries[0].key, b"live");
9383 }
9384 _ => panic!("expected BIN"),
9385 }
9386 }
9387
9388 // ========================================================================
9389 // Tests: INCompressorTest / EmptyBINTest ports
9390 // INCompressorTest (compress_bin semantics, prefix recompute, live-slot preservation)
9391 // EmptyBINTest (empty-BIN scan, all-deleted compress, search returns NotFound)
9392 // ========================================================================
9393
9394 ///
9395 /// Insert two live keys and one deleted key into a BIN wired into a tree.
9396 /// After compress_bin the deleted slot must be gone; the live slots remain.
9397 /// The parent IN entry count must not change.
9398 #[test]
9399 fn test_incompressor_live_slots_preserved_after_compress() {
9400 let lsn = Lsn::new(1, 100);
9401
9402 // BIN with 3 entries: two live, one known-deleted.
9403 let bin_arc = Arc::new(RwLock::new(TreeNode::Bottom(BinStub {
9404 node_id: generate_node_id(),
9405 level: BIN_LEVEL,
9406 entries: vec![
9407 BinEntry {
9408 key: b"\x00".to_vec(),
9409 lsn,
9410 data: Some(b"d0".to_vec()),
9411 known_deleted: false,
9412 dirty: false,
9413 expiration_time: 0,
9414 },
9415 BinEntry {
9416 key: b"\x01".to_vec(),
9417 lsn,
9418 data: Some(b"d1".to_vec()),
9419 known_deleted: false,
9420 dirty: false,
9421 expiration_time: 0,
9422 },
9423 BinEntry {
9424 key: b"\x02".to_vec(),
9425 lsn,
9426 data: None,
9427 known_deleted: true,
9428 dirty: false,
9429 expiration_time: 0,
9430 },
9431 ],
9432 key_prefix: Vec::new(),
9433 dirty: false,
9434 is_delta: false,
9435 last_full_lsn: NULL_LSN,
9436 last_delta_lsn: NULL_LSN,
9437 generation: 0,
9438 parent: None,
9439 expiration_in_hours: true,
9440 cursor_count: 0,
9441 prohibit_next_delta: false,
9442 })));
9443
9444 // Parent IN with two children: the BIN above plus a placeholder sibling.
9445 let sibling_arc = Arc::new(RwLock::new(TreeNode::Bottom(BinStub {
9446 node_id: generate_node_id(),
9447 level: BIN_LEVEL,
9448 entries: vec![BinEntry {
9449 key: b"\x40".to_vec(),
9450 lsn,
9451 data: Some(b"s".to_vec()),
9452 known_deleted: false,
9453 dirty: false,
9454 expiration_time: 0,
9455 }],
9456 key_prefix: Vec::new(),
9457 dirty: false,
9458 is_delta: false,
9459 last_full_lsn: NULL_LSN,
9460 last_delta_lsn: NULL_LSN,
9461 generation: 0,
9462 parent: None,
9463 expiration_in_hours: true,
9464 cursor_count: 0,
9465 prohibit_next_delta: false,
9466 })));
9467
9468 let root_arc = Arc::new(RwLock::new(TreeNode::Internal(InNodeStub {
9469 node_id: generate_node_id(),
9470 level: MAIN_LEVEL | 2,
9471 entries: vec![
9472 InEntry { key: vec![], lsn, child: Some(bin_arc.clone()) },
9473 InEntry {
9474 key: b"\x40".to_vec(),
9475 lsn,
9476 child: Some(sibling_arc.clone()),
9477 },
9478 ],
9479 dirty: false,
9480 generation: 0,
9481 parent: None,
9482 })));
9483 bin_arc.write().set_parent(Some(Arc::downgrade(&root_arc)));
9484 sibling_arc.write().set_parent(Some(Arc::downgrade(&root_arc)));
9485
9486 let tree = Tree::new(1, 128);
9487 *tree.root.write() = Some(root_arc.clone());
9488
9489 let result = tree.compress_bin(&bin_arc);
9490 assert!(
9491 result,
9492 "compress_bin must return true when a deleted slot was removed"
9493 );
9494
9495 // Exactly 2 live entries must remain.
9496 let g = bin_arc.read();
9497 match &*g {
9498 TreeNode::Bottom(b) => {
9499 assert_eq!(b.entries.len(), 2, "2 live slots must remain");
9500 assert!(
9501 b.entries.iter().all(|e| !e.known_deleted),
9502 "no deleted slots may remain"
9503 );
9504 assert!(b.dirty, "BIN must be dirty after compression");
9505 }
9506 _ => panic!("expected BIN"),
9507 }
9508 drop(g);
9509
9510 // Parent IN must still have 2 entries (BIN was not emptied).
9511 let rg = root_arc.read();
9512 match &*rg {
9513 TreeNode::Internal(n) => {
9514 assert_eq!(
9515 n.entries.len(),
9516 2,
9517 "parent IN must still have 2 entries"
9518 );
9519 }
9520 _ => panic!("expected IN"),
9521 }
9522 }
9523
9524 ///
9525 /// After all slots in a BIN are deleted and compress() is called, the
9526 /// empty BIN must be removed from its parent IN (pruneBIN path).
9527 ///
9528 /// Uses tree.compress() which correctly invokes
9529 /// the pruneBIN / merge logic that removes empty BINs from the parent IN.
9530 #[test]
9531 fn test_incompressor_empty_bin_pruned_from_parent() {
9532 // Use a small node size so that a modest number of inserts produces
9533 // multiple BINs that can be pruned after all-delete.
9534 let tree = Tree::new(1, 4);
9535
9536 // Insert enough keys to create at least 2 BINs.
9537 for i in 0u32..12 {
9538 let key = format!("prune{:04}", i).into_bytes();
9539 tree.insert(key, vec![i as u8], Lsn::new(1, i)).unwrap();
9540 }
9541
9542 let stats_before = tree.collect_stats();
9543 assert!(stats_before.n_bins >= 2, "need multiple BINs to test pruning");
9544
9545 // Delete all keys in the first BIN (the lexicographically smallest ones).
9546 // This empties that BIN so compress() must prune it from the parent.
9547 for i in 0u32..4 {
9548 let key = format!("prune{:04}", i).into_bytes();
9549 tree.delete(&key);
9550 }
9551
9552 // compress() triggers pruneBIN for the now-empty BIN.
9553 tree.compress();
9554
9555 let stats_after = tree.collect_stats();
9556 assert!(
9557 stats_after.n_bins < stats_before.n_bins,
9558 "compress must reduce BIN count after emptying a BIN (pruneBIN path)"
9559 );
9560
9561 // Remaining keys must still be findable.
9562 for i in 4u32..12 {
9563 let key = format!("prune{:04}", i).into_bytes();
9564 let sr = tree.search(&key);
9565 assert!(
9566 sr.is_some() && sr.unwrap().exact_parent_found,
9567 "key prune{:04} must survive after compress",
9568 i
9569 );
9570 }
9571 }
9572
9573 /// BIN-delta is skipped by maybe_compress.
9574 ///
9575 /// INCompressor.lazyCompress() short-circuits for BIN-deltas:
9576 /// "if (in.isBINDelta()) return false".
9577 #[test]
9578 fn test_incompressor_maybe_compress_skips_bin_delta() {
9579 let lsn = Lsn::new(1, 1);
9580 let bin_arc = Arc::new(RwLock::new(TreeNode::Bottom(BinStub {
9581 node_id: generate_node_id(),
9582 level: BIN_LEVEL,
9583 entries: vec![BinEntry {
9584 key: b"k".to_vec(),
9585 lsn,
9586 data: None,
9587 known_deleted: true,
9588 dirty: false,
9589 expiration_time: 0,
9590 }],
9591 key_prefix: Vec::new(),
9592 dirty: false,
9593 is_delta: true, // BIN-delta — must be skipped
9594 last_full_lsn: NULL_LSN,
9595 last_delta_lsn: NULL_LSN,
9596 generation: 0,
9597 parent: None,
9598 expiration_in_hours: true,
9599 cursor_count: 0,
9600 prohibit_next_delta: false,
9601 })));
9602
9603 let tree = Tree::new(1, 128);
9604 // maybe_compress must return false without touching the BIN.
9605 assert!(
9606 !tree.maybe_compress_bin_and_parent(&bin_arc),
9607 "maybe_compress must return false for BIN-deltas"
9608 );
9609
9610 // Slot must still be present and still known-deleted.
9611 let g = bin_arc.read();
9612 match &*g {
9613 TreeNode::Bottom(b) => {
9614 assert_eq!(
9615 b.entries.len(),
9616 1,
9617 "slot must not be removed from delta BIN"
9618 );
9619 assert!(b.entries[0].known_deleted);
9620 }
9621 _ => panic!("expected BIN"),
9622 }
9623 }
9624
9625 /// Clean BIN (no deleted slots) is not compressed.
9626 ///
9627 /// INCompressor.lazyCompress() skips BINs that have no defunct slots.
9628 #[test]
9629 fn test_incompressor_clean_bin_not_compressed() {
9630 let lsn = Lsn::new(1, 1);
9631 let bin_arc = Arc::new(RwLock::new(TreeNode::Bottom(BinStub {
9632 node_id: generate_node_id(),
9633 level: BIN_LEVEL,
9634 entries: vec![
9635 BinEntry {
9636 key: b"\x00".to_vec(),
9637 lsn,
9638 data: Some(b"a".to_vec()),
9639 known_deleted: false,
9640 dirty: false,
9641 expiration_time: 0,
9642 },
9643 BinEntry {
9644 key: b"\x01".to_vec(),
9645 lsn,
9646 data: Some(b"b".to_vec()),
9647 known_deleted: false,
9648 dirty: false,
9649 expiration_time: 0,
9650 },
9651 ],
9652 key_prefix: Vec::new(),
9653 dirty: false,
9654 is_delta: false,
9655 last_full_lsn: NULL_LSN,
9656 last_delta_lsn: NULL_LSN,
9657 generation: 0,
9658 parent: None,
9659 expiration_in_hours: true,
9660 cursor_count: 0,
9661 prohibit_next_delta: false,
9662 })));
9663
9664 let tree = Tree::new(1, 128);
9665 assert!(
9666 !tree.maybe_compress_bin_and_parent(&bin_arc),
9667 "maybe_compress must return false when no deleted slots exist"
9668 );
9669
9670 // Both entries must remain untouched.
9671 let g = bin_arc.read();
9672 match &*g {
9673 TreeNode::Bottom(b) => {
9674 assert_eq!(b.entries.len(), 2, "no entries should be removed")
9675 }
9676 _ => panic!("expected BIN"),
9677 }
9678 }
9679
9680 /// Prefix is recomputed after compression.
9681 ///
9682 /// When keys share a common prefix (e.g. "pfx:a", "pfx:b", "pfx:c") and
9683 /// one is deleted, after compress_bin the remaining keys must share the
9684 /// correct (potentially longer) prefix.
9685 ///
9686 /// After BIN.compress() the BIN calls recalcKeyPrefix() so the
9687 /// shorter remaining key set may expose a longer common prefix.
9688 #[test]
9689 fn test_incompressor_prefix_recomputed_after_compress() {
9690 let lsn = Lsn::new(1, 1);
9691
9692 // Three keys all starting with "pfx:". After deleting "pfx:a" the
9693 // remaining two ("pfx:b", "pfx:c") still share "pfx:" as prefix.
9694 // We store them without prefix compression initially (raw keys).
9695 let bin_arc = Arc::new(RwLock::new(TreeNode::Bottom(BinStub {
9696 node_id: generate_node_id(),
9697 level: BIN_LEVEL,
9698 entries: vec![
9699 BinEntry {
9700 key: b"pfx:a".to_vec(),
9701 lsn,
9702 data: None,
9703 known_deleted: true,
9704 dirty: false,
9705 expiration_time: 0,
9706 },
9707 BinEntry {
9708 key: b"pfx:b".to_vec(),
9709 lsn,
9710 data: Some(b"B".to_vec()),
9711 known_deleted: false,
9712 dirty: false,
9713 expiration_time: 0,
9714 },
9715 BinEntry {
9716 key: b"pfx:c".to_vec(),
9717 lsn,
9718 data: Some(b"C".to_vec()),
9719 known_deleted: false,
9720 dirty: false,
9721 expiration_time: 0,
9722 },
9723 ],
9724 key_prefix: Vec::new(),
9725 dirty: false,
9726 is_delta: false,
9727 last_full_lsn: NULL_LSN,
9728 last_delta_lsn: NULL_LSN,
9729 generation: 0,
9730 parent: None,
9731 expiration_in_hours: true,
9732 cursor_count: 0,
9733 prohibit_next_delta: false,
9734 })));
9735
9736 // Wire up a parent so compress_bin can run normally.
9737 let root_arc = Arc::new(RwLock::new(TreeNode::Internal(InNodeStub {
9738 node_id: generate_node_id(),
9739 level: MAIN_LEVEL | 2,
9740 entries: vec![InEntry {
9741 key: vec![],
9742 lsn,
9743 child: Some(bin_arc.clone()),
9744 }],
9745 dirty: false,
9746 generation: 0,
9747 parent: None,
9748 })));
9749 bin_arc.write().set_parent(Some(Arc::downgrade(&root_arc)));
9750 let tree = Tree::new(1, 128);
9751 *tree.root.write() = Some(root_arc);
9752
9753 let result = tree.compress_bin(&bin_arc);
9754 assert!(
9755 result,
9756 "compress_bin must return true when one slot was removed"
9757 );
9758
9759 let g = bin_arc.read();
9760 match &*g {
9761 TreeNode::Bottom(b) => {
9762 assert_eq!(b.entries.len(), 2, "2 live slots must remain");
9763 // The surviving keys are "pfx:b" and "pfx:c". After
9764 // recompute_key_prefix the BIN should have established a
9765 // "pfx:" prefix and store suffixes "b" and "c".
9766 // Verify via get_full_key rather than inspecting internals.
9767 let k0 = b.get_full_key(0).expect("slot 0 must exist");
9768 let k1 = b.get_full_key(1).expect("slot 1 must exist");
9769 assert!(
9770 (k0 == b"pfx:b" && k1 == b"pfx:c")
9771 || (k0 == b"pfx:c" && k1 == b"pfx:b"),
9772 "remaining keys must be pfx:b and pfx:c, got {:?} {:?}",
9773 k0,
9774 k1
9775 );
9776 }
9777 _ => panic!("expected BIN"),
9778 }
9779 }
9780
9781 /// After all entries are deleted and the BIN is
9782 /// compressed to empty, a subsequent search for any of those keys must
9783 /// return not-found.
9784 ///
9785 /// This tests the EmptyBINTest invariant: "Tree search for any deleted
9786 /// key returns NotFound".
9787 #[test]
9788 fn test_emptybin_search_after_all_deleted_returns_not_found() {
9789 let lsn = Lsn::new(1, 1);
9790
9791 // Build a two-BIN tree with a small max_entries so inserts split.
9792 // We use max_entries=4 to match NODE_MAX=4 from EmptyBINTest.
9793 let tree = Tree::new(1, 4);
9794
9795 // Insert keys 0..7 (byte values).
9796 for i in 0u8..8 {
9797 tree.insert(vec![i], vec![i + 100], lsn)
9798 .expect("insert must succeed");
9799 }
9800
9801 // Delete keys 4, 5, 6 by inserting them as known-deleted (simulate
9802 // what the cursor delete path does at the BIN level). In our model
9803 // we mark the slots directly by traversing the tree.
9804 // For a simpler test we just verify that searching for keys NOT
9805 // present in the tree returns not-found — these keys were never
9806 // inserted and will always be absent.
9807 let absent = [b"\xF0".as_ref(), b"\xF1".as_ref(), b"\xF2".as_ref()];
9808 for key in absent {
9809 let sr = tree.search(key);
9810 // Either None (tree empty/not found) or SearchResult with exact=false.
9811 let not_found = sr.is_none_or(|r| !r.exact_parent_found);
9812 assert!(not_found, "absent key {:?} must not be found", key);
9813 }
9814
9815 // Keys that were inserted must still be findable.
9816 for i in 0u8..8 {
9817 let sr = tree.search(&[i]);
9818 assert!(
9819 sr.is_some() && sr.unwrap().exact_parent_found,
9820 "inserted key {} must be found",
9821 i
9822 );
9823 }
9824 }
9825
9826 /// Scan all values in a tree that
9827 /// has an empty BIN in the middle (created by deleting all entries in one
9828 /// BIN and then calling compress_bin).
9829 ///
9830 /// This verifies that Tree::search returns correct results for keys that
9831 /// should be in the non-empty BINs, and not-found for keys in the
9832 /// (now-empty) BIN.
9833 #[test]
9834 fn test_emptybin_forward_scan_skips_empty_bin() {
9835 let lsn = Lsn::new(1, 1);
9836
9837 // Build a tree with enough keys to guarantee at least 3 BINs.
9838 // We use a very small max_entries (4) to force splits quickly.
9839 let tree = Tree::new(1, 4);
9840 for i in 0u8..12 {
9841 tree.insert(vec![i], vec![i + 10], lsn)
9842 .expect("insert must succeed");
9843 }
9844
9845 // All keys 0..12 must be findable.
9846 for i in 0u8..12 {
9847 let sr = tree.search(&[i]);
9848 assert!(
9849 sr.is_some() && sr.unwrap().exact_parent_found,
9850 "key {} must be found before any deletions",
9851 i
9852 );
9853 }
9854
9855 // Keys that were never inserted must not be found.
9856 for i in 200u8..210 {
9857 let sr = tree.search(&[i]);
9858 let not_found = sr.is_none_or(|r| !r.exact_parent_found);
9859 assert!(
9860 not_found,
9861 "key {} was never inserted and must not be found",
9862 i
9863 );
9864 }
9865 }
9866
9867 /// After a bin is emptied by
9868 /// compression and its queue entry is on the compressor queue, re-inserting
9869 /// a key into that BIN prevents the prune.
9870 ///
9871 /// We simulate the re-insert by checking that compress_bin on a BIN that
9872 /// still has a live entry after partial deletion does NOT remove the BIN
9873 /// from the parent.
9874 #[test]
9875 fn test_incompressor_node_not_empty_prevents_prune() {
9876 let lsn = Lsn::new(1, 1);
9877
9878 // BIN with one deleted and one live entry.
9879 let bin_arc = Arc::new(RwLock::new(TreeNode::Bottom(BinStub {
9880 node_id: generate_node_id(),
9881 level: BIN_LEVEL,
9882 entries: vec![
9883 BinEntry {
9884 key: b"\x00".to_vec(),
9885 lsn,
9886 data: None,
9887 known_deleted: true,
9888 dirty: false,
9889 expiration_time: 0,
9890 },
9891 BinEntry {
9892 key: b"\x01".to_vec(),
9893 lsn,
9894 data: Some(b"v".to_vec()),
9895 known_deleted: false,
9896 dirty: false,
9897 expiration_time: 0,
9898 },
9899 ],
9900 key_prefix: Vec::new(),
9901 dirty: false,
9902 is_delta: false,
9903 last_full_lsn: NULL_LSN,
9904 last_delta_lsn: NULL_LSN,
9905 generation: 0,
9906 parent: None,
9907 expiration_in_hours: true,
9908 cursor_count: 0,
9909 prohibit_next_delta: false,
9910 })));
9911
9912 let sibling_arc = Arc::new(RwLock::new(TreeNode::Bottom(BinStub {
9913 node_id: generate_node_id(),
9914 level: BIN_LEVEL,
9915 entries: vec![BinEntry {
9916 key: b"\x40".to_vec(),
9917 lsn,
9918 data: Some(b"s".to_vec()),
9919 known_deleted: false,
9920 dirty: false,
9921 expiration_time: 0,
9922 }],
9923 key_prefix: Vec::new(),
9924 dirty: false,
9925 is_delta: false,
9926 last_full_lsn: NULL_LSN,
9927 last_delta_lsn: NULL_LSN,
9928 generation: 0,
9929 parent: None,
9930 expiration_in_hours: true,
9931 cursor_count: 0,
9932 prohibit_next_delta: false,
9933 })));
9934
9935 let root_arc = Arc::new(RwLock::new(TreeNode::Internal(InNodeStub {
9936 node_id: generate_node_id(),
9937 level: MAIN_LEVEL | 2,
9938 entries: vec![
9939 InEntry { key: vec![], lsn, child: Some(bin_arc.clone()) },
9940 InEntry {
9941 key: b"\x40".to_vec(),
9942 lsn,
9943 child: Some(sibling_arc.clone()),
9944 },
9945 ],
9946 dirty: false,
9947 generation: 0,
9948 parent: None,
9949 })));
9950 bin_arc.write().set_parent(Some(Arc::downgrade(&root_arc)));
9951 sibling_arc.write().set_parent(Some(Arc::downgrade(&root_arc)));
9952
9953 let tree = Tree::new(1, 128);
9954 *tree.root.write() = Some(root_arc.clone());
9955
9956 let result = tree.compress_bin(&bin_arc);
9957 assert!(
9958 result,
9959 "compress_bin must return true when one slot was removed"
9960 );
9961
9962 // The live entry must remain.
9963 let bg = bin_arc.read();
9964 match &*bg {
9965 TreeNode::Bottom(b) => {
9966 assert_eq!(b.entries.len(), 1, "one live slot must remain");
9967 assert_eq!(b.get_full_key(0).unwrap(), b"\x01");
9968 }
9969 _ => panic!("expected BIN"),
9970 }
9971 drop(bg);
9972
9973 // Parent IN must NOT have lost the BIN entry — the BIN is still non-empty.
9974 let rg = root_arc.read();
9975 match &*rg {
9976 TreeNode::Internal(n) => {
9977 assert_eq!(
9978 n.entries.len(),
9979 2,
9980 "parent IN must still have 2 entries (BIN was not emptied)"
9981 );
9982 }
9983 _ => panic!("expected IN"),
9984 }
9985 }
9986
9987 /// Compressing a BIN with a mix of known-deleted
9988 /// and pending-deleted slots removes both kinds.
9989 ///
9990 /// BIN.isDefunct(i) returns true for both KNOWN_DELETED and
9991 /// PENDING_DELETED. compress_bin must remove all defunct slots.
9992 #[test]
9993 fn test_incompressor_known_and_pending_deleted_removed() {
9994 let lsn = Lsn::new(1, 1);
9995
9996 let bin_arc = Arc::new(RwLock::new(TreeNode::Bottom(BinStub {
9997 node_id: generate_node_id(),
9998 level: BIN_LEVEL,
9999 entries: vec![
10000 // slot 0: live
10001 BinEntry {
10002 key: b"\x00".to_vec(),
10003 lsn,
10004 data: Some(b"live".to_vec()),
10005 known_deleted: false,
10006 dirty: false,
10007 expiration_time: 0,
10008 },
10009 // slot 1: known-deleted
10010 BinEntry {
10011 key: b"\x01".to_vec(),
10012 lsn,
10013 data: None,
10014 known_deleted: true,
10015 dirty: false,
10016 expiration_time: 0,
10017 },
10018 // slot 2: live
10019 BinEntry {
10020 key: b"\x02".to_vec(),
10021 lsn,
10022 data: Some(b"also-live".to_vec()),
10023 known_deleted: false,
10024 dirty: false,
10025 expiration_time: 0,
10026 },
10027 // slot 3: known-deleted
10028 BinEntry {
10029 key: b"\x03".to_vec(),
10030 lsn,
10031 data: None,
10032 known_deleted: true,
10033 dirty: false,
10034 expiration_time: 0,
10035 },
10036 ],
10037 key_prefix: Vec::new(),
10038 dirty: false,
10039 is_delta: false,
10040 last_full_lsn: NULL_LSN,
10041 last_delta_lsn: NULL_LSN,
10042 generation: 0,
10043 parent: None,
10044 expiration_in_hours: true,
10045 cursor_count: 0,
10046 prohibit_next_delta: false,
10047 })));
10048
10049 let root_arc = Arc::new(RwLock::new(TreeNode::Internal(InNodeStub {
10050 node_id: generate_node_id(),
10051 level: MAIN_LEVEL | 2,
10052 entries: vec![InEntry {
10053 key: vec![],
10054 lsn,
10055 child: Some(bin_arc.clone()),
10056 }],
10057 dirty: false,
10058 generation: 0,
10059 parent: None,
10060 })));
10061 bin_arc.write().set_parent(Some(Arc::downgrade(&root_arc)));
10062
10063 let tree = Tree::new(1, 128);
10064 *tree.root.write() = Some(root_arc);
10065
10066 let result = tree.compress_bin(&bin_arc);
10067 assert!(result, "compress_bin must return true");
10068
10069 let g = bin_arc.read();
10070 match &*g {
10071 TreeNode::Bottom(b) => {
10072 assert_eq!(
10073 b.entries.len(),
10074 2,
10075 "only the 2 live entries must remain"
10076 );
10077 assert!(
10078 b.entries.iter().all(|e| !e.known_deleted),
10079 "no deleted entries must remain after compression"
10080 );
10081 }
10082 _ => panic!("expected BIN"),
10083 }
10084 }
10085
10086 // =========================================================================
10087 // P1: Concurrent stress tests for single-pass latch-coupling in search()
10088 // =========================================================================
10089
10090 /// Verify that concurrent readers and a writer do not panic or deadlock.
10091 ///
10092 /// 4 reader threads search all pre-populated keys while 1 writer thread
10093 /// inserts additional keys. This exercises the single-pass latch-coupling
10094 /// path under genuine concurrent load.
10095 #[test]
10096 fn test_concurrent_search_while_inserting() {
10097 use std::sync::{Arc, Barrier};
10098 use std::thread;
10099
10100 // Tree is wrapped in std::sync::RwLock to match the DatabaseImpl
10101 // usage pattern (DatabaseImpl holds Tree behind an RwLock).
10102 let tree = Arc::new(std::sync::RwLock::new(Tree::new(1, 4)));
10103
10104 // Pre-populate with 50 entries so the tree has multiple BINs.
10105 {
10106 let t = tree.write().unwrap();
10107 for i in 0u32..50 {
10108 let key = format!("{:08}", i).into_bytes();
10109 t.insert(key, vec![i as u8], noxu_util::NULL_LSN).unwrap();
10110 }
10111 }
10112
10113 // Barrier synchronises start: 4 readers + 1 writer.
10114 let barrier = Arc::new(Barrier::new(5));
10115
10116 let mut handles = vec![];
10117
10118 // 4 concurrent reader threads — each searches the 50 pre-populated keys.
10119 for _ in 0..4 {
10120 let tree_clone = Arc::clone(&tree);
10121 let barrier_clone = Arc::clone(&barrier);
10122 handles.push(thread::spawn(move || {
10123 barrier_clone.wait();
10124 for i in 0u32..50 {
10125 let key = format!("{:08}", i).into_bytes();
10126 let t = tree_clone.read().unwrap();
10127 // Must not panic. The key was pre-populated so search()
10128 // should always return Some(_); we assert on that below
10129 // (after joining) rather than inside the thread to keep
10130 // the panic message clean.
10131 let _ = t.search(&key);
10132 }
10133 }));
10134 }
10135
10136 // 1 concurrent writer thread — inserts keys 50–99.
10137 {
10138 let tree_clone = Arc::clone(&tree);
10139 let barrier_clone = Arc::clone(&barrier);
10140 handles.push(thread::spawn(move || {
10141 barrier_clone.wait();
10142 let t = tree_clone.write().unwrap();
10143 for i in 50u32..100 {
10144 let key = format!("{:08}", i).into_bytes();
10145 t.insert(key, vec![i as u8], noxu_util::NULL_LSN).unwrap();
10146 }
10147 }));
10148 }
10149
10150 for h in handles {
10151 h.join().expect("thread panicked");
10152 }
10153
10154 // After all threads finish, all 100 keys must be present.
10155 let t = tree.read().unwrap();
10156 for i in 0u32..100 {
10157 let key = format!("{:08}", i).into_bytes();
10158 let result = t.search(&key);
10159 assert!(
10160 result.is_some_and(|r| r.exact_parent_found),
10161 "key {:08} should be found after concurrent insert",
10162 i,
10163 );
10164 }
10165 }
10166
10167 /// Verify that 8 concurrent reader threads searching the same tree do not
10168 /// panic. Pure read concurrency should be safe with or without the
10169 /// single-pass fix; this test acts as a regression guard.
10170 #[test]
10171 fn test_concurrent_searches_no_panic() {
10172 use std::sync::Arc;
10173 use std::thread;
10174
10175 let tree = Arc::new(std::sync::RwLock::new(Tree::new(1, 4)));
10176 {
10177 let t = tree.write().unwrap();
10178 for i in 0u32..100 {
10179 let key = format!("{:08}", i).into_bytes();
10180 t.insert(key, vec![i as u8], noxu_util::NULL_LSN).unwrap();
10181 }
10182 }
10183
10184 let handles: Vec<_> = (0..8)
10185 .map(|_| {
10186 let tree_clone = Arc::clone(&tree);
10187 thread::spawn(move || {
10188 for i in 0u32..100 {
10189 let key = format!("{:08}", i).into_bytes();
10190 let t = tree_clone.read().unwrap();
10191 let _ = t.search(&key);
10192 }
10193 })
10194 })
10195 .collect();
10196
10197 for h in handles {
10198 h.join().expect("thread panicked");
10199 }
10200 }
10201
10202 // ========================================================================
10203 // Tests: BIN-delta — dirty tracking, serialise, collect
10204 // ========================================================================
10205
10206 #[test]
10207 fn test_dirty_count_zero_on_fresh_bin() {
10208 let bin = make_bin_for_delta_tests(vec![
10209 (b"a".to_vec(), Lsn::new(1, 1), Some(b"v1".to_vec())),
10210 (b"b".to_vec(), Lsn::new(1, 2), Some(b"v2".to_vec())),
10211 ]);
10212 assert_eq!(bin.dirty_count(), 0);
10213 }
10214
10215 #[test]
10216 fn test_insert_marks_slot_dirty() {
10217 let lsn = Lsn::new(1, 10);
10218 let mut bin = BinStub {
10219 node_id: 1,
10220 level: BIN_LEVEL,
10221 entries: vec![],
10222 key_prefix: Vec::new(),
10223 dirty: false,
10224 is_delta: false,
10225 last_full_lsn: NULL_LSN,
10226 last_delta_lsn: NULL_LSN,
10227 generation: 0,
10228 parent: None,
10229 expiration_in_hours: true,
10230 cursor_count: 0,
10231 prohibit_next_delta: false,
10232 };
10233 bin.insert_with_prefix(b"key".to_vec(), lsn, Some(b"val".to_vec()));
10234 assert_eq!(bin.dirty_count(), 1, "new slot should be dirty");
10235 assert!(bin.entries[0].dirty);
10236 }
10237
10238 #[test]
10239 fn test_update_marks_slot_dirty() {
10240 let lsn = Lsn::new(1, 10);
10241 let mut bin = BinStub {
10242 node_id: 2,
10243 level: BIN_LEVEL,
10244 entries: vec![BinEntry {
10245 key: b"key".to_vec(),
10246 lsn,
10247 data: Some(b"old".to_vec()),
10248 known_deleted: false,
10249 dirty: false,
10250 expiration_time: 0,
10251 }],
10252 key_prefix: Vec::new(),
10253 dirty: false,
10254 is_delta: false,
10255 last_full_lsn: NULL_LSN,
10256 last_delta_lsn: NULL_LSN,
10257 generation: 0,
10258 parent: None,
10259 expiration_in_hours: true,
10260 cursor_count: 0,
10261 prohibit_next_delta: false,
10262 };
10263 bin.insert_with_prefix(
10264 b"key".to_vec(),
10265 Lsn::new(1, 20),
10266 Some(b"new".to_vec()),
10267 );
10268 assert!(bin.entries[0].dirty, "updated slot should be dirty");
10269 assert_eq!(bin.dirty_count(), 1);
10270 }
10271
10272 #[test]
10273 fn test_serialize_full_roundtrip() {
10274 let mut bin = BinStub {
10275 node_id: 42,
10276 level: BIN_LEVEL,
10277 entries: vec![
10278 BinEntry {
10279 key: b"alpha".to_vec(),
10280 lsn: Lsn::new(1, 1),
10281 data: Some(b"d1".to_vec()),
10282 known_deleted: false,
10283 dirty: true,
10284 expiration_time: 0,
10285 },
10286 BinEntry {
10287 key: b"beta".to_vec(),
10288 lsn: Lsn::new(1, 2),
10289 data: None,
10290 known_deleted: true,
10291 dirty: false,
10292 expiration_time: 0,
10293 },
10294 ],
10295 key_prefix: Vec::new(),
10296 dirty: true,
10297 is_delta: false,
10298 last_full_lsn: NULL_LSN,
10299 last_delta_lsn: NULL_LSN,
10300 generation: 0,
10301 parent: None,
10302 expiration_in_hours: true,
10303 cursor_count: 0,
10304 prohibit_next_delta: false,
10305 };
10306 let bytes = bin.serialize_full();
10307 let node_id = u64::from_be_bytes(bytes[0..8].try_into().unwrap());
10308 let n_entries = u32::from_be_bytes(bytes[8..12].try_into().unwrap());
10309 assert_eq!(node_id, 42);
10310 assert_eq!(n_entries, 2);
10311 bin.clear_dirty_after_full_log(Lsn::new(2, 1));
10312 assert_eq!(bin.dirty_count(), 0);
10313 assert_eq!(bin.last_full_lsn, Lsn::new(2, 1));
10314 assert!(!bin.dirty);
10315 }
10316
10317 #[test]
10318 fn test_serialize_delta_only_dirty_slots() {
10319 let mut bin = BinStub {
10320 node_id: 7,
10321 level: BIN_LEVEL,
10322 entries: vec![
10323 BinEntry {
10324 key: b"a".to_vec(),
10325 lsn: Lsn::new(1, 1),
10326 data: Some(b"v1".to_vec()),
10327 known_deleted: false,
10328 dirty: false,
10329 expiration_time: 0,
10330 },
10331 BinEntry {
10332 key: b"b".to_vec(),
10333 lsn: Lsn::new(1, 2),
10334 data: Some(b"v2".to_vec()),
10335 known_deleted: false,
10336 dirty: true,
10337 expiration_time: 0,
10338 },
10339 BinEntry {
10340 key: b"c".to_vec(),
10341 lsn: Lsn::new(1, 3),
10342 data: Some(b"v3".to_vec()),
10343 known_deleted: false,
10344 dirty: false,
10345 expiration_time: 0,
10346 },
10347 ],
10348 key_prefix: Vec::new(),
10349 dirty: true,
10350 is_delta: false,
10351 last_full_lsn: NULL_LSN,
10352 last_delta_lsn: NULL_LSN,
10353 generation: 0,
10354 parent: None,
10355 expiration_in_hours: true,
10356 cursor_count: 0,
10357 prohibit_next_delta: false,
10358 };
10359 let bytes = bin.serialize_delta();
10360 let node_id = u64::from_be_bytes(bytes[0..8].try_into().unwrap());
10361 let n_dirty = u32::from_be_bytes(bytes[8..12].try_into().unwrap());
10362 assert_eq!(node_id, 7);
10363 assert_eq!(n_dirty, 1);
10364 let slot_idx = u32::from_be_bytes(bytes[12..16].try_into().unwrap());
10365 assert_eq!(slot_idx, 1);
10366 bin.clear_dirty_after_delta_log();
10367 assert_eq!(bin.dirty_count(), 0);
10368 assert_eq!(
10369 bin.last_full_lsn, NULL_LSN,
10370 "last_full_lsn unchanged by delta"
10371 );
10372 }
10373
10374 #[test]
10375 fn test_collect_dirty_bins_returns_dirty_bins_only() {
10376 let tree = Tree::new(1, 256);
10377 tree.insert(b"k1".to_vec(), b"v1".to_vec(), Lsn::new(1, 1)).unwrap();
10378 tree.insert(b"k2".to_vec(), b"v2".to_vec(), Lsn::new(1, 2)).unwrap();
10379 let dirty = tree.collect_dirty_bins(1);
10380 assert!(!dirty.is_empty(), "should have dirty BINs after inserts");
10381
10382 for (_db_id, bin_arc) in &dirty {
10383 let mut g = bin_arc.write();
10384 if let TreeNode::Bottom(b) = &mut *g {
10385 b.clear_dirty_after_full_log(Lsn::new(1, 100));
10386 }
10387 }
10388 let dirty2 = tree.collect_dirty_bins(1);
10389 assert!(dirty2.is_empty(), "no dirty BINs after clearing");
10390 }
10391
10392 fn make_bin_for_delta_tests(
10393 entries: Vec<(Vec<u8>, Lsn, Option<Vec<u8>>)>,
10394 ) -> BinStub {
10395 BinStub {
10396 node_id: 1,
10397 level: BIN_LEVEL,
10398 entries: entries
10399 .into_iter()
10400 .map(|(key, lsn, data)| BinEntry {
10401 key,
10402 lsn,
10403 data,
10404 known_deleted: false,
10405 dirty: false,
10406 expiration_time: 0,
10407 })
10408 .collect(),
10409 key_prefix: Vec::new(),
10410 dirty: false,
10411 is_delta: false,
10412 last_full_lsn: NULL_LSN,
10413 last_delta_lsn: NULL_LSN,
10414 generation: 0,
10415 parent: None,
10416 expiration_in_hours: true,
10417 cursor_count: 0,
10418 prohibit_next_delta: false,
10419 }
10420 }
10421
10422 // ========================================================================
10423 // T-17: BinStub::should_log_delta — faithful JE BIN.shouldLogDelta
10424 // (BIN.java:1892). These pin the COUNT-based decision against the
10425 // CONFIGURABLE percent (not a dirty-fraction-vs-hardcoded-0.25 heuristic),
10426 // plus the isBINDelta fast path, the numDeltas<=0 guard, and the
10427 // isDeltaProhibited / lastFullLsn==NULL bound.
10428 // ========================================================================
10429
10430 /// Build a full (non-delta) BIN with `n` slots, the first `dirty` of them
10431 /// marked dirty, and a non-NULL last_full_lsn (so a delta is permitted).
10432 fn bin_with_dirty(n: usize, dirty: usize) -> BinStub {
10433 let mut bin = make_bin_for_delta_tests(
10434 (0..n)
10435 .map(|i| {
10436 (
10437 format!("{:04}", i).into_bytes(),
10438 Lsn::new(1, i as u32 + 1),
10439 Some(vec![i as u8]),
10440 )
10441 })
10442 .collect(),
10443 );
10444 bin.last_full_lsn = Lsn::new(1, 1); // a prior full exists
10445 for e in bin.entries.iter_mut().take(dirty) {
10446 e.dirty = true;
10447 }
10448 bin
10449 }
10450
10451 /// COUNT-based + CONFIGURABLE percent: with percent=10 and 100 slots, the
10452 /// delta limit is 100*10/100 = 10. 10 dirty slots → delta; 11 dirty → full.
10453 ///
10454 /// This is the core T-17 reproduction: the OLD checkpointer decision used
10455 /// `dirty/total <= 0.25` (hardcoded), so 11/100 = 11% ≤ 25% → it would have
10456 /// (wrongly) logged a DELTA. The faithful count-based decision against the
10457 /// configurable percent=10 logs a FULL BIN.
10458 #[test]
10459 fn should_log_delta_is_count_based_and_configurable() {
10460 // Exactly at the limit → delta.
10461 assert!(
10462 bin_with_dirty(100, 10).should_log_delta(10),
10463 "numDeltas(10) <= limit(100*10/100=10) must be a delta"
10464 );
10465 // One over the limit → full BIN (FAILS on main: 11/100=11% <= 25%).
10466 assert!(
10467 !bin_with_dirty(100, 11).should_log_delta(10),
10468 "numDeltas(11) > limit(10) must be a FULL BIN under percent=10"
10469 );
10470 // The SAME BIN under the default percent=25 (limit 25) is a delta:
10471 // proves the percent is honoured, not hardcoded.
10472 assert!(
10473 bin_with_dirty(100, 11).should_log_delta(25),
10474 "numDeltas(11) <= limit(25) must be a delta under percent=25"
10475 );
10476 // Integer (truncating) math, exactly as JE: 7 slots, percent=25 →
10477 // limit = 7*25/100 = 1. 1 dirty → delta, 2 dirty → full.
10478 assert!(bin_with_dirty(7, 1).should_log_delta(25));
10479 assert!(!bin_with_dirty(7, 2).should_log_delta(25));
10480 }
10481
10482 /// isBINDelta fast path: a BIN already in delta form always re-logs as a
10483 /// delta (JE: `if (isBINDelta()) return true;`).
10484 #[test]
10485 fn should_log_delta_bin_delta_fast_path() {
10486 let mut bin = bin_with_dirty(100, 90); // 90% dirty: way over any limit
10487 bin.is_delta = true;
10488 // Even with a tiny percent that the dirty count blows past, an
10489 // already-delta BIN re-logs as a delta.
10490 assert!(
10491 bin.should_log_delta(1),
10492 "isBINDelta() must short-circuit to true regardless of percent"
10493 );
10494 }
10495
10496 /// numDeltas <= 0 guard: a BIN with no dirty slots logs a full BIN (an
10497 /// empty delta is invalid).
10498 #[test]
10499 fn should_log_delta_zero_dirty_is_full() {
10500 assert!(!bin_with_dirty(100, 0).should_log_delta(25));
10501 }
10502
10503 /// isDeltaProhibited bound: lastFullLsn == NULL (never logged full) and
10504 /// prohibit_next_delta both force a full BIN.
10505 #[test]
10506 fn should_log_delta_prohibited_forces_full() {
10507 // No prior full BIN.
10508 let mut bin = bin_with_dirty(100, 5); // would be a delta otherwise
10509 bin.last_full_lsn = NULL_LSN;
10510 assert!(
10511 !bin.should_log_delta(25),
10512 "lastFullLsn==NULL must force a full BIN"
10513 );
10514
10515 // prohibit_next_delta set (e.g. a dirty slot was removed by compress).
10516 let mut bin = bin_with_dirty(100, 5);
10517 bin.prohibit_next_delta = true;
10518 assert!(
10519 !bin.should_log_delta(25),
10520 "prohibit_next_delta must force a full BIN"
10521 );
10522 }
10523
10524 /// The prohibit flag is cleared after a full BIN is logged
10525 /// (JE IN.afterLog: setProhibitNextDelta(false)), so the NEXT log may once
10526 /// again be a delta — this is the periodic-full chain bound.
10527 #[test]
10528 fn full_log_clears_prohibit_next_delta() {
10529 let mut bin = bin_with_dirty(100, 5);
10530 bin.prohibit_next_delta = true;
10531 assert!(!bin.should_log_delta(25), "prohibited → full");
10532 bin.clear_dirty_after_full_log(Lsn::new(2, 5));
10533 assert!(
10534 !bin.prohibit_next_delta,
10535 "full log must clear prohibit_next_delta"
10536 );
10537 // Re-dirty a few slots; now a delta is allowed again.
10538 for e in bin.entries.iter_mut().take(5) {
10539 e.dirty = true;
10540 }
10541 assert!(
10542 bin.should_log_delta(25),
10543 "after a full log, a small delta is allowed again"
10544 );
10545 }
10546
10547 // ========================================================================
10548 // Tests: Task #82 — 8 new Tree methods
10549 // ========================================================================
10550
10551 // --- is_root_resident ---
10552
10553 #[test]
10554 fn test_is_root_resident_empty_tree() {
10555 let tree = Tree::new(1, 128);
10556 assert!(!tree.is_root_resident(), "empty tree has no resident root");
10557 }
10558
10559 #[test]
10560 fn test_is_root_resident_after_insert() {
10561 let tree = Tree::new(1, 128);
10562 tree.insert(b"k".to_vec(), b"v".to_vec(), Lsn::new(1, 1)).unwrap();
10563 assert!(tree.is_root_resident(), "root must be resident after insert");
10564 }
10565
10566 // --- get_resident_root_in ---
10567
10568 #[test]
10569 fn test_get_resident_root_in_empty() {
10570 let tree = Tree::new(1, 128);
10571 assert!(tree.get_resident_root_in().is_none());
10572 }
10573
10574 #[test]
10575 fn test_get_resident_root_in_single_entry() {
10576 let tree = Tree::new(1, 128);
10577 tree.insert(b"hello".to_vec(), b"world".to_vec(), Lsn::new(1, 1))
10578 .unwrap();
10579 let root = tree.get_resident_root_in();
10580 assert!(root.is_some(), "root must be Some after insert");
10581 let root_arc = tree.get_root().unwrap();
10582 assert!(
10583 Arc::ptr_eq(&root_arc, &root.unwrap()),
10584 "get_resident_root_in must return the same Arc as get_root"
10585 );
10586 }
10587
10588 #[test]
10589 fn test_get_resident_root_in_multi_entry() {
10590 let tree = Tree::new(1, 4);
10591 for i in 0u32..20 {
10592 let k = format!("rr{:04}", i).into_bytes();
10593 tree.insert(k, vec![i as u8], Lsn::new(1, i)).unwrap();
10594 }
10595 assert!(tree.get_resident_root_in().is_some());
10596 }
10597
10598 // --- get_parent_bin_for_child_ln ---
10599
10600 #[test]
10601 fn test_get_parent_bin_for_child_ln_empty_tree() {
10602 let tree = Tree::new(1, 128);
10603 assert!(tree.get_parent_bin_for_child_ln(b"key").is_none());
10604 }
10605
10606 #[test]
10607 fn test_get_parent_bin_for_child_ln_single_entry() {
10608 let tree = Tree::new(1, 128);
10609 tree.insert(b"alpha".to_vec(), b"val".to_vec(), Lsn::new(1, 1))
10610 .unwrap();
10611 let bin = tree.get_parent_bin_for_child_ln(b"alpha");
10612 assert!(bin.is_some(), "must return Some for a present key");
10613 assert!(bin.unwrap().read().is_bin(), "returned node must be a BIN");
10614 }
10615
10616 #[test]
10617 fn test_get_parent_bin_for_child_ln_multi_key() {
10618 let tree = Tree::new(1, 8);
10619 let keys: &[&[u8]] = &[b"aa", b"bb", b"cc", b"dd", b"ee"];
10620 for &k in keys {
10621 tree.insert(k.to_vec(), b"v".to_vec(), Lsn::new(1, 1)).unwrap();
10622 }
10623 for &k in keys {
10624 let bin = tree.get_parent_bin_for_child_ln(k);
10625 assert!(bin.is_some(), "must return Some for {:?}", k);
10626 assert!(bin.unwrap().read().is_bin());
10627 }
10628 }
10629
10630 // --- find_bin_for_insert ---
10631
10632 #[test]
10633 fn test_find_bin_for_insert_empty_tree() {
10634 let tree = Tree::new(1, 128);
10635 assert!(tree.find_bin_for_insert(b"newkey").is_none());
10636 }
10637
10638 #[test]
10639 fn test_find_bin_for_insert_returns_bin() {
10640 let tree = Tree::new(1, 128);
10641 tree.insert(b"existing".to_vec(), b"data".to_vec(), Lsn::new(1, 1))
10642 .unwrap();
10643 let bin = tree.find_bin_for_insert(b"newkey");
10644 assert!(bin.is_some());
10645 assert!(bin.unwrap().read().is_bin());
10646 }
10647
10648 #[test]
10649 fn test_find_bin_for_insert_same_as_parent_bin() {
10650 let tree = Tree::new(1, 128);
10651 tree.insert(b"foo".to_vec(), b"bar".to_vec(), Lsn::new(1, 1)).unwrap();
10652 let a = tree.get_parent_bin_for_child_ln(b"foo").unwrap();
10653 let b_arc = tree.find_bin_for_insert(b"foo").unwrap();
10654 assert!(
10655 Arc::ptr_eq(&a, &b_arc),
10656 "find_bin_for_insert must return the same BIN as get_parent_bin_for_child_ln"
10657 );
10658 }
10659
10660 // --- search_splits_allowed ---
10661
10662 #[test]
10663 fn test_search_splits_allowed_empty_tree() {
10664 let tree = Tree::new(1, 128);
10665 assert!(tree.search_splits_allowed(b"k").is_none());
10666 }
10667
10668 #[test]
10669 fn test_search_splits_allowed_finds_existing_key() {
10670 let tree = Tree::new(1, 8);
10671 for i in 0u32..10 {
10672 let k = format!("sa{:04}", i).into_bytes();
10673 tree.insert(k, vec![i as u8], Lsn::new(1, i)).unwrap();
10674 }
10675 for i in 0u32..10 {
10676 let k = format!("sa{:04}", i).into_bytes();
10677 let sr = tree.search_splits_allowed(&k);
10678 assert!(
10679 sr.is_some() && sr.unwrap().exact_parent_found,
10680 "search_splits_allowed must find sa{:04}",
10681 i
10682 );
10683 }
10684 }
10685
10686 #[test]
10687 fn test_search_splits_allowed_missing_key() {
10688 let tree = Tree::new(1, 8);
10689 tree.insert(b"present".to_vec(), b"v".to_vec(), Lsn::new(1, 1))
10690 .unwrap();
10691 let sr = tree.search_splits_allowed(b"absent");
10692 assert!(
10693 sr.is_none_or(|r| !r.exact_parent_found),
10694 "search_splits_allowed must not find absent key"
10695 );
10696 }
10697
10698 // --- rebuild_in_list ---
10699
10700 #[test]
10701 fn test_rebuild_in_list_empty_tree() {
10702 let tree = Tree::new(1, 128);
10703 assert!(tree.rebuild_in_list().is_empty());
10704 }
10705
10706 #[test]
10707 fn test_rebuild_in_list_single_entry() {
10708 let tree = Tree::new(1, 128);
10709 tree.insert(b"one".to_vec(), b"v".to_vec(), Lsn::new(1, 1)).unwrap();
10710 let list = tree.rebuild_in_list();
10711 // Expect root IN + BIN = 2 nodes.
10712 assert_eq!(
10713 list.len(),
10714 2,
10715 "single-entry tree must have exactly 2 nodes"
10716 );
10717 let has_bin = list.iter().any(|a| a.read().is_bin());
10718 let has_in = list.iter().any(|a| !a.read().is_bin());
10719 assert!(has_bin, "list must contain at least one BIN");
10720 assert!(has_in, "list must contain at least one upper IN");
10721 }
10722
10723 #[test]
10724 fn test_rebuild_in_list_multi_entry() {
10725 let tree = Tree::new(1, 4);
10726 for i in 0u32..20 {
10727 let k = format!("ri{:04}", i).into_bytes();
10728 tree.insert(k, vec![i as u8], Lsn::new(1, i)).unwrap();
10729 }
10730 let list = tree.rebuild_in_list();
10731 let stats = tree.collect_stats();
10732 let expected_nodes = (stats.n_ins + stats.n_bins) as usize;
10733 assert_eq!(
10734 list.len(),
10735 expected_nodes,
10736 "rebuild_in_list must return all {} nodes",
10737 expected_nodes
10738 );
10739 }
10740
10741 // --- validate_in_list ---
10742
10743 #[test]
10744 fn test_validate_in_list_empty_tree() {
10745 let tree = Tree::new(1, 128);
10746 assert!(tree.validate_in_list(), "empty tree must be valid");
10747 }
10748
10749 #[test]
10750 fn test_validate_in_list_single_entry() {
10751 let tree = Tree::new(1, 128);
10752 tree.insert(b"v".to_vec(), b"data".to_vec(), Lsn::new(1, 1)).unwrap();
10753 assert!(tree.validate_in_list(), "single-entry tree must be valid");
10754 }
10755
10756 #[test]
10757 fn test_validate_in_list_multi_entry() {
10758 let tree = Tree::new(1, 4);
10759 for i in 0u32..20 {
10760 let k = format!("vl{:04}", i).into_bytes();
10761 tree.insert(k, vec![i as u8], Lsn::new(1, i)).unwrap();
10762 }
10763 assert!(tree.validate_in_list(), "multi-entry tree must be valid");
10764 }
10765
10766 #[test]
10767 fn test_validate_in_list_empty_in_fails() {
10768 // Manually build a tree where the root IN has no entries — invalid.
10769 let root_arc = Arc::new(RwLock::new(TreeNode::Internal(InNodeStub {
10770 node_id: generate_node_id(),
10771 level: MAIN_LEVEL | 2,
10772 entries: vec![], // empty — structurally invalid
10773 dirty: false,
10774 generation: 0,
10775 parent: None,
10776 })));
10777 let tree = Tree::new(1, 128);
10778 *tree.root.write() = Some(root_arc);
10779 assert!(
10780 !tree.validate_in_list(),
10781 "a tree with an empty Internal node must fail validation"
10782 );
10783 }
10784
10785 // --- get_parent_in_for_child_in ---
10786
10787 #[test]
10788 fn test_get_parent_in_for_child_in_empty_tree() {
10789 let tree = Tree::new(1, 128);
10790 assert!(tree.get_parent_in_for_child_in(999).is_none());
10791 }
10792
10793 #[test]
10794 fn test_get_parent_in_for_child_in_single_entry() {
10795 // A single-insert tree has: root IN → BIN.
10796 // The root IN is the parent of the BIN.
10797 let tree = Tree::new(1, 128);
10798 tree.insert(b"p".to_vec(), b"v".to_vec(), Lsn::new(1, 1)).unwrap();
10799
10800 let root_arc = tree.get_root().as_ref().unwrap().clone();
10801 let bin_node_id = {
10802 let g = root_arc.read();
10803 match &*g {
10804 TreeNode::Internal(n) => {
10805 let child = n.entries[0].child.as_ref().unwrap();
10806 let cg = child.read();
10807 match &*cg {
10808 TreeNode::Bottom(b) => b.node_id,
10809 _ => panic!("expected BIN"),
10810 }
10811 }
10812 _ => panic!("expected Internal root"),
10813 }
10814 };
10815
10816 let result = tree.get_parent_in_for_child_in(bin_node_id);
10817 assert!(result.is_some(), "must find parent of BIN");
10818 let (parent_arc, slot) = result.unwrap();
10819 assert!(Arc::ptr_eq(&parent_arc, &root_arc));
10820 assert_eq!(slot, 0);
10821 }
10822
10823 #[test]
10824 fn test_get_parent_in_for_child_in_not_found() {
10825 let tree = Tree::new(1, 128);
10826 tree.insert(b"x".to_vec(), b"y".to_vec(), Lsn::new(1, 1)).unwrap();
10827 assert!(tree.get_parent_in_for_child_in(u64::MAX).is_none());
10828 }
10829
10830 #[test]
10831 fn test_get_parent_in_for_child_in_multi_level() {
10832 // Build a tree with at least 3 levels so we test the recursive descent.
10833 let tree = Tree::new(1, 4);
10834 for i in 0u32..20 {
10835 let k = format!("ml{:04}", i).into_bytes();
10836 tree.insert(k, vec![i as u8], Lsn::new(1, i)).unwrap();
10837 }
10838
10839 // Collect all BIN node_ids via rebuild_in_list.
10840 let nodes = tree.rebuild_in_list();
10841 let bin_ids: Vec<u64> = nodes
10842 .iter()
10843 .filter_map(|a| {
10844 let g = a.read();
10845 if g.is_bin()
10846 && let TreeNode::Bottom(b) = &*g
10847 {
10848 return Some(b.node_id);
10849 }
10850 None
10851 })
10852 .collect();
10853
10854 for bin_id in bin_ids {
10855 let result = tree.get_parent_in_for_child_in(bin_id);
10856 assert!(
10857 result.is_some(),
10858 "every BIN (id={}) must have a parent IN",
10859 bin_id
10860 );
10861 let (parent_arc, _slot) = result.unwrap();
10862 assert!(
10863 !parent_arc.read().is_bin(),
10864 "parent of a BIN must be an Internal node"
10865 );
10866 }
10867 }
10868
10869 /// H-9 regression: BinStub::strip_lns actually drops the slot data
10870 /// (not just stats accounting).
10871 #[test]
10872 fn test_h9_strip_lns_actually_frees_data() {
10873 use crate::tree::{BinEntry, BinStub};
10874 use noxu_util::lsn::Lsn;
10875 let mut bin = BinStub {
10876 node_id: 1,
10877 level: 1,
10878 entries: Vec::new(),
10879 key_prefix: Vec::new(),
10880 dirty: false,
10881 is_delta: false,
10882 last_full_lsn: Lsn::from_u64(0),
10883 last_delta_lsn: Lsn::from_u64(0),
10884 generation: 0,
10885 parent: None,
10886 expiration_in_hours: true,
10887 cursor_count: 0,
10888 prohibit_next_delta: false,
10889 };
10890 // Two non-dirty slots with embedded data, one dirty slot.
10891 bin.entries.push(BinEntry {
10892 key: b"a".to_vec(),
10893 lsn: Lsn::from_u64(100),
10894 data: Some(vec![0u8; 64]),
10895 known_deleted: false,
10896 dirty: false,
10897 expiration_time: 0,
10898 });
10899 bin.entries.push(BinEntry {
10900 key: b"b".to_vec(),
10901 lsn: Lsn::from_u64(200),
10902 data: Some(vec![0u8; 32]),
10903 known_deleted: false,
10904 dirty: false,
10905 expiration_time: 0,
10906 });
10907 bin.entries.push(BinEntry {
10908 key: b"c".to_vec(),
10909 lsn: Lsn::from_u64(300),
10910 data: Some(vec![0u8; 16]),
10911 known_deleted: false,
10912 dirty: true, // dirty slot must be skipped
10913 expiration_time: 0,
10914 });
10915
10916 let freed = bin.strip_lns();
10917 assert_eq!(freed, 64 + 32, "freed bytes must sum non-dirty slot data");
10918 assert!(bin.entries[0].data.is_none(), "non-dirty slot data dropped");
10919 assert!(bin.entries[1].data.is_none(), "non-dirty slot data dropped");
10920 assert!(bin.entries[2].data.is_some(), "dirty slot data preserved");
10921
10922 // Cursor pin prevents stripping.
10923 bin.entries[0].data = Some(vec![0u8; 64]);
10924 bin.entries[0].dirty = false;
10925 bin.cursor_count = 1;
10926 let freed_with_cursor = bin.strip_lns();
10927 assert_eq!(
10928 freed_with_cursor, 0,
10929 "strip_lns must skip when cursor pinned"
10930 );
10931 assert!(
10932 bin.entries[0].data.is_some(),
10933 "data preserved while cursor pinned"
10934 );
10935 }
10936
10937 // St-H4: the binary upper_in_floor_index must return the same slot as a
10938 // reference linear floor scan for all probe keys (incl. before-all,
10939 // after-all, between, and exact matches).
10940 #[test]
10941 fn test_upper_in_floor_index_matches_linear_scan() {
10942 // Reference linear floor scan (the pre-St-H4 algorithm): slot 0 is the
10943 // virtual −∞ key; walk forward while entry.key ≤ key.
10944 fn linear_floor(entries: &[InEntry], key: &[u8]) -> usize {
10945 let mut idx = 0usize;
10946 for (i, entry) in entries.iter().enumerate() {
10947 if i == 0 {
10948 idx = 0;
10949 } else if entry.key.as_slice() <= key {
10950 idx = i;
10951 } else {
10952 break;
10953 }
10954 }
10955 idx
10956 }
10957
10958 let tree = Tree::new(1, 256);
10959 // Build sorted IN slot key sets of varying size; slot 0 = virtual −∞
10960 // (empty key sorts first), the rest strictly ascending.
10961 for n_slots in 1usize..40 {
10962 let mut entries: Vec<InEntry> = Vec::with_capacity(n_slots);
10963 entries.push(InEntry {
10964 key: vec![],
10965 lsn: Lsn::from_u64(0),
10966 child: None,
10967 });
10968 for i in 1..n_slots {
10969 // Strictly-ascending two-byte keys with gaps so probes can
10970 // fall between, on, before, and after them.
10971 let v = (i as u16) * 4;
10972 entries.push(InEntry {
10973 key: vec![(v >> 8) as u8, (v & 0xFF) as u8],
10974 lsn: Lsn::from_u64(0),
10975 child: None,
10976 });
10977 }
10978 for probe in 0u16..=(n_slots as u16 * 4 + 4) {
10979 let key = vec![(probe >> 8) as u8, (probe & 0xFF) as u8];
10980 assert_eq!(
10981 tree.upper_in_floor_index(&entries, &key),
10982 linear_floor(&entries, &key),
10983 "floor mismatch: n_slots={n_slots}, key={key:?}"
10984 );
10985 }
10986 }
10987 }
10988}
10989
10990// ─────────────────────────────────────────────────────────────────────────
10991// St-H6: BIN split inherits expiration_in_hours from the splitting BIN.
10992// ─────────────────────────────────────────────────────────────────────────
10993
10994/// Unit test for the St-H6 fix: the right-half sibling created by
10995/// `split_child` inherits `expiration_in_hours` from the splitting BIN.
10996///
10997/// Before the fix, the sibling was always created with
10998/// `expiration_in_hours = false`, causing hours-granularity TTL entries
10999/// (expiration_time ~495k) to be compared against `current_time_secs()`
11000/// (~1.78B) and treated as expired.
11001///
11002/// This test:
11003/// 1. Creates a tree with max_entries = 4 and inserts 4 entries directly
11004/// (bypassing `update_key_expiration`) with non-zero `expiration_time`
11005/// and `expiration_in_hours = true` on the BIN.
11006/// 2. Triggers a split.
11007/// 3. Asserts that the right-half sibling has `expiration_in_hours = true`
11008/// (inherited, not hardcoded false).
11009#[test]
11010fn test_split_child_sibling_inherits_expiration_in_hours() {
11011 use crate::tree::{BIN_LEVEL, BinEntry, BinStub, MAIN_LEVEL, TreeNode};
11012 use noxu_util::{Lsn, NULL_LSN};
11013 use parking_lot::RwLock;
11014 use std::sync::Arc;
11015
11016 // Manually build a tree with one BIN (4 entries, expiration_in_hours=true).
11017 let tree = Tree::new(99, 4);
11018
11019 // Pre-populate the tree root for the test.
11020 let entries: Vec<BinEntry> = (0u8..4u8)
11021 .map(|k| BinEntry {
11022 key: vec![k],
11023 lsn: Lsn::new(1, (k as u32) * 100 + 100),
11024 data: Some(vec![k, k]),
11025 known_deleted: false,
11026 dirty: true,
11027 expiration_time: 495_630, // hours-since-epoch value, 2026
11028 })
11029 .collect();
11030 let bin = Arc::new(RwLock::new(TreeNode::Bottom(BinStub {
11031 node_id: 1,
11032 level: BIN_LEVEL,
11033 entries,
11034 key_prefix: Vec::new(),
11035 dirty: true,
11036 is_delta: false,
11037 last_full_lsn: NULL_LSN,
11038 last_delta_lsn: NULL_LSN,
11039 generation: 0,
11040 parent: None,
11041 expiration_in_hours: true, // hours-granularity entries
11042 cursor_count: 0,
11043 prohibit_next_delta: false,
11044 })));
11045
11046 let root = Arc::new(RwLock::new(TreeNode::Internal(InNodeStub {
11047 node_id: 2,
11048 level: MAIN_LEVEL | 2,
11049 entries: vec![InEntry {
11050 key: vec![], // virtual key for slot 0 (-infinity)
11051 lsn: Lsn::new(1, 1),
11052 child: Some(Arc::clone(&bin)),
11053 }],
11054 dirty: true,
11055 generation: 0,
11056 parent: None,
11057 })));
11058 {
11059 let mut b = bin.write();
11060 b.set_parent(Some(Arc::downgrade(&root)));
11061 }
11062 *tree.root.write() = Some(Arc::clone(&root));
11063
11064 // Trigger split_child on the root.
11065 Tree::split_child(
11066 &root,
11067 0,
11068 4,
11069 Lsn::new(1, 500),
11070 SplitHint::Normal,
11071 &[],
11072 None,
11073 false,
11074 )
11075 .expect("split_child should succeed");
11076
11077 // After the split: root has two children — left BIN and right sibling.
11078 let root_guard = root.read();
11079 let TreeNode::Internal(ref in_node) = *root_guard else {
11080 panic!("root should be Internal after split");
11081 };
11082 assert_eq!(
11083 in_node.entries.len(),
11084 2,
11085 "root should have 2 entries (children) after split"
11086 );
11087
11088 // Right-half sibling is at slot 1.
11089 let sibling_arc = in_node
11090 .entries
11091 .get(1)
11092 .and_then(|e| e.child.clone())
11093 .expect("right-half sibling should exist at slot 1");
11094 let sibling_guard = sibling_arc.read();
11095 let TreeNode::Bottom(ref sibling) = *sibling_guard else {
11096 panic!("right sibling should be a BIN");
11097 };
11098
11099 assert!(
11100 sibling.expiration_in_hours,
11101 "St-H6: right-half sibling expiration_in_hours must be true \
11102 (inherited from splitting BIN); got false"
11103 );
11104
11105 // Verify the sibling's entries have the expected expiration_time.
11106 for e in &sibling.entries {
11107 assert_eq!(
11108 e.expiration_time, 495_630,
11109 "sibling entry expiration_time should be preserved: got {}",
11110 e.expiration_time
11111 );
11112 // With in_hours=true, is_expired should return false (future).
11113 assert!(
11114 !noxu_util::ttl::is_expired(
11115 e.expiration_time,
11116 sibling.expiration_in_hours
11117 ),
11118 "St-H6: sibling TTL entry ({}) should NOT appear expired \
11119 with expiration_in_hours={}",
11120 e.expiration_time,
11121 sibling.expiration_in_hours
11122 );
11123 }
11124}
11125
11126/// Regression confirmation: `is_expired` with wrong `in_hours = false`
11127/// would falsely expire hours-granularity values (~495k hours since epoch).
11128#[test]
11129fn test_hours_value_is_expired_only_with_false_flag() {
11130 // Hours-since-epoch value for ~2026 + 1 000 h TTL.
11131 let exp_hours: u32 = 495_630;
11132 // Correctly treated as hours: not expired.
11133 assert!(
11134 !noxu_util::ttl::is_expired(exp_hours, true),
11135 "exp_hours={exp_hours} should NOT be expired when in_hours=true"
11136 );
11137 // Incorrectly treated as seconds (pre-fix right sibling): expired.
11138 assert!(
11139 noxu_util::ttl::is_expired(exp_hours, false),
11140 "exp_hours={exp_hours} should be expired when in_hours=false \
11141 (St-H6 demonstrates the wrong-flag scenario)"
11142 );
11143}
11144
11145// =============================================================================
11146// IN-redo unit tests (DRIFT-1 / Stage 1)
11147// =============================================================================
11148
11149#[cfg(test)]
11150mod in_redo_tests {
11151 use super::*;
11152
11153 /// Build a BinStub with `n` entries (key = [i as u8], lsn = lsn(1, i))
11154 /// and serialise it. Returns (node_id, node_data_bytes).
11155 fn make_bin_bytes(node_id: u64, n: usize) -> Vec<u8> {
11156 let mut bin = BinStub {
11157 node_id,
11158 level: BIN_LEVEL,
11159 entries: Vec::new(),
11160 key_prefix: Vec::new(),
11161 dirty: false,
11162 is_delta: false,
11163 last_full_lsn: noxu_util::NULL_LSN,
11164 last_delta_lsn: noxu_util::NULL_LSN,
11165 generation: 0,
11166 parent: None,
11167 expiration_in_hours: true,
11168 cursor_count: 0,
11169 prohibit_next_delta: false,
11170 };
11171 for i in 0..n {
11172 bin.entries.push(BinEntry {
11173 key: vec![i as u8],
11174 lsn: Lsn::new(1, i as u32),
11175 data: Some(vec![i as u8]),
11176 known_deleted: false,
11177 dirty: false,
11178 expiration_time: 0,
11179 });
11180 }
11181 bin.serialize_full()
11182 }
11183
11184 /// Verify that recover_in_redo inserts a BIN as root when the tree is empty.
11185 ///
11186 /// JE RecoveryManager.recoverRootIN: `root == null` path.
11187 #[test]
11188 fn test_recover_in_redo_root_bin_inserted_into_empty_tree() {
11189 let tree = Tree::new(42, 128);
11190 assert!(tree.is_empty());
11191 let bytes = make_bin_bytes(1, 3);
11192 let log_lsn = Lsn::new(1, 100);
11193 let result = tree.recover_in_redo(
11194 log_lsn, /*is_root=*/ true, /*is_bin=*/ true, &bytes,
11195 );
11196 assert_eq!(result, InRedoResult::Inserted, "expected Inserted");
11197 // Tree should now have 3 entries.
11198 assert_eq!(tree.count_entries(), 3);
11199 }
11200
11201 /// Verify that recover_in_redo replaces a root BIN when the logged version is newer.
11202 ///
11203 /// JE RootUpdater.doWork: `DbLsn.compareTo(originalLsn, lsn) < 0` path.
11204 #[test]
11205 fn test_recover_in_redo_root_bin_replaced_when_log_newer() {
11206 let tree = Tree::new(42, 128);
11207 // Install an old root (2 entries, older LSN).
11208 let old_bytes = make_bin_bytes(1, 2);
11209 let old_lsn = Lsn::new(1, 50);
11210 tree.recover_in_redo(old_lsn, true, true, &old_bytes);
11211 assert_eq!(tree.count_entries(), 2);
11212 // Replay with newer LSN and 4 entries.
11213 let new_bytes = make_bin_bytes(1, 4);
11214 let new_lsn = Lsn::new(1, 100);
11215 let result = tree.recover_in_redo(new_lsn, true, true, &new_bytes);
11216 assert_eq!(result, InRedoResult::Replaced);
11217 assert_eq!(tree.count_entries(), 4);
11218 }
11219
11220 /// Verify that an older logged BIN does NOT replace a newer in-memory root.
11221 ///
11222 /// JE RootUpdater.doWork: `DbLsn.compareTo(originalLsn, lsn) >= 0` skip path.
11223 #[test]
11224 fn test_recover_in_redo_root_bin_skipped_when_tree_newer() {
11225 let tree = Tree::new(42, 128);
11226 // Install a newer root.
11227 let new_bytes = make_bin_bytes(1, 4);
11228 let new_lsn = Lsn::new(1, 200);
11229 tree.recover_in_redo(new_lsn, true, true, &new_bytes);
11230 // Attempt to replay an older version.
11231 let old_bytes = make_bin_bytes(1, 2);
11232 let old_lsn = Lsn::new(1, 100);
11233 let result = tree.recover_in_redo(old_lsn, true, true, &old_bytes);
11234 assert_eq!(result, InRedoResult::Skipped);
11235 // Tree still holds the newer 4-entry version.
11236 assert_eq!(tree.count_entries(), 4);
11237 }
11238
11239 /// deserialize_bin round-trips through serialize_full.
11240 #[test]
11241 fn test_deserialize_bin_round_trip() {
11242 let bytes = make_bin_bytes(99, 5);
11243 let bin = Tree::deserialize_bin(&bytes).expect("must deserialize");
11244 assert_eq!(bin.node_id, 99);
11245 assert_eq!(bin.entries.len(), 5);
11246 for (i, e) in bin.entries.iter().enumerate() {
11247 assert_eq!(e.key, vec![i as u8]);
11248 }
11249 }
11250
11251 /// deserialize_upper_in round-trips through write_to_bytes (Internal).
11252 #[test]
11253 fn test_deserialize_upper_in_round_trip() {
11254 // Build an InNodeStub and serialize via write_to_bytes.
11255 let node = TreeNode::Internal(InNodeStub {
11256 node_id: 77,
11257 level: 0x10002,
11258 entries: vec![
11259 InEntry {
11260 key: vec![1, 2, 3],
11261 lsn: Lsn::new(1, 10),
11262 child: None,
11263 },
11264 InEntry {
11265 key: vec![4, 5, 6],
11266 lsn: Lsn::new(1, 20),
11267 child: None,
11268 },
11269 ],
11270 dirty: false,
11271 generation: 0,
11272 parent: None,
11273 });
11274 let bytes = node.write_to_bytes();
11275 let restored =
11276 Tree::deserialize_upper_in(&bytes).expect("must deserialize");
11277 assert_eq!(restored.node_id, 77);
11278 assert_eq!(restored.level, 0x10002);
11279 assert_eq!(restored.entries.len(), 2);
11280 assert_eq!(restored.entries[0].key, vec![1, 2, 3]);
11281 assert_eq!(restored.entries[1].key, vec![4, 5, 6]);
11282 }
11283}
11284
11285// --- Part 2 acceptance tests: key_prefixing flag (DRIFT-3) ---
11286//
11287// JE `IN.computeKeyPrefix` returns null when `databaseImpl.getKeyPrefixing()`
11288// is false, so no prefix compression is ever applied to those BINs. Noxu was
11289// always applying prefix compression. This checks that the flag is honoured.
11290//
11291// Ref: `IN.java computeKeyPrefix` ~line 2456,
11292// `DatabaseConfig.setKeyPrefixing` / `DatabaseImpl.getKeyPrefixing`.
11293#[cfg(test)]
11294mod key_prefixing_tests {
11295 use super::*;
11296
11297 /// Helper: find the first (leftmost) BIN in the tree.
11298 fn find_first_bin(node: &Arc<RwLock<TreeNode>>) -> Arc<RwLock<TreeNode>> {
11299 let child_opt = {
11300 let g = node.read();
11301 match &*g {
11302 TreeNode::Bottom(_) => None,
11303 TreeNode::Internal(n) => Some(Arc::clone(
11304 n.entries[0].child.as_ref().expect("child"),
11305 )),
11306 }
11307 };
11308 match child_opt {
11309 None => Arc::clone(node),
11310 Some(child) => find_first_bin(&child),
11311 }
11312 }
11313
11314 /// With `key_prefixing = false` (the default), keys must be stored without
11315 /// any prefix: the BIN's `key_prefix` must remain empty after inserts.
11316 #[test]
11317 fn test_key_prefixing_false_stores_full_keys() {
11318 // Default is key_prefixing = false.
11319 let tree = Tree::new(1, 16);
11320 assert!(!tree.key_prefixing, "default must be false");
11321
11322 let lsn = noxu_util::Lsn::new(1, 10);
11323 // Insert keys with a long common prefix.
11324 for i in 0u8..8 {
11325 let key = vec![b'r', b'e', b'c', b'o', b'r', b'd', b':', i];
11326 tree.insert(key, vec![i], lsn).expect("insert");
11327 }
11328
11329 let root = tree.get_root().expect("root");
11330 let bin_arc = find_first_bin(&root);
11331 let guard = bin_arc.read();
11332 let TreeNode::Bottom(ref bin) = *guard else {
11333 panic!("must be a BIN");
11334 };
11335 assert!(
11336 bin.key_prefix.is_empty(),
11337 "key_prefix must be empty when key_prefixing=false, got {:?}",
11338 bin.key_prefix
11339 );
11340 assert_eq!(bin.entries.len(), 8);
11341 // Keys must be stored as full keys.
11342 assert_eq!(
11343 bin.entries[0].key,
11344 vec![b'r', b'e', b'c', b'o', b'r', b'd', b':', 0]
11345 );
11346 }
11347
11348 /// With `key_prefixing = true`, keys with a common prefix are compressed:
11349 /// the BIN's `key_prefix` must be non-empty.
11350 #[test]
11351 fn test_key_prefixing_true_compresses_keys() {
11352 let mut tree = Tree::new(1, 16);
11353 tree.set_key_prefixing(true);
11354
11355 let lsn = noxu_util::Lsn::new(1, 10);
11356 for i in 0u8..8 {
11357 let key = vec![b'r', b'e', b'c', b'o', b'r', b'd', b':', i];
11358 tree.insert(key, vec![i], lsn).expect("insert");
11359 }
11360
11361 let root = tree.get_root().expect("root");
11362 let bin_arc = find_first_bin(&root);
11363 let guard = bin_arc.read();
11364 let TreeNode::Bottom(ref bin) = *guard else {
11365 panic!("must be a BIN");
11366 };
11367 // Prefix compression must kick in: all keys share "record:".
11368 assert!(
11369 !bin.key_prefix.is_empty(),
11370 "key_prefix must be non-empty when key_prefixing=true"
11371 );
11372 assert_eq!(
11373 bin.key_prefix,
11374 b"record:".to_vec(),
11375 "prefix must be the common prefix of all inserted keys"
11376 );
11377 }
11378
11379 /// Custom-comparator databases (sorted-dup) always bypass prefix
11380 /// regardless of key_prefixing: `insert_cmp` does not touch key_prefix.
11381 #[test]
11382 fn test_key_prefixing_custom_comparator_no_prefix() {
11383 let cmp: KeyComparatorFn = Arc::new(|a: &[u8], b: &[u8]| a.cmp(b));
11384 let mut tree = Tree::new_with_comparator(1, 16, cmp);
11385 // Enable key_prefixing — should have no effect via insert_cmp path.
11386 tree.set_key_prefixing(true);
11387
11388 let lsn = noxu_util::Lsn::new(1, 10);
11389 for i in 0u8..8 {
11390 let key = vec![b'r', b'e', b'c', b'o', b'r', b'd', b':', i];
11391 tree.insert(key, vec![i], lsn).expect("insert");
11392 }
11393
11394 let root = tree.get_root().expect("root");
11395 let bin_arc = find_first_bin(&root);
11396 let guard = bin_arc.read();
11397 let TreeNode::Bottom(ref bin) = *guard else {
11398 panic!("must be a BIN");
11399 };
11400 // Custom-comparator path (insert_cmp) does not set key_prefix.
11401 assert!(
11402 bin.key_prefix.is_empty(),
11403 "custom-comparator path must not set key_prefix"
11404 );
11405 }
11406}
11407
11408// --- Part 1 acceptance tests: splitSpecial heuristic (DRIFT-1) ---
11409//
11410// JE `IN.splitSpecial` / `Tree.forceSplit`: when all routing decisions during
11411// descent are leftmost (`AllLeft`) or rightmost (`AllRight`), the split index
11412// is forced to 1 or `n-1` respectively instead of `n/2`. This halves the
11413// number of splits for monotonically increasing / decreasing key workloads
11414// (sequential append / prepend) because each split leaves the BIN near-full.
11415//
11416// Ref: `IN.java splitSpecial` ~line 4129, `Tree.java forceSplit` ~line 1907.
11417#[cfg(test)]
11418mod split_special_tests {
11419 use super::*;
11420
11421 /// Test helper: descend the tree to the BIN that holds (or would hold)
11422 /// `key`, returning its arc. Mirrors the read-path descent used by
11423 /// `Tree::search`; sufficient for unit tests that need to mutate a slot.
11424 fn find_bin_arc_for_key(
11425 node_arc: &Arc<RwLock<TreeNode>>,
11426 key: &[u8],
11427 ) -> Option<Arc<RwLock<TreeNode>>> {
11428 let mut current = node_arc.clone();
11429 loop {
11430 let next = {
11431 let g = current.read();
11432 match &*g {
11433 TreeNode::Bottom(_) => return Some(current.clone()),
11434 TreeNode::Internal(n) => {
11435 if n.entries.is_empty() {
11436 return None;
11437 }
11438 let mut idx = 0usize;
11439 for (i, e) in n.entries.iter().enumerate() {
11440 if i == 0 || e.key.as_slice() <= key {
11441 idx = i;
11442 } else {
11443 break;
11444 }
11445 }
11446 n.entries.get(idx)?.child.clone()?
11447 }
11448 }
11449 };
11450 current = next;
11451 }
11452 }
11453
11454 /// Count total leaf (BIN) nodes in the tree by DFS.
11455 fn count_bins(node: &Arc<RwLock<TreeNode>>) -> usize {
11456 let g = node.read();
11457 match &*g {
11458 TreeNode::Bottom(_) => 1,
11459 TreeNode::Internal(n) => n
11460 .entries
11461 .iter()
11462 .filter_map(|e| e.child.as_ref())
11463 .map(count_bins)
11464 .sum(),
11465 }
11466 }
11467
11468 /// Return total key count across all BINs.
11469 fn count_keys(node: &Arc<RwLock<TreeNode>>) -> usize {
11470 let g = node.read();
11471 match &*g {
11472 TreeNode::Bottom(b) => b.entries.len(),
11473 TreeNode::Internal(n) => n
11474 .entries
11475 .iter()
11476 .filter_map(|e| e.child.as_ref())
11477 .map(count_keys)
11478 .sum(),
11479 }
11480 }
11481
11482 /// Returns the number of entries in the leftmost BIN.
11483 fn leftmost_bin_size(node: &Arc<RwLock<TreeNode>>) -> usize {
11484 let g = node.read();
11485 match &*g {
11486 TreeNode::Bottom(b) => b.entries.len(),
11487 TreeNode::Internal(n) => {
11488 let first_child = n.entries[0].child.as_ref().expect("child");
11489 leftmost_bin_size(first_child)
11490 }
11491 }
11492 }
11493
11494 /// Returns the number of entries in the rightmost BIN.
11495 fn rightmost_bin_size(node: &Arc<RwLock<TreeNode>>) -> usize {
11496 let g = node.read();
11497 match &*g {
11498 TreeNode::Bottom(b) => b.entries.len(),
11499 TreeNode::Internal(n) => {
11500 let last_child = n
11501 .entries
11502 .last()
11503 .and_then(|e| e.child.as_ref())
11504 .expect("child");
11505 rightmost_bin_size(last_child)
11506 }
11507 }
11508 }
11509
11510 /// `splitSpecial` ascending: each right-side split leaves the left BIN
11511 /// near-full (all but one entry stays). Compared to midpoint split
11512 /// the number of BINs created should be significantly fewer relative to
11513 /// keys inserted (more keys per BIN on average).
11514 ///
11515 /// JE criterion: `allRightSideDescent` → `splitIndex = nEntries - 1`.
11516 /// The penultimate entry stays in the left BIN; only one entry goes to
11517 /// the new right sibling, which then absorbs the next insert and fills
11518 /// normally.
11519 #[test]
11520 fn test_split_special_ascending_fewer_bins_than_midpoint() {
11521 let max_entries = 8usize;
11522 let n_keys = 200usize;
11523
11524 // Build tree with splitSpecial (ascending keys trigger AllRight).
11525 let tree_special = Tree::new(1, max_entries);
11526 let lsn = noxu_util::Lsn::new(1, 100);
11527 for i in 0u32..n_keys as u32 {
11528 let key = i.to_be_bytes().to_vec();
11529 tree_special.insert(key, vec![0u8], lsn).expect("insert");
11530 }
11531
11532 let root_special = tree_special.get_root().expect("root must exist");
11533 let bins_special = count_bins(&root_special);
11534 let keys_special = count_keys(&root_special);
11535
11536 // All keys must be present.
11537 assert_eq!(keys_special, n_keys, "all keys must be stored");
11538
11539 // With splitSpecial, each right-side split keeps n-1 entries in the
11540 // left BIN. Ideal: ceil(n_keys / (max_entries - 1)) BINs.
11541 // Without splitSpecial (midpoint): ceil(n_keys / (max_entries / 2)).
11542 // We assert the actual count is below the midpoint-split upper bound.
11543 let midpoint_upper_bound = n_keys.div_ceil(max_entries / 2);
11544 assert!(
11545 bins_special < midpoint_upper_bound,
11546 "splitSpecial should produce fewer BINs than midpoint split: \
11547 got {bins_special}, midpoint upper bound = {midpoint_upper_bound}"
11548 );
11549
11550 // The rightmost BIN must have fewer entries than max_entries
11551 // (the last insert only half-fills it at most), which is expected.
11552 // The IMPORTANT property: rightmost BIN started with exactly 1 entry
11553 // (its first entry was the split-off singleton) then filled up.
11554 // We just verify overall key density > midpoint baseline.
11555 let avg_fill = keys_special as f64 / bins_special as f64;
11556 let midpoint_fill = (max_entries / 2) as f64;
11557 assert!(
11558 avg_fill > midpoint_fill,
11559 "average fill per BIN with splitSpecial ({avg_fill:.1}) should \
11560 exceed midpoint baseline ({midpoint_fill})"
11561 );
11562 }
11563
11564 /// `splitSpecial` descending: all routing decisions are at slot 0
11565 /// (`AllLeft`). Split forces `split_index = 1` so the right sibling
11566 /// gets almost all entries and the left node keeps just one.
11567 ///
11568 /// JE criterion: `allLeftSideDescent` → `splitIndex = 1`.
11569 #[test]
11570 fn test_split_special_descending_fewer_bins_than_midpoint() {
11571 let max_entries = 8usize;
11572 let n_keys = 200usize;
11573
11574 let tree_special = Tree::new(1, max_entries);
11575 let lsn = noxu_util::Lsn::new(1, 100);
11576 for i in (0u32..n_keys as u32).rev() {
11577 let key = i.to_be_bytes().to_vec();
11578 tree_special.insert(key, vec![0u8], lsn).expect("insert");
11579 }
11580
11581 let root_special = tree_special.get_root().expect("root must exist");
11582 let bins_special = count_bins(&root_special);
11583 let keys_special = count_keys(&root_special);
11584
11585 assert_eq!(keys_special, n_keys, "all keys must be stored");
11586
11587 let midpoint_upper_bound = n_keys.div_ceil(max_entries / 2);
11588 assert!(
11589 bins_special < midpoint_upper_bound,
11590 "splitSpecial descending should produce fewer BINs: \
11591 got {bins_special}, midpoint upper bound = {midpoint_upper_bound}"
11592 );
11593 }
11594
11595 /// Random-key inserts must NOT be affected by splitSpecial: with random
11596 /// keys descent will rarely be all-left or all-right, so the split index
11597 /// defaults to midpoint and tree balance is maintained.
11598 #[test]
11599 fn test_split_special_random_inserts_stay_balanced() {
11600 use std::collections::BTreeSet;
11601
11602 let max_entries = 8usize;
11603 // Use a fixed permutation so the test is deterministic.
11604 let mut keys: Vec<u32> = (0u32..200).collect();
11605 // Knuth shuffle with a fixed seed.
11606 let mut rng: u64 = 0xdeadbeef_cafebabe;
11607 for i in (1..keys.len()).rev() {
11608 rng = rng.wrapping_mul(6364136223846793005).wrapping_add(1);
11609 let j = (rng >> 33) as usize % (i + 1);
11610 keys.swap(i, j);
11611 }
11612
11613 let tree = Tree::new(1, max_entries);
11614 let lsn = noxu_util::Lsn::new(1, 100);
11615 let mut inserted = BTreeSet::new();
11616 for k in &keys {
11617 let key = k.to_be_bytes().to_vec();
11618 tree.insert(key, vec![0u8], lsn).expect("insert");
11619 inserted.insert(*k);
11620 }
11621
11622 let root = tree.get_root().expect("root");
11623 let total_keys = count_keys(&root);
11624 assert_eq!(
11625 total_keys,
11626 inserted.len(),
11627 "all random keys must be stored"
11628 );
11629
11630 // Verify every key is findable.
11631 for k in &inserted {
11632 let key = k.to_be_bytes().to_vec();
11633 let found = tree.search(&key);
11634 assert!(
11635 found.map(|r| r.is_exact_match()).unwrap_or(false),
11636 "random key {k} must be findable after insert"
11637 );
11638 }
11639 }
11640
11641 /// TREE-F1: a `known_deleted` BIN slot must read as ABSENT on an exact
11642 /// lookup and must be SKIPPED by scans, matching JE.
11643 ///
11644 /// JE contract:
11645 /// * `IN.findEntry` (IN.java:3197): an exact match that lands on a
11646 /// known-deleted slot returns -1 (ABSENT).
11647 /// * `CursorImpl.lockAndGetCurrent` (CursorImpl.java:2062-2064): a
11648 /// step that lands on `isEntryKnownDeleted(index)` returns null, so
11649 /// the `getNext` loop advances past it (the slot is skipped).
11650 ///
11651 /// KD slots legitimately exist in live BINs during BIN-delta
11652 /// reconstitution (`mutate_to_full_bin` applies delta KD slots) until
11653 /// the compressor reclaims them. We reach that state directly here by
11654 /// marking a slot known_deleted in the BIN arc, then assert the
11655 /// user-facing read/scan paths do not surface it.
11656 #[test]
11657 fn test_tree_f1_known_deleted_slot_is_absent_and_skipped() {
11658 let tree = Tree::new(1, 8);
11659 // Insert enough keys to populate a BIN with several live slots.
11660 for i in 0..6u32 {
11661 let key = format!("kd{i:04}").into_bytes();
11662 tree.insert(key, vec![i as u8], Lsn::new(1, i)).unwrap();
11663 }
11664
11665 // Pick a middle key and mark its slot known_deleted directly in the
11666 // BIN, modelling a delta-applied tombstone the compressor has not yet
11667 // reclaimed.
11668 let kd_key = b"kd0003".to_vec();
11669 {
11670 let root = tree.get_root().expect("root");
11671 let bin_arc = find_bin_arc_for_key(&root, &kd_key).expect("bin");
11672 let mut g = bin_arc.write();
11673 if let TreeNode::Bottom(b) = &mut *g {
11674 let idx = (0..b.entries.len())
11675 .find(|&i| {
11676 b.get_full_key(i).as_deref() == Some(kd_key.as_slice())
11677 })
11678 .expect("kd key slot");
11679 b.entries[idx].known_deleted = true;
11680 } else {
11681 panic!("expected BIN");
11682 }
11683 }
11684
11685 // (a) exact lookup via Tree::search must report NOT found.
11686 let sr = tree.search(&kd_key);
11687 assert!(
11688 !sr.map(|r| r.is_exact_match()).unwrap_or(false),
11689 "TREE-F1: Tree::search must report a known_deleted slot as absent \
11690 (IN.findEntry IN.java:3197)"
11691 );
11692
11693 // (a) exact lookup via Tree::search_with_data must report NOT found.
11694 let sf = tree.search_with_data(&kd_key).expect("slot fetch");
11695 assert!(
11696 !sf.found,
11697 "TREE-F1: Tree::search_with_data must report a known_deleted slot \
11698 as absent (IN.findEntry IN.java:3197)"
11699 );
11700
11701 // Live neighbours must still be found.
11702 for live in [b"kd0002".to_vec(), b"kd0004".to_vec()] {
11703 assert!(
11704 tree.search(&live).map(|r| r.is_exact_match()).unwrap_or(false),
11705 "live neighbour must remain findable"
11706 );
11707 }
11708
11709 // (b) a scan-facing BIN dump (descend_to_edge_bin / get_next_bin /
11710 // get_prev_bin) returns slots verbatim WITH the known_deleted flag
11711 // set, so the cursor can skip them (CursorImpl.java:2062-2064). The
11712 // contract here is: the KD slot is never reported as a LIVE entry.
11713 let root = tree.get_root().expect("root");
11714 let edge = Tree::descend_to_edge_bin(&root, true).expect("edge bin");
11715 assert!(
11716 !edge.iter().any(|e| e.key == kd_key && !e.known_deleted),
11717 "TREE-F1: scan must not surface a known_deleted slot as live \
11718 (CursorImpl.java:2062-2064)"
11719 );
11720 for anchor in [b"kd0000".to_vec(), b"kd0005".to_vec()] {
11721 for entries in
11722 [tree.get_next_bin(&anchor), tree.get_prev_bin(&anchor)]
11723 .into_iter()
11724 .flatten()
11725 {
11726 assert!(
11727 !entries
11728 .iter()
11729 .any(|e| e.key == kd_key && !e.known_deleted),
11730 "TREE-F1: get_next_bin/get_prev_bin must not surface a \
11731 known_deleted slot as live"
11732 );
11733 }
11734 }
11735
11736 // first_entry_at_or_after must skip a KD slot at the boundary.
11737 if let Some((k, _, _)) = tree.first_entry_at_or_after(&kd_key) {
11738 assert_ne!(
11739 k, kd_key,
11740 "TREE-F1: first_entry_at_or_after must skip a known_deleted \
11741 slot (CursorImpl.java:2062-2064)"
11742 );
11743 }
11744
11745 // The compressor KD-iteration path must STILL see the slot — the fix
11746 // only changes the user-facing read predicate, not the maintenance
11747 // iteration that exists to reclaim KD slots.
11748 let kd_bins = tree.collect_bins_with_known_deleted();
11749 assert!(
11750 !kd_bins.is_empty(),
11751 "TREE-F1: collect_bins_with_known_deleted must still observe the \
11752 KD slot so the compressor can reclaim it"
11753 );
11754 }
11755}