masstree/
leaf_trait.rs

1//! Traits for abstracting over leaf node WIDTH variants.
2//!
3//! This module defines [`TreePermutation`] and [`TreeLeafNode`] traits that
4//! enable generic tree operations over both WIDTH=15 and WIDTH=24 leaf nodes.
5//!
6//! # Design
7//!
8//! The traits use static dispatch (generics) for zero-cost abstraction:
9//! - No vtable overhead
10//! - Full monomorphization
11//! - Compiler can inline all trait methods
12//!
13//! # Implementors
14//!
15//! - [`TreePermutation`]: `Permuter<WIDTH>`, `Permuter24`
16//! - [`TreeLeafNode`]: `LeafNode<S, WIDTH>`, `LeafNode24<S>`
17
18use std::cmp::Ordering;
19use std::fmt::Debug;
20
21use crate::key::Key;
22use crate::nodeversion::NodeVersion;
23use crate::slot::ValueSlot;
24use seize::LocalGuard;
25
26// ============================================================================
27// Re-exports from value.rs for use in generic code
28// ============================================================================
29
30pub use crate::value::InsertTarget;
31pub use crate::value::SplitPoint;
32
33// ============================================================================
34//  TreePermutation Trait
35// ============================================================================
36
37/// Trait for permutation types used in leaf nodes.
38///
39/// Abstracts over `Permuter<WIDTH>` (u64) and `Permuter24` (u128), enabling
40/// generic tree operations that work with both WIDTH=15 and WIDTH=24 nodes.
41///
42/// # Associated Types
43///
44/// - `Raw`: The underlying storage type (`u64` or `u128`)
45///
46/// # Implementors
47///
48/// - `Permuter<WIDTH>` for WIDTH in 1..=15
49/// - `Permuter24` for WIDTH=24
50pub trait TreePermutation: Copy + Clone + Eq + Debug + Send + Sync + Sized + 'static {
51    /// Raw storage type for atomic operations.
52    ///
53    /// - `Permuter<WIDTH>`: `u64`
54    /// - `Permuter24`: `u128`
55    type Raw: Copy + Clone + Eq + Debug + Send + Sync + 'static;
56
57    /// Number of slots this permutation supports.
58    const WIDTH: usize;
59
60    // ========================================================================
61    //  Construction
62    // ========================================================================
63
64    /// Create an empty permutation with size = 0.
65    ///
66    /// Slots are arranged so `back()` returns slot 0 initially.
67    fn empty() -> Self;
68
69    /// Create a sorted permutation with `n` elements in slots `0..n`.
70    ///
71    /// The permutation will have size `n` with logical positions `0..n`
72    /// mapping to physical slots 0..n in order.
73    ///
74    /// This is used when creating layer nodes during suffix conflict resolution,
75    /// where we need a small number of pre-positioned entries.
76    ///
77    /// # Arguments
78    ///
79    /// * `n` - Number of elements (`0 <= n <= WIDTH`)
80    ///
81    /// # Panics
82    ///
83    /// Panics in debug mode if `n > WIDTH`.
84    ///
85    /// # Example
86    ///
87    /// ```ignore
88    /// // Create a permutation with 2 sorted entries
89    /// let perm = Permuter::<15>::make_sorted(2);
90    /// assert_eq!(perm.size(), 2);
91    ///
92    /// // Position 0 -> Slot 0
93    /// assert_eq!(perm.get(0), 0);
94    ///
95    /// // Position 1 -> Slot 1
96    /// assert_eq!(perm.get(1), 1);
97    /// ```
98    fn make_sorted(n: usize) -> Self;
99
100    /// Create a permutation from a raw storage value.
101    ///
102    /// Used when loading from atomic storage.
103    ///
104    /// # Safety Note
105    ///
106    /// The raw value should be a valid permutation encoding. Invalid values
107    /// may cause debug assertions to fail but won't cause undefined behavior.
108    fn from_value(raw: Self::Raw) -> Self;
109
110    // ========================================================================
111    //  Accessors
112    // ========================================================================
113
114    /// Get the raw storage value.
115    ///
116    /// Used for atomic store/CAS operations.
117    fn value(&self) -> Self::Raw;
118
119    /// Get the number of slots in use.
120    fn size(&self) -> usize;
121
122    /// Get the physical slot at logical position `i`.
123    ///
124    /// # Panics
125    ///
126    /// Debug-panics if `i >= WIDTH`.
127    fn get(&self, i: usize) -> usize;
128
129    /// Get the slot at the back (next free slot to allocate).
130    ///
131    /// Equivalent to `get(WIDTH - 1)`.
132    fn back(&self) -> usize;
133
134    /// Get the slot at `back()` with an offset into the free region.
135    ///
136    /// `back_at_offset(0)` == `back()`.
137    ///
138    /// # Panics
139    ///
140    /// Debug-panics if `size() + offset >= WIDTH`.
141    fn back_at_offset(&self, offset: usize) -> usize;
142
143    // ========================================================================
144    //  Mutation
145    // ========================================================================
146
147    /// Allocate a slot from back and insert at position `i`.
148    ///
149    /// Returns the allocated physical slot index.
150    ///
151    /// # Panics
152    ///
153    /// Debug-panics if `i > size()` or `size() >= WIDTH`.
154    fn insert_from_back(&mut self, i: usize) -> usize;
155
156    /// Compute insert result without mutation (for CAS operations).
157    ///
158    /// Returns `(new_permutation, allocated_slot)`.
159    ///
160    /// This is used in lock-free CAS insert paths where we need to compute
161    /// the new permutation value before attempting an atomic CAS.
162    fn insert_from_back_immutable(&self, i: usize) -> (Self, usize);
163
164    /// Swap two slots in the free region (positions >= size).
165    ///
166    /// Used to skip slot 0 when it can't be reused due to `ikey_bound` constraints.
167    fn swap_free_slots(&mut self, pos_i: usize, pos_j: usize);
168
169    /// Set the size without changing slot positions.
170    fn set_size(&mut self, n: usize);
171
172    /// Remove the slot at logical position `i`.
173    ///
174    /// The slot is moved to the free region (back) and size is decremented.
175    ///
176    /// # Panics
177    ///
178    /// Debug-panics if `i >= size()`.
179    fn remove(&mut self, i: usize);
180}
181
182// ============================================================================
183//  TreeInternode Trait
184// ============================================================================
185
186/// Trait for internode types used in a `MassTree`.
187///
188/// Abstracts over `InternodeNode` for generic tree operations.
189/// Internode WIDTH is fixed at 15 (matching leaf WIDTH for optimal B+tree fanout).
190///
191/// # Note on Generic Parameter
192///
193/// Unlike leaf nodes, internodes don't store values - only `u64` keys and `*mut u8`
194/// child pointers. The internode type is therefore non-generic, reducing code
195/// monomorphization (one `InternodeNode` implementation regardless of slot type).
196///
197/// # Implementors
198///
199/// - `InternodeNode` (fixed WIDTH=15)
200pub trait TreeInternode: Sized + Send + Sync + 'static {
201    /// Node width (max number of children).
202    const WIDTH: usize;
203
204    // ========================================================================
205    //  Construction
206    // ========================================================================
207
208    /// Create a new internode with specified height.
209    fn new_boxed(height: u32) -> Box<Self>;
210
211    /// Create a new root internode with specified height.
212    fn new_root_boxed(height: u32) -> Box<Self>;
213
214    /// Create a new internode sibling for a split operation.
215    ///
216    /// The new internode is created with a **split-locked** version copied from the
217    /// locked parent. This prevents other threads from locking the sibling until
218    /// it is installed into the tree and its parent pointer is set.
219    ///
220    /// # Help-Along Protocol
221    ///
222    /// This is the internode equivalent of leaf `NodeVersion::new_for_split()`.
223    /// The caller MUST call `version().unlock_for_split()` exactly once after:
224    /// 1. The sibling is inserted into its parent (grandparent or new root)
225    /// 2. The sibling's parent pointer is set
226    ///
227    /// # C++ Reference
228    ///
229    /// Matches `next_child->assign_version(*p)` in `masstree_split.hh:234`.
230    ///
231    /// # Safety
232    ///
233    /// The `parent_version` must be from a locked node (the parent being split).
234    fn new_boxed_for_split(parent_version: &NodeVersion, height: u32) -> Box<Self>;
235
236    // ========================================================================
237    //  Version / Locking
238    // ========================================================================
239
240    /// Get reference to node version.
241    fn version(&self) -> &NodeVersion;
242
243    // ========================================================================
244    //  Structure
245    // ========================================================================
246
247    /// Get the height of this internode.
248    fn height(&self) -> u32;
249
250    /// Check if children are leaves (height == 0).
251    fn children_are_leaves(&self) -> bool;
252
253    /// Get number of keys.
254    fn nkeys(&self) -> usize;
255
256    /// Set number of keys.
257    fn set_nkeys(&self, n: u8);
258
259    /// Increment nkeys by 1.
260    fn inc_nkeys(&self);
261
262    /// Check if this internode is full.
263    fn is_full(&self) -> bool;
264
265    // ========================================================================
266    //  Keys
267    // ========================================================================
268
269    /// Get key at index (Acquire ordering).
270    fn ikey(&self, idx: usize) -> u64;
271
272    /// Get key at index using Relaxed ordering.
273    ///
274    /// # Safety Justification
275    ///
276    /// Callers must have already established ordering via `version.stable()`
277    /// or equivalent fence. The Acquire fence from `stable()` synchronizes
278    /// with writer's Release stores, making Relaxed loads safe.
279    ///
280    /// This is safe because:
281    /// 1. Call sites do `stable()` which spins until `DIRTY_MASK == 0`
282    /// 2. `stable()` issues `fence(Acquire)` before returning
283    /// 3. The fence synchronizes with writer's `Release` stores
284    /// 4. Therefore, Relaxed loads see fully-published values
285    fn ikey_relaxed(&self, idx: usize) -> u64;
286
287    /// Set key at index.
288    fn set_ikey(&self, idx: usize, key: u64);
289
290    /// Compare key at position with search key.
291    fn compare_key(&self, search_ikey: u64, p: usize) -> Ordering;
292
293    /// Find insert position for a key.
294    fn find_insert_position(&self, insert_ikey: u64) -> usize;
295
296    // ========================================================================
297    // Children
298    // ========================================================================
299
300    /// Get child pointer at index.
301    fn child(&self, idx: usize) -> *mut u8;
302
303    /// Get child pointer with prefetch hint for the next likely child.
304    ///
305    /// Default implementation just calls `child()` without prefetching.
306    /// Optimized implementations can prefetch the next child to hide latency.
307    #[inline(always)]
308    fn child_with_prefetch(&self, idx: usize, _nkeys: usize) -> *mut u8 {
309        self.child(idx)
310    }
311
312    /// Get child pointer with depth-first prefetch for descent.
313    ///
314    /// Prefetches the target child's header and keys, optimizing
315    /// for tree descent (going down) rather than sibling traversal.
316    ///
317    /// # Default Implementation
318    ///
319    /// Falls back to [`Self::child`] without prefetch.
320    #[inline(always)]
321    fn child_with_depth_prefetch(&self, idx: usize) -> *mut u8 {
322        self.child(idx)
323    }
324
325    /// Get child pointer with full prefetch (all key cache lines).
326    ///
327    /// Prefetches all cache lines containing keys. Use when nodes
328    /// are expected to be full.
329    ///
330    /// # Default Implementation
331    ///
332    /// Falls back to [`Self::child`] without prefetch.
333    #[inline(always)]
334    fn child_with_full_prefetch(&self, idx: usize) -> *mut u8 {
335        self.child(idx)
336    }
337
338    /// Set child pointer at index.
339    fn set_child(&self, idx: usize, child: *mut u8);
340
341    /// Assign key and right child at position.
342    fn assign(&self, p: usize, ikey: u64, right_child: *mut u8);
343
344    /// Insert key and child at position, shifting existing entries.
345    fn insert_key_and_child(&self, p: usize, new_ikey: u64, new_child: *mut u8);
346
347    // ========================================================================
348    // Navigation
349    // ========================================================================
350
351    /// Get parent pointer.
352    fn parent(&self) -> *mut u8;
353
354    /// Set parent pointer.
355    fn set_parent(&self, parent: *mut u8);
356
357    /// Check if this is a root node.
358    fn is_root(&self) -> bool;
359
360    // ========================================================================
361    //  Split Support
362    // ========================================================================
363
364    /// Shift entries from another internode.
365    fn shift_from(&self, dst_pos: usize, src: &Self, src_pos: usize, count: usize);
366
367    /// Split this internode into a new sibling while inserting a key/child.
368    ///
369    /// This method performs the split AND updates all children's parent pointers
370    /// in `new_right` to point to `new_right_ptr`. This is critical for correctness:
371    /// parent updates must happen inside `split_into` (before returning) to prevent
372    /// races where a thread sees a child with a stale parent pointer.
373    ///
374    /// # Arguments
375    ///
376    /// * `new_right` - The new right sibling (pre-allocated, mutable reference)
377    /// * `new_right_ptr` - Raw pointer to `new_right` for setting parent pointers
378    /// * `insert_pos` - Position where the new key/child should be inserted
379    /// * `insert_ikey` - The key to insert
380    /// * `insert_child` - The child pointer to insert
381    ///
382    /// # Returns
383    ///
384    /// `(popup_key, insert_went_left)` where:
385    /// - `popup_key` is the key that goes to the parent
386    /// - `insert_went_left` is true if the insert went to the left sibling
387    ///
388    /// # Safety
389    ///
390    /// * `new_right_ptr` must point to `new_right`
391    /// * The caller must hold the lock on `self`
392    #[must_use = "popup_key must be inserted into parent node to complete the split"]
393    fn split_into(
394        &self,
395        new_right: &mut Self,
396        new_right_ptr: *mut Self,
397        insert_pos: usize,
398        insert_ikey: u64,
399        insert_child: *mut u8,
400    ) -> (u64, bool);
401}
402
403// ============================================================================
404//  TreeLeafNode Trait
405// ============================================================================
406
407/// Trait for leaf node types that can be used in a [`crate::MassTree`].
408///
409/// Abstracts over `LeafNode<S, WIDTH>` and `LeafNode24<S>`, enabling generic
410/// tree operations that work with both WIDTH=15 and WIDTH=24 nodes.
411///
412/// # Type Parameters
413///
414/// - `S`: The slot type (e.g., `LeafValue<V>` or `LeafValueIndex<V>`)
415///
416/// # Associated Types
417///
418/// - `Perm`: The permutation type for this leaf
419/// - `Internode`: The internode type for this tree variant
420///
421/// # Implementors
422///
423/// - `LeafNode<S, WIDTH>` for WIDTH in 1..=15
424/// - `LeafNode24<S>` for WIDTH=24
425pub trait TreeLeafNode<S: ValueSlot>: Sized + Send + Sync + 'static {
426    /// The permutation type for this leaf.
427    type Perm: TreePermutation;
428
429    /// The internode type for this tree variant.
430    type Internode: TreeInternode;
431
432    /// Node width (number of slots).
433    const WIDTH: usize;
434
435    /// Split threshold (trigger split when size >= this).
436    ///
437    /// Default is 80% of WIDTH to reduce split frequency on hot keys.
438    /// This leaves headroom for inserts before triggering a split.
439    ///
440    /// - WIDTH=15: threshold=12 (80%)
441    /// - WIDTH=24: threshold=19 (79%)
442    const SPLIT_THRESHOLD: usize;
443
444    // ========================================================================
445    //  Construction
446    // ========================================================================
447
448    /// Create a new leaf node (heap-allocated).
449    fn new_boxed() -> Box<Self>;
450
451    /// Create a new root leaf node (heap-allocated).
452    fn new_root_boxed() -> Box<Self>;
453
454    /// Create a new leaf node configured as a layer root.
455    ///
456    /// The returned node has:
457    /// - `is_root` flag set via `version.mark_root()`
458    /// - `parent` pointer set to null
459    ///
460    /// Layer roots are used when creating sublayers for keys longer than 8 bytes.
461    /// When two keys share the same 8-byte ikey but have different suffixes,
462    /// a new layer is created to distinguish them by their next 8-byte chunk.
463    fn new_layer_root_boxed() -> Box<Self>;
464
465    // ========================================================================
466    //  NodeVersion Operations
467    // ========================================================================
468
469    /// Get a reference to the node's version.
470    ///
471    /// Used for optimistic concurrency control (OCC) and locking.
472    fn version(&self) -> &NodeVersion;
473
474    // ========================================================================
475    //  Permutation Operations
476    // ========================================================================
477
478    /// Load the current permutation with Acquire ordering.
479    fn permutation(&self) -> Self::Perm;
480
481    /// Store a new permutation with Release ordering.
482    fn set_permutation(&self, perm: Self::Perm);
483
484    /// Load raw permutation value with Acquire ordering.
485    ///
486    /// Used for freeze detection without constructing a Permuter.
487    fn permutation_raw(&self) -> <Self::Perm as TreePermutation>::Raw;
488
489    // ========================================================================
490    //  Key Operations
491    // ========================================================================
492
493    /// Get ikey at physical slot using Acquire ordering.
494    ///
495    /// # Panics
496    ///
497    /// Debug-panics if `slot >= WIDTH`.
498    fn ikey(&self, slot: usize) -> u64;
499
500    /// Get ikey at physical slot using Relaxed ordering.
501    ///
502    /// # Safety Justification
503    ///
504    /// Safe to use when caller has already loaded permutation with Acquire
505    /// ordering and will validate with OCC at the end of the read operation.
506    /// Avoids redundant Acquire fences on each ikey load.
507    ///
508    /// # Panics
509    ///
510    /// Debug-panics if `slot >= WIDTH`.
511    fn ikey_relaxed(&self, slot: usize) -> u64;
512
513    /// Set ikey at physical slot.
514    ///
515    /// # Panics
516    ///
517    /// Debug-panics if `slot >= WIDTH`.
518    fn set_ikey(&self, slot: usize, ikey: u64);
519
520    /// Get ikey bound (slot 0's ikey for B-link navigation).
521    ///
522    /// The `ikey_bound` is the smallest ikey in this leaf and is used
523    /// for navigating to the correct sibling during splits.
524    fn ikey_bound(&self) -> u64;
525
526    /// Find all physical slots with matching ikey, returning a bitmask.
527    ///
528    /// Returns a `u32` where bit `i` is set if `self.ikey(i) == target_ikey`.
529    /// Used for SIMD-accelerated key search.
530    ///
531    /// The default implementation uses a scalar loop. Implementations may
532    /// override with SIMD for better performance.
533    #[inline]
534    fn find_ikey_matches(&self, target_ikey: u64) -> u32 {
535        let mut mask: u32 = 0;
536        for slot in 0..Self::WIDTH {
537            if self.ikey(slot) == target_ikey {
538                mask |= 1 << slot;
539            }
540        }
541        mask
542    }
543
544    /// Get keylenx at physical slot.
545    ///
546    /// Values:
547    /// - 0-8: inline key length
548    /// - 64 (`KSUF_KEYLENX)`: has suffix
549    /// - >=128 (LAYER_KEYLENX): is layer pointer
550    fn keylenx(&self, slot: usize) -> u8;
551
552    /// Set keylenx at physical slot.
553    fn set_keylenx(&self, slot: usize, keylenx: u8);
554
555    /// Check if slot contains a layer pointer.
556    ///
557    /// A layer pointer indicates this slot descends into a sublayer
558    /// for keys longer than 8 bytes at this level.
559    fn is_layer(&self, slot: usize) -> bool;
560
561    /// Check if slot has a suffix.
562    fn has_ksuf(&self, slot: usize) -> bool;
563
564    // ========================================================================
565    //  Value Operations
566    // ========================================================================
567
568    /// Load value pointer at slot.
569    ///
570    /// Returns raw pointer to either an `Arc<V>` (value mode) or
571    /// a sublayer root node (layer mode).
572    fn leaf_value_ptr(&self, slot: usize) -> *mut u8;
573
574    /// Store value pointer at slot.
575    fn set_leaf_value_ptr(&self, slot: usize, ptr: *mut u8);
576
577    /// Update an existing value in place.
578    ///
579    /// This is an optimization for updates where the slot already contains
580    /// a value. For inline storage, this skips the sentinel store since
581    /// the sentinel is already in place.
582    ///
583    /// Default implementation calls `set_leaf_value_ptr`.
584    #[inline(always)]
585    fn update_leaf_value_in_place(&self, slot: usize, ptr: *mut u8) {
586        self.set_leaf_value_ptr(slot, ptr);
587    }
588
589    /// CAS value pointer at slot.
590    ///
591    /// Used in CAS insert path to atomically claim a slot.
592    ///
593    /// # Errors
594    ///
595    /// Returns `Err(actual)` containing the actual pointer value if the CAS
596    /// failed due to a concurrent modification (the slot's current value
597    /// did not match `expected`).
598    fn cas_slot_value(
599        &self,
600        slot: usize,
601        expected: *mut u8,
602        new_value: *mut u8,
603    ) -> Result<(), *mut u8>;
604
605    // ========================================================================
606    //  Slot Clearing (for gc_layer)
607    // ========================================================================
608
609    /// Clear a slot completely, removing any value or layer pointer.
610    ///
611    /// Used by `gc_layer` when cleaning up an empty sublayer.
612    /// The parent leaf's slot that pointed to the sublayer is cleared.
613    ///
614    /// # Safety
615    ///
616    /// The caller must ensure:
617    /// - The leaf is locked
618    /// - The slot is valid (0..WIDTH)
619    /// - Any value/layer at this slot has been or will be properly retired
620    fn clear_slot(&self, slot: usize);
621
622    /// Clear a slot and update permutation.
623    ///
624    /// This is a convenience method that:
625    /// 1. Clears the slot contents
626    /// 2. Removes the slot from the permutation
627    ///
628    /// # Safety
629    ///
630    /// The caller must ensure the leaf is locked.
631    fn clear_slot_and_permutation(&self, slot: usize);
632
633    // ========================================================================
634    //  Size Operations
635    // ========================================================================
636
637    /// Get number of keys in this leaf.
638    #[inline(always)]
639    fn size(&self) -> usize {
640        self.permutation().size()
641    }
642
643    /// Check if leaf is empty.
644    #[inline(always)]
645    fn is_empty(&self) -> bool {
646        self.size() == 0
647    }
648
649    /// Check if leaf is full.
650    #[inline(always)]
651    fn is_full(&self) -> bool {
652        self.size() >= Self::WIDTH
653    }
654
655    // ========================================================================
656    //  Navigation (B-link tree pointers)
657    // ========================================================================
658
659    /// Get next leaf pointer (with mark bit cleared).
660    ///
661    /// The next pointer may be marked during splits. This method
662    /// returns the clean pointer for following the linked list.
663    fn safe_next(&self) -> *mut Self;
664
665    /// Check if next pointer is marked.
666    ///
667    /// A marked next pointer indicates a split is in progress.
668    fn next_is_marked(&self) -> bool;
669
670    /// Set next leaf pointer.
671    fn set_next(&self, next: *mut Self);
672
673    /// Mark the next pointer (during split).
674    fn mark_next(&self);
675
676    /// Unmark the next pointer.
677    fn unmark_next(&self);
678
679    /// Get previous leaf pointer.
680    fn prev(&self) -> *mut Self;
681
682    /// Set previous leaf pointer.
683    fn set_prev(&self, prev: *mut Self);
684
685    /// Unlink this leaf from the B-link doubly-linked chain.
686    ///
687    /// Used when removing an empty leaf from the tree.
688    ///
689    /// # Safety
690    ///
691    /// - Caller must hold the version lock on this leaf
692    /// - `self.prev()` must be non-null (not the leftmost leaf)
693    /// - The prev and next pointers must be valid leaves
694    unsafe fn unlink_from_chain(&self);
695
696    /// Get parent internode pointer.
697    fn parent(&self) -> *mut u8;
698
699    /// Set parent internode pointer.
700    fn set_parent(&self, parent: *mut u8);
701
702    // ========================================================================
703    //  Slot Assignment Helpers
704    // ========================================================================
705
706    /// Check if slot 0 can be reused for a new key.
707    ///
708    /// Slot 0 stores `ikey_bound()` which must be preserved if this
709    /// leaf has a predecessor (prev != null). Slot 0 can only be
710    /// reused if the new key has the same ikey as the current bound.
711    fn can_reuse_slot0(&self, new_ikey: u64) -> bool;
712
713    // ========================================================================
714    //  CAS Insert Support
715    // ========================================================================
716
717    /// Store key metadata (`ikey`, `keylenx`) for a CAS insert attempt.
718    ///
719    /// # Safety
720    ///
721    /// - The caller must have successfully claimed the slot via `cas_slot_value` and ensured
722    ///   the slot still belongs to the CAS attempt (i.e. `leaf_values[slot]` still equals the
723    ///   claimed pointer).
724    ///
725    /// Note: writing key metadata *before* claiming the slot is not safe in this design because
726    /// multiple concurrent CAS attempts can overwrite each other's metadata before publish.
727    unsafe fn store_key_data_for_cas(&self, slot: usize, ikey: u64, keylenx: u8);
728
729    /// Load the raw slot value pointer atomically.
730    ///
731    /// Used to verify slot ownership after CAS claim.
732    fn load_slot_value(&self, slot: usize) -> *mut u8;
733
734    /// Get the raw next pointer (may be marked).
735    ///
736    /// Returns the next pointer without unmarking. Use to check
737    /// if a split is in progress (marked) or get the raw value.
738    fn next_raw(&self) -> *mut Self;
739
740    /// Wait for an in-progress split to complete.
741    ///
742    /// Spins until the next pointer is unmarked and version is stable.
743    fn wait_for_split(&self);
744
745    // ========================================================================
746    //  Split Operations
747    // ========================================================================
748
749    /// Calculate the optimal split point.
750    ///
751    /// # Arguments
752    ///
753    /// * `insert_pos` - Logical position where new key will be inserted
754    /// * `insert_ikey` - The key being inserted
755    ///
756    /// # Returns
757    ///
758    /// `Some(SplitPoint)` with position and split key, or `None` if split
759    /// is not possible (e.g., empty leaf).
760    fn calculate_split_point(&self, insert_pos: usize, insert_ikey: u64) -> Option<SplitPoint>;
761
762    /// Split this leaf at `split_pos` using a pre-allocated target.
763    ///
764    /// Moves entries from `split_pos..size` to `new_leaf_ptr`.
765    /// The caller is responsible for allocating and tracking the new leaf.
766    ///
767    /// # Returns
768    ///
769    /// `(split_ikey, insert_target)` tuple where:
770    /// - `split_ikey` is the first key of the new leaf (separator for parent)
771    /// - `insert_target` indicates which leaf should receive the new key
772    ///
773    /// # Safety
774    ///
775    /// - Caller must hold the leaf lock (if concurrent)
776    /// - `new_leaf_ptr` must point to valid, initialized leaf memory
777    /// - The new leaf should be freshly allocated (empty) with split-locked version
778    /// - `guard` must be valid
779    unsafe fn split_into_preallocated(
780        &self,
781        split_pos: usize,
782        new_leaf_ptr: *mut Self,
783        guard: &seize::LocalGuard<'_>,
784    ) -> (u64, InsertTarget);
785
786    /// Move ALL entries to a new right leaf.
787    ///
788    /// Used for the edge case where `split_pos == 0` in post-insert coordinates.
789    /// The original leaf becomes empty, and all entries move to the new leaf.
790    ///
791    /// # Safety
792    ///
793    /// Same requirements as `split_into_preallocated`.
794    unsafe fn split_all_to_right_preallocated(
795        &self,
796        new_leaf_ptr: *mut Self,
797        guard: &seize::LocalGuard<'_>,
798    ) -> (u64, InsertTarget);
799
800    // ========================================================================
801    //  Sibling Link Helper (for split)
802    // ========================================================================
803
804    /// Link this leaf to a new sibling (B-link tree threading).
805    ///
806    /// Sets up the doubly-linked list: `self.next = new_sibling`,
807    /// `new_sibling.prev = self`, and if there was an old next,
808    /// updates `old_next.prev = new_sibling`.
809    ///
810    /// # Safety
811    ///
812    /// - `new_sibling` must be a valid pointer to a freshly allocated leaf
813    /// - Caller must hold the leaf lock
814    unsafe fn link_sibling(&self, new_sibling: *mut Self);
815
816    // ========================================================================
817    //  Suffix Operations (for split)
818    // ========================================================================
819
820    /// Get suffix at slot (if any).
821    ///
822    /// Returns `None` if no suffix is stored at this slot.
823    fn ksuf(&self, slot: usize) -> Option<&[u8]>;
824
825    /// Assign a suffix to a slot (normal operation).
826    ///
827    /// Uses the permutation to determine which slots are active.
828    /// This is the hot path for normal inserts/updates.
829    ///
830    /// # Safety
831    ///
832    /// - Caller must hold the leaf lock
833    /// - Slot must be valid
834    unsafe fn assign_ksuf(&self, slot: usize, suffix: &[u8], guard: &seize::LocalGuard<'_>);
835
836    /// Assign a suffix to a slot during node initialization (e.g., splits).
837    ///
838    /// Unlike `assign_ksuf`, this assumes slots `0..slot` are already filled
839    /// sequentially and doesn't rely on the permutation. Used during split
840    /// operations when the new node's permutation hasn't been set up yet.
841    ///
842    /// This matches C++ `assign_ksuf`'s `initializing=true` path in
843    /// `masstree_struct.hh:728`.
844    ///
845    /// # Safety
846    ///
847    /// - Caller must hold the leaf lock
848    /// - Slot must be valid
849    /// - Slots 0..slot must already be filled sequentially
850    unsafe fn assign_ksuf_init(&self, slot: usize, suffix: &[u8], guard: &seize::LocalGuard<'_>);
851
852    /// Clear suffix at slot.
853    ///
854    /// # Safety
855    ///
856    /// - Caller must hold the leaf lock
857    unsafe fn clear_ksuf(&self, slot: usize, guard: &seize::LocalGuard<'_>);
858
859    /// Take ownership of the value pointer at slot (for moving during split).
860    ///
861    /// Returns the pointer and clears the slot. Used when moving entries
862    /// between leaves during a split.
863    fn take_leaf_value_ptr(&self, slot: usize) -> *mut u8;
864
865    // ========================================================================
866    //  Suffix Comparison Operations
867    // ========================================================================
868
869    /// Check if a slot's suffix equals the given suffix.
870    ///
871    /// Returns `false` if:
872    /// - Slot has no suffix (`keylenx != KSUF_KEYLENX`)
873    /// - Suffix bag is null
874    /// - Suffixes don't match
875    ///
876    /// # Panics
877    ///
878    /// Panics in debug mode if `slot >= WIDTH`.
879    #[must_use]
880    fn ksuf_equals(&self, slot: usize, suffix: &[u8]) -> bool;
881
882    /// Compare a slot's suffix with the given suffix.
883    ///
884    /// Returns `None` if the slot has no suffix.
885    /// Returns `Some(Ordering)` if comparison is possible.
886    ///
887    /// # Panics
888    ///
889    /// Panics in debug mode if `slot >= WIDTH`.
890    #[must_use]
891    fn ksuf_compare(&self, slot: usize, suffix: &[u8]) -> Option<Ordering>;
892
893    /// Get the suffix for a slot, or an empty slice if none.
894    ///
895    /// Convenience wrapper around `ksuf()`.
896    ///
897    /// # Panics
898    ///
899    /// Panics in debug mode if `slot >= WIDTH`.
900    #[must_use]
901    #[inline(always)]
902    fn ksuf_or_empty(&self, slot: usize) -> &[u8] {
903        self.ksuf(slot).unwrap_or(&[])
904    }
905
906    /// Check if a slot's key (ikey + suffix) matches the given full key.
907    ///
908    /// This compares both the 8-byte ikey and the suffix (if any).
909    ///
910    /// # Arguments
911    ///
912    /// * `slot` - Physical slot index
913    /// * `ikey` - The 8-byte key to compare
914    /// * `suffix` - The suffix to compare (bytes after the first 8)
915    ///
916    /// # Returns
917    ///
918    /// `true` if both ikey and suffix match.
919    ///
920    /// # Panics
921    ///
922    /// Panics in debug mode if `slot >= WIDTH`.
923    #[must_use]
924    fn ksuf_matches(&self, slot: usize, ikey: u64, suffix: &[u8]) -> bool;
925
926    /// Check if a slot matches the given key parameters, with layer detection.
927    ///
928    /// This is the layer-aware version of `ksuf_matches` that returns detailed
929    /// match information needed for layer traversal.
930    ///
931    /// # Arguments
932    ///
933    /// * `slot` - Physical slot index
934    /// * `keylenx` - The keylenx of the search key (0-8 for inline, `KSUF_KEYLENX` for suffix)
935    /// * `suffix` - The suffix bytes to match (empty if inline key)
936    ///
937    /// # Returns
938    ///
939    /// * `1` - Exact match (ikey, keylenx, and suffix all match)
940    /// * `0` - Same ikey but different key (keylenx or suffix mismatch)
941    /// * `-8` - Slot is a layer pointer; caller should shift key by 8 bytes and descend
942    ///
943    /// # Note
944    ///
945    /// The ikey is assumed to already match (caller should check `leaf.ikey(slot) == ikey`
946    /// before calling this method).
947    ///
948    /// # Panics
949    ///
950    /// Panics in debug mode if `slot >= WIDTH`.
951    #[must_use]
952    fn ksuf_match_result(&self, slot: usize, keylenx: u8, suffix: &[u8]) -> i32;
953
954    // ========================================================================
955    //  Cache Optimization
956    // ========================================================================
957
958    /// Prefetch the leaf node's data into cache.
959    ///
960    /// Brings the node's key arrays (`ikey0`, `keylenx`) and value pointers
961    /// (`leaf_values`) into CPU cache before they're accessed, reducing memory
962    /// latency during sequential scanning.
963    ///
964    /// # C++ Reference
965    ///
966    /// Matches C++ `leaf::prefetch()` pattern from `masstree_scan.hh:195, 299`.
967    fn prefetch(&self);
968
969    /// Prefetch the ikey at the given slot into CPU cache.
970    ///
971    /// This is used during linear search to hide memory latency by
972    /// prefetching future ikeys while processing current ones.
973    ///
974    /// # Arguments
975    ///
976    /// * `slot` - Physical slot index (0..WIDTH)
977    ///
978    /// # Default Implementation
979    ///
980    /// No-op. Implementations may override with actual prefetch.
981    #[inline(always)]
982    fn prefetch_ikey(&self, _slot: usize) {
983        // Default no-op; implementations may override
984    }
985
986    /// Prefetch permutation and ikeys for point lookups.
987    ///
988    /// Lighter-weight than [`Self::prefetch`] - only fetches cache lines
989    /// needed for the search loop. Call before `version.stable()` to
990    /// hide memory latency during version spin-wait.
991    ///
992    /// # Default Implementation
993    ///
994    /// No-op. Implementations may override with actual prefetch.
995    #[inline(always)]
996    fn prefetch_for_search(&self) {
997        // Default no-op; implementations may override
998    }
999
1000    /// Prefetch for search, adapting to node size.
1001    ///
1002    /// Avoids unnecessary prefetch instructions for small nodes by
1003    /// only prefetching cache lines that contain valid keys.
1004    ///
1005    /// # Arguments
1006    ///
1007    /// * `size` - Number of keys in the node
1008    ///
1009    /// # Default Implementation
1010    ///
1011    /// Falls back to unconditional [`Self::prefetch_for_search`].
1012    #[inline(always)]
1013    fn prefetch_for_search_adaptive(&self, size: usize) {
1014        let _ = size;
1015        self.prefetch_for_search();
1016    }
1017
1018    // ========================================================================
1019    //  Modification State (modstate) Operations
1020    // ========================================================================
1021
1022    /// Get the modification state.
1023    ///
1024    /// Returns one of:
1025    /// - `0` (`MODSTATE_INSERT`): Normal insert mode
1026    /// - `1` (`MODSTATE_REMOVE`): Node is being removed
1027    /// - `2` (`MODSTATE_DELETED_LAYER`): Layer has been garbage collected
1028    ///
1029    /// # C++ Reference
1030    ///
1031    /// Matches `leaf::modstate_` in `masstree_struct.hh:264-270`.
1032    fn modstate(&self) -> u8;
1033
1034    /// Set the modification state.
1035    fn set_modstate(&self, state: u8);
1036
1037    /// Check if this layer has been deleted (garbage collected).
1038    ///
1039    /// This is distinct from `version.is_deleted()`:
1040    /// - `is_deleted()` means the node itself is removed from the tree
1041    /// - `deleted_layer()` means the sublayer this node was root of has been gc'd
1042    ///
1043    /// When `deleted_layer()` is true, readers should reset their key position
1044    /// and retry from the main tree root.
1045    ///
1046    /// # C++ Reference
1047    ///
1048    /// Matches `leaf::deleted_layer()` in `masstree_struct.hh:456-458`.
1049    fn deleted_layer(&self) -> bool;
1050
1051    /// Mark this layer as deleted (for `gc_layer`).
1052    ///
1053    /// Called when garbage collecting an empty sublayer.
1054    fn mark_deleted_layer(&self);
1055
1056    /// Mark this node as being in remove mode.
1057    ///
1058    /// Called at the start of a remove operation.
1059    fn mark_remove(&self);
1060
1061    /// Check if this node is in remove mode.
1062    fn is_removing(&self) -> bool;
1063
1064    // =========================================================================
1065    //  Empty State (for lazy coalescing)
1066    // =========================================================================
1067
1068    /// Check if this leaf is in empty state (modstate == `MODSTATE_EMPTY`).
1069    ///
1070    /// Empty state means all keys were removed and the leaf is available
1071    /// for reuse by insert or cleanup by background coalescing.
1072    ///
1073    /// Note: Use `is_empty()` (inherited from trait) to check if permutation
1074    /// size is 0. Use `is_empty_state()` to check the modstate flag.
1075    fn is_empty_state(&self) -> bool;
1076
1077    /// Mark this leaf as empty (all keys removed).
1078    ///
1079    /// Called when the last key is removed from a leaf.
1080    fn mark_empty(&self);
1081
1082    /// Clear empty state, returning to normal insert mode.
1083    ///
1084    /// Called when an empty leaf is being reused for a new insert.
1085    fn clear_empty_state(&self);
1086}
1087
1088// =============================================================================
1089// LayerCapableLeaf Trait
1090// =============================================================================
1091
1092/// Extension trait for Arc-mode leaves that support layer creation.
1093///
1094/// This trait adds layer-specific operations needed for handling suffix conflicts
1095/// in keys longer than 8 bytes. It is separate from [`TreeLeafNode`] because the
1096/// methods are specific to `LeafValue<V>` mode (Arc-wrapped values).
1097///
1098/// # When Layer Creation Occurs
1099///
1100/// Layer creation is triggered when:
1101/// 1. Two keys share the same 8-byte ikey
1102/// 2. Both have suffixes (bytes beyond the first 8)
1103/// 3. The suffixes differ
1104/// 4. Neither slot is already a layer pointer
1105///
1106/// This is the "Conflict" case in `InsertSearchResultGeneric`.
1107///
1108/// # Implementors
1109///
1110/// - `LeafNode24<LeafValue<V>>`
1111pub trait LayerCapableLeaf<S: ValueSlot>: TreeLeafNode<S>
1112where
1113    S::Value: Send + Sync + 'static,
1114    S::Output: Send + Sync,
1115{
1116    /// Try to clone the Arc value from a slot.
1117    ///
1118    /// Returns `None` if:
1119    /// - Slot is empty (null pointer)
1120    /// - Slot contains a layer pointer (`keylenx >= LAYER_KEYLENX`)
1121    ///
1122    /// # Safety Considerations
1123    ///
1124    /// This method is safe to call, but the caller should:
1125    /// - Hold the node lock (for write operations), OR
1126    /// - Have validated the version (for read operations)
1127    ///
1128    /// The returned `Arc<V>` is a new strong reference; the original
1129    /// slot's reference count is incremented.
1130    ///
1131    /// # Panics
1132    ///
1133    /// Panics in debug mode if `slot >= WIDTH`.
1134    fn try_clone_output(&self, slot: usize) -> Option<S::Output>;
1135
1136    /// Assign a slot from a Key iterator with an Arc value.
1137    ///
1138    /// This method sets up a slot with:
1139    /// - `ikey` from `key.ikey()`
1140    /// - `keylenx` computed from `key.has_suffix()`:
1141    ///   - If `key.has_suffix()`: `KSUF_KEYLENX` (64)
1142    ///   - Otherwise: `key.current_len().min(8)` (0-8)
1143    /// - Value pointer from `Arc::into_raw(value)`
1144    /// - Suffix data via `assign_ksuf()` if `key.has_suffix()`
1145    ///
1146    /// # Arguments
1147    ///
1148    /// * `slot` - Physical slot index (0..WIDTH)
1149    /// * `key` - The key containing ikey and suffix information
1150    /// * `value` - The Arc-wrapped value. Must be `Some`; `None` will panic.
1151    /// * `guard` - Seize guard for deferred suffix bag retirement
1152    ///
1153    /// # Safety
1154    ///
1155    /// - Caller must hold the node lock
1156    /// - `guard` must come from this tree's collector
1157    /// - Slot must be unoccupied or caller must handle cleanup of old value
1158    ///
1159    /// # Panics
1160    ///
1161    /// - Panics if `value` is `None` (use layer pointer setup methods instead)
1162    /// - Panics in debug mode if `slot >= WIDTH`
1163    unsafe fn assign_from_key_arc(
1164        &self,
1165        slot: usize,
1166        key: &Key<'_>,
1167        value: Option<S::Output>,
1168        guard: &LocalGuard<'_>,
1169    );
1170}
1171
1172// ============================================================================
1173//  Tests
1174// ============================================================================
1175
1176#[cfg(test)]
1177mod unit_tests;