armdb 0.1.11 - Docs.rs

use std::cell::Cell;
use std::ptr;

// SeqLock intentionally performs concurrent read/write on data, validating via
// seq counter afterward. loom::cell::UnsafeCell would flag this as a causality
// violation, so SeqLock data fields always use std::cell::UnsafeCell.
use std::cell::UnsafeCell;

#[cfg(feature = "loom")]
use loom::sync::atomic::{AtomicPtr, AtomicU64, AtomicUsize, Ordering, fence};
#[cfg(not(feature = "loom"))]
use std::sync::atomic::{AtomicPtr, AtomicU64, AtomicUsize, Ordering, fence};

use crate::Key;
use crate::disk_loc::DiskLoc;
use crate::key::Location;

pub const MAX_HEIGHT: usize = 16;

/// Trait abstracting over ConstNode and VarNode.
pub trait SkipNode: Sized + Send + Sync {
    fn key_bytes(&self) -> &[u8];
    fn height(&self) -> u8;
    fn is_marked(&self) -> bool;
    /// Mark the node as logically deleted. Returns `true` if this call performed the marking.
    fn mark(&self) -> bool;
    fn tower(&self, level: usize) -> &AtomicPtr<Self>;
    /// Allocate a sentinel head node with MAX_HEIGHT and an empty key.
    fn alloc_head() -> *mut Self;
    /// Deallocate a VLA-allocated node (drop inner data + dealloc memory).
    /// # Safety
    /// `ptr` must have been allocated by the corresponding `alloc` or `alloc_head`.
    unsafe fn dealloc_node(ptr: *mut Self);
    /// Seize-compatible reclaimer for VLA-allocated nodes.
    /// # Safety
    /// Same requirements as `dealloc_node`.
    unsafe fn reclaim(ptr: *mut Self, _collector: &seize::Collector);
}

// ---------------------------------------------------------------------------
// SeqLock helpers (loom-compatible)
// ---------------------------------------------------------------------------

/// Read a `Copy` value from an `UnsafeCell` (SeqLock data access).
#[inline(always)]
unsafe fn read_cell<T: Copy>(cell: &UnsafeCell<T>) -> T {
    unsafe { cell.get().read() }
}

/// Write a value into an `UnsafeCell` (SeqLock data access).
#[inline(always)]
unsafe fn write_cell<T>(cell: &UnsafeCell<T>, val: T) {
    unsafe { cell.get().write(val) }
}

/// Yield to loom scheduler or hint spin loop.
#[inline(always)]
fn spin_or_yield() {
    #[cfg(feature = "loom")]
    loom::thread::yield_now();
    #[cfg(not(feature = "loom"))]
    std::hint::spin_loop();
}

// ---------------------------------------------------------------------------

/// SkipList node for `ConstTree` — value and disk location are inlined with SeqLock.
///
/// Generic over `L: Location`: `DiskLoc` (12B) for Bitcask, `u32` (4B) for FixedStore.
///
/// Readers spin-read `seq` (even = stable). Writers (under shard Mutex)
/// bump `seq` to odd, write data, bump to even. No heap allocation per write,
/// no seize retirement for data.
///
/// Tower pointers are stored inline immediately after the struct fields
/// via a variable-length allocation (VLA). `#[repr(C)]` ensures a
/// deterministic layout for the pointer arithmetic in `tower()`.
#[repr(C)]
pub struct ConstNode<K: Key, const V: usize, L: Location = DiskLoc> {
    pub key: K,
    seq: AtomicU64,
    loc: UnsafeCell<L>,
    value: UnsafeCell<[u8; V]>,
    height: u8,
    /// Raw pointer to the tower array (follows the struct in the same allocation).
    /// Stored explicitly so that `tower()` can access it with full allocation provenance
    /// (going through `&self` would limit provenance to `size_of::<Self>()` under Stacked Borrows).
    tower_ptr: *const AtomicPtr<Self>,
}

impl<K: Key, const V: usize, L: Location> ConstNode<K, V, L> {
    /// Size of the fixed fields, rounded up to pointer alignment so the
    /// trailing tower array is properly aligned.
    const fn base_size() -> usize {
        let raw = std::mem::size_of::<Self>();
        let align = std::mem::align_of::<AtomicPtr<Self>>();
        (raw + align - 1) & !(align - 1)
    }

    fn layout_for(height: u8) -> std::alloc::Layout {
        let tower_bytes = height as usize * std::mem::size_of::<AtomicPtr<Self>>();
        let total = Self::base_size() + tower_bytes;
        let align = std::mem::align_of::<Self>().max(std::mem::align_of::<AtomicPtr<Self>>());
        std::alloc::Layout::from_size_align(total, align).expect("invalid layout")
    }

    pub fn alloc(key: K, value: [u8; V], loc: L, height: u8) -> *mut Self {
        let layout = Self::layout_for(height);

        unsafe {
            let ptr = std::alloc::alloc(layout) as *mut Self;
            assert!(!ptr.is_null(), "allocation failed");

            // Compute tower base from the allocation pointer (preserves provenance)
            let tower = (ptr as *mut u8).add(Self::base_size()) as *mut AtomicPtr<Self>;

            // Initialize fixed fields
            std::ptr::write(&raw mut (*ptr).key, key);
            std::ptr::write(&raw mut (*ptr).seq, AtomicU64::new(0));
            std::ptr::write(&raw mut (*ptr).loc, UnsafeCell::new(loc));
            std::ptr::write(&raw mut (*ptr).value, UnsafeCell::new(value));
            std::ptr::write(&raw mut (*ptr).height, height);
            std::ptr::write(&raw mut (*ptr).tower_ptr, tower as *const AtomicPtr<Self>);

            // Initialize tower pointers to null
            for i in 0..height as usize {
                std::ptr::write(tower.add(i), AtomicPtr::new(ptr::null_mut()));
            }

            ptr
        }
    }

    /// Lock-free read of location and value.
    /// Spins if a write is in progress (extremely rare -- writer holds shard mutex).
    #[inline]
    #[allow(dead_code)]
    pub fn read_data(&self) -> (L, [u8; V]) {
        loop {
            let s1 = self.seq.load(Ordering::Acquire);
            if s1 & 1 != 0 {
                spin_or_yield();
                continue;
            }
            let loc = unsafe { read_cell(&self.loc) };
            let value = unsafe { read_cell(&self.value) };
            fence(Ordering::Acquire);
            let s2 = self.seq.load(Ordering::Relaxed);
            if s1 == s2 {
                return (loc, value);
            }
            spin_or_yield();
        }
    }

    /// Read only the location (for compaction).
    #[inline]
    pub fn read_loc(&self) -> L {
        loop {
            let s1 = self.seq.load(Ordering::Acquire);
            if s1 & 1 != 0 {
                spin_or_yield();
                continue;
            }
            let loc = unsafe { read_cell(&self.loc) };
            fence(Ordering::Acquire);
            let s2 = self.seq.load(Ordering::Relaxed);
            if s1 == s2 {
                return loc;
            }
            spin_or_yield();
        }
    }

    /// Read only the value (most common in get()).
    #[inline]
    pub fn read_value(&self) -> [u8; V] {
        loop {
            let s1 = self.seq.load(Ordering::Acquire);
            if s1 & 1 != 0 {
                spin_or_yield();
                continue;
            }
            let value = unsafe { read_cell(&self.value) };
            fence(Ordering::Acquire);
            let s2 = self.seq.load(Ordering::Relaxed);
            if s1 == s2 {
                return value;
            }
            spin_or_yield();
        }
    }

    /// Write location and value. MUST be called under shard mutex (single writer).
    #[inline]
    pub fn write_data(&self, loc: L, value: &[u8; V]) {
        // Mark writing (odd) -- fence ensures this is visible before data writes
        self.seq.fetch_add(1, Ordering::Relaxed);
        fence(Ordering::Release);
        unsafe {
            write_cell(&self.loc, loc);
            write_cell(&self.value, *value);
        }
        // Mark done (even) -- Release ensures data writes visible before this
        self.seq.fetch_add(1, Ordering::Release);
    }

    /// Write only the location (for compaction update_if_match).
    #[inline]
    pub fn write_loc(&self, loc: L) {
        self.seq.fetch_add(1, Ordering::Relaxed);
        fence(Ordering::Release);
        unsafe {
            write_cell(&self.loc, loc);
        }
        self.seq.fetch_add(1, Ordering::Release);
    }
}

impl<K: Key, const V: usize, L: Location> SkipNode for ConstNode<K, V, L> {
    #[inline]
    fn key_bytes(&self) -> &[u8] {
        self.key.as_bytes()
    }

    #[inline]
    fn height(&self) -> u8 {
        self.height
    }

    #[inline]
    fn is_marked(&self) -> bool {
        self.tower(0).load(Ordering::Acquire) as usize & 1 != 0
    }

    fn mark(&self) -> bool {
        // AtomicPtr lacks fetch_or, so reinterpret as AtomicUsize.
        // SAFETY: AtomicPtr<T> and AtomicUsize have identical layout.
        let atomic_usize: &AtomicUsize =
            unsafe { &*(self.tower(0) as *const AtomicPtr<Self> as *const AtomicUsize) };
        let old = atomic_usize.fetch_or(1, Ordering::AcqRel);
        old & 1 == 0
    }

    #[inline]
    fn tower(&self, level: usize) -> &AtomicPtr<Self> {
        debug_assert!(level < self.height as usize);
        // Use tower_ptr (has full allocation provenance) instead of computing
        // from &self (which under Stacked Borrows only covers size_of::<Self>()).
        unsafe { &*self.tower_ptr.add(level) }
    }

    fn alloc_head() -> *mut Self {
        Self::alloc(K::zeroed(), [0u8; V], L::zeroed(), MAX_HEIGHT as u8)
    }

    unsafe fn dealloc_node(ptr: *mut Self) {
        unsafe {
            let height = (*ptr).height;
            // Drop the key in place (no-op for Copy types)
            std::ptr::drop_in_place(&mut (*ptr).key);
            let layout = Self::layout_for(height);
            std::alloc::dealloc(ptr as *mut u8, layout);
        }
    }

    unsafe fn reclaim(ptr: *mut Self, _collector: &seize::Collector) {
        unsafe { Self::dealloc_node(ptr) }
    }
}

// SAFETY: All mutable state is behind atomics. Tower is part of the same allocation.
unsafe impl<K: Key, const V: usize, L: Location> Send for ConstNode<K, V, L> {}
unsafe impl<K: Key, const V: usize, L: Location> Sync for ConstNode<K, V, L> {}

#[cfg(feature = "var-collections")]
/// SkipList node for `VarTree` — only stores disk location via RCU (AtomicPtr).
///
/// Tower pointers are stored inline via VLA (see `ConstNode` docs).
#[repr(C)]
pub struct VarNode<K: Key> {
    pub key: K,
    pub disk: AtomicPtr<DiskLoc>,
    height: u8,
    tower_ptr: *const AtomicPtr<Self>,
}

#[cfg(feature = "var-collections")]
impl<K: Key> VarNode<K> {
    const fn base_size() -> usize {
        let raw = std::mem::size_of::<Self>();
        let align = std::mem::align_of::<AtomicPtr<Self>>();
        (raw + align - 1) & !(align - 1)
    }

    fn layout_for(height: u8) -> std::alloc::Layout {
        let tower_bytes = height as usize * std::mem::size_of::<AtomicPtr<Self>>();
        let total = Self::base_size() + tower_bytes;
        let align = std::mem::align_of::<Self>().max(std::mem::align_of::<AtomicPtr<Self>>());
        std::alloc::Layout::from_size_align(total, align).expect("invalid layout")
    }

    pub fn alloc(key: K, disk: DiskLoc, height: u8) -> *mut Self {
        let layout = Self::layout_for(height);

        unsafe {
            let ptr = std::alloc::alloc(layout) as *mut Self;
            assert!(!ptr.is_null(), "allocation failed");

            let tower = (ptr as *mut u8).add(Self::base_size()) as *mut AtomicPtr<Self>;

            std::ptr::write(&raw mut (*ptr).key, key);
            std::ptr::write(
                &raw mut (*ptr).disk,
                AtomicPtr::new(Box::into_raw(Box::new(disk))),
            );
            std::ptr::write(&raw mut (*ptr).height, height);
            std::ptr::write(&raw mut (*ptr).tower_ptr, tower as *const AtomicPtr<Self>);

            for i in 0..height as usize {
                std::ptr::write(tower.add(i), AtomicPtr::new(ptr::null_mut()));
            }

            ptr
        }
    }

    /// Read the current DiskLoc. Caller must hold a seize Guard.
    #[inline]
    pub fn load_disk(&self) -> &DiskLoc {
        let ptr = self.disk.load(Ordering::Acquire);
        unsafe { &*ptr }
    }

    /// Load the raw DiskLoc pointer (for compare_exchange_disk).
    #[inline]
    pub fn load_disk_ptr(&self) -> *mut DiskLoc {
        self.disk.load(Ordering::Acquire)
    }

    /// Swap in a new DiskLoc, returning the old pointer for retirement.
    /// Caller must hold the shard Mutex.
    #[inline]
    pub fn swap_disk(&self, new_disk: *mut DiskLoc) -> *mut DiskLoc {
        self.disk.swap(new_disk, Ordering::AcqRel)
    }

    /// CAS on the DiskLoc pointer. Used by compaction to avoid TOCTOU race
    /// with concurrent `put` — only succeeds if the pointer hasn't changed
    /// since it was loaded.
    #[inline]
    pub fn compare_exchange_disk(
        &self,
        expected: *mut DiskLoc,
        new_disk: *mut DiskLoc,
    ) -> Result<*mut DiskLoc, *mut DiskLoc> {
        self.disk
            .compare_exchange(expected, new_disk, Ordering::AcqRel, Ordering::Acquire)
    }
}

#[cfg(feature = "var-collections")]
impl<K: Key> SkipNode for VarNode<K> {
    #[inline]
    fn key_bytes(&self) -> &[u8] {
        self.key.as_bytes()
    }

    #[inline]
    fn height(&self) -> u8 {
        self.height
    }

    #[inline]
    fn is_marked(&self) -> bool {
        self.tower(0).load(Ordering::Acquire) as usize & 1 != 0
    }

    fn mark(&self) -> bool {
        let atomic_usize: &AtomicUsize =
            unsafe { &*(self.tower(0) as *const AtomicPtr<Self> as *const AtomicUsize) };
        let old = atomic_usize.fetch_or(1, Ordering::AcqRel);
        old & 1 == 0
    }

    #[inline]
    fn tower(&self, level: usize) -> &AtomicPtr<Self> {
        debug_assert!(level < self.height as usize);
        unsafe { &*self.tower_ptr.add(level) }
    }

    fn alloc_head() -> *mut Self {
        Self::alloc(K::zeroed(), DiskLoc::new(0, 0, 0, 0), MAX_HEIGHT as u8)
    }

    unsafe fn dealloc_node(ptr: *mut Self) {
        unsafe {
            let height = (*ptr).height;

            #[cfg(not(feature = "loom"))]
            let disk = *(*ptr).disk.get_mut();
            #[cfg(feature = "loom")]
            let disk = (*ptr).disk.unsync_load();
            if !disk.is_null() {
                drop(Box::from_raw(disk));
            }

            std::ptr::drop_in_place(&mut (*ptr).key);

            let layout = Self::layout_for(height);
            std::alloc::dealloc(ptr as *mut u8, layout);
        }
    }

    unsafe fn reclaim(ptr: *mut Self, _collector: &seize::Collector) {
        unsafe { Self::dealloc_node(ptr) }
    }
}

// SAFETY: All mutable state is behind atomics. Tower is part of the same allocation.
#[cfg(feature = "var-collections")]
unsafe impl<K: Key> Send for VarNode<K> {}
#[cfg(feature = "var-collections")]
unsafe impl<K: Key> Sync for VarNode<K> {}

// --- TypedNode (feature: typed-tree) ---

#[cfg(feature = "typed-tree")]
pub struct TypedData<T> {
    pub disk: DiskLoc,
    pub value: T,
}

/// SkipList node for `TypedTree` — stores typed value `T` via RCU (AtomicPtr).
///
/// Same RCU pattern as `ConstNode`: readers load `data` with `Acquire`,
/// writers swap with `AcqRel` under shard Mutex, old data retired via `seize`.
///
/// Tower pointers are stored inline via VLA (see `ConstNode` docs).
#[cfg(feature = "typed-tree")]
#[repr(C)]
pub struct TypedNode<K: Key, T> {
    pub key: K,
    pub data: AtomicPtr<TypedData<T>>,
    height: u8,
    tower_ptr: *const AtomicPtr<Self>,
}

#[cfg(feature = "typed-tree")]
impl<K: Key, T> TypedNode<K, T> {
    const fn base_size() -> usize {
        let raw = std::mem::size_of::<Self>();
        let align = std::mem::align_of::<AtomicPtr<Self>>();
        (raw + align - 1) & !(align - 1)
    }

    fn layout_for(height: u8) -> std::alloc::Layout {
        let tower_bytes = height as usize * std::mem::size_of::<AtomicPtr<Self>>();
        let total = Self::base_size() + tower_bytes;
        let align = std::mem::align_of::<Self>().max(std::mem::align_of::<AtomicPtr<Self>>());
        std::alloc::Layout::from_size_align(total, align).expect("invalid layout")
    }

    pub fn alloc(key: K, value: T, disk: DiskLoc, height: u8) -> *mut Self {
        let layout = Self::layout_for(height);

        unsafe {
            let ptr = std::alloc::alloc(layout) as *mut Self;
            assert!(!ptr.is_null(), "allocation failed");

            let tower = (ptr as *mut u8).add(Self::base_size()) as *mut AtomicPtr<Self>;

            std::ptr::write(&raw mut (*ptr).key, key);
            std::ptr::write(
                &raw mut (*ptr).data,
                AtomicPtr::new(Box::into_raw(Box::new(TypedData { disk, value }))),
            );
            std::ptr::write(&raw mut (*ptr).height, height);
            std::ptr::write(&raw mut (*ptr).tower_ptr, tower as *const AtomicPtr<Self>);

            for i in 0..height as usize {
                std::ptr::write(tower.add(i), AtomicPtr::new(ptr::null_mut()));
            }

            ptr
        }
    }

    #[inline]
    pub fn load_data(&self) -> &TypedData<T> {
        let ptr = self.data.load(Ordering::Acquire);
        unsafe { &*ptr }
    }

    #[inline]
    pub fn swap_data(&self, new_data: *mut TypedData<T>) -> *mut TypedData<T> {
        self.data.swap(new_data, Ordering::AcqRel)
    }
}

#[cfg(feature = "typed-tree")]
impl<K: Key, T: Send + Sync> SkipNode for TypedNode<K, T> {
    #[inline]
    fn key_bytes(&self) -> &[u8] {
        self.key.as_bytes()
    }

    #[inline]
    fn height(&self) -> u8 {
        self.height
    }

    #[inline]
    fn is_marked(&self) -> bool {
        self.tower(0).load(Ordering::Acquire) as usize & 1 != 0
    }

    fn mark(&self) -> bool {
        let atomic_usize: &AtomicUsize =
            unsafe { &*(self.tower(0) as *const AtomicPtr<Self> as *const AtomicUsize) };
        let old = atomic_usize.fetch_or(1, Ordering::AcqRel);
        old & 1 == 0
    }

    #[inline]
    fn tower(&self, level: usize) -> &AtomicPtr<Self> {
        debug_assert!(level < self.height as usize);
        unsafe { &*self.tower_ptr.add(level) }
    }

    fn alloc_head() -> *mut Self {
        let layout = Self::layout_for(MAX_HEIGHT as u8);

        unsafe {
            let ptr = std::alloc::alloc(layout) as *mut Self;
            assert!(!ptr.is_null(), "allocation failed");

            let tower = (ptr as *mut u8).add(Self::base_size()) as *mut AtomicPtr<Self>;

            std::ptr::write(&raw mut (*ptr).key, K::zeroed());
            std::ptr::write(&raw mut (*ptr).data, AtomicPtr::new(ptr::null_mut()));
            std::ptr::write(&raw mut (*ptr).height, MAX_HEIGHT as u8);
            std::ptr::write(&raw mut (*ptr).tower_ptr, tower as *const AtomicPtr<Self>);

            for i in 0..MAX_HEIGHT {
                std::ptr::write(tower.add(i), AtomicPtr::new(ptr::null_mut()));
            }

            ptr
        }
    }

    unsafe fn dealloc_node(ptr: *mut Self) {
        unsafe {
            let height = (*ptr).height;

            #[cfg(not(feature = "loom"))]
            let data = *(*ptr).data.get_mut();
            #[cfg(feature = "loom")]
            let data = (*ptr).data.unsync_load();
            if !data.is_null() {
                drop(Box::from_raw(data));
            }

            std::ptr::drop_in_place(&mut (*ptr).key);

            let layout = Self::layout_for(height);
            std::alloc::dealloc(ptr as *mut u8, layout);
        }
    }

    unsafe fn reclaim(ptr: *mut Self, _collector: &seize::Collector) {
        unsafe { Self::dealloc_node(ptr) }
    }
}

#[cfg(feature = "typed-tree")]
unsafe impl<K: Key, T: Send + Sync> Send for TypedNode<K, T> {}
#[cfg(feature = "typed-tree")]
unsafe impl<K: Key, T: Send + Sync> Sync for TypedNode<K, T> {}

// --- Random height generation ---

thread_local! {
    static RNG_STATE: Cell<u64> = const { Cell::new(0) };
}

fn xorshift_seed() -> u64 {
    // Derive seed from a heap allocation address and thread id hash
    let boxed = Box::new(0u8);
    let addr = &*boxed as *const u8 as u64;
    drop(boxed);
    addr.wrapping_mul(6364136223846793005)
        .wrapping_add(1442695040888963407)
}

/// Generate a random height for a new SkipList node.
/// Probability 0.25 for each additional level.
pub fn random_height() -> u8 {
    let mut height = 1u8;
    RNG_STATE.with(|state| {
        let mut s = state.get();
        if s == 0 {
            s = xorshift_seed();
        }
        // xorshift64
        s ^= s << 13;
        s ^= s >> 7;
        s ^= s << 17;
        state.set(s);

        let mut r = s as u32;
        while height < MAX_HEIGHT as u8 && (r & 3) == 0 {
            height += 1;
            r >>= 2;
        }
    });
    height
}