coordinode-lsm-tree 5.7.0

// SPDX-License-Identifier: Apache-2.0
// Copyright (c) 2024-present, fjall-rs
// Copyright (c) 2026-present, Structured World Foundation

pub mod arena;
pub mod interval_tree;
pub mod skiplist;
pub mod value_store;

use crate::comparator::SharedComparator;
use crate::key::InternalKey;
use crate::range_tombstone::RangeTombstone;
use crate::{
    UserKey, ValueType,
    value::{InternalValue, SeqNo},
};
#[cfg(not(feature = "std"))]
use alloc::vec::Vec;
use core::ops::RangeBounds;
use core::sync::atomic::AtomicBool;
use portable_atomic::AtomicU64;
// `parking_lot::RwLock` (std: small, userspace fast-path, no poisoning) /
// `spin::RwLock` (no_std). Neither poisons on a panicked holder, so the read/
// write guards are taken without a `LockResult` unwrap.
#[cfg(feature = "std")]
use parking_lot::RwLock;
#[cfg(not(feature = "std"))]
use spin::RwLock;

pub use crate::tree::inner::MemtableId;

/// The memtable serves as an intermediary, ephemeral, sorted storage for new items
///
/// When the Memtable exceeds some size, it should be flushed to a table.
pub struct Memtable {
    #[doc(hidden)]
    pub id: MemtableId,

    /// The user key comparator used for ordering entries.
    pub(crate) comparator: SharedComparator,

    /// The actual content, stored in an arena-based skiplist with lock-free traversal.
    ///
    /// Nodes are allocated from a contiguous byte arena for cache locality
    /// and O(1) bulk deallocation when the memtable is dropped.  Traversal of
    /// the skiplist index uses atomic loads and CAS for inserts.
    pub(crate) items: skiplist::SkipMap,

    /// Range tombstones stored in an interval tree.
    ///
    /// Protected by `RwLock` — read-heavy suppression queries (`query_suppression`,
    /// `range_tombstones_sorted`) take a shared read lock, while `insert_range_tombstone`
    /// takes an exclusive write lock. After a rotation has been requested via
    /// `requested_rotation`, the interval tree is treated as read-only by convention,
    /// and only readers are expected to access this field (the `RwLock` is still used
    /// for synchronization, but there should be no further writes).
    ///
    /// `std::sync::RwLock` may be reader-biased on some platforms, but writer
    /// starvation is not a concern here: range deletes are rare, the write-side
    /// critical section is O(log n) with n typically small, and the memtable
    /// rotates (becoming read-only) well before contention could accumulate.
    pub(crate) range_tombstones: RwLock<interval_tree::IntervalTree>,

    /// Approximate active memtable size.
    ///
    /// If this grows too large, a flush is triggered.
    pub(crate) approximate_size: AtomicU64,

    /// Highest encountered sequence number.
    ///
    /// This is used so that `get_highest_seqno` has O(1) complexity.
    pub(crate) highest_seqno: AtomicU64,

    pub(crate) requested_rotation: AtomicBool,

    /// Whether any insert-time per-KV digest (`KvChecksumComputePoint::AtInsert`)
    /// has been stored in this memtable. Set on the first digest-bearing insert
    /// and read once at flush by [`Self::verify_kv_residence`] to skip walking
    /// the nodes entirely when there is nothing to verify (the default `Off` /
    /// `AtBlockCompile` path). The per-node digest carries its own algorithm, so
    /// no memtable-wide algorithm is tracked here.
    has_at_insert_digests: AtomicBool,
}

impl Memtable {
    /// Returns the memtable ID.
    pub fn id(&self) -> MemtableId {
        self.id
    }

    /// Returns `true` if the memtable was already flagged for rotation.
    pub fn is_flagged_for_rotation(&self) -> bool {
        self.requested_rotation
            .load(core::sync::atomic::Ordering::Relaxed)
    }

    /// Flags the memtable as requested for rotation.
    pub fn flag_rotated(&self) {
        self.requested_rotation
            .store(true, core::sync::atomic::Ordering::Relaxed);
    }

    // `pub` + `#[doc(hidden)]`: used by the host crate (fjall) to construct
    // ephemeral memtables. Not part of the semver-stable API.
    // Keep the comparator by-value for hidden-public API compatibility while
    // still requiring callers to pass the tree comparator explicitly.
    #[doc(hidden)]
    #[expect(
        clippy::needless_pass_by_value,
        reason = "hidden-public constructor keeps the preexisting by-value signature for compatibility"
    )]
    #[must_use]
    pub fn new(id: MemtableId, comparator: SharedComparator) -> Self {
        Self {
            id,
            items: skiplist::SkipMap::new(comparator.clone()),
            comparator: comparator.clone(),
            range_tombstones: RwLock::new(interval_tree::IntervalTree::new_with_comparator(
                comparator.clone(),
            )),
            approximate_size: AtomicU64::default(),
            highest_seqno: AtomicU64::default(),
            requested_rotation: AtomicBool::default(),
            has_at_insert_digests: AtomicBool::default(),
        }
    }

    /// Creates an iterator over all items.
    pub fn iter(&self) -> impl DoubleEndedIterator<Item = InternalValue> + '_ {
        self.items.iter().map(|entry| InternalValue {
            key: entry.key(),
            value: entry.value(),
        })
    }

    /// Creates an iterator over a range of items.
    ///
    /// Accepts `InternalKey`-based bounds.
    pub(crate) fn range_internal<'a, R: RangeBounds<InternalKey> + 'a>(
        &'a self,
        range: R,
    ) -> impl DoubleEndedIterator<Item = InternalValue> + 'a {
        self.items.range(range).map(|entry| InternalValue {
            key: entry.key(),
            value: entry.value(),
        })
    }

    /// Returns the item by key if it exists.
    ///
    /// Returns the version with the highest seqno that is strictly less than
    /// the given `seqno`.  Pass [`MAX_SEQNO`](crate::MAX_SEQNO) to retrieve the latest version.
    #[doc(hidden)]
    pub fn get(&self, key: &[u8], seqno: SeqNo) -> Option<InternalValue> {
        if seqno == 0 {
            return None;
        }

        // NOTE: This range start deserves some explanation...
        // InternalKeys are multi-sorted by 2 categories: user_key and Reverse(seqno). (tombstone doesn't really matter)
        // We search for the lowest entry that is greater or equal the user's prefix key
        // and has the seqno (or lower) we want (because the seqno is stored in reverse order)
        //
        // Example: We search for "abc"
        //
        // key -> seqno
        //
        // a   -> 7
        // abc -> 5 <<< This is the lowest key (highest seqno) that matches the key with seqno=MAX
        // abc -> 4
        // abc -> 3 <<< If searching for abc and seqno=4, we would get this
        // abcdef -> 6
        // abcdef -> 5
        //
        let lower_bound = InternalKey::new(key, seqno - 1, ValueType::Value);

        let cmp = self.comparator.as_ref();

        let mut iter = self.items.range(lower_bound..).take_while(|entry| {
            cmp.compare(entry.user_key_bytes(), key) == core::cmp::Ordering::Equal
        });

        iter.next().map(|entry| InternalValue {
            key: entry.key(),
            value: entry.value(),
        })
    }

    /// Gets approximate size of memtable in bytes.
    pub fn size(&self) -> u64 {
        self.approximate_size
            .load(core::sync::atomic::Ordering::Acquire)
    }

    /// Counts the number of items in the memtable.
    pub fn len(&self) -> usize {
        self.items.len()
    }

    /// Returns `true` if the memtable has no KV items and no range tombstones.
    #[must_use]
    pub fn is_empty(&self) -> bool {
        self.items.is_empty() && self.range_tombstone_count() == 0
    }

    /// Inserts multiple items into the memtable in bulk.
    ///
    /// More efficient than calling [`Memtable::insert`] in a loop because it
    /// performs a single `fetch_add` for the total size and a single
    /// `fetch_max` for the highest seqno.
    ///
    /// Returns `(total_bytes_added, new_memtable_size)`.
    #[doc(hidden)]
    pub fn insert_batch(&self, items: Vec<InternalValue>) -> (u64, u64) {
        self.insert_batch_with_kv_algo(items, None)
    }

    /// Bulk insert, optionally computing an insert-time per-KV digest per item
    /// under `kv_algo` (`KvChecksumComputePoint::AtInsert`).
    ///
    /// `kv_algo` is `Some(algo)` (a 4-byte algorithm) to fix each entry's
    /// digest at insert for the flush-time residence check, or `None` for the
    /// plain bulk path. Same single-`fetch_add` / single-`fetch_max` accounting
    /// as [`Self::insert_batch`].
    #[doc(hidden)]
    pub fn insert_batch_with_kv_algo(
        &self,
        items: Vec<InternalValue>,
        kv_algo: Option<crate::runtime_config::ChecksumAlgorithm>,
    ) -> (u64, u64) {
        if items.is_empty() {
            let size = self
                .approximate_size
                .load(core::sync::atomic::Ordering::Acquire);
            return (0, size);
        }

        let mut total_size: u64 = 0;
        let mut max_seqno: u64 = 0;

        let overhead =
            core::mem::size_of::<InternalValue>() + core::mem::size_of::<SharedComparator>();

        for item in &items {
            #[expect(
                clippy::expect_used,
                reason = "keys are limited to 16-bit length + values are limited to 32-bit length"
            )]
            let item_size: u64 = (item.key.user_key.len() + item.value.len() + overhead)
                .try_into()
                .expect("should fit into u64");

            // Running memtable byte total, bounded by the in-memory data size;
            // a plain add cannot overflow u64.
            total_size += item_size;

            if item.key.seqno > max_seqno {
                max_seqno = item.key.seqno;
            }
        }

        let size_before = self
            .approximate_size
            .fetch_add(total_size, core::sync::atomic::Ordering::AcqRel);

        if kv_algo.is_some() {
            // Flag that this memtable carries residence digests for the
            // flush-time verify. The algorithm lives per node.
            self.has_at_insert_digests
                .store(true, core::sync::atomic::Ordering::Relaxed);
        }

        for item in items {
            let digest = kv_algo.and_then(|algo| {
                crate::table::block::kv_checksum::kv_digest(&item, algo).map(|d| {
                    #[expect(
                        clippy::cast_possible_truncation,
                        reason = "AtInsert is config-validated to a 4-byte algorithm; the digest fits u32"
                    )]
                    let lo = d as u32;
                    (lo, algo)
                })
            });
            let key = InternalKey::new(item.key.user_key, item.key.seqno, item.key.value_type);
            self.items.insert_with_kv_digest(&key, &item.value, digest);
        }

        self.highest_seqno
            .fetch_max(max_seqno, core::sync::atomic::Ordering::AcqRel);

        // fetch_add returns value BEFORE the add, so size_before + total_size
        // = value AFTER add = new memtable size. Same pattern as Memtable::insert().
        (total_size, size_before + total_size)
    }

    /// Inserts an item into the memtable
    #[doc(hidden)]
    pub fn insert(&self, item: InternalValue) -> (u64, u64) {
        #[expect(
            clippy::expect_used,
            reason = "keys are limited to 16-bit length + values are limited to 32-bit length"
        )]
        // Account for MemtableKey overhead (InternalKey + Arc<dyn UserComparator>)
        let item_size = (item.key.user_key.len()
            + item.value.len()
            + core::mem::size_of::<InternalValue>()
            + core::mem::size_of::<SharedComparator>())
        .try_into()
        .expect("should fit into u64");

        let size_before = self
            .approximate_size
            .fetch_add(item_size, core::sync::atomic::Ordering::AcqRel);

        let key = InternalKey::new(item.key.user_key, item.key.seqno, item.key.value_type);
        self.items.insert(&key, &item.value);

        self.highest_seqno
            .fetch_max(item.key.seqno, core::sync::atomic::Ordering::AcqRel);

        (item_size, size_before + item_size)
    }

    /// Inserts an item, optionally carrying a precomputed insert-time per-KV
    /// digest (`KvChecksumComputePoint::AtInsert`).
    ///
    /// `kv_digest` is `Some((digest, algo))` when the caller computed the
    /// entry's 4-byte logical-content digest at insert (under `AtInsert` with a
    /// 4-byte algorithm), or `None` for the plain path. When present, the digest
    /// and its algorithm are stored in the skiplist node (per node, so a later
    /// config change cannot misverify it) and the memtable flags that it carries
    /// at least one digest so [`Self::verify_kv_residence`] knows to walk at
    /// flush. Mixed inserts (some with, some without a digest) are supported for
    /// the `Off` -> `AtInsert` live toggle.
    #[doc(hidden)]
    pub fn insert_with_kv_digest(
        &self,
        item: InternalValue,
        kv_digest: Option<(u32, crate::runtime_config::ChecksumAlgorithm)>,
    ) -> (u64, u64) {
        #[expect(
            clippy::expect_used,
            reason = "keys are limited to 16-bit length + values are limited to 32-bit length"
        )]
        let item_size = (item.key.user_key.len()
            + item.value.len()
            + core::mem::size_of::<InternalValue>()
            + core::mem::size_of::<SharedComparator>())
        .try_into()
        .expect("should fit into u64");

        let size_before = self
            .approximate_size
            .fetch_add(item_size, core::sync::atomic::Ordering::AcqRel);

        if kv_digest.is_some() {
            // Flag that this memtable carries at least one residence digest so
            // the flush-time verify walks the nodes. The algorithm lives per
            // node, not here.
            self.has_at_insert_digests
                .store(true, core::sync::atomic::Ordering::Relaxed);
        }

        let key = InternalKey::new(item.key.user_key, item.key.seqno, item.key.value_type);
        self.items
            .insert_with_kv_digest(&key, &item.value, kv_digest);

        self.highest_seqno
            .fetch_max(item.key.seqno, core::sync::atomic::Ordering::AcqRel);

        (item_size, size_before + item_size)
    }

    /// Verifies every insert-time per-KV digest in this memtable against a
    /// recompute over the entry's current bytes (the
    /// [`KvChecksumComputePoint::AtInsert`](crate::runtime_config::KvChecksumComputePoint::AtInsert)
    /// residence check), called once at flush.
    ///
    /// Returns `Ok` immediately when no `AtInsert` digest was ever inserted, so
    /// the default path pays nothing.
    ///
    /// # Errors
    ///
    /// - [`crate::Error::MemtableKvChecksumMismatch`] when an entry's stored
    ///   digest diverges from the recompute (a RAM bit-flip during residence).
    /// - [`crate::Error::FeatureUnsupported`] when a node's algorithm is not
    ///   compiled into this build.
    pub fn verify_kv_residence(&self) -> crate::Result<()> {
        if !self
            .has_at_insert_digests
            .load(core::sync::atomic::Ordering::Relaxed)
        {
            return Ok(());
        }
        self.items.verify_kv_digests()
    }

    /// Inserts a range tombstone covering `[start, end)` at the given seqno.
    ///
    /// Returns the approximate size added to the memtable.
    ///
    /// Returns 0 if `start >= end` or if either bound exceeds `u16::MAX` bytes.
    ///
    /// # Panics
    ///
    /// Panics if the internal `RwLock` is poisoned.
    #[must_use]
    pub fn insert_range_tombstone(&self, start: UserKey, end: UserKey, seqno: SeqNo) -> u64 {
        // flag_rotated() (which sets requested_rotation) is called by the host
        // crate (fjall) before rotation; this crate never sets it directly.
        // The assert catches misuse by callers
        // in debug builds — intentionally debug-only because post-rotation writes
        // are structurally prevented by the host (sealed memtables are behind Arc
        // with no write path exposed), and an atomic load here would add overhead
        // on the hot insert path in release builds for no practical benefit.
        debug_assert!(
            !self.is_flagged_for_rotation(),
            "insert_range_tombstone called after memtable was flagged for rotation"
        );

        // Reject invalid intervals in release builds (debug_assert is not enough)
        if self.comparator.compare(&start, &end) != core::cmp::Ordering::Less {
            return 0;
        }

        // On-disk RT format writes key lengths as u16, enforce at insertion time.
        // Emit a warning when rejecting an oversized bound so this failure is diagnosable.
        if u16::try_from(start.len()).is_err() || u16::try_from(end.len()).is_err() {
            log::warn!(
                "insert_range_tombstone: rejecting oversized range tombstone \
                 bounds (start_len = {}, end_len = {}, max = {})",
                start.len(),
                end.len(),
                u16::MAX,
            );
            return 0;
        }

        let size = (start.len() + end.len() + core::mem::size_of::<RangeTombstone>()) as u64;

        self.range_tombstones
            .write()
            .insert(RangeTombstone::new(start, end, seqno));

        self.approximate_size
            .fetch_add(size, core::sync::atomic::Ordering::AcqRel);

        self.highest_seqno
            .fetch_max(seqno, core::sync::atomic::Ordering::AcqRel);

        size
    }

    /// Returns `true` if the key at `key_seqno` is suppressed by a range tombstone
    /// visible at `read_seqno`.
    pub(crate) fn is_key_suppressed_by_range_tombstone(
        &self,
        key: &[u8],
        key_seqno: SeqNo,
        read_seqno: SeqNo,
    ) -> bool {
        self.range_tombstones
            .read()
            .query_suppression(key, key_seqno, read_seqno)
    }

    /// Returns all range tombstones in sorted order (for flush).
    pub(crate) fn range_tombstones_sorted(&self) -> Vec<RangeTombstone> {
        self.range_tombstones.read().iter_sorted()
    }

    /// Returns the number of range tombstones.
    #[must_use]
    pub fn range_tombstone_count(&self) -> usize {
        self.range_tombstones.read().len()
    }

    /// Returns the highest sequence number in the memtable.
    pub fn get_highest_seqno(&self) -> Option<SeqNo> {
        if self.is_empty() {
            None
        } else {
            Some(
                self.highest_seqno
                    .load(core::sync::atomic::Ordering::Acquire),
            )
        }
    }
}

#[cfg(test)]
mod tests;