coordinode-lsm-tree 5.7.0

Embedded LSM-tree storage engine: BuRR filters, zstd dictionary compression, MVCC, range tombstones, merge operators, K/V separation, AES-256-GCM at rest.
Documentation
// SPDX-License-Identifier: Apache-2.0
// Copyright (c) 2024-present, fjall-rs
// Copyright (c) 2026-present, Structured World Foundation

#[cfg(not(feature = "std"))]
use alloc::vec::Vec;

use crate::{SeqNo, UserKey, comparator::UserComparator};
use core::cmp::Reverse;

/// A range tombstone that deletes all keys in `[start, end)` at a given sequence number.
///
/// Half-open interval: `start` is inclusive, `end` is exclusive.
/// A key `k` is covered iff `start <= k < end`.
#[derive(Clone, Debug, Eq, PartialEq)]
pub struct RangeTombstone {
    /// Inclusive start bound
    pub start: UserKey,
    /// Exclusive end bound
    pub end: UserKey,
    /// Sequence number at which this tombstone was written
    pub seqno: SeqNo,
}

impl RangeTombstone {
    /// Creates a new range tombstone for `[start, end)` at the given seqno.
    ///
    /// # Panics (debug only)
    ///
    /// Debug-asserts that `start < end`. Callers must validate untrusted input
    /// before constructing a `RangeTombstone`.
    // No debug_assert on start < end here: with custom comparators,
    // lexicographic order may differ from comparator order. Callers
    // (decode_range_tombstones, insert_range_tombstone) validate using
    // the appropriate comparator or lexicographic order at their level.
    #[must_use]
    pub fn new(start: UserKey, end: UserKey, seqno: SeqNo) -> Self {
        Self { start, end, seqno }
    }

    /// Returns `true` if `key` is within `[start, end)`.
    #[must_use]
    pub fn contains_key(&self, key: &[u8]) -> bool {
        self.contains_key_with(key, &crate::comparator::DefaultUserComparator)
    }

    /// Returns `true` if `key` is within `[start, end)` using the tree comparator.
    #[must_use]
    pub fn contains_key_with(&self, key: &[u8], comparator: &dyn UserComparator) -> bool {
        comparator.compare(&self.start, key) != core::cmp::Ordering::Greater
            && comparator.compare(key, &self.end) == core::cmp::Ordering::Less
    }

    /// Returns `true` if this tombstone is visible at the given read seqno.
    ///
    /// Uses exclusive boundary (`self.seqno < read_seqno`) consistent with
    /// the codebase convention where `seqno` is an exclusive snapshot boundary.
    #[must_use]
    pub fn visible_at(&self, read_seqno: SeqNo) -> bool {
        self.seqno < read_seqno
    }

    /// Returns `true` if this tombstone should suppress a KV with the given seqno
    /// at the given read snapshot.
    ///
    /// Suppress iff: `kv_seqno < self.seqno AND self.contains_key(key) AND self.visible_at(read_seqno)`
    #[must_use]
    pub fn should_suppress(&self, key: &[u8], kv_seqno: SeqNo, read_seqno: SeqNo) -> bool {
        self.should_suppress_with(
            key,
            kv_seqno,
            read_seqno,
            &crate::comparator::DefaultUserComparator,
        )
    }

    /// Comparator-aware variant of [`RangeTombstone::should_suppress`].
    #[must_use]
    pub fn should_suppress_with(
        &self,
        key: &[u8],
        kv_seqno: SeqNo,
        read_seqno: SeqNo,
        comparator: &dyn UserComparator,
    ) -> bool {
        self.visible_at(read_seqno)
            && self.contains_key_with(key, comparator)
            && kv_seqno < self.seqno
    }

    /// Returns the intersection of this tombstone with `[min, max)`, or `None`
    /// if the ranges do not overlap.
    ///
    /// The resulting tombstone has the same seqno as `self`.
    #[must_use]
    pub fn intersect_opt(&self, min: &[u8], max: &[u8]) -> Option<Self> {
        self.intersect_opt_with(min, max, &crate::comparator::DefaultUserComparator)
    }

    /// Comparator-aware variant of [`RangeTombstone::intersect_opt`].
    #[must_use]
    pub fn intersect_opt_with(
        &self,
        min: &[u8],
        max: &[u8],
        comparator: &dyn UserComparator,
    ) -> Option<Self> {
        let new_start_ref = if comparator.compare(&self.start, min) == core::cmp::Ordering::Greater
        {
            self.start.as_ref()
        } else {
            min
        };
        let new_end_ref = if comparator.compare(&self.end, max) == core::cmp::Ordering::Less {
            self.end.as_ref()
        } else {
            max
        };

        if comparator.compare(new_start_ref, new_end_ref) == core::cmp::Ordering::Less {
            Some(Self {
                start: UserKey::from(new_start_ref),
                end: UserKey::from(new_end_ref),
                seqno: self.seqno,
            })
        } else {
            None
        }
    }

    /// Returns `true` if this tombstone fully covers the key range `[min, max]`.
    ///
    /// "Fully covers" means `self.start <= min` AND `max < self.end`.
    /// This uses the half-open convention: the inclusive `max` must be
    /// strictly less than the exclusive `end`.
    #[must_use]
    pub fn fully_covers(&self, min: &[u8], max: &[u8]) -> bool {
        self.fully_covers_with(min, max, &crate::comparator::DefaultUserComparator)
    }

    /// Comparator-aware variant of [`RangeTombstone::fully_covers`].
    #[must_use]
    pub fn fully_covers_with(
        &self,
        min: &[u8],
        max: &[u8],
        comparator: &dyn UserComparator,
    ) -> bool {
        comparator.compare(&self.start, min) != core::cmp::Ordering::Greater
            && comparator.compare(max, &self.end) == core::cmp::Ordering::Less
    }

    /// Comparator-aware ordering for range tombstone processing.
    ///
    /// Ordered by `(start asc, seqno desc, end asc)` using the user comparator
    /// for user-key components.
    #[must_use]
    pub fn cmp_with_comparator(
        &self,
        other: &Self,
        comparator: &dyn UserComparator,
    ) -> core::cmp::Ordering {
        comparator
            .compare(&self.start, &other.start)
            .then_with(|| other.seqno.cmp(&self.seqno))
            .then_with(|| comparator.compare(&self.end, &other.end))
    }
}

/// Ordered by `(start asc, seqno desc, end asc)`.
///
/// The `end` tiebreaker ensures deterministic ordering for debug output
/// and property tests.
impl Ord for RangeTombstone {
    fn cmp(&self, other: &Self) -> core::cmp::Ordering {
        (&self.start, Reverse(self.seqno), &self.end).cmp(&(
            &other.start,
            Reverse(other.seqno),
            &other.end,
        ))
    }
}

impl PartialOrd for RangeTombstone {
    fn partial_cmp(&self, other: &Self) -> Option<core::cmp::Ordering> {
        Some(self.cmp(other))
    }
}

/// Information about a covering range tombstone, used for table-skip decisions.
///
/// A covering tombstone fully covers a table's key range and has a seqno
/// greater than the table's max seqno, meaning the entire table can be skipped.
#[derive(Clone, Debug)]
pub struct CoveringRt {
    /// The start key of the covering tombstone (inclusive)
    pub start: UserKey,
    /// The end key of the covering tombstone (exclusive)
    pub end: UserKey,
    /// The seqno of the covering tombstone
    pub seqno: SeqNo,
}

impl CoveringRt {
    /// Returns `true` if this covering tombstone fully covers the given
    /// key range `[min, max]` and has a higher seqno than the table's max.
    #[must_use]
    #[cfg_attr(
        not(test),
        expect(dead_code, reason = "wired up in table-skip optimization")
    )]
    pub fn covers_table(&self, table_min: &[u8], table_max: &[u8], table_max_seqno: SeqNo) -> bool {
        self.covers_table_with(
            table_min,
            table_max,
            table_max_seqno,
            &crate::comparator::DefaultUserComparator,
        )
    }

    pub fn covers_table_with(
        &self,
        table_min: &[u8],
        table_max: &[u8],
        table_max_seqno: SeqNo,
        comparator: &dyn UserComparator,
    ) -> bool {
        comparator.compare(&self.start, table_min) != core::cmp::Ordering::Greater
            && comparator.compare(table_max, &self.end) == core::cmp::Ordering::Less
            && self.seqno > table_max_seqno
    }
}

impl From<&RangeTombstone> for CoveringRt {
    fn from(rt: &RangeTombstone) -> Self {
        Self {
            start: rt.start.clone(),
            end: rt.end.clone(),
            seqno: rt.seqno,
        }
    }
}

/// Computes the upper bound exclusive key for use in range queries.
///
/// Given a key, returns the next key in lexicographic order by appending `0x00`.
/// This is useful for converting inclusive upper bounds to exclusive ones
/// in range-cover queries.
///
/// Returns `None` only when the key is already the lexicographically largest
/// encodable user key in this bounded key domain.
#[must_use]
pub fn upper_bound_exclusive(key: &[u8]) -> Option<UserKey> {
    // The codebase enforces that user keys fit in a u16 length
    // (see `InternalKey::new`). For shorter keys, appending `0x00`
    // yields the immediate strict upper bound while preserving prefix order.
    if key.len() < usize::from(u16::MAX) {
        let mut result = Vec::with_capacity(key.len() + 1);
        result.extend_from_slice(key);
        result.push(0x00);
        return Some(UserKey::from(result));
    }

    // At max length we cannot append, but a strict upper bound still exists
    // unless the key is already the absolute maximum (all `0xFF`).
    let mut result = key.to_vec();

    for (idx, byte) in result.iter_mut().enumerate().rev() {
        if *byte < 0xFF {
            *byte += 1;
            result.truncate(idx + 1);
            return Some(UserKey::from(result));
        }
    }

    None
}

#[cfg(test)]
#[expect(
    clippy::unwrap_used,
    clippy::expect_used,
    reason = "tests use unwrap/expect on controlled fixtures for brevity"
)]
mod tests;