Skip to main content

lsm_tree/
comparator.rs

1// Copyright (c) 2024-present, fjall-rs
2// This source code is licensed under both the Apache 2.0 and MIT License
3// (found in the LICENSE-* files in the repository)
4
5use std::sync::Arc;
6
7/// Trait for custom user key comparison.
8///
9/// Comparators must be safe across unwind boundaries since they are stored
10/// in tree structures that may be referenced inside `catch_unwind` blocks.
11///
12/// Implementations must define a **strict total order** suitable for use in
13/// sorted data structures (memtable skip list, SST block index, merge heap).
14/// Specifically:
15///
16/// - **Totality**: for all `a`, `b`, exactly one of `Less`, `Equal`, `Greater` holds
17/// - **Transitivity**: `a < b` and `b < c` implies `a < c`
18/// - **Antisymmetry**: `compare(a, b) == Less` iff `compare(b, a) == Greater`
19/// - **Reflexivity**: `compare(a, a) == Equal`
20///
21/// - **Bytewise equality**: `compare(a, b) == Equal` **must** imply `a == b`
22///   byte-for-byte. Bloom filters and hash indexes operate on raw bytes;
23///   if two byte-different keys compare as equal, hash-based lookups will
24///   produce false negatives.
25///
26/// Violating these invariants corrupts the sort order and produces incorrect
27/// query results.
28///
29/// # Important
30///
31/// Once a tree is created with a comparator, it must always be opened with the
32/// same comparator. The comparator's [`name`](UserComparator::name) is
33/// persisted and checked on every subsequent open — a mismatch causes the open
34/// to fail with [`Error::ComparatorMismatch`](crate::Error::ComparatorMismatch).
35///
36/// # Examples
37///
38/// ```
39/// use lsm_tree::UserComparator;
40/// use std::cmp::Ordering;
41///
42/// /// Comparator that orders u64 keys stored as big-endian bytes.
43/// struct U64Comparator;
44///
45/// impl UserComparator for U64Comparator {
46///     fn name(&self) -> &'static str {
47///         "u64-big-endian"
48///     }
49///
50///     fn compare(&self, a: &[u8], b: &[u8]) -> Ordering {
51///         if a.len() == 8 && b.len() == 8 {
52///             // Length checked, conversion cannot fail.
53///             let a_u64 = u64::from_be_bytes(a.try_into().unwrap());
54///             let b_u64 = u64::from_be_bytes(b.try_into().unwrap());
55///             a_u64.cmp(&b_u64)
56///         } else {
57///             // Non-8-byte keys: fall back to lexicographic ordering
58///             // to preserve the bytewise-equality invariant.
59///             a.cmp(b)
60///         }
61///     }
62/// }
63/// ```
64pub trait UserComparator: Send + Sync + std::panic::RefUnwindSafe + 'static {
65    /// Returns a stable identifier for this comparator.
66    ///
67    /// The name is persisted when a tree is first created. On subsequent
68    /// opens the stored name is compared against the caller-supplied
69    /// comparator's name — a mismatch causes the open to fail, preventing
70    /// silent data corruption from using an incompatible ordering.
71    ///
72    /// Choose a name that uniquely identifies the ordering logic and will
73    /// not change across releases (e.g. `"u64-big-endian"`, `"reverse-lexicographic"`).
74    //
75    // Intentionally required (no default impl): a shared fallback name would
76    // let two distinct comparators pass the mismatch check, silently producing
77    // corrupt reads on reopen — the exact scenario this method prevents.
78    fn name(&self) -> &'static str;
79
80    /// Compares two user keys, returning their ordering.
81    fn compare(&self, a: &[u8], b: &[u8]) -> std::cmp::Ordering;
82
83    /// Returns `true` if this comparator is lexicographic byte ordering.
84    ///
85    /// When `true`, internal optimizations can avoid allocations in
86    /// prefix-compressed block comparisons. Override only if your
87    /// comparator is truly equivalent to `a.cmp(b)` on raw bytes.
88    fn is_lexicographic(&self) -> bool {
89        false
90    }
91}
92
93/// Default comparator using lexicographic byte ordering.
94///
95/// This is the comparator used when no custom comparator is configured,
96/// preserving backward compatibility with existing trees.
97#[derive(Clone, Debug)]
98pub struct DefaultUserComparator;
99
100impl UserComparator for DefaultUserComparator {
101    fn name(&self) -> &'static str {
102        "default"
103    }
104
105    #[inline]
106    fn compare(&self, a: &[u8], b: &[u8]) -> std::cmp::Ordering {
107        a.cmp(b)
108    }
109
110    #[inline]
111    fn is_lexicographic(&self) -> bool {
112        true
113    }
114}
115
116/// Shared reference to a [`UserComparator`].
117pub type SharedComparator = Arc<dyn UserComparator>;
118
119/// Maximum byte length for a comparator name.
120///
121/// Enforced on write (`persist_version`) and read (`Manifest::decode_from`).
122pub const MAX_COMPARATOR_NAME_BYTES: usize = 256;
123
124/// Returns the default comparator (lexicographic byte ordering).
125///
126/// Uses a shared static instance to avoid repeated allocations.
127#[must_use]
128pub fn default_comparator() -> SharedComparator {
129    // LazyLock creates the Arc once; subsequent calls just clone the Arc (ref-count bump).
130    static DEFAULT: std::sync::LazyLock<SharedComparator> =
131        std::sync::LazyLock::new(|| Arc::new(DefaultUserComparator));
132    DEFAULT.clone()
133}
134
135#[cfg(test)]
136mod tests {
137    use super::*;
138
139    #[test]
140    fn default_comparator_name() {
141        assert_eq!(DefaultUserComparator.name(), "default");
142        assert_eq!(default_comparator().name(), "default");
143    }
144
145    #[test]
146    fn default_comparator_is_lexicographic() {
147        assert!(DefaultUserComparator.is_lexicographic());
148    }
149}