lsm_tree/comparator.rs
1// Copyright (c) 2024-present, fjall-rs
2// This source code is licensed under both the Apache 2.0 and MIT License
3// (found in the LICENSE-* files in the repository)
4
5use std::sync::Arc;
6
7/// Trait for custom user key comparison.
8///
9/// Comparators must be safe across unwind boundaries since they are stored
10/// in tree structures that may be referenced inside `catch_unwind` blocks.
11///
12/// Implementations must define a **strict total order** suitable for use in
13/// sorted data structures (memtable skip list, SST block index, merge heap).
14/// Specifically:
15///
16/// - **Totality**: for all `a`, `b`, exactly one of `Less`, `Equal`, `Greater` holds
17/// - **Transitivity**: `a < b` and `b < c` implies `a < c`
18/// - **Antisymmetry**: `compare(a, b) == Less` iff `compare(b, a) == Greater`
19/// - **Reflexivity**: `compare(a, a) == Equal`
20///
21/// - **Bytewise equality**: `compare(a, b) == Equal` **must** imply `a == b`
22/// byte-for-byte. Bloom filters and hash indexes operate on raw bytes;
23/// if two byte-different keys compare as equal, hash-based lookups will
24/// produce false negatives.
25///
26/// Violating these invariants corrupts the sort order and produces incorrect
27/// query results.
28///
29/// # Important
30///
31/// Once a tree is created with a comparator, it must always be opened with the
32/// same comparator. The comparator's [`name`](UserComparator::name) is
33/// persisted and checked on every subsequent open — a mismatch causes the open
34/// to fail with [`Error::ComparatorMismatch`](crate::Error::ComparatorMismatch).
35///
36/// # Examples
37///
38/// ```
39/// use lsm_tree::UserComparator;
40/// use std::cmp::Ordering;
41///
42/// /// Comparator that orders u64 keys stored as big-endian bytes.
43/// struct U64Comparator;
44///
45/// impl UserComparator for U64Comparator {
46/// fn name(&self) -> &'static str {
47/// "u64-big-endian"
48/// }
49///
50/// fn compare(&self, a: &[u8], b: &[u8]) -> Ordering {
51/// if a.len() == 8 && b.len() == 8 {
52/// // Length checked, conversion cannot fail.
53/// let a_u64 = u64::from_be_bytes(a.try_into().unwrap());
54/// let b_u64 = u64::from_be_bytes(b.try_into().unwrap());
55/// a_u64.cmp(&b_u64)
56/// } else {
57/// // Non-8-byte keys: fall back to lexicographic ordering
58/// // to preserve the bytewise-equality invariant.
59/// a.cmp(b)
60/// }
61/// }
62/// }
63/// ```
64pub trait UserComparator: Send + Sync + std::panic::RefUnwindSafe + 'static {
65 /// Returns a stable identifier for this comparator.
66 ///
67 /// The name is persisted when a tree is first created. On subsequent
68 /// opens the stored name is compared against the caller-supplied
69 /// comparator's name — a mismatch causes the open to fail, preventing
70 /// silent data corruption from using an incompatible ordering.
71 ///
72 /// Choose a name that uniquely identifies the ordering logic and will
73 /// not change across releases (e.g. `"u64-big-endian"`, `"reverse-lexicographic"`).
74 //
75 // Intentionally required (no default impl): a shared fallback name would
76 // let two distinct comparators pass the mismatch check, silently producing
77 // corrupt reads on reopen — the exact scenario this method prevents.
78 fn name(&self) -> &'static str;
79
80 /// Compares two user keys, returning their ordering.
81 fn compare(&self, a: &[u8], b: &[u8]) -> std::cmp::Ordering;
82
83 /// Returns `true` if this comparator is lexicographic byte ordering.
84 ///
85 /// When `true`, internal optimizations can avoid allocations in
86 /// prefix-compressed block comparisons. Override only if your
87 /// comparator is truly equivalent to `a.cmp(b)` on raw bytes.
88 fn is_lexicographic(&self) -> bool {
89 false
90 }
91}
92
93/// Default comparator using lexicographic byte ordering.
94///
95/// This is the comparator used when no custom comparator is configured,
96/// preserving backward compatibility with existing trees.
97#[derive(Clone, Debug)]
98pub struct DefaultUserComparator;
99
100impl UserComparator for DefaultUserComparator {
101 fn name(&self) -> &'static str {
102 "default"
103 }
104
105 #[inline]
106 fn compare(&self, a: &[u8], b: &[u8]) -> std::cmp::Ordering {
107 a.cmp(b)
108 }
109
110 #[inline]
111 fn is_lexicographic(&self) -> bool {
112 true
113 }
114}
115
116/// Shared reference to a [`UserComparator`].
117pub type SharedComparator = Arc<dyn UserComparator>;
118
119/// Maximum byte length for a comparator name.
120///
121/// Enforced on write (`persist_version`) and read (`Manifest::decode_from`).
122pub const MAX_COMPARATOR_NAME_BYTES: usize = 256;
123
124/// Returns the default comparator (lexicographic byte ordering).
125///
126/// Uses a shared static instance to avoid repeated allocations.
127#[must_use]
128pub fn default_comparator() -> SharedComparator {
129 // LazyLock creates the Arc once; subsequent calls just clone the Arc (ref-count bump).
130 static DEFAULT: std::sync::LazyLock<SharedComparator> =
131 std::sync::LazyLock::new(|| Arc::new(DefaultUserComparator));
132 DEFAULT.clone()
133}
134
135#[cfg(test)]
136mod tests {
137 use super::*;
138
139 #[test]
140 fn default_comparator_name() {
141 assert_eq!(DefaultUserComparator.name(), "default");
142 assert_eq!(default_comparator().name(), "default");
143 }
144
145 #[test]
146 fn default_comparator_is_lexicographic() {
147 assert!(DefaultUserComparator.is_lexicographic());
148 }
149}