Skip to main content

lsm_tree/
prefix.rs

1// Copyright (c) 2025-present, fjall-rs
2// This source code is licensed under both the Apache 2.0 and MIT License
3// (found in the LICENSE-* files in the repository)
4
5/// Extracts prefixes from keys for prefix bloom filter indexing.
6///
7/// When a `PrefixExtractor` is configured on a tree, the bloom filter indexes
8/// not only full keys but also the prefixes returned by [`PrefixExtractor::prefixes`].
9/// This allows prefix scans to skip entire segments that contain no keys with a
10/// matching prefix, dramatically reducing I/O for prefix-heavy workloads (e.g.,
11/// graph adjacency lists, time-series buckets).
12///
13/// # Thread Safety
14///
15/// Implementations must be `Send + Sync + UnwindSafe + RefUnwindSafe`.
16/// The extractor is shared across flush, compaction, and read threads via `Arc`,
17/// and may be accessed across panic boundaries (e.g., `catch_unwind` in tests).
18///
19/// # Example
20///
21/// ```
22/// use lsm_tree::PrefixExtractor;
23///
24/// /// Extracts prefixes at each ':' separator boundary.
25/// ///
26/// /// For key `adj:out:42:KNOWS`, yields:
27/// ///   `adj:`, `adj:out:`, `adj:out:42:`
28/// struct ColonSeparatedPrefix;
29///
30/// impl PrefixExtractor for ColonSeparatedPrefix {
31///     fn prefixes<'a>(&self, key: &'a [u8]) -> Box<dyn Iterator<Item = &'a [u8]> + 'a> {
32///         Box::new(
33///             key.iter()
34///                 .enumerate()
35///                 .filter(|(_, b)| **b == b':')
36///                 .map(move |(i, _)| &key[..=i]),
37///         )
38///     }
39/// }
40/// ```
41pub trait PrefixExtractor:
42    Send + Sync + std::panic::UnwindSafe + std::panic::RefUnwindSafe
43{
44    /// Returns an iterator of prefixes to index for the given key.
45    ///
46    /// Each yielded prefix will be hashed and inserted into the segment's
47    /// bloom filter. During a prefix scan, the scan prefix is hashed and
48    /// checked against the bloom — segments without a match are skipped.
49    ///
50    /// Implementations should return prefixes from shortest to longest.
51    /// The full key itself is always indexed separately by the standard bloom
52    /// path; including it in the returned prefixes is allowed but redundant
53    /// and generally unnecessary.
54    ///
55    /// The returned iterator must be finite and yield a small number of
56    /// prefixes per key (typically 1–5). It is called on the write path
57    /// for every key during flush and compaction.
58    ///
59    /// # Performance note
60    ///
61    /// Returns `Box<dyn Iterator>` for object safety (`Arc<dyn PrefixExtractor>`).
62    /// Most extractors yield 1–5 prefixes per key, so the allocation is negligible
63    /// compared to the bloom hash + I/O cost. A callback-based `for_each_prefix`
64    /// alternative could avoid this allocation but would expand the trait API
65    /// surface; consider adding it if profiling shows measurable overhead.
66    fn prefixes<'a>(&self, key: &'a [u8]) -> Box<dyn Iterator<Item = &'a [u8]> + 'a>;
67
68    // NOTE: Renamed from `is_valid_prefix_boundary` (added in PR #43, never
69    // released). No deprecated shim needed — no downstream consumers exist.
70
71    /// Returns `true` if `prefix` is a valid scan boundary for this extractor.
72    ///
73    /// A scan boundary is valid when **every key** that the tree would consider
74    /// a match for this prefix in a prefix scan had `prefix` indexed via
75    /// [`prefixes`](Self::prefixes) at write time. This is the contract that
76    /// makes bloom-based table skipping safe: if the bloom filter says "no
77    /// match", we can skip the table because every matching key would have
78    /// produced the prefix hash during flush/compaction.
79    ///
80    /// # Default implementation
81    ///
82    /// Checks whether `prefixes(prefix)` emits `prefix` itself — i.e.,
83    /// whether the extractor considers this byte sequence a boundary.
84    /// This is correct for well-behaved extractors whose `prefixes()` returns
85    /// sub-slices of the input key.
86    ///
87    /// # When to override
88    ///
89    /// Override this method when the default self-referential check is either:
90    /// - **Too expensive** — e.g., the extractor can check a sentinel byte in
91    ///   O(1) instead of iterating all prefixes.
92    /// - **Incorrect** — e.g., the extractor produces prefixes that are *not*
93    ///   sub-slices of the input, so the default `any(|p| p == prefix)` check
94    ///   would never match even for valid boundaries.
95    fn is_valid_scan_boundary(&self, prefix: &[u8]) -> bool {
96        !prefix.is_empty() && self.prefixes(prefix).any(|p| p == prefix)
97    }
98}
99
100/// Computes the prefix hash for bloom-filter-based table skipping.
101///
102/// Returns `Some(hash)` only when the scan prefix is non-empty and is a valid
103/// boundary for the configured extractor. Returns `None` otherwise (no bloom
104/// skip will be attempted).
105///
106/// Used by both `Tree::create_prefix` and `BlobTree::prefix` to avoid
107/// duplicating the boundary-check + hashing logic.
108pub fn compute_prefix_hash(
109    extractor: Option<&std::sync::Arc<dyn PrefixExtractor>>,
110    prefix_bytes: &[u8],
111) -> Option<u64> {
112    use crate::table::filter::standard_bloom::Builder;
113
114    if prefix_bytes.is_empty() {
115        return None;
116    }
117
118    extractor
119        .filter(|e| e.is_valid_scan_boundary(prefix_bytes))
120        .map(|_| Builder::get_hash(prefix_bytes))
121}
122
123#[cfg(test)]
124mod tests {
125    use super::*;
126    // Shadows std's #[test] with test_log's version for structured logging.
127    // This IS used — #[test] on each function below resolves to this import.
128    use test_log::test;
129
130    struct ColonSeparatedPrefix;
131
132    impl PrefixExtractor for ColonSeparatedPrefix {
133        fn prefixes<'a>(&self, key: &'a [u8]) -> Box<dyn Iterator<Item = &'a [u8]> + 'a> {
134            Box::new(
135                key.iter()
136                    .enumerate()
137                    .filter(|(_, b)| **b == b':')
138                    .map(move |(i, _)| &key[..=i]),
139            )
140        }
141    }
142
143    #[test]
144    fn colon_separated_prefixes() {
145        let extractor = ColonSeparatedPrefix;
146        let key = b"adj:out:42:KNOWS";
147        let prefixes: Vec<&[u8]> = extractor.prefixes(key).collect();
148        assert_eq!(
149            prefixes,
150            vec![
151                b"adj:" as &[u8],
152                b"adj:out:" as &[u8],
153                b"adj:out:42:" as &[u8],
154            ]
155        );
156    }
157
158    #[test]
159    fn no_separator() {
160        let extractor = ColonSeparatedPrefix;
161        let key = b"noseparator";
162        let prefixes: Vec<&[u8]> = extractor.prefixes(key).collect();
163        assert!(prefixes.is_empty());
164    }
165
166    #[test]
167    fn single_separator_at_end() {
168        let extractor = ColonSeparatedPrefix;
169        let key = b"prefix:";
170        let prefixes: Vec<&[u8]> = extractor.prefixes(key).collect();
171        assert_eq!(prefixes, vec![b"prefix:" as &[u8]]);
172    }
173
174    #[test]
175    fn empty_key() {
176        let extractor = ColonSeparatedPrefix;
177        let prefixes: Vec<&[u8]> = extractor.prefixes(b"").collect();
178        assert!(prefixes.is_empty());
179    }
180
181    #[test]
182    fn is_valid_scan_boundary_colon_terminated() {
183        let extractor = ColonSeparatedPrefix;
184        // "adj:" is a valid boundary — extractor emits it for "adj:" input
185        assert!(extractor.is_valid_scan_boundary(b"adj:"));
186        assert!(extractor.is_valid_scan_boundary(b"adj:out:"));
187        assert!(extractor.is_valid_scan_boundary(b"adj:out:42:"));
188    }
189
190    #[test]
191    fn is_valid_scan_boundary_non_boundary() {
192        let extractor = ColonSeparatedPrefix;
193        // "adj" (no trailing colon) is NOT a valid boundary
194        assert!(!extractor.is_valid_scan_boundary(b"adj"));
195        assert!(!extractor.is_valid_scan_boundary(b"adj:out"));
196        assert!(!extractor.is_valid_scan_boundary(b"noseparator"));
197    }
198
199    #[test]
200    fn is_valid_scan_boundary_empty() {
201        let extractor = ColonSeparatedPrefix;
202        assert!(!extractor.is_valid_scan_boundary(b""));
203    }
204
205    /// Extractor that overrides `is_valid_scan_boundary` with an O(1) length
206    /// check instead of iterating all prefixes via the default implementation.
207    struct FixedLengthPrefix;
208
209    impl PrefixExtractor for FixedLengthPrefix {
210        fn prefixes<'a>(&self, key: &'a [u8]) -> Box<dyn Iterator<Item = &'a [u8]> + 'a> {
211            if let Some(prefix) = key.get(..4) {
212                Box::new(std::iter::once(prefix))
213            } else {
214                Box::new(std::iter::empty())
215            }
216        }
217
218        fn is_valid_scan_boundary(&self, prefix: &[u8]) -> bool {
219            prefix.len() == 4
220        }
221    }
222
223    #[test]
224    fn fixed_length_prefixes() {
225        let extractor = FixedLengthPrefix;
226        // Key longer than 4 bytes yields a single 4-byte prefix
227        let prefixes: Vec<&[u8]> = extractor.prefixes(b"usr:data").collect();
228        assert_eq!(prefixes, vec![b"usr:" as &[u8]]);
229
230        // Key shorter than 4 bytes yields nothing
231        let prefixes: Vec<&[u8]> = extractor.prefixes(b"ab").collect();
232        assert!(prefixes.is_empty());
233
234        // Key exactly 4 bytes yields itself
235        let prefixes: Vec<&[u8]> = extractor.prefixes(b"abcd").collect();
236        assert_eq!(prefixes, vec![b"abcd" as &[u8]]);
237    }
238
239    #[test]
240    fn custom_scan_boundary_valid() {
241        let extractor = FixedLengthPrefix;
242        assert!(extractor.is_valid_scan_boundary(b"usr:"));
243        assert!(extractor.is_valid_scan_boundary(b"abcd"));
244    }
245
246    #[test]
247    fn custom_scan_boundary_invalid() {
248        let extractor = FixedLengthPrefix;
249        assert!(!extractor.is_valid_scan_boundary(b"ab"));
250        assert!(!extractor.is_valid_scan_boundary(b"toolong"));
251        assert!(!extractor.is_valid_scan_boundary(b""));
252    }
253}