lsm_tree/prefix.rs
1// Copyright (c) 2025-present, fjall-rs
2// This source code is licensed under both the Apache 2.0 and MIT License
3// (found in the LICENSE-* files in the repository)
4
5/// Extracts prefixes from keys for prefix bloom filter indexing.
6///
7/// When a `PrefixExtractor` is configured on a tree, the bloom filter indexes
8/// not only full keys but also the prefixes returned by [`PrefixExtractor::prefixes`].
9/// This allows prefix scans to skip entire segments that contain no keys with a
10/// matching prefix, dramatically reducing I/O for prefix-heavy workloads (e.g.,
11/// graph adjacency lists, time-series buckets).
12///
13/// # Thread Safety
14///
15/// Implementations must be `Send + Sync + UnwindSafe + RefUnwindSafe`.
16/// The extractor is shared across flush, compaction, and read threads via `Arc`,
17/// and may be accessed across panic boundaries (e.g., `catch_unwind` in tests).
18///
19/// # Example
20///
21/// ```
22/// use lsm_tree::PrefixExtractor;
23///
24/// /// Extracts prefixes at each ':' separator boundary.
25/// ///
26/// /// For key `adj:out:42:KNOWS`, yields:
27/// /// `adj:`, `adj:out:`, `adj:out:42:`
28/// struct ColonSeparatedPrefix;
29///
30/// impl PrefixExtractor for ColonSeparatedPrefix {
31/// fn prefixes<'a>(&self, key: &'a [u8]) -> Box<dyn Iterator<Item = &'a [u8]> + 'a> {
32/// Box::new(
33/// key.iter()
34/// .enumerate()
35/// .filter(|(_, b)| **b == b':')
36/// .map(move |(i, _)| &key[..=i]),
37/// )
38/// }
39/// }
40/// ```
41pub trait PrefixExtractor:
42 Send + Sync + std::panic::UnwindSafe + std::panic::RefUnwindSafe
43{
44 /// Returns an iterator of prefixes to index for the given key.
45 ///
46 /// Each yielded prefix will be hashed and inserted into the segment's
47 /// bloom filter. During a prefix scan, the scan prefix is hashed and
48 /// checked against the bloom — segments without a match are skipped.
49 ///
50 /// Implementations should return prefixes from shortest to longest.
51 /// The full key itself is always indexed separately by the standard bloom
52 /// path; including it in the returned prefixes is allowed but redundant
53 /// and generally unnecessary.
54 ///
55 /// The returned iterator must be finite and yield a small number of
56 /// prefixes per key (typically 1–5). It is called on the write path
57 /// for every key during flush and compaction.
58 ///
59 /// # Performance note
60 ///
61 /// Returns `Box<dyn Iterator>` for object safety (`Arc<dyn PrefixExtractor>`).
62 /// Most extractors yield 1–5 prefixes per key, so the allocation is negligible
63 /// compared to the bloom hash + I/O cost. A callback-based `for_each_prefix`
64 /// alternative could avoid this allocation but would expand the trait API
65 /// surface; consider adding it if profiling shows measurable overhead.
66 fn prefixes<'a>(&self, key: &'a [u8]) -> Box<dyn Iterator<Item = &'a [u8]> + 'a>;
67
68 // NOTE: Renamed from `is_valid_prefix_boundary` (added in PR #43, never
69 // released). No deprecated shim needed — no downstream consumers exist.
70
71 /// Returns `true` if `prefix` is a valid scan boundary for this extractor.
72 ///
73 /// A scan boundary is valid when **every key** that the tree would consider
74 /// a match for this prefix in a prefix scan had `prefix` indexed via
75 /// [`prefixes`](Self::prefixes) at write time. This is the contract that
76 /// makes bloom-based table skipping safe: if the bloom filter says "no
77 /// match", we can skip the table because every matching key would have
78 /// produced the prefix hash during flush/compaction.
79 ///
80 /// # Default implementation
81 ///
82 /// Checks whether `prefixes(prefix)` emits `prefix` itself — i.e.,
83 /// whether the extractor considers this byte sequence a boundary.
84 /// This is correct for well-behaved extractors whose `prefixes()` returns
85 /// sub-slices of the input key.
86 ///
87 /// # When to override
88 ///
89 /// Override this method when the default self-referential check is either:
90 /// - **Too expensive** — e.g., the extractor can check a sentinel byte in
91 /// O(1) instead of iterating all prefixes.
92 /// - **Incorrect** — e.g., the extractor produces prefixes that are *not*
93 /// sub-slices of the input, so the default `any(|p| p == prefix)` check
94 /// would never match even for valid boundaries.
95 fn is_valid_scan_boundary(&self, prefix: &[u8]) -> bool {
96 !prefix.is_empty() && self.prefixes(prefix).any(|p| p == prefix)
97 }
98}
99
100/// Computes the prefix hash for bloom-filter-based table skipping.
101///
102/// Returns `Some(hash)` only when the scan prefix is non-empty and is a valid
103/// boundary for the configured extractor. Returns `None` otherwise (no bloom
104/// skip will be attempted).
105///
106/// Used by both `Tree::create_prefix` and `BlobTree::prefix` to avoid
107/// duplicating the boundary-check + hashing logic.
108pub fn compute_prefix_hash(
109 extractor: Option<&std::sync::Arc<dyn PrefixExtractor>>,
110 prefix_bytes: &[u8],
111) -> Option<u64> {
112 use crate::table::filter::standard_bloom::Builder;
113
114 if prefix_bytes.is_empty() {
115 return None;
116 }
117
118 extractor
119 .filter(|e| e.is_valid_scan_boundary(prefix_bytes))
120 .map(|_| Builder::get_hash(prefix_bytes))
121}
122
123#[cfg(test)]
124mod tests {
125 use super::*;
126 // Shadows std's #[test] with test_log's version for structured logging.
127 // This IS used — #[test] on each function below resolves to this import.
128 use test_log::test;
129
130 struct ColonSeparatedPrefix;
131
132 impl PrefixExtractor for ColonSeparatedPrefix {
133 fn prefixes<'a>(&self, key: &'a [u8]) -> Box<dyn Iterator<Item = &'a [u8]> + 'a> {
134 Box::new(
135 key.iter()
136 .enumerate()
137 .filter(|(_, b)| **b == b':')
138 .map(move |(i, _)| &key[..=i]),
139 )
140 }
141 }
142
143 #[test]
144 fn colon_separated_prefixes() {
145 let extractor = ColonSeparatedPrefix;
146 let key = b"adj:out:42:KNOWS";
147 let prefixes: Vec<&[u8]> = extractor.prefixes(key).collect();
148 assert_eq!(
149 prefixes,
150 vec![
151 b"adj:" as &[u8],
152 b"adj:out:" as &[u8],
153 b"adj:out:42:" as &[u8],
154 ]
155 );
156 }
157
158 #[test]
159 fn no_separator() {
160 let extractor = ColonSeparatedPrefix;
161 let key = b"noseparator";
162 let prefixes: Vec<&[u8]> = extractor.prefixes(key).collect();
163 assert!(prefixes.is_empty());
164 }
165
166 #[test]
167 fn single_separator_at_end() {
168 let extractor = ColonSeparatedPrefix;
169 let key = b"prefix:";
170 let prefixes: Vec<&[u8]> = extractor.prefixes(key).collect();
171 assert_eq!(prefixes, vec![b"prefix:" as &[u8]]);
172 }
173
174 #[test]
175 fn empty_key() {
176 let extractor = ColonSeparatedPrefix;
177 let prefixes: Vec<&[u8]> = extractor.prefixes(b"").collect();
178 assert!(prefixes.is_empty());
179 }
180
181 #[test]
182 fn is_valid_scan_boundary_colon_terminated() {
183 let extractor = ColonSeparatedPrefix;
184 // "adj:" is a valid boundary — extractor emits it for "adj:" input
185 assert!(extractor.is_valid_scan_boundary(b"adj:"));
186 assert!(extractor.is_valid_scan_boundary(b"adj:out:"));
187 assert!(extractor.is_valid_scan_boundary(b"adj:out:42:"));
188 }
189
190 #[test]
191 fn is_valid_scan_boundary_non_boundary() {
192 let extractor = ColonSeparatedPrefix;
193 // "adj" (no trailing colon) is NOT a valid boundary
194 assert!(!extractor.is_valid_scan_boundary(b"adj"));
195 assert!(!extractor.is_valid_scan_boundary(b"adj:out"));
196 assert!(!extractor.is_valid_scan_boundary(b"noseparator"));
197 }
198
199 #[test]
200 fn is_valid_scan_boundary_empty() {
201 let extractor = ColonSeparatedPrefix;
202 assert!(!extractor.is_valid_scan_boundary(b""));
203 }
204
205 /// Extractor that overrides `is_valid_scan_boundary` with an O(1) length
206 /// check instead of iterating all prefixes via the default implementation.
207 struct FixedLengthPrefix;
208
209 impl PrefixExtractor for FixedLengthPrefix {
210 fn prefixes<'a>(&self, key: &'a [u8]) -> Box<dyn Iterator<Item = &'a [u8]> + 'a> {
211 if let Some(prefix) = key.get(..4) {
212 Box::new(std::iter::once(prefix))
213 } else {
214 Box::new(std::iter::empty())
215 }
216 }
217
218 fn is_valid_scan_boundary(&self, prefix: &[u8]) -> bool {
219 prefix.len() == 4
220 }
221 }
222
223 #[test]
224 fn fixed_length_prefixes() {
225 let extractor = FixedLengthPrefix;
226 // Key longer than 4 bytes yields a single 4-byte prefix
227 let prefixes: Vec<&[u8]> = extractor.prefixes(b"usr:data").collect();
228 assert_eq!(prefixes, vec![b"usr:" as &[u8]]);
229
230 // Key shorter than 4 bytes yields nothing
231 let prefixes: Vec<&[u8]> = extractor.prefixes(b"ab").collect();
232 assert!(prefixes.is_empty());
233
234 // Key exactly 4 bytes yields itself
235 let prefixes: Vec<&[u8]> = extractor.prefixes(b"abcd").collect();
236 assert_eq!(prefixes, vec![b"abcd" as &[u8]]);
237 }
238
239 #[test]
240 fn custom_scan_boundary_valid() {
241 let extractor = FixedLengthPrefix;
242 assert!(extractor.is_valid_scan_boundary(b"usr:"));
243 assert!(extractor.is_valid_scan_boundary(b"abcd"));
244 }
245
246 #[test]
247 fn custom_scan_boundary_invalid() {
248 let extractor = FixedLengthPrefix;
249 assert!(!extractor.is_valid_scan_boundary(b"ab"));
250 assert!(!extractor.is_valid_scan_boundary(b"toolong"));
251 assert!(!extractor.is_valid_scan_boundary(b""));
252 }
253}