Skip to main content

luci/inverted/
term_dict.rs

1//! FST-based term dictionary with automaton search.
2//!
3//! Maps terms (strings) to posting list offsets using a finite state
4//! transducer. Supports O(term_length) exact lookup, prefix iteration,
5//! and automaton-based search for wildcard/fuzzy/regex queries.
6//!
7//! The FST compresses shared prefixes and suffixes, and enables
8//! DFA intersection that prunes entire subtrees of non-matching terms.
9//!
10//! See [[inverted-index]], [[optimization-wildcard-fuzzy-queries]],
11//! and [[architecture-overview#Step 2]].
12
13/// Builds a term dictionary. Terms must be added in lexicographic order.
14pub struct TermDictBuilder {
15    fst_builder: fst::MapBuilder<Vec<u8>>,
16}
17
18impl TermDictBuilder {
19    pub fn new() -> Self {
20        Self {
21            fst_builder: fst::MapBuilder::memory(),
22        }
23    }
24
25    /// Add a term → offset mapping. Terms must be added in sorted order.
26    pub fn add(&mut self, term: &str, offset: u64) {
27        self.fst_builder
28            .insert(term.as_bytes(), offset)
29            .expect("terms must be added in sorted order");
30    }
31
32    /// Finalize and return the encoded term dictionary bytes.
33    pub fn finish(self) -> Vec<u8> {
34        self.fst_builder
35            .into_inner()
36            .expect("FST builder finish failed")
37    }
38}
39
40impl Default for TermDictBuilder {
41    fn default() -> Self {
42        Self::new()
43    }
44}
45
46/// Read-only term dictionary backed by an FST.
47pub struct TermDict<'a> {
48    fst: fst::Map<&'a [u8]>,
49}
50
51impl<'a> TermDict<'a> {
52    /// Open a term dictionary from encoded bytes.
53    pub fn open(data: &'a [u8]) -> Self {
54        let fst = if data.is_empty() {
55            // Empty FST — build a minimal one
56            let builder = fst::MapBuilder::memory();
57            let bytes = builder.into_inner().unwrap();
58            // Leak the bytes so we can get a 'a reference
59            // This is safe because empty FSTs are tiny (8 bytes)
60            let leaked: &'a [u8] = Box::leak(bytes.into_boxed_slice());
61            fst::Map::new(leaked).unwrap()
62        } else {
63            fst::Map::new(data).expect("invalid FST data in term dictionary")
64        };
65        Self { fst }
66    }
67
68    /// Number of terms in the dictionary.
69    pub fn len(&self) -> u32 {
70        self.fst.len() as u32
71    }
72
73    /// Whether the dictionary is empty.
74    pub fn is_empty(&self) -> bool {
75        self.fst.is_empty()
76    }
77
78    /// Look up a term and return its posting list offset, or None.
79    pub fn get(&self, term: &str) -> Option<u64> {
80        self.fst.get(term.as_bytes())
81    }
82
83    /// Collect all terms starting with `prefix`, returning (term, offset) pairs.
84    pub fn prefix_iter(&self, prefix: &str) -> Vec<(String, u64)> {
85        collect_prefix(&self.fst, prefix)
86    }
87
88    /// Search the dictionary with an automaton. Returns all matching
89    /// (term, offset) pairs where the automaton accepts the term.
90    ///
91    /// This is the key optimization for wildcard and fuzzy queries —
92    /// the FST is walked in lockstep with the automaton DFA, pruning
93    /// entire subtrees where `can_match()` returns false.
94    pub fn automaton_search<A: fst::Automaton>(&self, aut: A) -> Vec<(String, u64)> {
95        collect_automaton(&self.fst, aut)
96    }
97}
98
99/// Collect all terms matching a prefix into a Vec.
100/// The FST stream API uses lifetimes that don't compose well with
101/// Iterator, so we collect eagerly.
102fn collect_prefix(fst: &fst::Map<&[u8]>, prefix: &str) -> Vec<(String, u64)> {
103    use fst::IntoStreamer;
104    use fst::Streamer;
105
106    let mut stream = fst.range().ge(prefix.as_bytes()).into_stream();
107    let mut results = Vec::new();
108    while let Some((key, value)) = stream.next() {
109        let term = std::str::from_utf8(key).expect("invalid UTF-8 in term dictionary");
110        if !term.starts_with(prefix) {
111            break;
112        }
113        results.push((term.to_string(), value));
114    }
115    results
116}
117
118/// Collect all terms matching an automaton into a Vec.
119fn collect_automaton<A: fst::Automaton>(fst: &fst::Map<&[u8]>, aut: A) -> Vec<(String, u64)> {
120    use fst::IntoStreamer;
121    use fst::Streamer;
122
123    let mut stream = fst.search(aut).into_stream();
124    let mut results = Vec::new();
125    while let Some((key, value)) = stream.next() {
126        let term = std::str::from_utf8(key).expect("invalid UTF-8 in term dictionary");
127        results.push((term.to_string(), value));
128    }
129    results
130}
131
132#[cfg(test)]
133mod tests {
134    use super::*;
135    use fst::Automaton;
136
137    #[test]
138    fn empty_dict() {
139        let builder = TermDictBuilder::new();
140        let data = builder.finish();
141        let dict = TermDict::open(&data);
142        assert_eq!(dict.len(), 0);
143        assert!(dict.is_empty());
144        assert_eq!(dict.get("anything"), None);
145    }
146
147    #[test]
148    fn single_term() {
149        let mut builder = TermDictBuilder::new();
150        builder.add("hello", 42);
151        let data = builder.finish();
152
153        let dict = TermDict::open(&data);
154        assert_eq!(dict.len(), 1);
155        assert_eq!(dict.get("hello"), Some(42));
156        assert_eq!(dict.get("world"), None);
157    }
158
159    #[test]
160    fn multiple_terms() {
161        let mut builder = TermDictBuilder::new();
162        builder.add("apple", 100);
163        builder.add("banana", 200);
164        builder.add("cherry", 300);
165        let data = builder.finish();
166
167        let dict = TermDict::open(&data);
168        assert_eq!(dict.len(), 3);
169        assert_eq!(dict.get("apple"), Some(100));
170        assert_eq!(dict.get("banana"), Some(200));
171        assert_eq!(dict.get("cherry"), Some(300));
172    }
173
174    #[test]
175    fn lookup_miss() {
176        let mut builder = TermDictBuilder::new();
177        builder.add("apple", 100);
178        builder.add("cherry", 300);
179        let data = builder.finish();
180
181        let dict = TermDict::open(&data);
182        assert_eq!(dict.get("banana"), None);
183        assert_eq!(dict.get("aaa"), None);
184        assert_eq!(dict.get("zzz"), None);
185    }
186
187    #[test]
188    fn binary_search_first_term() {
189        let mut builder = TermDictBuilder::new();
190        builder.add("aaa", 1);
191        builder.add("bbb", 2);
192        builder.add("ccc", 3);
193        builder.add("ddd", 4);
194        builder.add("eee", 5);
195        let data = builder.finish();
196
197        let dict = TermDict::open(&data);
198        assert_eq!(dict.get("aaa"), Some(1));
199    }
200
201    #[test]
202    fn binary_search_last_term() {
203        let mut builder = TermDictBuilder::new();
204        builder.add("aaa", 1);
205        builder.add("bbb", 2);
206        builder.add("ccc", 3);
207        builder.add("ddd", 4);
208        builder.add("eee", 5);
209        let data = builder.finish();
210
211        let dict = TermDict::open(&data);
212        assert_eq!(dict.get("eee"), Some(5));
213    }
214
215    #[test]
216    fn binary_search_middle_term() {
217        let mut builder = TermDictBuilder::new();
218        builder.add("aaa", 1);
219        builder.add("bbb", 2);
220        builder.add("ccc", 3);
221        builder.add("ddd", 4);
222        builder.add("eee", 5);
223        let data = builder.finish();
224
225        let dict = TermDict::open(&data);
226        assert_eq!(dict.get("ccc"), Some(3));
227    }
228
229    #[test]
230    fn unicode_terms() {
231        let mut builder = TermDictBuilder::new();
232        builder.add("cafe", 1);
233        builder.add("caf\u{00e9}", 2);
234        builder.add("na\u{00ef}ve", 3);
235        builder.add("\u{00fc}ber", 4);
236        let data = builder.finish();
237
238        let dict = TermDict::open(&data);
239        assert_eq!(dict.get("cafe"), Some(1));
240        assert_eq!(dict.get("caf\u{00e9}"), Some(2));
241        assert_eq!(dict.get("na\u{00ef}ve"), Some(3));
242        assert_eq!(dict.get("\u{00fc}ber"), Some(4));
243    }
244
245    #[test]
246    fn large_dict() {
247        let count = 10_000;
248        let mut builder = TermDictBuilder::new();
249        let mut terms: Vec<String> = (0..count).map(|i| format!("term_{i:06}")).collect();
250        terms.sort();
251        for (i, term) in terms.iter().enumerate() {
252            builder.add(term, i as u64);
253        }
254        let data = builder.finish();
255
256        let dict = TermDict::open(&data);
257        assert_eq!(dict.len(), count as u32);
258
259        for (i, term) in terms.iter().enumerate() {
260            assert_eq!(dict.get(term), Some(i as u64));
261        }
262    }
263
264    #[test]
265    fn large_offsets() {
266        let mut builder = TermDictBuilder::new();
267        builder.add("a", u64::MAX - 1);
268        builder.add("b", u64::MAX);
269        let data = builder.finish();
270
271        let dict = TermDict::open(&data);
272        assert_eq!(dict.get("a"), Some(u64::MAX - 1));
273        assert_eq!(dict.get("b"), Some(u64::MAX));
274    }
275
276    #[test]
277    fn empty_string_term() {
278        let mut builder = TermDictBuilder::new();
279        builder.add("", 0);
280        builder.add("a", 1);
281        let data = builder.finish();
282
283        let dict = TermDict::open(&data);
284        assert_eq!(dict.get(""), Some(0));
285        assert_eq!(dict.get("a"), Some(1));
286    }
287
288    // --- Prefix iteration tests ---
289
290    #[test]
291    fn prefix_multiple_matches() {
292        let mut builder = TermDictBuilder::new();
293        builder.add("apple", 1);
294        builder.add("application", 2);
295        builder.add("apply", 3);
296        builder.add("banana", 4);
297        builder.add("cherry", 5);
298        let data = builder.finish();
299
300        let dict = TermDict::open(&data);
301        let results: Vec<_> = dict.prefix_iter("app");
302        assert_eq!(results.len(), 3);
303        assert_eq!(results[0], ("apple".to_string(), 1));
304        assert_eq!(results[1], ("application".to_string(), 2));
305        assert_eq!(results[2], ("apply".to_string(), 3));
306    }
307
308    #[test]
309    fn prefix_no_matches() {
310        let mut builder = TermDictBuilder::new();
311        builder.add("apple", 1);
312        builder.add("banana", 2);
313        let data = builder.finish();
314
315        let dict = TermDict::open(&data);
316        let results: Vec<_> = dict.prefix_iter("xyz");
317        assert!(results.is_empty());
318    }
319
320    #[test]
321    fn prefix_empty_yields_all() {
322        let mut builder = TermDictBuilder::new();
323        builder.add("a", 1);
324        builder.add("b", 2);
325        builder.add("c", 3);
326        let data = builder.finish();
327
328        let dict = TermDict::open(&data);
329        let results: Vec<_> = dict.prefix_iter("");
330        assert_eq!(results.len(), 3);
331    }
332
333    #[test]
334    fn prefix_single_char() {
335        let mut builder = TermDictBuilder::new();
336        builder.add("ax", 1);
337        builder.add("ay", 2);
338        builder.add("bx", 3);
339        let data = builder.finish();
340
341        let dict = TermDict::open(&data);
342        let results: Vec<_> = dict.prefix_iter("a");
343        assert_eq!(results.len(), 2);
344        assert_eq!(results[0].0, "ax");
345        assert_eq!(results[1].0, "ay");
346    }
347
348    #[test]
349    fn prefix_exact_term_match() {
350        let mut builder = TermDictBuilder::new();
351        builder.add("hello", 1);
352        builder.add("helloworld", 2);
353        builder.add("help", 3);
354        let data = builder.finish();
355
356        let dict = TermDict::open(&data);
357        let results: Vec<_> = dict.prefix_iter("hello");
358        assert_eq!(results.len(), 2);
359        assert_eq!(results[0].0, "hello");
360        assert_eq!(results[1].0, "helloworld");
361    }
362
363    #[test]
364    fn prefix_unicode() {
365        let mut builder = TermDictBuilder::new();
366        builder.add("cafe", 1);
367        builder.add("caf\u{00e9}", 2);
368        builder.add("car", 3);
369        let data = builder.finish();
370
371        let dict = TermDict::open(&data);
372        let results: Vec<_> = dict.prefix_iter("caf");
373        assert_eq!(results.len(), 2);
374    }
375
376    #[test]
377    fn prefix_on_empty_dict() {
378        let builder = TermDictBuilder::new();
379        let data = builder.finish();
380        let dict = TermDict::open(&data);
381        let results: Vec<_> = dict.prefix_iter("any");
382        assert!(results.is_empty());
383    }
384
385    #[test]
386    fn prefix_at_end_of_dict() {
387        let mut builder = TermDictBuilder::new();
388        builder.add("aaa", 1);
389        builder.add("zzz", 2);
390        let data = builder.finish();
391
392        let dict = TermDict::open(&data);
393        let results: Vec<_> = dict.prefix_iter("zz");
394        assert_eq!(results.len(), 1);
395        assert_eq!(results[0].0, "zzz");
396    }
397
398    // --- Automaton search tests ---
399
400    #[test]
401    fn automaton_search_with_prefix() {
402        let mut builder = TermDictBuilder::new();
403        builder.add("apple", 1);
404        builder.add("application", 2);
405        builder.add("apply", 3);
406        builder.add("banana", 4);
407        builder.add("cherry", 5);
408        let data = builder.finish();
409
410        let dict = TermDict::open(&data);
411        // StartsWith automaton matches all terms starting with "app"
412        let aut = fst::automaton::Str::new("app").starts_with();
413        let results = dict.automaton_search(aut);
414        assert_eq!(results.len(), 3);
415        assert_eq!(results[0].0, "apple");
416        assert_eq!(results[1].0, "application");
417        assert_eq!(results[2].0, "apply");
418    }
419
420    #[test]
421    fn automaton_search_exact_match() {
422        let mut builder = TermDictBuilder::new();
423        builder.add("apple", 1);
424        builder.add("banana", 2);
425        builder.add("cherry", 3);
426        let data = builder.finish();
427
428        let dict = TermDict::open(&data);
429        let aut = fst::automaton::Str::new("banana");
430        let results = dict.automaton_search(aut);
431        assert_eq!(results.len(), 1);
432        assert_eq!(results[0].0, "banana");
433    }
434}