1pub struct TermDictBuilder {
15 fst_builder: fst::MapBuilder<Vec<u8>>,
16}
17
18impl TermDictBuilder {
19 pub fn new() -> Self {
20 Self {
21 fst_builder: fst::MapBuilder::memory(),
22 }
23 }
24
25 pub fn add(&mut self, term: &str, offset: u64) {
27 self.fst_builder
28 .insert(term.as_bytes(), offset)
29 .expect("terms must be added in sorted order");
30 }
31
32 pub fn finish(self) -> Vec<u8> {
34 self.fst_builder
35 .into_inner()
36 .expect("FST builder finish failed")
37 }
38}
39
40impl Default for TermDictBuilder {
41 fn default() -> Self {
42 Self::new()
43 }
44}
45
46pub struct TermDict<'a> {
48 fst: fst::Map<&'a [u8]>,
49}
50
51impl<'a> TermDict<'a> {
52 pub fn open(data: &'a [u8]) -> Self {
54 let fst = if data.is_empty() {
55 let builder = fst::MapBuilder::memory();
57 let bytes = builder.into_inner().unwrap();
58 let leaked: &'a [u8] = Box::leak(bytes.into_boxed_slice());
61 fst::Map::new(leaked).unwrap()
62 } else {
63 fst::Map::new(data).expect("invalid FST data in term dictionary")
64 };
65 Self { fst }
66 }
67
68 pub fn len(&self) -> u32 {
70 self.fst.len() as u32
71 }
72
73 pub fn is_empty(&self) -> bool {
75 self.fst.is_empty()
76 }
77
78 pub fn get(&self, term: &str) -> Option<u64> {
80 self.fst.get(term.as_bytes())
81 }
82
83 pub fn prefix_iter(&self, prefix: &str) -> Vec<(String, u64)> {
85 collect_prefix(&self.fst, prefix)
86 }
87
88 pub fn automaton_search<A: fst::Automaton>(&self, aut: A) -> Vec<(String, u64)> {
95 collect_automaton(&self.fst, aut)
96 }
97}
98
99fn collect_prefix(fst: &fst::Map<&[u8]>, prefix: &str) -> Vec<(String, u64)> {
103 use fst::IntoStreamer;
104 use fst::Streamer;
105
106 let mut stream = fst.range().ge(prefix.as_bytes()).into_stream();
107 let mut results = Vec::new();
108 while let Some((key, value)) = stream.next() {
109 let term = std::str::from_utf8(key).expect("invalid UTF-8 in term dictionary");
110 if !term.starts_with(prefix) {
111 break;
112 }
113 results.push((term.to_string(), value));
114 }
115 results
116}
117
118fn collect_automaton<A: fst::Automaton>(fst: &fst::Map<&[u8]>, aut: A) -> Vec<(String, u64)> {
120 use fst::IntoStreamer;
121 use fst::Streamer;
122
123 let mut stream = fst.search(aut).into_stream();
124 let mut results = Vec::new();
125 while let Some((key, value)) = stream.next() {
126 let term = std::str::from_utf8(key).expect("invalid UTF-8 in term dictionary");
127 results.push((term.to_string(), value));
128 }
129 results
130}
131
132#[cfg(test)]
133mod tests {
134 use super::*;
135 use fst::Automaton;
136
137 #[test]
138 fn empty_dict() {
139 let builder = TermDictBuilder::new();
140 let data = builder.finish();
141 let dict = TermDict::open(&data);
142 assert_eq!(dict.len(), 0);
143 assert!(dict.is_empty());
144 assert_eq!(dict.get("anything"), None);
145 }
146
147 #[test]
148 fn single_term() {
149 let mut builder = TermDictBuilder::new();
150 builder.add("hello", 42);
151 let data = builder.finish();
152
153 let dict = TermDict::open(&data);
154 assert_eq!(dict.len(), 1);
155 assert_eq!(dict.get("hello"), Some(42));
156 assert_eq!(dict.get("world"), None);
157 }
158
159 #[test]
160 fn multiple_terms() {
161 let mut builder = TermDictBuilder::new();
162 builder.add("apple", 100);
163 builder.add("banana", 200);
164 builder.add("cherry", 300);
165 let data = builder.finish();
166
167 let dict = TermDict::open(&data);
168 assert_eq!(dict.len(), 3);
169 assert_eq!(dict.get("apple"), Some(100));
170 assert_eq!(dict.get("banana"), Some(200));
171 assert_eq!(dict.get("cherry"), Some(300));
172 }
173
174 #[test]
175 fn lookup_miss() {
176 let mut builder = TermDictBuilder::new();
177 builder.add("apple", 100);
178 builder.add("cherry", 300);
179 let data = builder.finish();
180
181 let dict = TermDict::open(&data);
182 assert_eq!(dict.get("banana"), None);
183 assert_eq!(dict.get("aaa"), None);
184 assert_eq!(dict.get("zzz"), None);
185 }
186
187 #[test]
188 fn binary_search_first_term() {
189 let mut builder = TermDictBuilder::new();
190 builder.add("aaa", 1);
191 builder.add("bbb", 2);
192 builder.add("ccc", 3);
193 builder.add("ddd", 4);
194 builder.add("eee", 5);
195 let data = builder.finish();
196
197 let dict = TermDict::open(&data);
198 assert_eq!(dict.get("aaa"), Some(1));
199 }
200
201 #[test]
202 fn binary_search_last_term() {
203 let mut builder = TermDictBuilder::new();
204 builder.add("aaa", 1);
205 builder.add("bbb", 2);
206 builder.add("ccc", 3);
207 builder.add("ddd", 4);
208 builder.add("eee", 5);
209 let data = builder.finish();
210
211 let dict = TermDict::open(&data);
212 assert_eq!(dict.get("eee"), Some(5));
213 }
214
215 #[test]
216 fn binary_search_middle_term() {
217 let mut builder = TermDictBuilder::new();
218 builder.add("aaa", 1);
219 builder.add("bbb", 2);
220 builder.add("ccc", 3);
221 builder.add("ddd", 4);
222 builder.add("eee", 5);
223 let data = builder.finish();
224
225 let dict = TermDict::open(&data);
226 assert_eq!(dict.get("ccc"), Some(3));
227 }
228
229 #[test]
230 fn unicode_terms() {
231 let mut builder = TermDictBuilder::new();
232 builder.add("cafe", 1);
233 builder.add("caf\u{00e9}", 2);
234 builder.add("na\u{00ef}ve", 3);
235 builder.add("\u{00fc}ber", 4);
236 let data = builder.finish();
237
238 let dict = TermDict::open(&data);
239 assert_eq!(dict.get("cafe"), Some(1));
240 assert_eq!(dict.get("caf\u{00e9}"), Some(2));
241 assert_eq!(dict.get("na\u{00ef}ve"), Some(3));
242 assert_eq!(dict.get("\u{00fc}ber"), Some(4));
243 }
244
245 #[test]
246 fn large_dict() {
247 let count = 10_000;
248 let mut builder = TermDictBuilder::new();
249 let mut terms: Vec<String> = (0..count).map(|i| format!("term_{i:06}")).collect();
250 terms.sort();
251 for (i, term) in terms.iter().enumerate() {
252 builder.add(term, i as u64);
253 }
254 let data = builder.finish();
255
256 let dict = TermDict::open(&data);
257 assert_eq!(dict.len(), count as u32);
258
259 for (i, term) in terms.iter().enumerate() {
260 assert_eq!(dict.get(term), Some(i as u64));
261 }
262 }
263
264 #[test]
265 fn large_offsets() {
266 let mut builder = TermDictBuilder::new();
267 builder.add("a", u64::MAX - 1);
268 builder.add("b", u64::MAX);
269 let data = builder.finish();
270
271 let dict = TermDict::open(&data);
272 assert_eq!(dict.get("a"), Some(u64::MAX - 1));
273 assert_eq!(dict.get("b"), Some(u64::MAX));
274 }
275
276 #[test]
277 fn empty_string_term() {
278 let mut builder = TermDictBuilder::new();
279 builder.add("", 0);
280 builder.add("a", 1);
281 let data = builder.finish();
282
283 let dict = TermDict::open(&data);
284 assert_eq!(dict.get(""), Some(0));
285 assert_eq!(dict.get("a"), Some(1));
286 }
287
288 #[test]
291 fn prefix_multiple_matches() {
292 let mut builder = TermDictBuilder::new();
293 builder.add("apple", 1);
294 builder.add("application", 2);
295 builder.add("apply", 3);
296 builder.add("banana", 4);
297 builder.add("cherry", 5);
298 let data = builder.finish();
299
300 let dict = TermDict::open(&data);
301 let results: Vec<_> = dict.prefix_iter("app");
302 assert_eq!(results.len(), 3);
303 assert_eq!(results[0], ("apple".to_string(), 1));
304 assert_eq!(results[1], ("application".to_string(), 2));
305 assert_eq!(results[2], ("apply".to_string(), 3));
306 }
307
308 #[test]
309 fn prefix_no_matches() {
310 let mut builder = TermDictBuilder::new();
311 builder.add("apple", 1);
312 builder.add("banana", 2);
313 let data = builder.finish();
314
315 let dict = TermDict::open(&data);
316 let results: Vec<_> = dict.prefix_iter("xyz");
317 assert!(results.is_empty());
318 }
319
320 #[test]
321 fn prefix_empty_yields_all() {
322 let mut builder = TermDictBuilder::new();
323 builder.add("a", 1);
324 builder.add("b", 2);
325 builder.add("c", 3);
326 let data = builder.finish();
327
328 let dict = TermDict::open(&data);
329 let results: Vec<_> = dict.prefix_iter("");
330 assert_eq!(results.len(), 3);
331 }
332
333 #[test]
334 fn prefix_single_char() {
335 let mut builder = TermDictBuilder::new();
336 builder.add("ax", 1);
337 builder.add("ay", 2);
338 builder.add("bx", 3);
339 let data = builder.finish();
340
341 let dict = TermDict::open(&data);
342 let results: Vec<_> = dict.prefix_iter("a");
343 assert_eq!(results.len(), 2);
344 assert_eq!(results[0].0, "ax");
345 assert_eq!(results[1].0, "ay");
346 }
347
348 #[test]
349 fn prefix_exact_term_match() {
350 let mut builder = TermDictBuilder::new();
351 builder.add("hello", 1);
352 builder.add("helloworld", 2);
353 builder.add("help", 3);
354 let data = builder.finish();
355
356 let dict = TermDict::open(&data);
357 let results: Vec<_> = dict.prefix_iter("hello");
358 assert_eq!(results.len(), 2);
359 assert_eq!(results[0].0, "hello");
360 assert_eq!(results[1].0, "helloworld");
361 }
362
363 #[test]
364 fn prefix_unicode() {
365 let mut builder = TermDictBuilder::new();
366 builder.add("cafe", 1);
367 builder.add("caf\u{00e9}", 2);
368 builder.add("car", 3);
369 let data = builder.finish();
370
371 let dict = TermDict::open(&data);
372 let results: Vec<_> = dict.prefix_iter("caf");
373 assert_eq!(results.len(), 2);
374 }
375
376 #[test]
377 fn prefix_on_empty_dict() {
378 let builder = TermDictBuilder::new();
379 let data = builder.finish();
380 let dict = TermDict::open(&data);
381 let results: Vec<_> = dict.prefix_iter("any");
382 assert!(results.is_empty());
383 }
384
385 #[test]
386 fn prefix_at_end_of_dict() {
387 let mut builder = TermDictBuilder::new();
388 builder.add("aaa", 1);
389 builder.add("zzz", 2);
390 let data = builder.finish();
391
392 let dict = TermDict::open(&data);
393 let results: Vec<_> = dict.prefix_iter("zz");
394 assert_eq!(results.len(), 1);
395 assert_eq!(results[0].0, "zzz");
396 }
397
398 #[test]
401 fn automaton_search_with_prefix() {
402 let mut builder = TermDictBuilder::new();
403 builder.add("apple", 1);
404 builder.add("application", 2);
405 builder.add("apply", 3);
406 builder.add("banana", 4);
407 builder.add("cherry", 5);
408 let data = builder.finish();
409
410 let dict = TermDict::open(&data);
411 let aut = fst::automaton::Str::new("app").starts_with();
413 let results = dict.automaton_search(aut);
414 assert_eq!(results.len(), 3);
415 assert_eq!(results[0].0, "apple");
416 assert_eq!(results[1].0, "application");
417 assert_eq!(results[2].0, "apply");
418 }
419
420 #[test]
421 fn automaton_search_exact_match() {
422 let mut builder = TermDictBuilder::new();
423 builder.add("apple", 1);
424 builder.add("banana", 2);
425 builder.add("cherry", 3);
426 let data = builder.finish();
427
428 let dict = TermDict::open(&data);
429 let aut = fst::automaton::Str::new("banana");
430 let results = dict.automaton_search(aut);
431 assert_eq!(results.len(), 1);
432 assert_eq!(results[0].0, "banana");
433 }
434}