pub struct TermDictBuilder {
fst_builder: fst::MapBuilder<Vec<u8>>,
}
impl TermDictBuilder {
pub fn new() -> Self {
Self {
fst_builder: fst::MapBuilder::memory(),
}
}
pub fn add(&mut self, term: &str, offset: u64) {
self.fst_builder
.insert(term.as_bytes(), offset)
.expect("terms must be added in sorted order");
}
pub fn finish(self) -> Vec<u8> {
self.fst_builder
.into_inner()
.expect("FST builder finish failed")
}
}
impl Default for TermDictBuilder {
fn default() -> Self {
Self::new()
}
}
pub struct TermDict<'a> {
fst: fst::Map<&'a [u8]>,
}
impl<'a> TermDict<'a> {
pub fn open(data: &'a [u8]) -> Self {
let fst = if data.is_empty() {
let builder = fst::MapBuilder::memory();
let bytes = builder.into_inner().unwrap();
let leaked: &'a [u8] = Box::leak(bytes.into_boxed_slice());
fst::Map::new(leaked).unwrap()
} else {
fst::Map::new(data).expect("invalid FST data in term dictionary")
};
Self { fst }
}
pub fn len(&self) -> u32 {
self.fst.len() as u32
}
pub fn is_empty(&self) -> bool {
self.fst.is_empty()
}
pub fn get(&self, term: &str) -> Option<u64> {
self.fst.get(term.as_bytes())
}
pub fn prefix_iter(&self, prefix: &str) -> Vec<(String, u64)> {
collect_prefix(&self.fst, prefix)
}
pub fn automaton_search<A: fst::Automaton>(&self, aut: A) -> Vec<(String, u64)> {
collect_automaton(&self.fst, aut)
}
}
fn collect_prefix(fst: &fst::Map<&[u8]>, prefix: &str) -> Vec<(String, u64)> {
use fst::IntoStreamer;
use fst::Streamer;
let mut stream = fst.range().ge(prefix.as_bytes()).into_stream();
let mut results = Vec::new();
while let Some((key, value)) = stream.next() {
let term = std::str::from_utf8(key).expect("invalid UTF-8 in term dictionary");
if !term.starts_with(prefix) {
break;
}
results.push((term.to_string(), value));
}
results
}
fn collect_automaton<A: fst::Automaton>(fst: &fst::Map<&[u8]>, aut: A) -> Vec<(String, u64)> {
use fst::IntoStreamer;
use fst::Streamer;
let mut stream = fst.search(aut).into_stream();
let mut results = Vec::new();
while let Some((key, value)) = stream.next() {
let term = std::str::from_utf8(key).expect("invalid UTF-8 in term dictionary");
results.push((term.to_string(), value));
}
results
}
#[cfg(test)]
mod tests {
use super::*;
use fst::Automaton;
#[test]
fn empty_dict() {
let builder = TermDictBuilder::new();
let data = builder.finish();
let dict = TermDict::open(&data);
assert_eq!(dict.len(), 0);
assert!(dict.is_empty());
assert_eq!(dict.get("anything"), None);
}
#[test]
fn single_term() {
let mut builder = TermDictBuilder::new();
builder.add("hello", 42);
let data = builder.finish();
let dict = TermDict::open(&data);
assert_eq!(dict.len(), 1);
assert_eq!(dict.get("hello"), Some(42));
assert_eq!(dict.get("world"), None);
}
#[test]
fn multiple_terms() {
let mut builder = TermDictBuilder::new();
builder.add("apple", 100);
builder.add("banana", 200);
builder.add("cherry", 300);
let data = builder.finish();
let dict = TermDict::open(&data);
assert_eq!(dict.len(), 3);
assert_eq!(dict.get("apple"), Some(100));
assert_eq!(dict.get("banana"), Some(200));
assert_eq!(dict.get("cherry"), Some(300));
}
#[test]
fn lookup_miss() {
let mut builder = TermDictBuilder::new();
builder.add("apple", 100);
builder.add("cherry", 300);
let data = builder.finish();
let dict = TermDict::open(&data);
assert_eq!(dict.get("banana"), None);
assert_eq!(dict.get("aaa"), None);
assert_eq!(dict.get("zzz"), None);
}
#[test]
fn binary_search_first_term() {
let mut builder = TermDictBuilder::new();
builder.add("aaa", 1);
builder.add("bbb", 2);
builder.add("ccc", 3);
builder.add("ddd", 4);
builder.add("eee", 5);
let data = builder.finish();
let dict = TermDict::open(&data);
assert_eq!(dict.get("aaa"), Some(1));
}
#[test]
fn binary_search_last_term() {
let mut builder = TermDictBuilder::new();
builder.add("aaa", 1);
builder.add("bbb", 2);
builder.add("ccc", 3);
builder.add("ddd", 4);
builder.add("eee", 5);
let data = builder.finish();
let dict = TermDict::open(&data);
assert_eq!(dict.get("eee"), Some(5));
}
#[test]
fn binary_search_middle_term() {
let mut builder = TermDictBuilder::new();
builder.add("aaa", 1);
builder.add("bbb", 2);
builder.add("ccc", 3);
builder.add("ddd", 4);
builder.add("eee", 5);
let data = builder.finish();
let dict = TermDict::open(&data);
assert_eq!(dict.get("ccc"), Some(3));
}
#[test]
fn unicode_terms() {
let mut builder = TermDictBuilder::new();
builder.add("cafe", 1);
builder.add("caf\u{00e9}", 2);
builder.add("na\u{00ef}ve", 3);
builder.add("\u{00fc}ber", 4);
let data = builder.finish();
let dict = TermDict::open(&data);
assert_eq!(dict.get("cafe"), Some(1));
assert_eq!(dict.get("caf\u{00e9}"), Some(2));
assert_eq!(dict.get("na\u{00ef}ve"), Some(3));
assert_eq!(dict.get("\u{00fc}ber"), Some(4));
}
#[test]
fn large_dict() {
let count = 10_000;
let mut builder = TermDictBuilder::new();
let mut terms: Vec<String> = (0..count).map(|i| format!("term_{i:06}")).collect();
terms.sort();
for (i, term) in terms.iter().enumerate() {
builder.add(term, i as u64);
}
let data = builder.finish();
let dict = TermDict::open(&data);
assert_eq!(dict.len(), count as u32);
for (i, term) in terms.iter().enumerate() {
assert_eq!(dict.get(term), Some(i as u64));
}
}
#[test]
fn large_offsets() {
let mut builder = TermDictBuilder::new();
builder.add("a", u64::MAX - 1);
builder.add("b", u64::MAX);
let data = builder.finish();
let dict = TermDict::open(&data);
assert_eq!(dict.get("a"), Some(u64::MAX - 1));
assert_eq!(dict.get("b"), Some(u64::MAX));
}
#[test]
fn empty_string_term() {
let mut builder = TermDictBuilder::new();
builder.add("", 0);
builder.add("a", 1);
let data = builder.finish();
let dict = TermDict::open(&data);
assert_eq!(dict.get(""), Some(0));
assert_eq!(dict.get("a"), Some(1));
}
#[test]
fn prefix_multiple_matches() {
let mut builder = TermDictBuilder::new();
builder.add("apple", 1);
builder.add("application", 2);
builder.add("apply", 3);
builder.add("banana", 4);
builder.add("cherry", 5);
let data = builder.finish();
let dict = TermDict::open(&data);
let results: Vec<_> = dict.prefix_iter("app");
assert_eq!(results.len(), 3);
assert_eq!(results[0], ("apple".to_string(), 1));
assert_eq!(results[1], ("application".to_string(), 2));
assert_eq!(results[2], ("apply".to_string(), 3));
}
#[test]
fn prefix_no_matches() {
let mut builder = TermDictBuilder::new();
builder.add("apple", 1);
builder.add("banana", 2);
let data = builder.finish();
let dict = TermDict::open(&data);
let results: Vec<_> = dict.prefix_iter("xyz");
assert!(results.is_empty());
}
#[test]
fn prefix_empty_yields_all() {
let mut builder = TermDictBuilder::new();
builder.add("a", 1);
builder.add("b", 2);
builder.add("c", 3);
let data = builder.finish();
let dict = TermDict::open(&data);
let results: Vec<_> = dict.prefix_iter("");
assert_eq!(results.len(), 3);
}
#[test]
fn prefix_single_char() {
let mut builder = TermDictBuilder::new();
builder.add("ax", 1);
builder.add("ay", 2);
builder.add("bx", 3);
let data = builder.finish();
let dict = TermDict::open(&data);
let results: Vec<_> = dict.prefix_iter("a");
assert_eq!(results.len(), 2);
assert_eq!(results[0].0, "ax");
assert_eq!(results[1].0, "ay");
}
#[test]
fn prefix_exact_term_match() {
let mut builder = TermDictBuilder::new();
builder.add("hello", 1);
builder.add("helloworld", 2);
builder.add("help", 3);
let data = builder.finish();
let dict = TermDict::open(&data);
let results: Vec<_> = dict.prefix_iter("hello");
assert_eq!(results.len(), 2);
assert_eq!(results[0].0, "hello");
assert_eq!(results[1].0, "helloworld");
}
#[test]
fn prefix_unicode() {
let mut builder = TermDictBuilder::new();
builder.add("cafe", 1);
builder.add("caf\u{00e9}", 2);
builder.add("car", 3);
let data = builder.finish();
let dict = TermDict::open(&data);
let results: Vec<_> = dict.prefix_iter("caf");
assert_eq!(results.len(), 2);
}
#[test]
fn prefix_on_empty_dict() {
let builder = TermDictBuilder::new();
let data = builder.finish();
let dict = TermDict::open(&data);
let results: Vec<_> = dict.prefix_iter("any");
assert!(results.is_empty());
}
#[test]
fn prefix_at_end_of_dict() {
let mut builder = TermDictBuilder::new();
builder.add("aaa", 1);
builder.add("zzz", 2);
let data = builder.finish();
let dict = TermDict::open(&data);
let results: Vec<_> = dict.prefix_iter("zz");
assert_eq!(results.len(), 1);
assert_eq!(results[0].0, "zzz");
}
#[test]
fn automaton_search_with_prefix() {
let mut builder = TermDictBuilder::new();
builder.add("apple", 1);
builder.add("application", 2);
builder.add("apply", 3);
builder.add("banana", 4);
builder.add("cherry", 5);
let data = builder.finish();
let dict = TermDict::open(&data);
let aut = fst::automaton::Str::new("app").starts_with();
let results = dict.automaton_search(aut);
assert_eq!(results.len(), 3);
assert_eq!(results[0].0, "apple");
assert_eq!(results[1].0, "application");
assert_eq!(results[2].0, "apply");
}
#[test]
fn automaton_search_exact_match() {
let mut builder = TermDictBuilder::new();
builder.add("apple", 1);
builder.add("banana", 2);
builder.add("cherry", 3);
let data = builder.finish();
let dict = TermDict::open(&data);
let aut = fst::automaton::Str::new("banana");
let results = dict.automaton_search(aut);
assert_eq!(results.len(), 1);
assert_eq!(results[0].0, "banana");
}
}