use std::{collections::BTreeMap, io::Write};
use fst::{IntoStreamer, Map, MapBuilder, Streamer};
use crate::superfile::format::FST_SEPARATOR;
pub fn make_key(column_name: &str, term: &str) -> Vec<u8> {
let mut k = Vec::with_capacity(column_name.len() + 1 + term.len());
k.extend_from_slice(column_name.as_bytes());
k.push(FST_SEPARATOR);
k.extend_from_slice(term.as_bytes());
k
}
#[inline]
pub fn validate_column_name(column_name: &str) -> bool {
!column_name.as_bytes().contains(&FST_SEPARATOR)
}
#[derive(Debug, Default)]
pub struct DictBuilder {
sorted_buffer: BTreeMap<Vec<u8>, u64>,
}
impl DictBuilder {
pub fn new() -> Self {
Self::default()
}
pub fn insert(&mut self, key: &[u8], value: u64) {
self.sorted_buffer.insert(key.to_vec(), value);
}
pub fn len(&self) -> usize {
self.sorted_buffer.len()
}
pub fn is_empty(&self) -> bool {
self.sorted_buffer.is_empty()
}
pub fn finish(self) -> Vec<u8> {
let mut builder = MapBuilder::memory();
for (k, v) in self.sorted_buffer {
builder
.insert(&k, v)
.expect("BTreeMap guarantees sorted, unique keys");
}
builder
.into_inner()
.expect("in-memory FST writer cannot fail at finalize")
}
}
pub struct StreamingDictBuilder<W: Write> {
inner: MapBuilder<W>,
n_keys: u64,
}
impl<W: Write> StreamingDictBuilder<W> {
pub fn new(w: W) -> Result<Self, fst::Error> {
Ok(Self {
inner: MapBuilder::new(w)?,
n_keys: 0,
})
}
pub fn insert_sorted(&mut self, key: &[u8], value: u64) -> Result<(), fst::Error> {
self.inner.insert(key, value)?;
self.n_keys += 1;
Ok(())
}
#[inline]
pub fn n_keys(&self) -> u64 {
self.n_keys
}
pub fn finish(self) -> Result<W, fst::Error> {
self.inner.into_inner()
}
}
pub struct DictReader<'a> {
fst: Map<&'a [u8]>,
}
impl<'a> DictReader<'a> {
pub fn open(bytes: &'a [u8]) -> Result<Self, fst::Error> {
Ok(Self {
fst: Map::new(bytes)?,
})
}
#[inline]
pub fn lookup(&self, key: &[u8]) -> Option<u64> {
self.fst.get(key)
}
#[inline]
pub fn len(&self) -> usize {
self.fst.len()
}
#[inline]
pub fn is_empty(&self) -> bool {
self.fst.is_empty()
}
pub fn iter_prefix(&self, prefix: &[u8]) -> Vec<(Vec<u8>, u64)> {
let mut out = Vec::new();
let mut stream = self.fst.range().ge(prefix).into_stream();
while let Some((key, value)) = stream.next() {
if !key.starts_with(prefix) {
break;
}
out.push((key.to_vec(), value));
}
out
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn make_key_encodes_with_unit_separator() {
assert_eq!(make_key("title", "rust"), b"title\x1Frust");
}
#[test]
fn make_key_handles_empty_term() {
assert_eq!(make_key("title", ""), b"title\x1F");
}
#[test]
fn make_key_handles_empty_column() {
assert_eq!(make_key("", "rust"), b"\x1Frust");
}
#[test]
fn make_key_handles_both_empty() {
assert_eq!(make_key("", ""), b"\x1F");
}
#[test]
fn make_key_preserves_term_bytes() {
let key = make_key("body", "café");
assert_eq!(&key[0..4], b"body");
assert_eq!(key[4], FST_SEPARATOR);
assert_eq!(&key[5..], "café".as_bytes());
}
#[test]
fn make_key_capacity_avoids_realloc() {
let key = make_key("title", "rust");
assert_eq!(key.len(), key.capacity());
}
#[test]
fn validate_column_name_rejects_separator_byte() {
let bad = "ti\x1Ftle";
assert!(!validate_column_name(bad));
}
#[test]
fn validate_column_name_accepts_normal_names() {
for name in [
"title",
"body",
"headline",
"field_1",
"field-2",
"MyField",
"snake_case",
"camelCase",
"PascalCase",
"with spaces",
"with.dots",
] {
assert!(validate_column_name(name), "expected {name:?} to be valid");
}
}
#[test]
fn validate_column_name_accepts_empty_string() {
assert!(validate_column_name(""));
}
#[test]
fn validate_column_name_accepts_high_unicode() {
assert!(validate_column_name("café"));
assert!(validate_column_name("日本語"));
}
#[test]
fn lookup_roundtrip_basic() {
let mut b = DictBuilder::new();
b.insert(&make_key("title", "rust"), 100);
b.insert(&make_key("title", "async"), 200);
b.insert(&make_key("body", "tokio"), 300);
let bytes = b.finish();
let r = DictReader::open(&bytes).expect("open DictReader");
assert_eq!(r.lookup(&make_key("title", "rust")), Some(100));
assert_eq!(r.lookup(&make_key("title", "async")), Some(200));
assert_eq!(r.lookup(&make_key("body", "tokio")), Some(300));
assert_eq!(r.len(), 3);
}
#[test]
fn lookup_missing_returns_none() {
let mut b = DictBuilder::new();
b.insert(&make_key("title", "rust"), 1);
let bytes = b.finish();
let r = DictReader::open(&bytes).expect("open DictReader");
assert_eq!(r.lookup(&make_key("title", "java")), None);
assert_eq!(r.lookup(&make_key("body", "rust")), None);
assert_eq!(r.lookup(b""), None);
}
#[test]
fn out_of_order_inserts_work() {
let mut b = DictBuilder::new();
let keys = ["z", "a", "m", "b", "y", "c"];
for (i, k) in keys.iter().enumerate() {
b.insert(&make_key("col", k), i as u64);
}
let bytes = b.finish();
let r = DictReader::open(&bytes).expect("open DictReader");
for (i, k) in keys.iter().enumerate() {
assert_eq!(r.lookup(&make_key("col", k)), Some(i as u64));
}
}
#[test]
fn duplicate_inserts_keep_last_value() {
let mut b = DictBuilder::new();
b.insert(&make_key("col", "key"), 1);
b.insert(&make_key("col", "key"), 2);
b.insert(&make_key("col", "key"), 999);
let bytes = b.finish();
let r = DictReader::open(&bytes).expect("open DictReader");
assert_eq!(r.lookup(&make_key("col", "key")), Some(999));
assert_eq!(r.len(), 1);
}
#[test]
fn iter_prefix_returns_matching_keys_only() {
let mut b = DictBuilder::new();
b.insert(&make_key("title", "alpha"), 1);
b.insert(&make_key("title", "beta"), 2);
b.insert(&make_key("title", "gamma"), 3);
b.insert(&make_key("body", "alpha"), 10);
b.insert(&make_key("body", "delta"), 20);
b.insert(&make_key("tag", "epsilon"), 100);
let bytes = b.finish();
let r = DictReader::open(&bytes).expect("open DictReader");
let title_terms = r.iter_prefix(b"title\x1F");
let title_only: Vec<&[u8]> = title_terms.iter().map(|(k, _)| k.as_slice()).collect();
assert_eq!(
title_only,
vec![
b"title\x1Falpha".as_slice(),
b"title\x1Fbeta".as_slice(),
b"title\x1Fgamma".as_slice(),
],
"iter_prefix on title\\x1F returns only title-prefixed keys"
);
assert_eq!(
title_terms.iter().map(|(_, v)| *v).collect::<Vec<_>>(),
vec![1, 2, 3]
);
let body_terms = r.iter_prefix(b"body\x1F");
assert_eq!(body_terms.len(), 2);
let tag_terms = r.iter_prefix(b"tag\x1F");
assert_eq!(tag_terms.len(), 1);
}
#[test]
fn iter_prefix_lexicographic_order() {
let mut b = DictBuilder::new();
let terms = ["zebra", "ant", "monkey", "apple", "banana", "aardvark"];
for (i, t) in terms.iter().enumerate() {
b.insert(&make_key("col", t), i as u64);
}
let bytes = b.finish();
let r = DictReader::open(&bytes).expect("open DictReader");
let listed: Vec<Vec<u8>> = r
.iter_prefix(b"col\x1F")
.into_iter()
.map(|(k, _)| k[4..].to_vec()) .collect();
let mut expected: Vec<&str> = terms.to_vec();
expected.sort();
let expected: Vec<Vec<u8>> = expected.iter().map(|s| s.as_bytes().to_vec()).collect();
assert_eq!(listed, expected);
}
#[test]
fn iter_prefix_no_match_returns_empty() {
let mut b = DictBuilder::new();
b.insert(&make_key("title", "rust"), 1);
let bytes = b.finish();
let r = DictReader::open(&bytes).expect("open DictReader");
assert_eq!(r.iter_prefix(b"missing\x1F"), Vec::new());
assert_eq!(r.iter_prefix(b"").len(), 1);
}
#[test]
fn iter_prefix_stops_at_first_non_match() {
let mut b = DictBuilder::new();
for i in 0..1000 {
b.insert(&make_key("body", &format!("term{i:04}")), i as u64);
}
b.insert(&make_key("title", "rust"), 9999);
let bytes = b.finish();
let r = DictReader::open(&bytes).expect("open DictReader");
let title = r.iter_prefix(b"title\x1F");
assert_eq!(title.len(), 1);
assert_eq!(title[0].1, 9999);
}
#[test]
fn empty_builder_produces_valid_empty_fst() {
let bytes = DictBuilder::new().finish();
let r = DictReader::open(&bytes).expect("open DictReader");
assert!(r.is_empty());
assert_eq!(r.len(), 0);
assert_eq!(r.lookup(b"anything"), None);
assert_eq!(r.iter_prefix(b""), Vec::new());
assert_eq!(r.iter_prefix(b"prefix"), Vec::new());
}
#[test]
fn handles_thousand_keys_roundtrip() {
let mut b = DictBuilder::new();
for i in 0..1000 {
b.insert(&make_key("body", &format!("term{i:04}")), i as u64);
}
let bytes = b.finish();
let r = DictReader::open(&bytes).expect("open DictReader");
assert_eq!(r.len(), 1000);
for i in 0..1000 {
let k = make_key("body", &format!("term{i:04}"));
assert_eq!(r.lookup(&k), Some(i as u64));
}
let listed = r.iter_prefix(b"body\x1F");
assert_eq!(listed.len(), 1000);
for w in listed.windows(2) {
assert!(w[0].0 < w[1].0, "prefix iter not in lex order");
}
}
#[test]
fn serialization_is_deterministic() {
let mut b1 = DictBuilder::new();
b1.insert(&make_key("a", "x"), 1);
b1.insert(&make_key("b", "y"), 2);
b1.insert(&make_key("c", "z"), 3);
let bytes1 = b1.finish();
let mut b2 = DictBuilder::new();
b2.insert(&make_key("c", "z"), 3);
b2.insert(&make_key("a", "x"), 1);
b2.insert(&make_key("b", "y"), 2);
let bytes2 = b2.finish();
assert_eq!(bytes1, bytes2);
}
#[test]
fn keys_with_same_prefix_share_state() {
let mut b = DictBuilder::new();
for i in 0..100 {
b.insert(
&make_key("very_long_column_name", &format!("term{i}")),
i as u64,
);
}
let bytes = b.finish();
let r = DictReader::open(&bytes).expect("open DictReader");
assert_eq!(r.len(), 100);
for i in 0..100 {
let k = make_key("very_long_column_name", &format!("term{i}"));
assert_eq!(r.lookup(&k), Some(i as u64));
}
}
#[test]
fn lookup_distinguishes_columns_with_same_term() {
let mut b = DictBuilder::new();
b.insert(&make_key("title", "rust"), 1);
b.insert(&make_key("body", "rust"), 2);
b.insert(&make_key("tag", "rust"), 3);
let bytes = b.finish();
let r = DictReader::open(&bytes).expect("open DictReader");
assert_eq!(r.lookup(&make_key("title", "rust")), Some(1));
assert_eq!(r.lookup(&make_key("body", "rust")), Some(2));
assert_eq!(r.lookup(&make_key("tag", "rust")), Some(3));
}
#[test]
fn builder_and_reader_track_key_counts() {
let mut b = DictBuilder::new();
assert!(b.is_empty(), "fresh builder is empty");
assert_eq!(b.len(), 0);
b.insert(b"alpha", 1);
b.insert(b"beta", 2);
assert!(!b.is_empty());
assert_eq!(b.len(), 2);
let bytes = b.finish();
let r = DictReader::open(&bytes).expect("open dict");
assert_eq!(r.len(), 2);
assert!(!r.is_empty());
let mut sb = StreamingDictBuilder::new(Vec::new()).expect("streaming builder");
assert_eq!(sb.n_keys(), 0);
sb.insert_sorted(b"a", 1).expect("sorted insert");
sb.insert_sorted(b"b", 2).expect("sorted insert");
assert_eq!(sb.n_keys(), 2);
let _ = sb.finish().expect("finish streaming builder");
}
}