Skip to main content

inputx_fsa/
lib.rs

1//! `inputx-fsa` — a compact, zero-dependency ordered map from byte keys to
2//! `u64`, backed by a **minimal acyclic finite-state automaton** (MA-FSA /
3//! DAWG) plus a parallel value table addressed by lexicographic rank.
4//!
5//! This is a clean-room implementation written for the Inputx IME to replace
6//! the `fst` crate on the shipped read path — same role (compressed,
7//! mmap-friendly dictionary index with `get` + prefix/range scan), no
8//! external dependencies. The algorithms (Revuz minimization, MA-FSA as a
9//! minimal perfect hash via per-transition right-language counts) are
10//! textbook; the code is original.
11//!
12//! # Model
13//!
14//! The automaton recognizes the *set* of keys (it carries no values inside
15//! it — that keeps it minimal regardless of the values). Each accepted key
16//! has a stable **ordinal** = its rank in sorted order, computed during the
17//! walk by summing, at every step, the number of keys that sort before the
18//! branch we take. Values live in a separate fixed-width array indexed by
19//! that ordinal. This is the "numbered MA-FSA / minimal perfect hash"
20//! construction.
21//!
22//! # Format (v3, little-endian)
23//!
24//! ```text
25//! magic "IXFA" (4) · version u8(=3) · value_width u8 (1|2|4|8)
26//! value_count u32 · root_off u32 · state_count u32        (18-byte header)
27//! states:  byte-offset addressed (no offset table), post-order so every
28//!          target precedes its source. Per state, a flags byte:
29//!            bit0 = final, bit1 = single-transition.
30//!          · single  → [flags, label u8, delta uvarint]   (count omitted)
31//!          · multi   → [flags, n uvarint, (label u8, delta uvarint,
32//!                       count uvarint) × n]
33//!          `delta` is a back-distance to the target state; `count` is the
34//!          target's right-language size (for the ordinal/rank walk).
35//! values:  [value_width bytes; value_count]   (tail of the buffer)
36//! ```
37//!
38//! # Example
39//!
40//! ```
41//! use inputx_fsa::{Builder, Fsa};
42//!
43//! let mut b = Builder::new();
44//! b.insert(b"apple", 1);
45//! b.insert(b"apply", 2);
46//! b.insert(b"banana", 3);
47//! let fsa = Fsa::new(b.finish()).unwrap();
48//!
49//! assert_eq!(fsa.get(b"apple"), Some(1));
50//! assert_eq!(fsa.get(b"grape"), None);
51//!
52//! // lazy, composable prefix scan
53//! let app: Vec<_> = fsa.range(b"app").collect();
54//! assert_eq!(app, vec![(b"apple".to_vec(), 1), (b"apply".to_vec(), 2)]);
55//! ```
56
57#![forbid(unsafe_code)]
58#![cfg_attr(not(feature = "std"), no_std)]
59
60extern crate alloc;
61
62mod builder;
63mod dict;
64mod reader;
65
66pub use builder::Builder;
67pub use dict::{Dict, DictBuilder};
68pub use reader::{Fsa, FsaError, FsaIter};
69
70#[cfg(test)]
71mod tests {
72    use super::*;
73    use std::collections::BTreeMap;
74
75    /// Build an Fsa from pairs and return the serialized bytes.
76    fn build(pairs: &[(&[u8], u64)]) -> Vec<u8> {
77        let mut b = Builder::new();
78        for &(k, v) in pairs {
79            b.insert(k, v);
80        }
81        b.finish()
82    }
83
84    #[test]
85    fn empty() {
86        let bytes = build(&[]);
87        let fsa = Fsa::new(bytes).unwrap();
88        assert_eq!(fsa.get(b"anything"), None);
89        assert_eq!(fsa.len(), 0);
90    }
91
92    #[test]
93    fn basic_get() {
94        let bytes = build(&[(b"a", 1), (b"ab", 2), (b"ac", 3), (b"b", 4)]);
95        let fsa = Fsa::new(bytes).unwrap();
96        assert_eq!(fsa.get(b"a"), Some(1));
97        assert_eq!(fsa.get(b"ab"), Some(2));
98        assert_eq!(fsa.get(b"ac"), Some(3));
99        assert_eq!(fsa.get(b"b"), Some(4));
100        assert_eq!(fsa.get(b"c"), None);
101        assert_eq!(fsa.get(b"abc"), None);
102        assert_eq!(fsa.get(b""), None);
103        assert_eq!(fsa.len(), 4);
104    }
105
106    #[test]
107    fn empty_key_member() {
108        let bytes = build(&[(b"", 7), (b"x", 9)]);
109        let fsa = Fsa::new(bytes).unwrap();
110        assert_eq!(fsa.get(b""), Some(7));
111        assert_eq!(fsa.get(b"x"), Some(9));
112    }
113
114    #[test]
115    fn shared_suffix_minimization() {
116        // "ation" suffix shared → DAWG should merge those states. We can't
117        // easily assert state count here, but correctness must hold.
118        let bytes = build(&[
119            (b"nation", 1),
120            (b"ration", 2),
121            (b"station", 3),
122        ]);
123        let fsa = Fsa::new(bytes).unwrap();
124        assert_eq!(fsa.get(b"nation"), Some(1));
125        assert_eq!(fsa.get(b"ration"), Some(2));
126        assert_eq!(fsa.get(b"station"), Some(3));
127        assert_eq!(fsa.get(b"ation"), None);
128    }
129
130    #[test]
131    fn lazy_iter() {
132        let bytes = build(&[(b"a", 1), (b"ab", 2), (b"ac", 3), (b"b", 4)]);
133        let fsa = Fsa::new(bytes).unwrap();
134        // lazy iter == eager prefix(b"")
135        assert_eq!(fsa.iter().collect::<Vec<_>>(), fsa.prefix(b""));
136        // range == prefix; early termination via take
137        assert_eq!(fsa.range(b"a").collect::<Vec<_>>(), fsa.prefix(b"a"));
138        assert_eq!(
139            fsa.range(b"a").take(2).collect::<Vec<_>>(),
140            vec![(b"a".to_vec(), 1), (b"ab".to_vec(), 2)]
141        );
142        // composes + stops early without walking the rest
143        assert_eq!(fsa.iter().find(|(k, _)| k == b"ac"), Some((b"ac".to_vec(), 3)));
144        assert_eq!(fsa.range(b"z").next(), None);
145    }
146
147    #[test]
148    fn prefix_scan() {
149        let bytes = build(&[(b"a", 1), (b"ab", 2), (b"ac", 3), (b"b", 4)]);
150        let fsa = Fsa::new(bytes).unwrap();
151        let got: Vec<_> = fsa.prefix(b"a");
152        assert_eq!(
153            got,
154            vec![
155                (b"a".to_vec(), 1),
156                (b"ab".to_vec(), 2),
157                (b"ac".to_vec(), 3)
158            ]
159        );
160        assert_eq!(fsa.prefix(b"b"), vec![(b"b".to_vec(), 4)]);
161        assert_eq!(fsa.prefix(b"z"), Vec::<(Vec<u8>, u64)>::new());
162        assert_eq!(fsa.prefix(b"").len(), 4); // empty prefix → all
163    }
164
165    #[test]
166    fn value_widths() {
167        // Force each width tier and verify round-trip.
168        for &maxv in &[0xFFu64, 0xFFFF, 0xFFFF_FFFF, 0xFFFF_FFFF_FF] {
169            let bytes = build(&[(b"lo", 0), (b"hi", maxv)]);
170            let fsa = Fsa::new(bytes).unwrap();
171            assert_eq!(fsa.get(b"hi"), Some(maxv));
172            assert_eq!(fsa.get(b"lo"), Some(0));
173        }
174    }
175
176    #[test]
177    fn duplicate_key_last_wins() {
178        let bytes = build(&[(b"k", 1), (b"k", 2), (b"k", 3)]);
179        let fsa = Fsa::new(bytes).unwrap();
180        assert_eq!(fsa.get(b"k"), Some(3));
181        assert_eq!(fsa.len(), 1);
182    }
183
184    // ─── Differential testing against a BTreeMap oracle (zero-dep) ────────
185    //
186    // The oracle is trivially correct; the FSA must match it for `get` and
187    // `prefix` across every key + many random probes. This is what makes the
188    // clean-room build *provably* correct without leaning on `fst`.
189
190
191    /// Perf regression gate (run in release): `get` and a prefix scan must
192    /// stay well under budget. Generous thresholds (~4x measured) so it
193    /// flags real regressions, not scheduling jitter. Ignored by default;
194    /// run: `cargo test -p inputx-fsa --release perfgate -- --ignored --nocapture`
195    #[test]
196    #[ignore]
197    fn perfgate_get_prefix_under_budget() {
198        let mut b = DictBuilder::new();
199        for i in 0..20_000u32 {
200            let code = format!("code{i:05}");
201            for j in 0..5u32 {
202                b.insert(
203                    code.as_bytes(),
204                    format!("word{i}_{j}").as_bytes(),
205                    u64::from(i * 10 + j),
206                );
207            }
208        }
209        let dict = Dict::new(b.finish()).unwrap();
210        let codes: Vec<String> = (0..1000).map(|i| format!("code{:05}", (i * 19) % 20_000)).collect();
211
212        let iters = 50;
213        let t = std::time::Instant::now();
214        for _ in 0..iters {
215            for c in &codes {
216                std::hint::black_box(dict.get(c.as_bytes()));
217            }
218        }
219        let per_get = t.elapsed().as_nanos() as f64 / (iters * codes.len()) as f64;
220        eprintln!("[perfgate] get = {per_get:.0} ns/op");
221        assert!(per_get < 20_000.0, "get regressed: {per_get:.0} ns/op (budget 20µs)");
222
223        let t = std::time::Instant::now();
224        for _ in 0..iters {
225            let mut n = 0u64;
226            dict.prefix_for_each(b"code0", |_, _, v| n = n.wrapping_add(v));
227            std::hint::black_box(n);
228        }
229        let per_pref = t.elapsed().as_nanos() as f64 / iters as f64;
230        eprintln!("[perfgate] prefix(code0* = 5000 items) = {per_pref:.0} ns/op");
231        assert!(per_pref < 5_000_000.0, "prefix regressed: {per_pref:.0} ns (budget 5ms)");
232    }
233
234    use proptest::prelude::*;
235
236    fn key_strategy() -> impl Strategy<Value = Vec<u8>> {
237        proptest::collection::vec(b'a'..=b'e', 0..6)
238    }
239
240    proptest! {
241        #![proptest_config(ProptestConfig { cases: 200, ..ProptestConfig::default() })]
242
243        #[test]
244        fn diff_get_and_prefix(
245            entries in proptest::collection::vec((key_strategy(), any::<u64>()), 0..64),
246            probes in proptest::collection::vec(key_strategy(), 0..32),
247        ) {
248            // Oracle (last-write-wins on dup, matching Builder).
249            let mut oracle: BTreeMap<Vec<u8>, u64> = BTreeMap::new();
250            for (k, v) in &entries {
251                oracle.insert(k.clone(), *v);
252            }
253
254            let mut b = Builder::new();
255            for (k, v) in &entries {
256                b.insert(k, *v);
257            }
258            let fsa = Fsa::new(b.finish()).unwrap();
259
260            prop_assert_eq!(fsa.len(), oracle.len() as u64);
261
262            // lazy iter yields exactly the eager prefix(b"") sequence.
263            prop_assert_eq!(fsa.iter().collect::<Vec<_>>(), fsa.prefix(b""));
264
265            // get matches on every oracle key + random probes.
266            for (k, v) in &oracle {
267                prop_assert_eq!(fsa.get(k), Some(*v), "get({:?})", k);
268            }
269            for p in &probes {
270                prop_assert_eq!(fsa.get(p), oracle.get(p).copied(), "get probe {:?}", p);
271            }
272
273            // prefix matches the oracle's sorted range for every probe prefix.
274            for p in &probes {
275                let want: Vec<(Vec<u8>, u64)> = oracle
276                    .iter()
277                    .filter(|(k, _)| k.starts_with(p))
278                    .map(|(k, v)| (k.clone(), *v))
279                    .collect();
280                prop_assert_eq!(fsa.prefix(p), want, "prefix {:?}", p);
281            }
282        }
283    }
284
285    // ─── Robustness + wide-byte coverage ─────────────────────────────────
286
287    #[test]
288    fn rejects_bad_buffers() {
289        assert!(matches!(Fsa::new(&b""[..]), Err(FsaError::Truncated)));
290        assert!(matches!(
291            Fsa::new(&b"XXXX..............."[..]),
292            Err(FsaError::BadMagic)
293        ));
294        let mut bad = b"IXFA".to_vec();
295        bad.push(99); // version
296        bad.extend(std::iter::repeat_n(0u8, 20));
297        assert!(matches!(
298            Fsa::new(bad.as_slice()),
299            Err(FsaError::BadVersion(99))
300        ));
301    }
302
303    #[test]
304    fn keys_with_zero_and_high_bytes() {
305        // Keys are arbitrary bytes — 0x00 / 0xFF carry no special meaning.
306        let bytes = build(&[
307            (b"\x00", 1),
308            (b"\x00\xff", 2),
309            (b"\xff", 3),
310            (b"a\x00b", 4),
311        ]);
312        let fsa = Fsa::new(bytes).unwrap();
313        assert_eq!(fsa.get(b"\x00"), Some(1));
314        assert_eq!(fsa.get(b"\x00\xff"), Some(2));
315        assert_eq!(fsa.get(b"\xff"), Some(3));
316        assert_eq!(fsa.get(b"a\x00b"), Some(4));
317        assert_eq!(fsa.get(b"\x00\x00"), None);
318        assert_eq!(fsa.prefix(b"\x00").len(), 2);
319    }
320
321    #[test]
322    fn wide_alphabet_single_state() {
323        // A root with all 256 labels exercises the u16 n_trans path.
324        let pairs: Vec<(Vec<u8>, u64)> = (0u16..256).map(|b| (vec![b as u8], u64::from(b))).collect();
325        let mut bld = Builder::new();
326        for (k, v) in &pairs {
327            bld.insert(k, *v);
328        }
329        let fsa = Fsa::new(bld.finish()).unwrap();
330        assert_eq!(fsa.len(), 256);
331        for (k, v) in &pairs {
332            assert_eq!(fsa.get(k), Some(*v));
333        }
334    }
335
336    fn wide_key() -> impl Strategy<Value = Vec<u8>> {
337        proptest::collection::vec(any::<u8>(), 0..5)
338    }
339
340    proptest! {
341        #![proptest_config(ProptestConfig { cases: 200, ..ProptestConfig::default() })]
342
343        /// Full-byte-range keys (0x00 / 0xFF included) — get + prefix match oracle.
344        #[test]
345        fn diff_wide_bytes(
346            entries in proptest::collection::vec((wide_key(), any::<u64>()), 0..48),
347            probes in proptest::collection::vec(wide_key(), 0..24),
348        ) {
349            let mut oracle: BTreeMap<Vec<u8>, u64> = BTreeMap::new();
350            for (k, v) in &entries { oracle.insert(k.clone(), *v); }
351            let mut b = Builder::new();
352            for (k, v) in &entries { b.insert(k, *v); }
353            let fsa = Fsa::new(b.finish()).unwrap();
354            for (k, v) in &oracle { prop_assert_eq!(fsa.get(k), Some(*v)); }
355            for p in &probes {
356                let want: Vec<(Vec<u8>, u64)> = oracle.iter()
357                    .filter(|(k, _)| k.starts_with(p))
358                    .map(|(k, v)| (k.clone(), *v)).collect();
359                prop_assert_eq!(fsa.prefix(p), want);
360            }
361        }
362
363        /// `Fsa::new` never panics on arbitrary input — Ok or Err, never crash.
364        #[test]
365        fn new_no_panic(bytes in proptest::collection::vec(any::<u8>(), 0..128)) {
366            let _ = Fsa::new(bytes.as_slice());
367        }
368
369        /// Corrupt-buffer robustness: take a VALID Fsa/Dict, flip arbitrary
370        /// bytes (header or body), then new + get + prefix must never panic —
371        /// at worst Err on new, or graceful empty/None results. This is the
372        /// untrusted-input contract for the published crate.
373        #[test]
374        fn fuzz_mutated_buffer_no_panic(
375            muts in proptest::collection::vec((any::<usize>(), any::<u8>()), 0..40),
376            probe in proptest::collection::vec(any::<u8>(), 0..6),
377        ) {
378            // ── Fsa ──
379            let mut fb = Builder::new();
380            for i in 0..60u32 {
381                fb.insert(format!("key{i:03}").as_bytes(), u64::from(i) * 7);
382            }
383            let mut bytes = fb.finish();
384            for (idx, val) in &muts {
385                if !bytes.is_empty() {
386                    let i = idx % bytes.len();
387                    bytes[i] = *val;
388                }
389            }
390            if let Ok(fsa) = Fsa::new(bytes.as_slice()) {
391                let _ = fsa.get(&probe);
392                let _ = fsa.contains_prefix(&probe);
393                let _ = fsa.prefix(&probe); // walk + visit_subtree
394            }
395
396            // ── Dict ──
397            let mut db = DictBuilder::new();
398            for i in 0..60u32 {
399                db.insert(format!("c{}", i % 12).as_bytes(), format!("item{i}").as_bytes(), u64::from(i));
400            }
401            let mut dbytes = db.finish();
402            for (idx, val) in &muts {
403                if !dbytes.is_empty() {
404                    let i = idx % dbytes.len();
405                    dbytes[i] = *val;
406                }
407            }
408            if let Ok(dict) = Dict::new(dbytes.as_slice()) {
409                let _ = dict.get(&probe);
410                let _ = dict.prefix(&probe);
411                dict.get_for_each(&probe, |_, _| {});
412            }
413        }
414    }
415}