Skip to main content

inputx_fsa/
builder.rs

1//! Build side: collect (key, value) pairs → minimal acyclic FSA + value
2//! table → serialized bytes. Build-time only (not perf-critical), so it
3//! favors a simple, verifiable two-phase construction:
4//!
5//!   1. Insert all keys into a plain trie.
6//!   2. Minimize bottom-up by hash-consing equivalent sub-automata
7//!      (Revuz's algorithm) — two states are equal iff same finality and
8//!      same (label → canonical-target) transition set.
9//!
10//! Values are kept out of the automaton (it stays a pure key recognizer)
11//! and emitted as a fixed-width array indexed by each key's sorted rank.
12
13use alloc::vec;
14use alloc::vec::Vec;
15
16// Hash-cons register for state minimization. HashMap (O(1)) when `std` is
17// on; a BTreeMap keeps the builder available in no_std builds.
18#[cfg(feature = "std")]
19use std::collections::HashMap as Register;
20#[cfg(not(feature = "std"))]
21use alloc::collections::BTreeMap as Register;
22
23/// Accumulates (key, value) pairs and serializes a minimal FSA.
24#[derive(Default)]
25pub struct Builder {
26    pairs: Vec<(Vec<u8>, u64)>,
27}
28
29impl Builder {
30    /// Construct an empty builder. Call [`insert`](Self::insert) to add
31    /// entries, then [`finish`](Self::finish) to serialize.
32    pub fn new() -> Self {
33        Self { pairs: Vec::new() }
34    }
35
36    /// Add a key→value entry. Duplicate keys: last insert wins. Keys may be
37    /// inserted in any order (the builder sorts).
38    ///
39    /// ```
40    /// use inputx_fsa::{Builder, Fsa};
41    /// let mut b = Builder::new();
42    /// b.insert(b"banana", 4);
43    /// b.insert(b"apple", 1);   // arbitrary order is fine
44    /// b.insert(b"apple", 99);  // last-write-wins on duplicates
45    /// let fsa = Fsa::new(b.finish()).unwrap();
46    /// assert_eq!(fsa.get(b"apple"), Some(99));
47    /// ```
48    pub fn insert(&mut self, key: &[u8], value: u64) {
49        self.pairs.push((key.to_vec(), value));
50    }
51
52    /// Consume the builder and return the serialized FSA bytes.
53    ///
54    /// The output is a self-contained byte buffer — embed via
55    /// `include_bytes!`, `mmap`, or load from disk; either way pass it
56    /// directly to [`Fsa::new`](crate::Fsa::new).
57    ///
58    /// ```
59    /// use inputx_fsa::{Builder, Fsa};
60    /// let mut b = Builder::new();
61    /// for (k, v) in [(&b"a"[..], 1u64), (b"b", 2), (b"c", 3)] {
62    ///     b.insert(k, v);
63    /// }
64    /// let bytes: Vec<u8> = b.finish();
65    /// let fsa = Fsa::new(&bytes[..]).unwrap();
66    /// assert_eq!(fsa.len(), 3);
67    /// ```
68    pub fn finish(mut self) -> Vec<u8> {
69        // Sort by key; on duplicates keep the LAST inserted value.
70        self.pairs.sort_by(|a, b| a.0.cmp(&b.0));
71        self.pairs.dedup_by(|a, b| {
72            if a.0 == b.0 {
73                b.1 = a.1; // `a` is the later element; carry its value into the kept `b`
74                true
75            } else {
76                false
77            }
78        });
79
80        let values: Vec<u64> = self.pairs.iter().map(|(_, v)| *v).collect();
81
82        // ── Phase 1: trie ──────────────────────────────────────────────
83        let mut trie: Vec<TrieNode> = vec![TrieNode::default()]; // root = 0
84        for (key, _) in &self.pairs {
85            let mut cur = 0u32;
86            for &b in key {
87                cur = match trie[cur as usize].children.get(b) {
88                    Some(n) => n,
89                    None => {
90                        let n = trie.len() as u32;
91                        trie.push(TrieNode::default());
92                        trie[cur as usize].children.insert(b, n);
93                        n
94                    }
95                };
96            }
97            trie[cur as usize].final_ = true;
98        }
99
100        // ── Phase 2: minimize (hash-cons, post-order) ──────────────────
101        let mut register: Register<StateKey, u32> = Register::new();
102        let mut canon: Vec<CanonState> = Vec::new();
103        let root = minimize(0, &trie, &mut register, &mut canon);
104
105        // Right-language sizes (number of accepted strings from each state).
106        let mut num = vec![None; canon.len()];
107        for i in 0..canon.len() {
108            compute_num(i as u32, &canon, &mut num);
109        }
110        let num: Vec<u64> = num.into_iter().map(|n| n.unwrap_or(0)).collect();
111
112        serialize(&canon, &num, root, &values)
113    }
114}
115
116#[derive(Default)]
117struct TrieNode {
118    children: Children,
119    final_: bool,
120}
121
122/// Compact trie children. Most trie nodes (deep suffix chains over unique
123/// words) have 0 or 1 child — `None`/`One` keep those heap-allocation-free,
124/// which is the bulk of the build-time memory win over a per-node B-tree map.
125enum Children {
126    None,
127    One(u8, u32),
128    Many(Vec<(u8, u32)>), // sorted by label
129}
130
131impl Default for Children {
132    fn default() -> Self {
133        Children::None
134    }
135}
136
137impl Children {
138    fn get(&self, b: u8) -> Option<u32> {
139        match self {
140            Children::None => None,
141            Children::One(k, v) => (*k == b).then_some(*v),
142            Children::Many(m) => m.binary_search_by_key(&b, |&(k, _)| k).ok().map(|i| m[i].1),
143        }
144    }
145
146    fn insert(&mut self, b: u8, child: u32) {
147        match self {
148            Children::None => *self = Children::One(b, child),
149            Children::One(k, v) => {
150                if *k == b {
151                    *v = child;
152                } else {
153                    let pair = (*k, *v);
154                    let m = if *k < b {
155                        vec![pair, (b, child)]
156                    } else {
157                        vec![(b, child), pair]
158                    };
159                    *self = Children::Many(m);
160                }
161            }
162            Children::Many(m) => match m.binary_search_by_key(&b, |&(k, _)| k) {
163                Ok(i) => m[i].1 = child,
164                Err(i) => m.insert(i, (b, child)),
165            },
166        }
167    }
168
169    /// Sorted (label, child) pairs. Allocates — used once per node in the
170    /// minimize pass, not during the memory-heavy trie build.
171    fn collect_sorted(&self) -> Vec<(u8, u32)> {
172        match self {
173            Children::None => Vec::new(),
174            Children::One(k, v) => vec![(*k, *v)],
175            Children::Many(m) => m.clone(),
176        }
177    }
178}
179
180struct CanonState {
181    final_: bool,
182    trans: Vec<(u8, u32)>, // (label, canonical target id), sorted by label
183}
184
185type StateKey = (bool, Vec<(u8, u32)>);
186
187/// Post-order hash-cons: returns the canonical id for `node`. Because the
188/// input is a trie (a tree), each node is visited exactly once; the register
189/// collapses structurally-identical sub-automata into shared states.
190fn minimize(
191    node: u32,
192    trie: &[TrieNode],
193    register: &mut Register<StateKey, u32>,
194    canon: &mut Vec<CanonState>,
195) -> u32 {
196    let kids = trie[node as usize].children.collect_sorted();
197    let mut trans = Vec::with_capacity(kids.len());
198    for (label, child) in kids {
199        let cid = minimize(child, trie, register, canon);
200        trans.push((label, cid));
201    }
202    let final_ = trie[node as usize].final_;
203    let key: StateKey = (final_, trans.clone());
204    if let Some(&id) = register.get(&key) {
205        return id;
206    }
207    let id = canon.len() as u32;
208    canon.push(CanonState { final_, trans });
209    register.insert(key, id);
210    id
211}
212
213fn compute_num(id: u32, canon: &[CanonState], memo: &mut [Option<u64>]) -> u64 {
214    if let Some(n) = memo[id as usize] {
215        return n;
216    }
217    let st = &canon[id as usize];
218    let mut n = u64::from(st.final_);
219    for &(_, child) in &st.trans {
220        n += compute_num(child, canon, memo);
221    }
222    memo[id as usize] = Some(n);
223    n
224}
225
226fn value_width(values: &[u64]) -> u8 {
227    let maxv = values.iter().copied().max().unwrap_or(0);
228    if maxv <= 0xFF {
229        1
230    } else if maxv <= 0xFFFF {
231        2
232    } else if maxv <= 0xFFFF_FFFF {
233        4
234    } else {
235        8
236    }
237}
238
239/// Unsigned LEB128.
240pub(crate) fn write_uvarint(out: &mut Vec<u8>, mut v: u64) {
241    loop {
242        let mut byte = (v & 0x7f) as u8;
243        v >>= 7;
244        if v != 0 {
245            byte |= 0x80;
246        }
247        out.push(byte);
248        if v == 0 {
249            break;
250        }
251    }
252}
253
254/// Post-order over the DAG from `root`: every state appears after all its
255/// targets, so a state's transition target offsets are already known when
256/// it is written. Recursion depth is bounded by the longest key.
257fn post_order(root: u32, canon: &[CanonState]) -> Vec<u32> {
258    fn dfs(s: u32, canon: &[CanonState], visited: &mut [bool], order: &mut Vec<u32>) {
259        if visited[s as usize] {
260            return;
261        }
262        visited[s as usize] = true;
263        for &(_, c) in &canon[s as usize].trans {
264            dfs(c, canon, visited, order);
265        }
266        order.push(s);
267    }
268    let mut visited = vec![false; canon.len()];
269    let mut order = Vec::with_capacity(canon.len());
270    dfs(root, canon, &mut visited, &mut order);
271    order
272}
273
274/// Format v3 — compact, byte-offset addressed (no offset table). Per state:
275/// a flags byte (bit0 = final, bit1 = single-transition), then transitions.
276///
277/// - **Single-transition** node (bit1 set) → `[flags, label, delta]`. The
278///   target's right-language count is *omitted*: with one outgoing edge the
279///   ordinal walk never skips it, so its count is never read. These nodes are
280///   the vast majority (unique-word suffix chains), so dropping `ntrans` + the
281///   count here is the bulk of the size win.
282/// - **Otherwise** (0 or ≥2 transitions) → `[flags, ntrans, (label, delta,
283///   count)×ntrans]` (all LEB128).
284///
285/// Header: magic4 · ver1 · width1 · value_count u32 · root_off u32 ·
286/// state_count u32 (= 18 bytes). States blob follows; values tail.
287fn serialize(canon: &[CanonState], num: &[u64], root: u32, values: &[u64]) -> Vec<u8> {
288    let width = value_width(values);
289    let order = post_order(root, canon);
290
291    let mut state_off = vec![u32::MAX; canon.len()];
292    let mut blob: Vec<u8> = Vec::new();
293    for &s in &order {
294        let off = blob.len() as u32;
295        state_off[s as usize] = off;
296        let st = &canon[s as usize];
297        if st.trans.len() == 1 {
298            // single-transition fast form (no ntrans, no count)
299            let (label, target) = st.trans[0];
300            blob.push(u8::from(st.final_) | 0b10); // bit1 = single
301            blob.push(label);
302            write_uvarint(&mut blob, u64::from(off - state_off[target as usize]));
303        } else {
304            blob.push(u8::from(st.final_)); // bit1 = 0 → multi/leaf form
305            write_uvarint(&mut blob, st.trans.len() as u64);
306            for &(label, target) in &st.trans {
307                // target written earlier (post-order) → offset known, < off.
308                blob.push(label);
309                write_uvarint(&mut blob, u64::from(off - state_off[target as usize]));
310                write_uvarint(&mut blob, num[target as usize]);
311            }
312        }
313    }
314    let root_off = state_off[root as usize];
315
316    let mut out: Vec<u8> = Vec::with_capacity(18 + blob.len() + values.len() * width as usize);
317    out.extend_from_slice(b"IXFA");
318    out.push(3); // version
319    out.push(width);
320    out.extend_from_slice(&(values.len() as u32).to_le_bytes());
321    out.extend_from_slice(&root_off.to_le_bytes());
322    out.extend_from_slice(&(canon.len() as u32).to_le_bytes());
323    out.extend_from_slice(&blob);
324    for &v in values {
325        out.extend_from_slice(&v.to_le_bytes()[..width as usize]);
326    }
327    out
328}