Skip to main content

inputx_fsa/
builder.rs

1//! Build side: collect (key, value) pairs → minimal acyclic FSA + value
2//! table → serialized bytes. Build-time only (not perf-critical), so it
3//! favors a simple, verifiable two-phase construction:
4//!
5//!   1. Insert all keys into a plain trie.
6//!   2. Minimize bottom-up by hash-consing equivalent sub-automata
7//!      (Revuz's algorithm) — two states are equal iff same finality and
8//!      same (label → canonical-target) transition set.
9//!
10//! Values are kept out of the automaton (it stays a pure key recognizer)
11//! and emitted as a fixed-width array indexed by each key's sorted rank.
12
13use alloc::vec;
14use alloc::vec::Vec;
15
16// Hash-cons register for state minimization. HashMap (O(1)) when `std` is
17// on; a BTreeMap keeps the builder available in no_std builds.
18#[cfg(feature = "std")]
19use std::collections::HashMap as Register;
20#[cfg(not(feature = "std"))]
21use alloc::collections::BTreeMap as Register;
22
23/// Accumulates (key, value) pairs and serializes a minimal FSA.
24#[derive(Default)]
25pub struct Builder {
26    pairs: Vec<(Vec<u8>, u64)>,
27}
28
29impl Builder {
30    pub fn new() -> Self {
31        Self { pairs: Vec::new() }
32    }
33
34    /// Add a key→value entry. Duplicate keys: last insert wins. Keys may be
35    /// inserted in any order (the builder sorts).
36    pub fn insert(&mut self, key: &[u8], value: u64) {
37        self.pairs.push((key.to_vec(), value));
38    }
39
40    /// Consume the builder and return the serialized FSA bytes.
41    pub fn finish(mut self) -> Vec<u8> {
42        // Sort by key; on duplicates keep the LAST inserted value.
43        self.pairs.sort_by(|a, b| a.0.cmp(&b.0));
44        self.pairs.dedup_by(|a, b| {
45            if a.0 == b.0 {
46                b.1 = a.1; // `a` is the later element; carry its value into the kept `b`
47                true
48            } else {
49                false
50            }
51        });
52
53        let values: Vec<u64> = self.pairs.iter().map(|(_, v)| *v).collect();
54
55        // ── Phase 1: trie ──────────────────────────────────────────────
56        let mut trie: Vec<TrieNode> = vec![TrieNode::default()]; // root = 0
57        for (key, _) in &self.pairs {
58            let mut cur = 0u32;
59            for &b in key {
60                cur = match trie[cur as usize].children.get(b) {
61                    Some(n) => n,
62                    None => {
63                        let n = trie.len() as u32;
64                        trie.push(TrieNode::default());
65                        trie[cur as usize].children.insert(b, n);
66                        n
67                    }
68                };
69            }
70            trie[cur as usize].final_ = true;
71        }
72
73        // ── Phase 2: minimize (hash-cons, post-order) ──────────────────
74        let mut register: Register<StateKey, u32> = Register::new();
75        let mut canon: Vec<CanonState> = Vec::new();
76        let root = minimize(0, &trie, &mut register, &mut canon);
77
78        // Right-language sizes (number of accepted strings from each state).
79        let mut num = vec![None; canon.len()];
80        for i in 0..canon.len() {
81            compute_num(i as u32, &canon, &mut num);
82        }
83        let num: Vec<u64> = num.into_iter().map(|n| n.unwrap_or(0)).collect();
84
85        serialize(&canon, &num, root, &values)
86    }
87}
88
89#[derive(Default)]
90struct TrieNode {
91    children: Children,
92    final_: bool,
93}
94
95/// Compact trie children. Most trie nodes (deep suffix chains over unique
96/// words) have 0 or 1 child — `None`/`One` keep those heap-allocation-free,
97/// which is the bulk of the build-time memory win over a per-node B-tree map.
98enum Children {
99    None,
100    One(u8, u32),
101    Many(Vec<(u8, u32)>), // sorted by label
102}
103
104impl Default for Children {
105    fn default() -> Self {
106        Children::None
107    }
108}
109
110impl Children {
111    fn get(&self, b: u8) -> Option<u32> {
112        match self {
113            Children::None => None,
114            Children::One(k, v) => (*k == b).then_some(*v),
115            Children::Many(m) => m.binary_search_by_key(&b, |&(k, _)| k).ok().map(|i| m[i].1),
116        }
117    }
118
119    fn insert(&mut self, b: u8, child: u32) {
120        match self {
121            Children::None => *self = Children::One(b, child),
122            Children::One(k, v) => {
123                if *k == b {
124                    *v = child;
125                } else {
126                    let pair = (*k, *v);
127                    let m = if *k < b {
128                        vec![pair, (b, child)]
129                    } else {
130                        vec![(b, child), pair]
131                    };
132                    *self = Children::Many(m);
133                }
134            }
135            Children::Many(m) => match m.binary_search_by_key(&b, |&(k, _)| k) {
136                Ok(i) => m[i].1 = child,
137                Err(i) => m.insert(i, (b, child)),
138            },
139        }
140    }
141
142    /// Sorted (label, child) pairs. Allocates — used once per node in the
143    /// minimize pass, not during the memory-heavy trie build.
144    fn collect_sorted(&self) -> Vec<(u8, u32)> {
145        match self {
146            Children::None => Vec::new(),
147            Children::One(k, v) => vec![(*k, *v)],
148            Children::Many(m) => m.clone(),
149        }
150    }
151}
152
153struct CanonState {
154    final_: bool,
155    trans: Vec<(u8, u32)>, // (label, canonical target id), sorted by label
156}
157
158type StateKey = (bool, Vec<(u8, u32)>);
159
160/// Post-order hash-cons: returns the canonical id for `node`. Because the
161/// input is a trie (a tree), each node is visited exactly once; the register
162/// collapses structurally-identical sub-automata into shared states.
163fn minimize(
164    node: u32,
165    trie: &[TrieNode],
166    register: &mut Register<StateKey, u32>,
167    canon: &mut Vec<CanonState>,
168) -> u32 {
169    let kids = trie[node as usize].children.collect_sorted();
170    let mut trans = Vec::with_capacity(kids.len());
171    for (label, child) in kids {
172        let cid = minimize(child, trie, register, canon);
173        trans.push((label, cid));
174    }
175    let final_ = trie[node as usize].final_;
176    let key: StateKey = (final_, trans.clone());
177    if let Some(&id) = register.get(&key) {
178        return id;
179    }
180    let id = canon.len() as u32;
181    canon.push(CanonState { final_, trans });
182    register.insert(key, id);
183    id
184}
185
186fn compute_num(id: u32, canon: &[CanonState], memo: &mut [Option<u64>]) -> u64 {
187    if let Some(n) = memo[id as usize] {
188        return n;
189    }
190    let st = &canon[id as usize];
191    let mut n = u64::from(st.final_);
192    for &(_, child) in &st.trans {
193        n += compute_num(child, canon, memo);
194    }
195    memo[id as usize] = Some(n);
196    n
197}
198
199fn value_width(values: &[u64]) -> u8 {
200    let maxv = values.iter().copied().max().unwrap_or(0);
201    if maxv <= 0xFF {
202        1
203    } else if maxv <= 0xFFFF {
204        2
205    } else if maxv <= 0xFFFF_FFFF {
206        4
207    } else {
208        8
209    }
210}
211
212/// Unsigned LEB128.
213pub(crate) fn write_uvarint(out: &mut Vec<u8>, mut v: u64) {
214    loop {
215        let mut byte = (v & 0x7f) as u8;
216        v >>= 7;
217        if v != 0 {
218            byte |= 0x80;
219        }
220        out.push(byte);
221        if v == 0 {
222            break;
223        }
224    }
225}
226
227/// Post-order over the DAG from `root`: every state appears after all its
228/// targets, so a state's transition target offsets are already known when
229/// it is written. Recursion depth is bounded by the longest key.
230fn post_order(root: u32, canon: &[CanonState]) -> Vec<u32> {
231    fn dfs(s: u32, canon: &[CanonState], visited: &mut [bool], order: &mut Vec<u32>) {
232        if visited[s as usize] {
233            return;
234        }
235        visited[s as usize] = true;
236        for &(_, c) in &canon[s as usize].trans {
237            dfs(c, canon, visited, order);
238        }
239        order.push(s);
240    }
241    let mut visited = vec![false; canon.len()];
242    let mut order = Vec::with_capacity(canon.len());
243    dfs(root, canon, &mut visited, &mut order);
244    order
245}
246
247/// Format v3 — compact, byte-offset addressed (no offset table). Per state:
248/// a flags byte (bit0 = final, bit1 = single-transition), then transitions.
249///
250/// - **Single-transition** node (bit1 set) → `[flags, label, delta]`. The
251///   target's right-language count is *omitted*: with one outgoing edge the
252///   ordinal walk never skips it, so its count is never read. These nodes are
253///   the vast majority (unique-word suffix chains), so dropping `ntrans` + the
254///   count here is the bulk of the size win.
255/// - **Otherwise** (0 or ≥2 transitions) → `[flags, ntrans, (label, delta,
256///   count)×ntrans]` (all LEB128).
257///
258/// Header: magic4 · ver1 · width1 · value_count u32 · root_off u32 ·
259/// state_count u32 (= 18 bytes). States blob follows; values tail.
260fn serialize(canon: &[CanonState], num: &[u64], root: u32, values: &[u64]) -> Vec<u8> {
261    let width = value_width(values);
262    let order = post_order(root, canon);
263
264    let mut state_off = vec![u32::MAX; canon.len()];
265    let mut blob: Vec<u8> = Vec::new();
266    for &s in &order {
267        let off = blob.len() as u32;
268        state_off[s as usize] = off;
269        let st = &canon[s as usize];
270        if st.trans.len() == 1 {
271            // single-transition fast form (no ntrans, no count)
272            let (label, target) = st.trans[0];
273            blob.push(u8::from(st.final_) | 0b10); // bit1 = single
274            blob.push(label);
275            write_uvarint(&mut blob, u64::from(off - state_off[target as usize]));
276        } else {
277            blob.push(u8::from(st.final_)); // bit1 = 0 → multi/leaf form
278            write_uvarint(&mut blob, st.trans.len() as u64);
279            for &(label, target) in &st.trans {
280                // target written earlier (post-order) → offset known, < off.
281                blob.push(label);
282                write_uvarint(&mut blob, u64::from(off - state_off[target as usize]));
283                write_uvarint(&mut blob, num[target as usize]);
284            }
285        }
286    }
287    let root_off = state_off[root as usize];
288
289    let mut out: Vec<u8> = Vec::with_capacity(18 + blob.len() + values.len() * width as usize);
290    out.extend_from_slice(b"IXFA");
291    out.push(3); // version
292    out.push(width);
293    out.extend_from_slice(&(values.len() as u32).to_le_bytes());
294    out.extend_from_slice(&root_off.to_le_bytes());
295    out.extend_from_slice(&(canon.len() as u32).to_le_bytes());
296    out.extend_from_slice(&blob);
297    for &v in values {
298        out.extend_from_slice(&v.to_le_bytes()[..width as usize]);
299    }
300    out
301}