Skip to main content

inputx_fsa/
reader.rs

1//! Read side: parse the serialized buffer and answer `get` / `prefix` /
2//! `iter` over the embedded bytes with no decompression step. Generic over
3//! the byte container `D: AsRef<[u8]>` so the same reader works on a
4//! `Vec<u8>`, a `&[u8]`, or a future memory-mapped file.
5//!
6//! Format v3 (see `builder::serialize`): states are byte-offset addressed
7//! (no offset table). A flags byte (bit0=final, bit1=single-transition)
8//! precedes the transitions; single-transition nodes drop the arity + the
9//! count. A transition stores `label`, a LEB128 back-delta to the target,
10//! and (multi-node only) the target's LEB128 right-language count.
11
12use alloc::vec::Vec;
13
14const HEADER_LEN: usize = 18; // magic4 + ver1 + width1 + value_count4 + root_off4 + state_count4
15
16/// Error parsing an FSA buffer.
17#[derive(Debug, PartialEq, Eq)]
18pub enum FsaError {
19    BadMagic,
20    BadVersion(u8),
21    Truncated,
22}
23
24/// A read-only minimal-FSA map over a byte container.
25pub struct Fsa<D> {
26    data: D,
27    value_width: usize,
28    value_count: u32,
29    state_count: u32,
30    root_rel: u32,
31    blob_start: usize,
32    values_start: usize,
33}
34
35#[inline]
36pub(crate) fn rd_u32(b: &[u8], at: usize) -> u32 {
37    u32::from_le_bytes([b[at], b[at + 1], b[at + 2], b[at + 3]])
38}
39
40/// Read an unsigned LEB128 starting at `*p`, advancing `*p` past it.
41/// Bounds-safe: returns `None` if the varint runs off the end of `b` or
42/// is malformed (>10 bytes), so a corrupt buffer degrades instead of
43/// panicking.
44#[inline]
45pub(crate) fn rd_uvarint(b: &[u8], p: &mut usize) -> Option<u64> {
46    let mut v = 0u64;
47    let mut shift = 0u32;
48    loop {
49        let byte = *b.get(*p)?;
50        *p += 1;
51        v |= u64::from(byte & 0x7f) << shift;
52        if byte & 0x80 == 0 {
53            return Some(v);
54        }
55        shift += 7;
56        if shift >= 64 {
57            return None; // malformed: varint too long
58        }
59    }
60}
61
62/// Max key length walked during a prefix DFS. Valid IME keys are tiny; this
63/// only bounds recursion depth so a corrupt/cyclic buffer can't overflow the
64/// stack (defense for untrusted input).
65const MAX_WALK_DEPTH: usize = 4096;
66
67/// Resolve a transition's back-delta to a target rel-offset: must be `> 0`
68/// (strictly earlier — no self-loop) and not underflow. `None` on corrupt.
69#[inline]
70fn decode_target(rel: u32, delta: u64) -> Option<u32> {
71    u32::try_from(delta)
72        .ok()
73        .filter(|&d| d > 0)
74        .and_then(|d| rel.checked_sub(d))
75}
76
77impl<D: AsRef<[u8]>> Fsa<D> {
78    /// Load an FSA from a serialized byte buffer. Accepts anything that
79    /// implements `AsRef<[u8]>` (`&[u8]`, `Vec<u8>`, `mmap::Mmap`, etc.).
80    /// The buffer is parsed header-only — no allocation per entry.
81    ///
82    /// Returns [`FsaError::BadMagic`] / [`FsaError::BadVersion`] /
83    /// [`FsaError::Truncated`] on invalid input; never panics.
84    ///
85    /// ```
86    /// use inputx_fsa::{Builder, Fsa, FsaError};
87    /// let mut b = Builder::new();
88    /// b.insert(b"hello", 42);
89    /// let bytes = b.finish();
90    ///
91    /// let fsa = Fsa::new(&bytes[..]).unwrap();
92    /// assert_eq!(fsa.len(), 1);
93    ///
94    /// // Corrupt magic → graceful error, no panic.
95    /// let mut bad = bytes.clone();
96    /// bad[0] = 0;
97    /// assert!(matches!(Fsa::new(&bad[..]), Err(FsaError::BadMagic)));
98    /// ```
99    pub fn new(data: D) -> Result<Self, FsaError> {
100        let b = data.as_ref();
101        if b.len() < HEADER_LEN {
102            return Err(FsaError::Truncated);
103        }
104        if &b[0..4] != b"IXFA" {
105            return Err(FsaError::BadMagic);
106        }
107        if b[4] != 3 {
108            return Err(FsaError::BadVersion(b[4]));
109        }
110        let value_width = b[5] as usize;
111        let value_count = rd_u32(b, 6);
112        let root_rel = rd_u32(b, 10);
113        let state_count = rd_u32(b, 14);
114        let blob_start = HEADER_LEN;
115        let values_len = value_count as usize * value_width;
116        if b.len() < blob_start + values_len {
117            return Err(FsaError::Truncated);
118        }
119        let values_start = b.len() - values_len;
120        Ok(Self {
121            data,
122            value_width,
123            value_count,
124            state_count,
125            root_rel,
126            blob_start,
127            values_start,
128        })
129    }
130
131    /// Number of keys stored.
132    pub fn len(&self) -> u64 {
133        u64::from(self.value_count)
134    }
135
136    pub fn is_empty(&self) -> bool {
137        self.value_count == 0
138    }
139
140    /// Total number of states in the automaton (diagnostics).
141    pub fn state_count(&self) -> u32 {
142        self.state_count
143    }
144
145    #[inline]
146    fn read_value(&self, ord: u64) -> u64 {
147        let b = self.data.as_ref();
148        let at = self.values_start + ord as usize * self.value_width;
149        let mut v = 0u64;
150        for i in 0..self.value_width {
151            v |= u64::from(b.get(at + i).copied().unwrap_or(0)) << (8 * i);
152        }
153        v
154    }
155
156    #[inline]
157    fn is_final(&self, rel: u32) -> bool {
158        self.data
159            .as_ref()
160            .get(self.blob_start + rel as usize)
161            .is_some_and(|x| x & 1 != 0)
162    }
163
164    /// Walk one state: from state at `rel`, take `byte`. Returns the target
165    /// state's relative offset, adding to `ord` the rank contribution of
166    /// everything that sorts before that branch. `None` if no such transition
167    /// (or on any out-of-bounds read — a corrupt buffer degrades to "no
168    /// match" rather than panicking).
169    #[inline]
170    fn step(&self, rel: u32, byte: u8, ord: &mut u64) -> Option<u32> {
171        let b = self.data.as_ref();
172        let mut p = self.blob_start + rel as usize;
173        let flags = *b.get(p)?;
174        p += 1;
175        if flags & 1 != 0 {
176            *ord += 1; // final: the (shorter) word ending here sorts first
177        }
178        if flags & 0b10 != 0 {
179            // single-transition form: [flags, label, delta], no count.
180            let label = *b.get(p)?;
181            p += 1;
182            let delta = rd_uvarint(b, &mut p)?;
183            // Only one edge: take it iff it matches; its count is never needed.
184            if label == byte {
185                return rel.checked_sub(u32::try_from(delta).ok()?);
186            }
187            return None;
188        }
189        let ntrans = rd_uvarint(b, &mut p)?;
190        for _ in 0..ntrans {
191            let label = *b.get(p)?;
192            p += 1;
193            let delta = rd_uvarint(b, &mut p)?;
194            let numt = rd_uvarint(b, &mut p)?;
195            if label < byte {
196                *ord += numt;
197            } else if label == byte {
198                return rel.checked_sub(u32::try_from(delta).ok()?);
199            } else {
200                break; // label-sorted
201            }
202        }
203        None
204    }
205
206    /// Look up `key` — `Some(value)` for an exact match, `None` otherwise.
207    /// Constant cost per byte; the buffer is never copied.
208    ///
209    /// ```
210    /// use inputx_fsa::{Builder, Fsa};
211    /// let mut b = Builder::new();
212    /// b.insert(b"ni", 1);
213    /// b.insert(b"nihao", 100);
214    /// let fsa = Fsa::new(b.finish()).unwrap();
215    /// assert_eq!(fsa.get(b"ni"), Some(1));
216    /// assert_eq!(fsa.get(b"nihao"), Some(100));
217    /// assert_eq!(fsa.get(b"niha"), None);  // prefix-only, not a key
218    /// assert_eq!(fsa.get(b"absent"), None);
219    /// ```
220    pub fn get(&self, key: &[u8]) -> Option<u64> {
221        if self.value_count == 0 {
222            return None;
223        }
224        let mut rel = self.root_rel;
225        let mut ord = 0u64;
226        for &byte in key {
227            rel = self.step(rel, byte, &mut ord)?;
228        }
229        if self.is_final(rel) {
230            Some(self.read_value(ord))
231        } else {
232            None
233        }
234    }
235
236    /// `true` if any stored key starts with `prefix`.
237    pub fn contains_prefix(&self, prefix: &[u8]) -> bool {
238        self.walk_to(prefix).is_some()
239    }
240
241    /// All (key, value) pairs whose key starts with `prefix`, sorted.
242    pub fn prefix(&self, prefix: &[u8]) -> Vec<(Vec<u8>, u64)> {
243        let mut out = Vec::new();
244        self.prefix_for_each(prefix, |k, v| out.push((k.to_vec(), v)));
245        out
246    }
247
248    /// Streaming variant: invoke `visit(key, value)` for every (key, value)
249    /// whose key starts with `prefix`, in sorted order, without allocating a
250    /// result vector. The `key` slice is valid only for the call. This is the
251    /// hot-path entry — a bare-letter prefix can match tens of thousands of
252    /// keys, and materializing them all would dominate cost.
253    ///
254    /// ```
255    /// use inputx_fsa::{Builder, Fsa};
256    /// let mut b = Builder::new();
257    /// for (k, v) in [(&b"apple"[..], 1u64), (b"apply", 2), (b"banana", 3)] {
258    ///     b.insert(k, v);
259    /// }
260    /// let fsa = Fsa::new(b.finish()).unwrap();
261    ///
262    /// let mut count = 0;
263    /// let mut last_value = 0;
264    /// fsa.prefix_for_each(b"app", |_key, value| {
265    ///     count += 1;
266    ///     last_value = value;
267    /// });
268    /// assert_eq!(count, 2);          // apple + apply
269    /// assert_eq!(last_value, 2);     // "apply" sorts last
270    /// ```
271    pub fn prefix_for_each<F: FnMut(&[u8], u64)>(&self, prefix: &[u8], mut visit: F) {
272        if let Some((rel, ord)) = self.walk_to(prefix) {
273            let mut cur = prefix.to_vec();
274            let mut ord = ord;
275            self.visit_subtree(rel, &mut cur, &mut ord, &mut visit);
276        }
277    }
278
279    /// Lazy iterator over all (key, value) pairs in sorted order.
280    pub fn iter(&self) -> FsaIter<'_, D> {
281        self.range(b"")
282    }
283
284    /// Lazy iterator over (key, value) pairs whose key starts with `prefix`,
285    /// in sorted order. Unlike [`prefix`](Self::prefix) it allocates no result
286    /// vector and supports early termination (`.take`, `.find`, `break`).
287    pub fn range(&self, prefix: &[u8]) -> FsaIter<'_, D> {
288        let (to_enter, ord) = match self.walk_to(prefix) {
289            Some((rel, ord)) => (Some(rel), ord),
290            None => (None, 0),
291        };
292        FsaIter {
293            fsa: self,
294            stack: Vec::new(),
295            cur: prefix.to_vec(),
296            ord,
297            to_enter,
298        }
299    }
300
301    fn walk_to(&self, prefix: &[u8]) -> Option<(u32, u64)> {
302        if self.value_count == 0 {
303            return None;
304        }
305        let mut rel = self.root_rel;
306        let mut ord = 0u64;
307        for &byte in prefix {
308            rel = self.step(rel, byte, &mut ord)?;
309        }
310        Some((rel, ord))
311    }
312
313    /// DFS in label-sorted order from `rel`, invoking `visit(key, value)` for
314    /// each accepted key and advancing `ord`. Recursion depth ≤ longest key.
315    fn visit_subtree<F: FnMut(&[u8], u64)>(
316        &self,
317        rel: u32,
318        cur: &mut Vec<u8>,
319        ord: &mut u64,
320        visit: &mut F,
321    ) {
322        // Depth guard: bounds recursion on a corrupt/cyclic buffer.
323        if cur.len() > MAX_WALK_DEPTH {
324            return;
325        }
326        let b = self.data.as_ref();
327        let mut p = self.blob_start + rel as usize;
328        let Some(&flags) = b.get(p) else { return };
329        p += 1;
330        if flags & 1 != 0 {
331            visit(cur, self.read_value(*ord));
332            *ord += 1;
333        }
334        if flags & 0b10 != 0 {
335            // single-transition form: [flags, label, delta], no count.
336            let Some(&label) = b.get(p) else { return };
337            p += 1;
338            let Some(delta) = rd_uvarint(b, &mut p) else { return };
339            if let Some(target) = decode_target(rel, delta) {
340                cur.push(label);
341                self.visit_subtree(target, cur, ord, visit);
342                cur.pop();
343            }
344            return;
345        }
346        let Some(ntrans) = rd_uvarint(b, &mut p) else { return };
347        for _ in 0..ntrans {
348            let Some(&label) = b.get(p) else { return };
349            p += 1;
350            let Some(delta) = rd_uvarint(b, &mut p) else { return };
351            let Some(_num) = rd_uvarint(b, &mut p) else { return };
352            if let Some(target) = decode_target(rel, delta) {
353                cur.push(label);
354                self.visit_subtree(target, cur, ord, visit);
355                cur.pop();
356            }
357        }
358    }
359}
360
361/// One state being expanded on the iterator's explicit DFS stack.
362struct IterFrame {
363    rel: u32,
364    p: usize,        // byte cursor: next transition record to read
365    remaining: u64,  // transitions left in this state
366    base_len: usize, // `cur.len()` at this state (truncate target between children)
367    single: bool,    // single-transition form (no per-edge count)
368}
369
370/// Lazy, allocation-light iterator over an [`Fsa`]'s (key, value) pairs in
371/// sorted order — see [`Fsa::iter`] / [`Fsa::range`]. Pre-order DFS via an
372/// explicit stack (depth = key length), so it composes with the `Iterator`
373/// adapters and stops early without walking the whole automaton. On a corrupt
374/// buffer it simply ends (consistent with the bounds-safe reader).
375pub struct FsaIter<'a, D> {
376    fsa: &'a Fsa<D>,
377    stack: Vec<IterFrame>,
378    cur: Vec<u8>,
379    ord: u64,
380    to_enter: Option<u32>,
381}
382
383impl<D: AsRef<[u8]>> Iterator for FsaIter<'_, D> {
384    type Item = (Vec<u8>, u64);
385
386    fn next(&mut self) -> Option<Self::Item> {
387        let b = self.fsa.data.as_ref();
388        loop {
389            if let Some(rel) = self.to_enter.take() {
390                // Enter a state: push its frame; if final, yield its key now.
391                let mut p = self.fsa.blob_start + rel as usize;
392                let flags = *b.get(p)?;
393                p += 1;
394                let single = flags & 0b10 != 0;
395                let final_ = flags & 1 != 0;
396                let remaining = if single { 1 } else { rd_uvarint(b, &mut p)? };
397                self.stack.push(IterFrame {
398                    rel,
399                    p,
400                    remaining,
401                    base_len: self.cur.len(),
402                    single,
403                });
404                if final_ {
405                    let v = self.fsa.read_value(self.ord);
406                    self.ord += 1;
407                    return Some((self.cur.clone(), v));
408                }
409                continue;
410            }
411            let frame = self.stack.last_mut()?;
412            // Drop the previous child's label before taking the next edge.
413            self.cur.truncate(frame.base_len);
414            if frame.remaining == 0 {
415                self.stack.pop();
416                continue;
417            }
418            let mut p = frame.p;
419            let label = *b.get(p)?;
420            p += 1;
421            let delta = rd_uvarint(b, &mut p)?;
422            if !frame.single {
423                rd_uvarint(b, &mut p)?; // skip the count
424            }
425            frame.p = p;
426            frame.remaining -= 1;
427            let target = decode_target(frame.rel, delta)?;
428            self.cur.push(label);
429            self.to_enter = Some(target);
430        }
431    }
432}