Skip to main content

inputx_fsa/
reader.rs

1//! Read side: parse the serialized buffer and answer `get` / `prefix` /
2//! `iter` over the embedded bytes with no decompression step. Generic over
3//! the byte container `D: AsRef<[u8]>` so the same reader works on a
4//! `Vec<u8>`, a `&[u8]`, or a future memory-mapped file.
5//!
6//! Format v3 (see `builder::serialize`): states are byte-offset addressed
7//! (no offset table). A flags byte (bit0=final, bit1=single-transition)
8//! precedes the transitions; single-transition nodes drop the arity + the
9//! count. A transition stores `label`, a LEB128 back-delta to the target,
10//! and (multi-node only) the target's LEB128 right-language count.
11
12use alloc::vec::Vec;
13
14const HEADER_LEN: usize = 18; // magic4 + ver1 + width1 + value_count4 + root_off4 + state_count4
15
16/// Error parsing an FSA buffer.
17#[derive(Debug, PartialEq, Eq)]
18pub enum FsaError {
19    BadMagic,
20    BadVersion(u8),
21    Truncated,
22}
23
24/// A read-only minimal-FSA map over a byte container.
25pub struct Fsa<D> {
26    data: D,
27    value_width: usize,
28    value_count: u32,
29    state_count: u32,
30    root_rel: u32,
31    blob_start: usize,
32    values_start: usize,
33}
34
35#[inline]
36pub(crate) fn rd_u32(b: &[u8], at: usize) -> u32 {
37    u32::from_le_bytes([b[at], b[at + 1], b[at + 2], b[at + 3]])
38}
39
40/// Read an unsigned LEB128 starting at `*p`, advancing `*p` past it.
41/// Bounds-safe: returns `None` if the varint runs off the end of `b` or
42/// is malformed (>10 bytes), so a corrupt buffer degrades instead of
43/// panicking.
44#[inline]
45pub(crate) fn rd_uvarint(b: &[u8], p: &mut usize) -> Option<u64> {
46    let mut v = 0u64;
47    let mut shift = 0u32;
48    loop {
49        let byte = *b.get(*p)?;
50        *p += 1;
51        v |= u64::from(byte & 0x7f) << shift;
52        if byte & 0x80 == 0 {
53            return Some(v);
54        }
55        shift += 7;
56        if shift >= 64 {
57            return None; // malformed: varint too long
58        }
59    }
60}
61
62/// Max key length walked during a prefix DFS. Valid IME keys are tiny; this
63/// only bounds recursion depth so a corrupt/cyclic buffer can't overflow the
64/// stack (defense for untrusted input).
65const MAX_WALK_DEPTH: usize = 4096;
66
67/// Resolve a transition's back-delta to a target rel-offset: must be `> 0`
68/// (strictly earlier — no self-loop) and not underflow. `None` on corrupt.
69#[inline]
70fn decode_target(rel: u32, delta: u64) -> Option<u32> {
71    u32::try_from(delta)
72        .ok()
73        .filter(|&d| d > 0)
74        .and_then(|d| rel.checked_sub(d))
75}
76
77impl<D: AsRef<[u8]>> Fsa<D> {
78    pub fn new(data: D) -> Result<Self, FsaError> {
79        let b = data.as_ref();
80        if b.len() < HEADER_LEN {
81            return Err(FsaError::Truncated);
82        }
83        if &b[0..4] != b"IXFA" {
84            return Err(FsaError::BadMagic);
85        }
86        if b[4] != 3 {
87            return Err(FsaError::BadVersion(b[4]));
88        }
89        let value_width = b[5] as usize;
90        let value_count = rd_u32(b, 6);
91        let root_rel = rd_u32(b, 10);
92        let state_count = rd_u32(b, 14);
93        let blob_start = HEADER_LEN;
94        let values_len = value_count as usize * value_width;
95        if b.len() < blob_start + values_len {
96            return Err(FsaError::Truncated);
97        }
98        let values_start = b.len() - values_len;
99        Ok(Self {
100            data,
101            value_width,
102            value_count,
103            state_count,
104            root_rel,
105            blob_start,
106            values_start,
107        })
108    }
109
110    /// Number of keys stored.
111    pub fn len(&self) -> u64 {
112        u64::from(self.value_count)
113    }
114
115    pub fn is_empty(&self) -> bool {
116        self.value_count == 0
117    }
118
119    /// Total number of states in the automaton (diagnostics).
120    pub fn state_count(&self) -> u32 {
121        self.state_count
122    }
123
124    #[inline]
125    fn read_value(&self, ord: u64) -> u64 {
126        let b = self.data.as_ref();
127        let at = self.values_start + ord as usize * self.value_width;
128        let mut v = 0u64;
129        for i in 0..self.value_width {
130            v |= u64::from(b.get(at + i).copied().unwrap_or(0)) << (8 * i);
131        }
132        v
133    }
134
135    #[inline]
136    fn is_final(&self, rel: u32) -> bool {
137        self.data
138            .as_ref()
139            .get(self.blob_start + rel as usize)
140            .is_some_and(|x| x & 1 != 0)
141    }
142
143    /// Walk one state: from state at `rel`, take `byte`. Returns the target
144    /// state's relative offset, adding to `ord` the rank contribution of
145    /// everything that sorts before that branch. `None` if no such transition
146    /// (or on any out-of-bounds read — a corrupt buffer degrades to "no
147    /// match" rather than panicking).
148    #[inline]
149    fn step(&self, rel: u32, byte: u8, ord: &mut u64) -> Option<u32> {
150        let b = self.data.as_ref();
151        let mut p = self.blob_start + rel as usize;
152        let flags = *b.get(p)?;
153        p += 1;
154        if flags & 1 != 0 {
155            *ord += 1; // final: the (shorter) word ending here sorts first
156        }
157        if flags & 0b10 != 0 {
158            // single-transition form: [flags, label, delta], no count.
159            let label = *b.get(p)?;
160            p += 1;
161            let delta = rd_uvarint(b, &mut p)?;
162            // Only one edge: take it iff it matches; its count is never needed.
163            if label == byte {
164                return rel.checked_sub(u32::try_from(delta).ok()?);
165            }
166            return None;
167        }
168        let ntrans = rd_uvarint(b, &mut p)?;
169        for _ in 0..ntrans {
170            let label = *b.get(p)?;
171            p += 1;
172            let delta = rd_uvarint(b, &mut p)?;
173            let numt = rd_uvarint(b, &mut p)?;
174            if label < byte {
175                *ord += numt;
176            } else if label == byte {
177                return rel.checked_sub(u32::try_from(delta).ok()?);
178            } else {
179                break; // label-sorted
180            }
181        }
182        None
183    }
184
185    /// Look up `key`.
186    pub fn get(&self, key: &[u8]) -> Option<u64> {
187        if self.value_count == 0 {
188            return None;
189        }
190        let mut rel = self.root_rel;
191        let mut ord = 0u64;
192        for &byte in key {
193            rel = self.step(rel, byte, &mut ord)?;
194        }
195        if self.is_final(rel) {
196            Some(self.read_value(ord))
197        } else {
198            None
199        }
200    }
201
202    /// `true` if any stored key starts with `prefix`.
203    pub fn contains_prefix(&self, prefix: &[u8]) -> bool {
204        self.walk_to(prefix).is_some()
205    }
206
207    /// All (key, value) pairs whose key starts with `prefix`, sorted.
208    pub fn prefix(&self, prefix: &[u8]) -> Vec<(Vec<u8>, u64)> {
209        let mut out = Vec::new();
210        self.prefix_for_each(prefix, |k, v| out.push((k.to_vec(), v)));
211        out
212    }
213
214    /// Streaming variant: invoke `visit(key, value)` for every (key, value)
215    /// whose key starts with `prefix`, in sorted order, without allocating a
216    /// result vector. The `key` slice is valid only for the call. This is the
217    /// hot-path entry — a bare-letter prefix can match tens of thousands of
218    /// keys, and materializing them all would dominate cost.
219    pub fn prefix_for_each<F: FnMut(&[u8], u64)>(&self, prefix: &[u8], mut visit: F) {
220        if let Some((rel, ord)) = self.walk_to(prefix) {
221            let mut cur = prefix.to_vec();
222            let mut ord = ord;
223            self.visit_subtree(rel, &mut cur, &mut ord, &mut visit);
224        }
225    }
226
227    /// Lazy iterator over all (key, value) pairs in sorted order.
228    pub fn iter(&self) -> FsaIter<'_, D> {
229        self.range(b"")
230    }
231
232    /// Lazy iterator over (key, value) pairs whose key starts with `prefix`,
233    /// in sorted order. Unlike [`prefix`](Self::prefix) it allocates no result
234    /// vector and supports early termination (`.take`, `.find`, `break`).
235    pub fn range(&self, prefix: &[u8]) -> FsaIter<'_, D> {
236        let (to_enter, ord) = match self.walk_to(prefix) {
237            Some((rel, ord)) => (Some(rel), ord),
238            None => (None, 0),
239        };
240        FsaIter {
241            fsa: self,
242            stack: Vec::new(),
243            cur: prefix.to_vec(),
244            ord,
245            to_enter,
246        }
247    }
248
249    fn walk_to(&self, prefix: &[u8]) -> Option<(u32, u64)> {
250        if self.value_count == 0 {
251            return None;
252        }
253        let mut rel = self.root_rel;
254        let mut ord = 0u64;
255        for &byte in prefix {
256            rel = self.step(rel, byte, &mut ord)?;
257        }
258        Some((rel, ord))
259    }
260
261    /// DFS in label-sorted order from `rel`, invoking `visit(key, value)` for
262    /// each accepted key and advancing `ord`. Recursion depth ≤ longest key.
263    fn visit_subtree<F: FnMut(&[u8], u64)>(
264        &self,
265        rel: u32,
266        cur: &mut Vec<u8>,
267        ord: &mut u64,
268        visit: &mut F,
269    ) {
270        // Depth guard: bounds recursion on a corrupt/cyclic buffer.
271        if cur.len() > MAX_WALK_DEPTH {
272            return;
273        }
274        let b = self.data.as_ref();
275        let mut p = self.blob_start + rel as usize;
276        let Some(&flags) = b.get(p) else { return };
277        p += 1;
278        if flags & 1 != 0 {
279            visit(cur, self.read_value(*ord));
280            *ord += 1;
281        }
282        if flags & 0b10 != 0 {
283            // single-transition form: [flags, label, delta], no count.
284            let Some(&label) = b.get(p) else { return };
285            p += 1;
286            let Some(delta) = rd_uvarint(b, &mut p) else { return };
287            if let Some(target) = decode_target(rel, delta) {
288                cur.push(label);
289                self.visit_subtree(target, cur, ord, visit);
290                cur.pop();
291            }
292            return;
293        }
294        let Some(ntrans) = rd_uvarint(b, &mut p) else { return };
295        for _ in 0..ntrans {
296            let Some(&label) = b.get(p) else { return };
297            p += 1;
298            let Some(delta) = rd_uvarint(b, &mut p) else { return };
299            let Some(_num) = rd_uvarint(b, &mut p) else { return };
300            if let Some(target) = decode_target(rel, delta) {
301                cur.push(label);
302                self.visit_subtree(target, cur, ord, visit);
303                cur.pop();
304            }
305        }
306    }
307}
308
309/// One state being expanded on the iterator's explicit DFS stack.
310struct IterFrame {
311    rel: u32,
312    p: usize,        // byte cursor: next transition record to read
313    remaining: u64,  // transitions left in this state
314    base_len: usize, // `cur.len()` at this state (truncate target between children)
315    single: bool,    // single-transition form (no per-edge count)
316}
317
318/// Lazy, allocation-light iterator over an [`Fsa`]'s (key, value) pairs in
319/// sorted order — see [`Fsa::iter`] / [`Fsa::range`]. Pre-order DFS via an
320/// explicit stack (depth = key length), so it composes with the `Iterator`
321/// adapters and stops early without walking the whole automaton. On a corrupt
322/// buffer it simply ends (consistent with the bounds-safe reader).
323pub struct FsaIter<'a, D> {
324    fsa: &'a Fsa<D>,
325    stack: Vec<IterFrame>,
326    cur: Vec<u8>,
327    ord: u64,
328    to_enter: Option<u32>,
329}
330
331impl<D: AsRef<[u8]>> Iterator for FsaIter<'_, D> {
332    type Item = (Vec<u8>, u64);
333
334    fn next(&mut self) -> Option<Self::Item> {
335        let b = self.fsa.data.as_ref();
336        loop {
337            if let Some(rel) = self.to_enter.take() {
338                // Enter a state: push its frame; if final, yield its key now.
339                let mut p = self.fsa.blob_start + rel as usize;
340                let flags = *b.get(p)?;
341                p += 1;
342                let single = flags & 0b10 != 0;
343                let final_ = flags & 1 != 0;
344                let remaining = if single { 1 } else { rd_uvarint(b, &mut p)? };
345                self.stack.push(IterFrame {
346                    rel,
347                    p,
348                    remaining,
349                    base_len: self.cur.len(),
350                    single,
351                });
352                if final_ {
353                    let v = self.fsa.read_value(self.ord);
354                    self.ord += 1;
355                    return Some((self.cur.clone(), v));
356                }
357                continue;
358            }
359            let frame = self.stack.last_mut()?;
360            // Drop the previous child's label before taking the next edge.
361            self.cur.truncate(frame.base_len);
362            if frame.remaining == 0 {
363                self.stack.pop();
364                continue;
365            }
366            let mut p = frame.p;
367            let label = *b.get(p)?;
368            p += 1;
369            let delta = rd_uvarint(b, &mut p)?;
370            if !frame.single {
371                rd_uvarint(b, &mut p)?; // skip the count
372            }
373            frame.p = p;
374            frame.remaining -= 1;
375            let target = decode_target(frame.rel, delta)?;
376            self.cur.push(label);
377            self.to_enter = Some(target);
378        }
379    }
380}