inputx_fsa/reader.rs
1//! Read side: parse the serialized buffer and answer `get` / `prefix` /
2//! `iter` over the embedded bytes with no decompression step. Generic over
3//! the byte container `D: AsRef<[u8]>` so the same reader works on a
4//! `Vec<u8>`, a `&[u8]`, or a future memory-mapped file.
5//!
6//! Format v3 (see `builder::serialize`): states are byte-offset addressed
7//! (no offset table). A flags byte (bit0=final, bit1=single-transition)
8//! precedes the transitions; single-transition nodes drop the arity + the
9//! count. A transition stores `label`, a LEB128 back-delta to the target,
10//! and (multi-node only) the target's LEB128 right-language count.
11
12use alloc::vec::Vec;
13
14const HEADER_LEN: usize = 18; // magic4 + ver1 + width1 + value_count4 + root_off4 + state_count4
15
16/// Error parsing an FSA buffer.
17#[derive(Debug, PartialEq, Eq)]
18pub enum FsaError {
19 BadMagic,
20 BadVersion(u8),
21 Truncated,
22}
23
24/// A read-only minimal-FSA map over a byte container.
25pub struct Fsa<D> {
26 data: D,
27 value_width: usize,
28 value_count: u32,
29 state_count: u32,
30 root_rel: u32,
31 blob_start: usize,
32 values_start: usize,
33}
34
35#[inline]
36pub(crate) fn rd_u32(b: &[u8], at: usize) -> u32 {
37 u32::from_le_bytes([b[at], b[at + 1], b[at + 2], b[at + 3]])
38}
39
40/// Read an unsigned LEB128 starting at `*p`, advancing `*p` past it.
41/// Bounds-safe: returns `None` if the varint runs off the end of `b` or
42/// is malformed (>10 bytes), so a corrupt buffer degrades instead of
43/// panicking.
44#[inline]
45pub(crate) fn rd_uvarint(b: &[u8], p: &mut usize) -> Option<u64> {
46 let mut v = 0u64;
47 let mut shift = 0u32;
48 loop {
49 let byte = *b.get(*p)?;
50 *p += 1;
51 v |= u64::from(byte & 0x7f) << shift;
52 if byte & 0x80 == 0 {
53 return Some(v);
54 }
55 shift += 7;
56 if shift >= 64 {
57 return None; // malformed: varint too long
58 }
59 }
60}
61
62/// Max key length walked during a prefix DFS. Valid IME keys are tiny; this
63/// only bounds recursion depth so a corrupt/cyclic buffer can't overflow the
64/// stack (defense for untrusted input).
65const MAX_WALK_DEPTH: usize = 4096;
66
67/// Resolve a transition's back-delta to a target rel-offset: must be `> 0`
68/// (strictly earlier — no self-loop) and not underflow. `None` on corrupt.
69#[inline]
70fn decode_target(rel: u32, delta: u64) -> Option<u32> {
71 u32::try_from(delta)
72 .ok()
73 .filter(|&d| d > 0)
74 .and_then(|d| rel.checked_sub(d))
75}
76
77impl<D: AsRef<[u8]>> Fsa<D> {
78 /// Load an FSA from a serialized byte buffer. Accepts anything that
79 /// implements `AsRef<[u8]>` (`&[u8]`, `Vec<u8>`, `mmap::Mmap`, etc.).
80 /// The buffer is parsed header-only — no allocation per entry.
81 ///
82 /// Returns [`FsaError::BadMagic`] / [`FsaError::BadVersion`] /
83 /// [`FsaError::Truncated`] on invalid input; never panics.
84 ///
85 /// ```
86 /// use inputx_fsa::{Builder, Fsa, FsaError};
87 /// let mut b = Builder::new();
88 /// b.insert(b"hello", 42);
89 /// let bytes = b.finish();
90 ///
91 /// let fsa = Fsa::new(&bytes[..]).unwrap();
92 /// assert_eq!(fsa.len(), 1);
93 ///
94 /// // Corrupt magic → graceful error, no panic.
95 /// let mut bad = bytes.clone();
96 /// bad[0] = 0;
97 /// assert!(matches!(Fsa::new(&bad[..]), Err(FsaError::BadMagic)));
98 /// ```
99 pub fn new(data: D) -> Result<Self, FsaError> {
100 let b = data.as_ref();
101 if b.len() < HEADER_LEN {
102 return Err(FsaError::Truncated);
103 }
104 if &b[0..4] != b"IXFA" {
105 return Err(FsaError::BadMagic);
106 }
107 if b[4] != 3 {
108 return Err(FsaError::BadVersion(b[4]));
109 }
110 let value_width = b[5] as usize;
111 let value_count = rd_u32(b, 6);
112 let root_rel = rd_u32(b, 10);
113 let state_count = rd_u32(b, 14);
114 let blob_start = HEADER_LEN;
115 let values_len = value_count as usize * value_width;
116 if b.len() < blob_start + values_len {
117 return Err(FsaError::Truncated);
118 }
119 let values_start = b.len() - values_len;
120 Ok(Self {
121 data,
122 value_width,
123 value_count,
124 state_count,
125 root_rel,
126 blob_start,
127 values_start,
128 })
129 }
130
131 /// Number of keys stored.
132 pub fn len(&self) -> u64 {
133 u64::from(self.value_count)
134 }
135
136 pub fn is_empty(&self) -> bool {
137 self.value_count == 0
138 }
139
140 /// Total number of states in the automaton (diagnostics).
141 pub fn state_count(&self) -> u32 {
142 self.state_count
143 }
144
145 #[inline]
146 fn read_value(&self, ord: u64) -> u64 {
147 let b = self.data.as_ref();
148 let at = self.values_start + ord as usize * self.value_width;
149 let mut v = 0u64;
150 for i in 0..self.value_width {
151 v |= u64::from(b.get(at + i).copied().unwrap_or(0)) << (8 * i);
152 }
153 v
154 }
155
156 #[inline]
157 fn is_final(&self, rel: u32) -> bool {
158 self.data
159 .as_ref()
160 .get(self.blob_start + rel as usize)
161 .is_some_and(|x| x & 1 != 0)
162 }
163
164 /// Walk one state: from state at `rel`, take `byte`. Returns the target
165 /// state's relative offset, adding to `ord` the rank contribution of
166 /// everything that sorts before that branch. `None` if no such transition
167 /// (or on any out-of-bounds read — a corrupt buffer degrades to "no
168 /// match" rather than panicking).
169 #[inline]
170 fn step(&self, rel: u32, byte: u8, ord: &mut u64) -> Option<u32> {
171 let b = self.data.as_ref();
172 let mut p = self.blob_start + rel as usize;
173 let flags = *b.get(p)?;
174 p += 1;
175 if flags & 1 != 0 {
176 *ord += 1; // final: the (shorter) word ending here sorts first
177 }
178 if flags & 0b10 != 0 {
179 // single-transition form: [flags, label, delta], no count.
180 let label = *b.get(p)?;
181 p += 1;
182 let delta = rd_uvarint(b, &mut p)?;
183 // Only one edge: take it iff it matches; its count is never needed.
184 if label == byte {
185 return rel.checked_sub(u32::try_from(delta).ok()?);
186 }
187 return None;
188 }
189 let ntrans = rd_uvarint(b, &mut p)?;
190 for _ in 0..ntrans {
191 let label = *b.get(p)?;
192 p += 1;
193 let delta = rd_uvarint(b, &mut p)?;
194 let numt = rd_uvarint(b, &mut p)?;
195 if label < byte {
196 *ord += numt;
197 } else if label == byte {
198 return rel.checked_sub(u32::try_from(delta).ok()?);
199 } else {
200 break; // label-sorted
201 }
202 }
203 None
204 }
205
206 /// Look up `key` — `Some(value)` for an exact match, `None` otherwise.
207 /// Constant cost per byte; the buffer is never copied.
208 ///
209 /// ```
210 /// use inputx_fsa::{Builder, Fsa};
211 /// let mut b = Builder::new();
212 /// b.insert(b"ni", 1);
213 /// b.insert(b"nihao", 100);
214 /// let fsa = Fsa::new(b.finish()).unwrap();
215 /// assert_eq!(fsa.get(b"ni"), Some(1));
216 /// assert_eq!(fsa.get(b"nihao"), Some(100));
217 /// assert_eq!(fsa.get(b"niha"), None); // prefix-only, not a key
218 /// assert_eq!(fsa.get(b"absent"), None);
219 /// ```
220 pub fn get(&self, key: &[u8]) -> Option<u64> {
221 if self.value_count == 0 {
222 return None;
223 }
224 let mut rel = self.root_rel;
225 let mut ord = 0u64;
226 for &byte in key {
227 rel = self.step(rel, byte, &mut ord)?;
228 }
229 if self.is_final(rel) {
230 Some(self.read_value(ord))
231 } else {
232 None
233 }
234 }
235
236 /// `true` if any stored key starts with `prefix`.
237 pub fn contains_prefix(&self, prefix: &[u8]) -> bool {
238 self.walk_to(prefix).is_some()
239 }
240
241 /// All (key, value) pairs whose key starts with `prefix`, sorted.
242 pub fn prefix(&self, prefix: &[u8]) -> Vec<(Vec<u8>, u64)> {
243 let mut out = Vec::new();
244 self.prefix_for_each(prefix, |k, v| out.push((k.to_vec(), v)));
245 out
246 }
247
248 /// Streaming variant: invoke `visit(key, value)` for every (key, value)
249 /// whose key starts with `prefix`, in sorted order, without allocating a
250 /// result vector. The `key` slice is valid only for the call. This is the
251 /// hot-path entry — a bare-letter prefix can match tens of thousands of
252 /// keys, and materializing them all would dominate cost.
253 ///
254 /// ```
255 /// use inputx_fsa::{Builder, Fsa};
256 /// let mut b = Builder::new();
257 /// for (k, v) in [(&b"apple"[..], 1u64), (b"apply", 2), (b"banana", 3)] {
258 /// b.insert(k, v);
259 /// }
260 /// let fsa = Fsa::new(b.finish()).unwrap();
261 ///
262 /// let mut count = 0;
263 /// let mut last_value = 0;
264 /// fsa.prefix_for_each(b"app", |_key, value| {
265 /// count += 1;
266 /// last_value = value;
267 /// });
268 /// assert_eq!(count, 2); // apple + apply
269 /// assert_eq!(last_value, 2); // "apply" sorts last
270 /// ```
271 pub fn prefix_for_each<F: FnMut(&[u8], u64)>(&self, prefix: &[u8], mut visit: F) {
272 if let Some((rel, ord)) = self.walk_to(prefix) {
273 let mut cur = prefix.to_vec();
274 let mut ord = ord;
275 self.visit_subtree(rel, &mut cur, &mut ord, &mut visit);
276 }
277 }
278
279 /// Lazy iterator over all (key, value) pairs in sorted order.
280 pub fn iter(&self) -> FsaIter<'_, D> {
281 self.range(b"")
282 }
283
284 /// Lazy iterator over (key, value) pairs whose key starts with `prefix`,
285 /// in sorted order. Unlike [`prefix`](Self::prefix) it allocates no result
286 /// vector and supports early termination (`.take`, `.find`, `break`).
287 pub fn range(&self, prefix: &[u8]) -> FsaIter<'_, D> {
288 let (to_enter, ord) = match self.walk_to(prefix) {
289 Some((rel, ord)) => (Some(rel), ord),
290 None => (None, 0),
291 };
292 FsaIter {
293 fsa: self,
294 stack: Vec::new(),
295 cur: prefix.to_vec(),
296 ord,
297 to_enter,
298 }
299 }
300
301 fn walk_to(&self, prefix: &[u8]) -> Option<(u32, u64)> {
302 if self.value_count == 0 {
303 return None;
304 }
305 let mut rel = self.root_rel;
306 let mut ord = 0u64;
307 for &byte in prefix {
308 rel = self.step(rel, byte, &mut ord)?;
309 }
310 Some((rel, ord))
311 }
312
313 /// DFS in label-sorted order from `rel`, invoking `visit(key, value)` for
314 /// each accepted key and advancing `ord`. Recursion depth ≤ longest key.
315 fn visit_subtree<F: FnMut(&[u8], u64)>(
316 &self,
317 rel: u32,
318 cur: &mut Vec<u8>,
319 ord: &mut u64,
320 visit: &mut F,
321 ) {
322 // Depth guard: bounds recursion on a corrupt/cyclic buffer.
323 if cur.len() > MAX_WALK_DEPTH {
324 return;
325 }
326 let b = self.data.as_ref();
327 let mut p = self.blob_start + rel as usize;
328 let Some(&flags) = b.get(p) else { return };
329 p += 1;
330 if flags & 1 != 0 {
331 visit(cur, self.read_value(*ord));
332 *ord += 1;
333 }
334 if flags & 0b10 != 0 {
335 // single-transition form: [flags, label, delta], no count.
336 let Some(&label) = b.get(p) else { return };
337 p += 1;
338 let Some(delta) = rd_uvarint(b, &mut p) else { return };
339 if let Some(target) = decode_target(rel, delta) {
340 cur.push(label);
341 self.visit_subtree(target, cur, ord, visit);
342 cur.pop();
343 }
344 return;
345 }
346 let Some(ntrans) = rd_uvarint(b, &mut p) else { return };
347 for _ in 0..ntrans {
348 let Some(&label) = b.get(p) else { return };
349 p += 1;
350 let Some(delta) = rd_uvarint(b, &mut p) else { return };
351 let Some(_num) = rd_uvarint(b, &mut p) else { return };
352 if let Some(target) = decode_target(rel, delta) {
353 cur.push(label);
354 self.visit_subtree(target, cur, ord, visit);
355 cur.pop();
356 }
357 }
358 }
359}
360
361/// One state being expanded on the iterator's explicit DFS stack.
362struct IterFrame {
363 rel: u32,
364 p: usize, // byte cursor: next transition record to read
365 remaining: u64, // transitions left in this state
366 base_len: usize, // `cur.len()` at this state (truncate target between children)
367 single: bool, // single-transition form (no per-edge count)
368}
369
370/// Lazy, allocation-light iterator over an [`Fsa`]'s (key, value) pairs in
371/// sorted order — see [`Fsa::iter`] / [`Fsa::range`]. Pre-order DFS via an
372/// explicit stack (depth = key length), so it composes with the `Iterator`
373/// adapters and stops early without walking the whole automaton. On a corrupt
374/// buffer it simply ends (consistent with the bounds-safe reader).
375pub struct FsaIter<'a, D> {
376 fsa: &'a Fsa<D>,
377 stack: Vec<IterFrame>,
378 cur: Vec<u8>,
379 ord: u64,
380 to_enter: Option<u32>,
381}
382
383impl<D: AsRef<[u8]>> Iterator for FsaIter<'_, D> {
384 type Item = (Vec<u8>, u64);
385
386 fn next(&mut self) -> Option<Self::Item> {
387 let b = self.fsa.data.as_ref();
388 loop {
389 if let Some(rel) = self.to_enter.take() {
390 // Enter a state: push its frame; if final, yield its key now.
391 let mut p = self.fsa.blob_start + rel as usize;
392 let flags = *b.get(p)?;
393 p += 1;
394 let single = flags & 0b10 != 0;
395 let final_ = flags & 1 != 0;
396 let remaining = if single { 1 } else { rd_uvarint(b, &mut p)? };
397 self.stack.push(IterFrame {
398 rel,
399 p,
400 remaining,
401 base_len: self.cur.len(),
402 single,
403 });
404 if final_ {
405 let v = self.fsa.read_value(self.ord);
406 self.ord += 1;
407 return Some((self.cur.clone(), v));
408 }
409 continue;
410 }
411 let frame = self.stack.last_mut()?;
412 // Drop the previous child's label before taking the next edge.
413 self.cur.truncate(frame.base_len);
414 if frame.remaining == 0 {
415 self.stack.pop();
416 continue;
417 }
418 let mut p = frame.p;
419 let label = *b.get(p)?;
420 p += 1;
421 let delta = rd_uvarint(b, &mut p)?;
422 if !frame.single {
423 rd_uvarint(b, &mut p)?; // skip the count
424 }
425 frame.p = p;
426 frame.remaining -= 1;
427 let target = decode_target(frame.rel, delta)?;
428 self.cur.push(label);
429 self.to_enter = Some(target);
430 }
431 }
432}