fst_no_std/automaton/
mod.rs

1#[cfg(feature = "levenshtein")]
2pub use self::levenshtein::{Levenshtein, LevenshteinError};
3
4#[cfg(feature = "levenshtein")]
5mod levenshtein;
6
7/// Automaton describes types that behave as a finite automaton.
8///
9/// All implementors of this trait are represented by *byte based* automata.
10/// Stated differently, all transitions in the automata correspond to a single
11/// byte in the input.
12///
13/// This implementation choice is important for a couple reasons:
14///
15/// 1. The set of possible transitions in each node is small, which may make
16///    efficient memory usage easier.
17/// 2. The finite state transducers in this crate are all byte based, so any
18///    automata used on them must also be byte based.
19///
20/// In practice, this does present somewhat of a problem, for example, if
21/// you're storing UTF-8 encoded strings in a finite state transducer. Consider
22/// using a `Levenshtein` automaton, which accepts a query string and an edit
23/// distance. The edit distance should apply to some notion of *character*,
24/// which could be represented by at least 1-4 bytes in a UTF-8 encoding (for
25/// some definition of "character"). Therefore, the automaton must have UTF-8
26/// decoding built into it. This can be tricky to implement, so you may find
27/// the [`utf8-ranges`](https://crates.io/crates/utf8-ranges) crate useful.
28pub trait Automaton {
29    /// The type of the state used in the automaton.
30    type State;
31
32    /// Returns a single start state for this automaton.
33    ///
34    /// This method should always return the same value for each
35    /// implementation.
36    fn start(&self) -> Self::State;
37
38    /// Returns true if and only if `state` is a match state.
39    fn is_match(&self, state: &Self::State) -> bool;
40
41    /// Returns true if and only if `state` can lead to a match in zero or more
42    /// steps.
43    ///
44    /// If this returns `false`, then no sequence of inputs from this state
45    /// should ever produce a match. If this does not follow, then those match
46    /// states may never be reached. In other words, behavior may be incorrect.
47    ///
48    /// If this returns `true` even when no match is possible, then behavior
49    /// will be correct, but callers may be forced to do additional work.
50    fn can_match(&self, _state: &Self::State) -> bool {
51        true
52    }
53
54    /// Returns true if and only if `state` matches and must match no matter
55    /// what steps are taken.
56    ///
57    /// If this returns `true`, then every sequence of inputs from this state
58    /// produces a match. If this does not follow, then those match states may
59    /// never be reached. In other words, behavior may be incorrect.
60    ///
61    /// If this returns `false` even when every sequence of inputs will lead to
62    /// a match, then behavior will be correct, but callers may be forced to do
63    /// additional work.
64    fn will_always_match(&self, _state: &Self::State) -> bool {
65        false
66    }
67
68    /// Return the next state given `state` and an input.
69    fn accept(&self, state: &Self::State, byte: u8) -> Self::State;
70
71    /// If applicable, return the next state when the end of a key is seen.
72    fn accept_eof(&self, _: &Self::State) -> Option<Self::State> {
73        None
74    }
75
76    /// Returns an automaton that matches the strings that start with something
77    /// this automaton matches.
78    fn starts_with(self) -> StartsWith<Self>
79    where
80        Self: Sized,
81    {
82        StartsWith(self)
83    }
84
85    /// Returns an automaton that matches the strings matched by either this or
86    /// the other automaton.
87    fn union<Rhs: Automaton>(self, rhs: Rhs) -> Union<Self, Rhs>
88    where
89        Self: Sized,
90    {
91        Union(self, rhs)
92    }
93
94    /// Returns an automaton that matches the strings matched by both this and
95    /// the other automaton.
96    fn intersection<Rhs: Automaton>(self, rhs: Rhs) -> Intersection<Self, Rhs>
97    where
98        Self: Sized,
99    {
100        Intersection(self, rhs)
101    }
102
103    /// Returns an automaton that matches the strings not matched by this
104    /// automaton.
105    fn complement(self) -> Complement<Self>
106    where
107        Self: Sized,
108    {
109        Complement(self)
110    }
111}
112
113impl<'a, T: Automaton> Automaton for &'a T {
114    type State = T::State;
115
116    fn start(&self) -> T::State {
117        (*self).start()
118    }
119
120    fn is_match(&self, state: &T::State) -> bool {
121        (*self).is_match(state)
122    }
123
124    fn can_match(&self, state: &T::State) -> bool {
125        (*self).can_match(state)
126    }
127
128    fn will_always_match(&self, state: &T::State) -> bool {
129        (*self).will_always_match(state)
130    }
131
132    fn accept(&self, state: &T::State, byte: u8) -> T::State {
133        (*self).accept(state, byte)
134    }
135
136    fn accept_eof(&self, state: &Self::State) -> Option<Self::State> {
137        (*self).accept_eof(state)
138    }
139}
140
141/// An automaton that matches if the input equals to a specific string.
142///
143/// It can be used in combination with [`StartsWith`] to search strings
144/// starting with a given prefix.
145///
146/// ```rust
147/// extern crate fst_no_std;
148///
149/// use fst_no_std::{Automaton, IntoStreamer, Streamer, Set};
150/// use fst_no_std::automaton::Str;
151///
152/// # fn main() { example().unwrap(); }
153/// fn example() -> Result<(), Box<dyn std::error::Error>> {
154///     let paths = vec!["/home/projects/bar", "/home/projects/foo", "/tmp/foo"];
155///     let set = Set::from_iter(paths)?;
156///
157///     // Build our prefix query.
158///     let prefix = Str::new("/home").starts_with();
159///
160///     // Apply our query to the set we built.
161///     let mut stream = set.search(prefix).into_stream();
162///
163///     let matches = stream.into_strs()?;
164///     assert_eq!(matches, vec!["/home/projects/bar", "/home/projects/foo"]);
165///     Ok(())
166/// }
167/// ```
168#[derive(Clone, Debug)]
169pub struct Str<'a> {
170    string: &'a [u8],
171}
172
173impl<'a> Str<'a> {
174    /// Constructs automaton that matches an exact string.
175    #[inline]
176    #[must_use]
177    pub fn new(string: &'a str) -> Str<'a> {
178        Str { string: string.as_bytes() }
179    }
180}
181
182impl<'a> Automaton for Str<'a> {
183    type State = Option<usize>;
184
185    #[inline]
186    fn start(&self) -> Option<usize> {
187        Some(0)
188    }
189
190    #[inline]
191    fn is_match(&self, pos: &Option<usize>) -> bool {
192        *pos == Some(self.string.len())
193    }
194
195    #[inline]
196    fn can_match(&self, pos: &Option<usize>) -> bool {
197        pos.is_some()
198    }
199
200    #[inline]
201    fn accept(&self, pos: &Option<usize>, byte: u8) -> Option<usize> {
202        // if we aren't already past the end...
203        if let Some(pos) = *pos {
204            // and there is still a matching byte at the current position...
205            if self.string.get(pos).copied() == Some(byte) {
206                // then move forward
207                return Some(pos + 1);
208            }
209        }
210        // otherwise we're either past the end or didn't match the byte
211        None
212    }
213}
214
215/// An automaton that matches if the input contains a specific subsequence.
216///
217/// It can be used to build a simple fuzzy-finder.
218///
219/// ```rust
220/// extern crate fst_no_std;
221///
222/// use fst_no_std::{IntoStreamer, Streamer, Set};
223/// use fst_no_std::automaton::Subsequence;
224///
225/// # fn main() { example().unwrap(); }
226/// fn example() -> Result<(), Box<dyn std::error::Error>> {
227///     let paths = vec!["/home/projects/bar", "/home/projects/foo", "/tmp/foo"];
228///     let set = Set::from_iter(paths)?;
229///
230///     // Build our fuzzy query.
231///     let subseq = Subsequence::new("hpf");
232///
233///     // Apply our fuzzy query to the set we built.
234///     let mut stream = set.search(subseq).into_stream();
235///
236///     let matches = stream.into_strs()?;
237///     assert_eq!(matches, vec!["/home/projects/foo"]);
238///     Ok(())
239/// }
240/// ```
241#[derive(Clone, Debug)]
242pub struct Subsequence<'a> {
243    subseq: &'a [u8],
244}
245
246impl<'a> Subsequence<'a> {
247    /// Constructs automaton that matches input containing the
248    /// specified subsequence.
249    #[inline]
250    #[must_use]
251    pub fn new(subsequence: &'a str) -> Subsequence<'a> {
252        Subsequence { subseq: subsequence.as_bytes() }
253    }
254}
255
256impl<'a> Automaton for Subsequence<'a> {
257    type State = usize;
258
259    #[inline]
260    fn start(&self) -> usize {
261        0
262    }
263
264    #[inline]
265    fn is_match(&self, &state: &usize) -> bool {
266        state == self.subseq.len()
267    }
268
269    #[inline]
270    fn can_match(&self, _: &usize) -> bool {
271        true
272    }
273
274    #[inline]
275    fn will_always_match(&self, &state: &usize) -> bool {
276        state == self.subseq.len()
277    }
278
279    #[inline]
280    fn accept(&self, &state: &usize, byte: u8) -> usize {
281        if state == self.subseq.len() {
282            return state;
283        }
284        state + usize::from(byte == self.subseq[state])
285    }
286}
287
288/// An automaton that always matches.
289///
290/// This is useful in a generic context as a way to express that no automaton
291/// should be used.
292#[derive(Clone, Debug)]
293pub struct AlwaysMatch;
294
295impl Automaton for AlwaysMatch {
296    type State = ();
297
298    #[inline]
299    fn start(&self) {}
300    #[inline]
301    fn is_match(&self, (): &()) -> bool {
302        true
303    }
304    #[inline]
305    fn can_match(&self, (): &()) -> bool {
306        true
307    }
308    #[inline]
309    fn will_always_match(&self, (): &()) -> bool {
310        true
311    }
312    #[inline]
313    fn accept(&self, (): &(), _: u8) {}
314}
315
316/// An automaton that matches a string that begins with something that the
317/// wrapped automaton matches.
318#[derive(Clone, Debug)]
319pub struct StartsWith<A>(A);
320
321/// The `Automaton` state for `StartsWith<A>`.
322pub struct StartsWithState<A: Automaton>(StartsWithStateKind<A>);
323
324enum StartsWithStateKind<A: Automaton> {
325    Done,
326    Running(A::State),
327}
328
329impl<A: Automaton> Automaton for StartsWith<A> {
330    type State = StartsWithState<A>;
331
332    fn start(&self) -> StartsWithState<A> {
333        StartsWithState({
334            let inner = self.0.start();
335            if self.0.is_match(&inner) {
336                StartsWithStateKind::Done
337            } else {
338                StartsWithStateKind::Running(inner)
339            }
340        })
341    }
342
343    fn is_match(&self, state: &StartsWithState<A>) -> bool {
344        match state.0 {
345            StartsWithStateKind::Done => true,
346            StartsWithStateKind::Running(_) => false,
347        }
348    }
349
350    fn can_match(&self, state: &StartsWithState<A>) -> bool {
351        match state.0 {
352            StartsWithStateKind::Done => true,
353            StartsWithStateKind::Running(ref inner) => self.0.can_match(inner),
354        }
355    }
356
357    fn will_always_match(&self, state: &StartsWithState<A>) -> bool {
358        match state.0 {
359            StartsWithStateKind::Done => true,
360            StartsWithStateKind::Running(_) => false,
361        }
362    }
363
364    fn accept(
365        &self,
366        state: &StartsWithState<A>,
367        byte: u8,
368    ) -> StartsWithState<A> {
369        StartsWithState(match state.0 {
370            StartsWithStateKind::Done => StartsWithStateKind::Done,
371            StartsWithStateKind::Running(ref inner) => {
372                let next_inner = self.0.accept(inner, byte);
373                if self.0.is_match(&next_inner) {
374                    StartsWithStateKind::Done
375                } else {
376                    StartsWithStateKind::Running(next_inner)
377                }
378            }
379        })
380    }
381}
382
383/// An automaton that matches when one of its component automata match.
384#[derive(Clone, Debug)]
385pub struct Union<A, B>(A, B);
386
387/// The `Automaton` state for `Union<A, B>`.
388pub struct UnionState<A: Automaton, B: Automaton>(A::State, B::State);
389
390impl<A: Automaton, B: Automaton> Automaton for Union<A, B> {
391    type State = UnionState<A, B>;
392
393    fn start(&self) -> UnionState<A, B> {
394        UnionState(self.0.start(), self.1.start())
395    }
396
397    fn is_match(&self, state: &UnionState<A, B>) -> bool {
398        self.0.is_match(&state.0) || self.1.is_match(&state.1)
399    }
400
401    fn can_match(&self, state: &UnionState<A, B>) -> bool {
402        self.0.can_match(&state.0) || self.1.can_match(&state.1)
403    }
404
405    fn will_always_match(&self, state: &UnionState<A, B>) -> bool {
406        self.0.will_always_match(&state.0)
407            || self.1.will_always_match(&state.1)
408    }
409
410    fn accept(&self, state: &UnionState<A, B>, byte: u8) -> UnionState<A, B> {
411        UnionState(
412            self.0.accept(&state.0, byte),
413            self.1.accept(&state.1, byte),
414        )
415    }
416}
417
418/// An automaton that matches when both of its component automata match.
419#[derive(Clone, Debug)]
420pub struct Intersection<A, B>(A, B);
421
422/// The `Automaton` state for `Intersection<A, B>`.
423pub struct IntersectionState<A: Automaton, B: Automaton>(A::State, B::State);
424
425impl<A: Automaton, B: Automaton> Automaton for Intersection<A, B> {
426    type State = IntersectionState<A, B>;
427
428    fn start(&self) -> IntersectionState<A, B> {
429        IntersectionState(self.0.start(), self.1.start())
430    }
431
432    fn is_match(&self, state: &IntersectionState<A, B>) -> bool {
433        self.0.is_match(&state.0) && self.1.is_match(&state.1)
434    }
435
436    fn can_match(&self, state: &IntersectionState<A, B>) -> bool {
437        self.0.can_match(&state.0) && self.1.can_match(&state.1)
438    }
439
440    fn will_always_match(&self, state: &IntersectionState<A, B>) -> bool {
441        self.0.will_always_match(&state.0)
442            && self.1.will_always_match(&state.1)
443    }
444
445    fn accept(
446        &self,
447        state: &IntersectionState<A, B>,
448        byte: u8,
449    ) -> IntersectionState<A, B> {
450        IntersectionState(
451            self.0.accept(&state.0, byte),
452            self.1.accept(&state.1, byte),
453        )
454    }
455}
456
457/// An automaton that matches exactly when the automaton it wraps does not.
458#[derive(Clone, Debug)]
459pub struct Complement<A>(A);
460
461/// The `Automaton` state for `Complement<A>`.
462pub struct ComplementState<A: Automaton>(A::State);
463
464impl<A: Automaton> Automaton for Complement<A> {
465    type State = ComplementState<A>;
466
467    fn start(&self) -> ComplementState<A> {
468        ComplementState(self.0.start())
469    }
470
471    fn is_match(&self, state: &ComplementState<A>) -> bool {
472        !self.0.is_match(&state.0)
473    }
474
475    fn can_match(&self, state: &ComplementState<A>) -> bool {
476        !self.0.will_always_match(&state.0)
477    }
478
479    fn will_always_match(&self, state: &ComplementState<A>) -> bool {
480        !self.0.can_match(&state.0)
481    }
482
483    fn accept(
484        &self,
485        state: &ComplementState<A>,
486        byte: u8,
487    ) -> ComplementState<A> {
488        ComplementState(self.0.accept(&state.0, byte))
489    }
490}