Skip to main content

lindera_fst/automaton/
mod.rs

1extern crate levenshtein_automata;
2
3use self::levenshtein_automata::Distance;
4use self::StartsWithStateInternal::*;
5
6/// Automaton describes types that behave as a finite automaton.
7///
8/// All implementors of this trait are represented by *byte based* automata.
9/// Stated differently, all transitions in the automata correspond to a single
10/// byte in the input.
11///
12/// This implementation choice is important for a couple reasons:
13///
14/// 1. The set of possible transitions in each node is small, which may make
15///    efficient memory usage easier.
16/// 2. The finite state transducers in this crate are all byte based, so any
17///    automata used on them must also be byte based.
18///
19/// In practice, this does present somewhat of a problem, for example, if
20/// you're storing UTF-8 encoded strings in a finite state transducer. Consider
21/// using a `Levenshtein` automaton, which accepts a query string and an edit
22/// distance. The edit distance should apply to some notion of *character*,
23/// which could be represented by at least 1-4 bytes in a UTF-8 encoding (for
24/// some definition of "character"). Therefore, the automaton must have UTF-8
25/// decoding built into it. This can be tricky to implement, so you may find
26/// the [`utf8-ranges`](https://crates.io/crates/utf8-ranges) crate useful.
27pub trait Automaton {
28    /// The type of the state used in the automaton.
29    type State;
30
31    /// Returns a single start state for this automaton.
32    ///
33    /// This method should always return the same value for each
34    /// implementation.
35    fn start(&self) -> Self::State;
36
37    /// Returns true if and only if `state` is a match state.
38    fn is_match(&self, state: &Self::State) -> bool;
39
40    /// Returns true if and only if `state` can lead to a match in zero or more
41    /// steps.
42    ///
43    /// If this returns `false`, then no sequence of inputs from this state
44    /// should ever produce a match. If this does not follow, then those match
45    /// states may never be reached. In other words, behavior may be incorrect.
46    ///
47    /// If this returns `true` even when no match is possible, then behavior
48    /// will be correct, but callers may be forced to do additional work.
49    fn can_match(&self, _state: &Self::State) -> bool {
50        true
51    }
52
53    /// Returns true if and only if `state` matches and must match no matter
54    /// what steps are taken.
55    ///
56    /// If this returns `true`, then every sequence of inputs from this state
57    /// produces a match. If this does not follow, then those match states may
58    /// never be reached. In other words, behavior may be incorrect.
59    ///
60    /// If this returns `false` even when every sequence of inputs will lead to
61    /// a match, then behavior will be correct, but callers may be forced to do
62    /// additional work.
63    fn will_always_match(&self, _state: &Self::State) -> bool {
64        false
65    }
66
67    /// Return the next state given `state` and an input.
68    fn accept(&self, state: &Self::State, byte: u8) -> Self::State;
69
70    /// Returns an automaton that matches the strings that start with something
71    /// this automaton matches.
72    fn starts_with(self) -> StartsWith<Self>
73    where
74        Self: Sized,
75    {
76        StartsWith(self)
77    }
78
79    /// Returns an automaton that matches the strings matched by either this or
80    /// the other automaton.
81    fn union<Rhs: Automaton>(self, rhs: Rhs) -> Union<Self, Rhs>
82    where
83        Self: Sized,
84    {
85        Union(self, rhs)
86    }
87
88    /// Returns an automaton that matches the strings matched by both this and
89    /// the other automaton.
90    fn intersection<Rhs: Automaton>(self, rhs: Rhs) -> Intersection<Self, Rhs>
91    where
92        Self: Sized,
93    {
94        Intersection(self, rhs)
95    }
96
97    /// Returns an automaton that matches the strings not matched by this
98    /// automaton.
99    fn complement(self) -> Complement<Self>
100    where
101        Self: Sized,
102    {
103        Complement(self)
104    }
105}
106
107impl<'a, T: Automaton> Automaton for &'a T {
108    type State = T::State;
109
110    fn start(&self) -> Self::State {
111        (*self).start()
112    }
113
114    fn is_match(&self, state: &Self::State) -> bool {
115        (*self).is_match(state)
116    }
117
118    fn can_match(&self, state: &Self::State) -> bool {
119        (*self).can_match(state)
120    }
121
122    fn will_always_match(&self, state: &Self::State) -> bool {
123        (*self).will_always_match(state)
124    }
125
126    fn accept(&self, state: &Self::State, byte: u8) -> Self::State {
127        (*self).accept(state, byte)
128    }
129}
130
131impl Automaton for levenshtein_automata::DFA {
132    type State = u32;
133
134    fn start(&self) -> Self::State {
135        self.initial_state()
136    }
137
138    fn is_match(&self, state: &Self::State) -> bool {
139        match self.distance(*state) {
140            Distance::Exact(_) => true,
141            Distance::AtLeast(_) => false,
142        }
143    }
144
145    fn accept(&self, state: &Self::State, byte: u8) -> Self::State {
146        self.transition(*state, byte)
147    }
148}
149
150/// An automaton that matches if the input contains a specific subsequence.
151#[derive(Clone, Debug)]
152pub struct Subsequence<'a> {
153    subseq: &'a [u8],
154}
155
156impl<'a> Subsequence<'a> {
157    /// Constructs automaton that matches input containing the
158    /// specified subsequence.
159    #[inline]
160    pub fn new(subsequence: &'a str) -> Subsequence<'a> {
161        Subsequence {
162            subseq: subsequence.as_bytes(),
163        }
164    }
165}
166
167impl<'a> Automaton for Subsequence<'a> {
168    type State = usize;
169
170    #[inline]
171    fn start(&self) -> usize {
172        0
173    }
174
175    #[inline]
176    fn is_match(&self, &state: &usize) -> bool {
177        state == self.subseq.len()
178    }
179
180    #[inline]
181    fn can_match(&self, _: &usize) -> bool {
182        true
183    }
184
185    #[inline]
186    fn will_always_match(&self, &state: &usize) -> bool {
187        state == self.subseq.len()
188    }
189
190    #[inline]
191    fn accept(&self, &state: &usize, byte: u8) -> usize {
192        if state == self.subseq.len() {
193            return state;
194        }
195        state + (byte == self.subseq[state]) as usize
196    }
197}
198
199/// An automaton that always matches.
200///
201/// This is useful in a generic context as a way to express that no automaton
202/// should be used.
203#[derive(Clone, Debug)]
204pub struct AlwaysMatch;
205
206impl Automaton for AlwaysMatch {
207    type State = ();
208
209    #[inline]
210    fn start(&self) {}
211    #[inline]
212    fn is_match(&self, _: &()) -> bool {
213        true
214    }
215    #[inline]
216    fn can_match(&self, _: &()) -> bool {
217        true
218    }
219    #[inline]
220    fn will_always_match(&self, _: &()) -> bool {
221        true
222    }
223    #[inline]
224    fn accept(&self, _: &(), _: u8) {}
225}
226
227/// An automaton that matches a string that begins with something that the
228/// wrapped automaton matches.
229#[derive(Clone, Debug)]
230pub struct StartsWith<A>(A);
231
232/// The `Automaton` state for `StartsWith<A>`.
233pub struct StartsWithState<A: Automaton>(StartsWithStateInternal<A>);
234
235enum StartsWithStateInternal<A: Automaton> {
236    Done,
237    Running(A::State),
238}
239
240impl<A: Automaton> Automaton for StartsWith<A> {
241    type State = StartsWithState<A>;
242
243    fn start(&self) -> StartsWithState<A> {
244        StartsWithState({
245            let inner = self.0.start();
246            if self.0.is_match(&inner) {
247                Done
248            } else {
249                Running(inner)
250            }
251        })
252    }
253
254    fn is_match(&self, state: &StartsWithState<A>) -> bool {
255        match state.0 {
256            Done => true,
257            Running(_) => false,
258        }
259    }
260
261    fn can_match(&self, state: &StartsWithState<A>) -> bool {
262        match state.0 {
263            Done => true,
264            Running(ref inner) => self.0.can_match(inner),
265        }
266    }
267
268    fn will_always_match(&self, state: &StartsWithState<A>) -> bool {
269        match state.0 {
270            Done => true,
271            Running(_) => false,
272        }
273    }
274
275    fn accept(&self, state: &StartsWithState<A>, byte: u8) -> StartsWithState<A> {
276        StartsWithState(match state.0 {
277            Done => Done,
278            Running(ref inner) => {
279                let next_inner = self.0.accept(inner, byte);
280                if self.0.is_match(&next_inner) {
281                    Done
282                } else {
283                    Running(next_inner)
284                }
285            }
286        })
287    }
288}
289
290/// An automaton that matches when one of its component automata match.
291#[derive(Clone, Debug)]
292pub struct Union<A, B>(A, B);
293
294/// The `Automaton` state for `Union<A, B>`.
295pub struct UnionState<A: Automaton, B: Automaton>(A::State, B::State);
296
297impl<A: Automaton, B: Automaton> Automaton for Union<A, B> {
298    type State = UnionState<A, B>;
299
300    fn start(&self) -> UnionState<A, B> {
301        UnionState(self.0.start(), self.1.start())
302    }
303
304    fn is_match(&self, state: &UnionState<A, B>) -> bool {
305        self.0.is_match(&state.0) || self.1.is_match(&state.1)
306    }
307
308    fn can_match(&self, state: &UnionState<A, B>) -> bool {
309        self.0.can_match(&state.0) || self.1.can_match(&state.1)
310    }
311
312    fn will_always_match(&self, state: &UnionState<A, B>) -> bool {
313        self.0.will_always_match(&state.0) || self.1.will_always_match(&state.1)
314    }
315
316    fn accept(&self, state: &UnionState<A, B>, byte: u8) -> UnionState<A, B> {
317        UnionState(self.0.accept(&state.0, byte), self.1.accept(&state.1, byte))
318    }
319}
320
321/// An automaton that matches when both of its component automata match.
322#[derive(Clone, Debug)]
323pub struct Intersection<A, B>(A, B);
324
325/// The `Automaton` state for `Intersection<A, B>`.
326pub struct IntersectionState<A: Automaton, B: Automaton>(A::State, B::State);
327
328impl<A: Automaton, B: Automaton> Automaton for Intersection<A, B> {
329    type State = IntersectionState<A, B>;
330
331    fn start(&self) -> IntersectionState<A, B> {
332        IntersectionState(self.0.start(), self.1.start())
333    }
334
335    fn is_match(&self, state: &IntersectionState<A, B>) -> bool {
336        self.0.is_match(&state.0) && self.1.is_match(&state.1)
337    }
338
339    fn can_match(&self, state: &IntersectionState<A, B>) -> bool {
340        self.0.can_match(&state.0) && self.1.can_match(&state.1)
341    }
342
343    fn will_always_match(&self, state: &IntersectionState<A, B>) -> bool {
344        self.0.will_always_match(&state.0) && self.1.will_always_match(&state.1)
345    }
346
347    fn accept(&self, state: &IntersectionState<A, B>, byte: u8) -> IntersectionState<A, B> {
348        IntersectionState(self.0.accept(&state.0, byte), self.1.accept(&state.1, byte))
349    }
350}
351
352/// An automaton that matches exactly when the automaton it wraps does not.
353#[derive(Clone, Debug)]
354pub struct Complement<A>(A);
355
356/// The `Automaton` state for `Complement<A>`.
357pub struct ComplementState<A: Automaton>(A::State);
358
359impl<A: Automaton> Automaton for Complement<A> {
360    type State = ComplementState<A>;
361
362    fn start(&self) -> ComplementState<A> {
363        ComplementState(self.0.start())
364    }
365
366    fn is_match(&self, state: &ComplementState<A>) -> bool {
367        !self.0.is_match(&state.0)
368    }
369
370    fn can_match(&self, state: &ComplementState<A>) -> bool {
371        !self.0.will_always_match(&state.0)
372    }
373
374    fn will_always_match(&self, state: &ComplementState<A>) -> bool {
375        !self.0.can_match(&state.0)
376    }
377
378    fn accept(&self, state: &ComplementState<A>, byte: u8) -> ComplementState<A> {
379        ComplementState(self.0.accept(&state.0, byte))
380    }
381}