fst_regex/lib.rs
1
2use regex_syntax;
3
4
5use std::fmt;
6
7use fst::Automaton;
8
9pub use crate::error::Error;
10
11mod compile;
12mod dfa;
13mod error;
14mod sparse;
15
16/// A regular expression for searching FSTs with Unicode support.
17///
18/// Regular expressions are compiled down to a deterministic finite automaton
19/// that can efficiently search any finite state transducer. Notably, most
20/// regular expressions only need to explore a small portion of a finite state
21/// transducer without loading all of it into memory.
22///
23/// # Syntax
24///
25/// `Regex` supports fully featured regular expressions. Namely, it supports
26/// all of the same constructs as the standard `regex` crate except for the
27/// following things:
28///
29/// 1. Lazy quantifiers, since a regular expression automaton only reports
30/// whether a key matches at all, and not its location. Namely, lazy
31/// quantifiers such as `+?` only modify the location of a match, but never
32/// change a non-match into a match or a match into a non-match.
33/// 2. Word boundaries (i.e., `\b`). Because such things are hard to do in
34/// a deterministic finite automaton, but not impossible. As such, these
35/// may be allowed some day.
36/// 3. Other zero width assertions like `^` and `$`. These are easier to
37/// support than word boundaries, but are still tricky and usually aren't
38/// as useful when searching dictionaries.
39///
40/// Otherwise, the [full syntax of the `regex`
41/// crate](http://doc.rust-lang.org/regex/regex/index.html#syntax)
42/// is supported. This includes all Unicode support and relevant flags.
43/// (The `U` and `m` flags are no-ops because of (1) and (3) above,
44/// respectively.)
45///
46/// # Matching semantics
47///
48/// A regular expression matches a key in a finite state transducer if and only
49/// if it matches from the start of a key all the way to end. Stated
50/// differently, every regular expression `(re)` is matched as if it were
51/// `^(re)$`. This means that if you want to do a substring match, then you
52/// must use `.*substring.*`.
53///
54/// **Caution**: Starting a regular expression with `.*` means that it could
55/// potentially match *any* key in a finite state transducer. This implies that
56/// all keys could be visited, which could be slow. It is possible that this
57/// crate will grow facilities for detecting regular expressions that will
58/// scan a large portion of a transducer and optionally disallow them.
59///
60/// # Example
61///
62/// This example shows how to run a regular expression on a `Set`.
63///
64/// ```rust
65/// extern crate fst;
66/// extern crate fst_regex;
67///
68/// use fst::{IntoStreamer, Streamer, Set};
69/// use fst_regex::Regex;
70///
71/// fn main() {
72/// let set = Set::from_iter(&["foo", "foo1", "foo2", "foo3", "foobar"])
73/// .unwrap();
74///
75/// let re = Regex::new("f[a-z]+3?").unwrap();
76/// let mut stream = set.search(&re).into_stream();
77///
78/// let mut keys = vec![];
79/// while let Some(key) = stream.next() {
80/// keys.push(key.to_vec());
81/// }
82/// assert_eq!(keys, vec![
83/// "foo".as_bytes(), "foo3".as_bytes(), "foobar".as_bytes(),
84/// ]);
85/// }
86/// ```
87///
88/// # Warning: experimental
89///
90/// While executing a regular expression against a finite state transducer will
91/// be very fast, *construction* of a regular expression automaton may not be.
92/// Namely, this implementation is a proof of concept. In particular, one of
93/// its major deficiencies is that it can use enormous amounts of memory.
94/// Note though, that the construction phase will return an error if the
95/// underlying automata grows too big (tens of MB).
96///
97/// This is important functionality, so one should count on this implementation
98/// being vastly improved in the future.
99pub struct Regex {
100 original: String,
101 dfa: dfa::Dfa,
102}
103
104#[derive(Eq, PartialEq)]
105pub enum Inst {
106 Match,
107 Jump(usize),
108 Split(usize, usize),
109 Range(u8, u8),
110}
111
112impl Regex {
113 /// Create a new regular expression query.
114 ///
115 /// The query finds all terms matching the regular expression.
116 ///
117 /// If the regular expression is malformed or if it results in an automaton
118 /// that is too big, then an error is returned.
119 ///
120 /// A `Regex` value satisfies the `Automaton` trait, which means it can be
121 /// used with the `search` method of any finite state transducer.
122 #[inline]
123 pub fn new(re: &str) -> Result<Regex, Error> {
124 Regex::with_size_limit(10 * (1 << 20), re)
125 }
126
127 fn with_size_limit(size: usize, re: &str) -> Result<Regex, Error> {
128 let expr = regex_syntax::Expr::parse(re)?;
129 let insts = compile::Compiler::new(size).compile(&expr)?;
130 let dfa = dfa::DfaBuilder::new(insts).build()?;
131 Ok(Regex { original: re.to_owned(), dfa: dfa })
132 }
133}
134
135impl Automaton for Regex {
136 type State = Option<usize>;
137
138 #[inline]
139 fn start(&self) -> Option<usize> {
140 Some(0)
141 }
142
143 #[inline]
144 fn is_match(&self, state: &Option<usize>) -> bool {
145 state.map(|state| self.dfa.is_match(state)).unwrap_or(false)
146 }
147
148 #[inline]
149 fn can_match(&self, state: &Option<usize>) -> bool {
150 state.is_some()
151 }
152
153 #[inline]
154 fn accept(&self, state: &Option<usize>, byte: u8) -> Option<usize> {
155 state.and_then(|state| self.dfa.accept(state, byte))
156 }
157}
158
159impl fmt::Debug for Regex {
160 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
161 writeln!(f, "Regex({:?})", self.original)?;
162 self.dfa.fmt(f)
163 }
164}
165
166impl fmt::Debug for Inst {
167 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
168 use self::Inst::*;
169 match *self {
170 Match => write!(f, "Match"),
171 Jump(ip) => write!(f, "JUMP {}", ip),
172 Split(ip1, ip2) => write!(f, "SPLIT {}, {}", ip1, ip2),
173 Range(s, e) => write!(f, "RANGE {:X}-{:X}", s, e),
174 }
175 }
176}