regex_filtered/
lib.rs

1#![doc = include_str!("../README.md")]
2#![deny(unsafe_code)]
3#![warn(missing_docs)]
4
5use aho_corasick::AhoCorasick;
6
7mod mapper;
8mod model;
9pub use model::Error as ModelError;
10
11/// Builder for the regexes set
12pub struct Builder {
13    regexes: Vec<regex::Regex>,
14    mapper_builder: mapper::Builder,
15}
16
17/// Parser configuration, can be used to tune the regex parsing when
18/// adding it to the [`Builder`]. Every option defaults to `false`
19/// whether through [`Default`] or [`Options::new`].
20///
21/// The parser can also be configured via standard [`regex`] inline
22/// flags.
23#[derive(Default)]
24pub struct Options {
25    case_insensitive: bool,
26    dot_matches_new_line: bool,
27    ignore_whitespace: bool,
28    multi_line: bool,
29    crlf: bool,
30}
31
32impl Options {
33    /// Create a new options object.
34    pub fn new() -> Self {
35        Self::default()
36    }
37    /// Configures case-insensitive matching for the entire pattern.
38    pub fn case_insensitive(&mut self, yes: bool) -> &mut Self {
39        self.case_insensitive = yes;
40        self
41    }
42    /// Configures `.` to match newline characters, by default `.`
43    /// matches everything *except* newline characters.
44    pub fn dot_matches_new_line(&mut self, yes: bool) -> &mut Self {
45        self.dot_matches_new_line = yes;
46        self
47    }
48    /// Configures ignoring whitespace inside patterns, as well as `#`
49    /// line comments ("verbose" mode).
50    ///
51    /// Verbose mode is useful to break up complex regexes and improve
52    /// their documentation.
53    pub fn ignore_whitespace(&mut self, yes: bool) -> &mut Self {
54        self.ignore_whitespace = yes;
55        self
56    }
57    /// Configures multi-line mode. When enabled, `^` matches at every
58    /// start of line and `$` at every end of line, by default they
59    /// match only the start and end of the string respectively.ca
60    pub fn multi_line(&mut self, yes: bool) -> &mut Self {
61        self.multi_line = yes;
62        self
63    }
64    /// Allows `\r` as a line terminator, by default only `\n` is a
65    /// line terminator (relevant for [`Self::ignore_whitespace`] and
66    /// [`Self::multi_line`]).
67    pub fn crlf(&mut self, yes: bool) -> &mut Self {
68        self.crlf = yes;
69        self
70    }
71    fn to_regex(&self, pattern: &str) -> Result<regex::Regex, regex::Error> {
72        regex::RegexBuilder::new(pattern)
73            .case_insensitive(self.case_insensitive)
74            .dot_matches_new_line(self.dot_matches_new_line)
75            .ignore_whitespace(self.ignore_whitespace)
76            .multi_line(self.multi_line)
77            .crlf(self.crlf)
78            .build()
79    }
80}
81impl From<Options> for regex_syntax::Parser {
82    fn from(opt: Options) -> Self {
83        Self::from(&opt)
84    }
85}
86impl From<&Options> for regex_syntax::Parser {
87    fn from(
88        Options {
89            case_insensitive,
90            dot_matches_new_line,
91            ignore_whitespace,
92            multi_line,
93            crlf,
94        }: &Options,
95    ) -> Self {
96        regex_syntax::ParserBuilder::new()
97            .case_insensitive(*case_insensitive)
98            .dot_matches_new_line(*dot_matches_new_line)
99            .ignore_whitespace(*ignore_whitespace)
100            .multi_line(*multi_line)
101            .crlf(*crlf)
102            .build()
103    }
104}
105
106/// Parsing error when adding a new regex to the [`Builder`].
107#[derive(Debug)]
108pub enum ParseError {
109    /// An error occurred while parsing the regex or translating it to
110    /// HIR.
111    SyntaxError(String),
112    /// An error occurred while processing the regex for atom
113    /// extraction.
114    ProcessingError(ModelError),
115    /// The regex was too large to compile to the NFA (within the
116    /// default limits).
117    RegexTooLarge(usize),
118}
119impl std::error::Error for ParseError {
120    fn source(&self) -> Option<&(dyn std::error::Error + 'static)> {
121        match self {
122            ParseError::ProcessingError(e) => Some(e),
123            ParseError::SyntaxError(_) => None,
124            ParseError::RegexTooLarge(_) => None,
125        }
126    }
127}
128impl std::fmt::Display for ParseError {
129    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
130        write!(f, "{self:?}")
131    }
132}
133impl From<regex_syntax::Error> for ParseError {
134    fn from(value: regex_syntax::Error) -> Self {
135        Self::SyntaxError(value.to_string())
136    }
137}
138impl From<regex::Error> for ParseError {
139    fn from(value: regex::Error) -> Self {
140        match value {
141            regex::Error::CompiledTooBig(v) => Self::RegexTooLarge(v),
142            e => Self::SyntaxError(e.to_string()),
143        }
144    }
145}
146impl From<ModelError> for ParseError {
147    fn from(value: ModelError) -> Self {
148        Self::ProcessingError(value)
149    }
150}
151
152/// Error while compiling the builder to a prefiltered set.
153#[derive(Debug)]
154pub enum BuildError {
155    /// Error while building the prefilter.
156    PrefilterError(aho_corasick::BuildError),
157}
158impl std::error::Error for BuildError {
159    fn source(&self) -> Option<&(dyn std::error::Error + 'static)> {
160        match self {
161            BuildError::PrefilterError(p) => Some(p),
162        }
163    }
164}
165impl std::fmt::Display for BuildError {
166    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
167        write!(f, "{self:?}")
168    }
169}
170impl From<aho_corasick::BuildError> for BuildError {
171    fn from(value: aho_corasick::BuildError) -> Self {
172        Self::PrefilterError(value)
173    }
174}
175
176impl Builder {
177    /// Instantiate a builder with the default metadata configuration:
178    ///
179    /// - minimum atom length 3
180    #[must_use]
181    pub fn new() -> Self {
182        Self::new_atom_len(3)
183    }
184
185    /// Instantiate a builder with a custom minimum atom length.
186    /// Increasing the atom length decreases the size and cost of the
187    /// prefilter, but may make more regexes impossible to prefilter,
188    /// which can increase matching costs.
189    #[must_use]
190    pub fn new_atom_len(min_atom_len: usize) -> Self {
191        Self {
192            regexes: Vec::new(),
193            mapper_builder: mapper::Builder::new(min_atom_len),
194        }
195    }
196
197    /// Currently loaded regexes.
198    pub fn regexes(&self) -> &[regex::Regex] {
199        &self.regexes
200    }
201
202    /// Push a single regex into the builder, using the default
203    /// parsing options.
204    pub fn push(self, s: &str) -> Result<Self, ParseError> {
205        self.push_opt(s, &Options::new())
206    }
207
208    /// Push a single regex into the builder, using custom parsing
209    /// options.
210    pub fn push_opt(mut self, regex: &str, opts: &Options) -> Result<Self, ParseError> {
211        let hir = regex_syntax::Parser::from(opts).parse(regex)?;
212        let pf = model::Model::new(&hir)?;
213        self.mapper_builder.push(pf);
214        self.regexes.push(opts.to_regex(regex)?);
215        Ok(self)
216    }
217
218    /// Push a batch of regexes into the builder, using the default
219    /// parsing options.
220    pub fn push_all<T, I>(self, i: I) -> Result<Self, ParseError>
221    where
222        T: AsRef<str>,
223        I: IntoIterator<Item = T>,
224    {
225        i.into_iter().try_fold(self, |b, s| b.push(s.as_ref()))
226    }
227
228    /// Build the regexes set from the current builder.
229    ///
230    /// Building a regexes set from no regexes is useless but not an
231    /// error.
232    pub fn build(self) -> Result<Regexes, BuildError> {
233        let Self {
234            regexes,
235            mapper_builder,
236        } = self;
237        let (mapper, atoms) = mapper_builder.build();
238
239        // Instead of returning a bunch of atoms for the user to
240        // manage, since `regex` depends on aho-corasick by default we
241        // can use that directly and not bother the user.
242        let prefilter = AhoCorasick::builder()
243            .ascii_case_insensitive(true)
244            .prefilter(true)
245            .build(atoms)?;
246
247        Ok(Regexes {
248            regexes,
249            mapper,
250            prefilter,
251        })
252    }
253}
254
255impl Default for Builder {
256    fn default() -> Self {
257        Self::new()
258    }
259}
260
261/// Regexes set, allows testing inputs against a *large* number of
262/// *non-trivial* regexes.
263pub struct Regexes {
264    regexes: Vec<regex::Regex>,
265    mapper: mapper::Mapper,
266    prefilter: AhoCorasick,
267}
268
269impl Regexes {
270    // TODO:
271    // - number of tokens (prefilter.patterns_len())
272    // - number of regexes
273    // - number of unfiltered regexes (from mapper)
274    // - ratio of checked regexes to successes (cfg-gated)
275    // - total / prefiltered (- unfiltered?) so atom size can be manipulated
276    #[inline]
277    fn prefilter<'a>(&'a self, haystack: &'a str) -> impl Iterator<Item = usize> + 'a {
278        self.prefilter
279            .find_overlapping_iter(haystack)
280            .map(|m| m.pattern().as_usize())
281    }
282
283    #[inline]
284    fn prefiltered(&self, haystack: &str) -> impl Iterator<Item = usize> {
285        self.mapper.atom_to_re(self.prefilter(haystack)).into_iter()
286    }
287
288    /// Returns *whether* any regex in the set matches the haystack.
289    pub fn is_match(&self, haystack: &str) -> bool {
290        self.prefiltered(haystack)
291            .any(|idx| self.regexes[idx].is_match(haystack))
292    }
293
294    /// Yields the regexes matching the haystack along with their
295    /// index.
296    ///
297    /// The results are guaranteed to be returned in ascending order.
298    pub fn matching<'a>(
299        &'a self,
300        haystack: &'a str,
301    ) -> impl Iterator<Item = (usize, &regex::Regex)> + 'a {
302        self.prefiltered(haystack).filter_map(move |idx| {
303            let r = &self.regexes[idx];
304            r.is_match(haystack).then_some((idx, r))
305        })
306    }
307
308    /// Returns a reference to all the regexes in the set.
309    pub fn regexes(&self) -> &[regex::Regex] {
310        &self.regexes
311    }
312}
313
314#[cfg(test)]
315mod test {
316    use super::*;
317    use itertools::Itertools;
318
319    #[test]
320    fn empty_filter() {
321        let f = Builder::new().build().unwrap();
322        assert_eq!(f.prefilter("0123").collect_vec(), vec![]);
323
324        assert_eq!(f.matching("foo").count(), 0);
325    }
326
327    #[test]
328    fn empty_pattern() {
329        let f = Builder::new().push("").unwrap().build().unwrap();
330
331        assert_eq!(f.prefilter("0123").collect_vec(), vec![]);
332
333        assert_eq!(
334            f.matching("0123").map(|(idx, _)| idx).collect_vec(),
335            vec![0]
336        );
337    }
338
339    #[test]
340    fn small_or_test() {
341        let f = Builder::new_atom_len(4)
342            .push("(foo|bar)")
343            .unwrap()
344            .build()
345            .unwrap();
346
347        assert_eq!(f.prefilter("lemurs bar").collect_vec(), vec![]);
348
349        assert_eq!(
350            f.matching("lemurs bar").map(|(idx, _)| idx).collect_vec(),
351            vec![0],
352        );
353
354        let f = Builder::new().push("(foo|bar)").unwrap().build().unwrap();
355
356        assert_eq!(f.prefilter("lemurs bar").collect_vec(), vec![1]);
357
358        assert_eq!(
359            f.matching("lemurs bar").map(|(idx, _)| idx).collect_vec(),
360            vec![0],
361        );
362    }
363
364    #[test]
365    fn basic_matches() {
366        let f = Builder::new()
367            .push("(abc123|abc|defxyz|ghi789|abc1234|xyz).*[x-z]+")
368            .unwrap()
369            .push("abcd..yyy..yyyzzz")
370            .unwrap()
371            .push("mnmnpp[a-z]+PPP")
372            .unwrap()
373            .build()
374            .unwrap();
375
376        assert_eq!(
377            f.matching("abc121212xyz").map(|(idx, _)| idx).collect_vec(),
378            vec![0],
379        );
380
381        assert_eq!(
382            f.matching("abc12312yyyzzz")
383                .map(|(idx, _)| idx)
384                .collect_vec(),
385            vec![0],
386        );
387
388        assert_eq!(
389            f.matching("abcd12yyy32yyyzzz")
390                .map(|(idx, _)| idx)
391                .collect_vec(),
392            vec![0, 1],
393        );
394    }
395
396    #[test]
397    fn basics() {
398        // In re2 this is the `MoveSemantics` test, which is... so not
399        // necessary for us. But it's a pair of extra regexes we can
400        // test
401
402        let f = Builder::new().push("foo\\d+").unwrap().build().unwrap();
403
404        assert_eq!(
405            f.matching("abc foo1 xyz").map(|(idx, _)| idx).collect_vec(),
406            vec![0],
407        );
408        assert_eq!(
409            f.matching("abc bar2 xyz").map(|(idx, _)| idx).collect_vec(),
410            vec![],
411        );
412
413        let f = Builder::new().push("bar\\d+").unwrap().build().unwrap();
414
415        assert_eq!(
416            f.matching("abc foo1 xyz").map(|(idx, _)| idx).collect_vec(),
417            vec![],
418        );
419        assert_eq!(
420            f.matching("abc bar2 xyz").map(|(idx, _)| idx).collect_vec(),
421            vec![0],
422        );
423    }
424
425    #[test]
426    fn bulk_api() {
427        use std::io::BufRead as _;
428
429        Builder::new().push_all(["a", "b"]).unwrap();
430
431        Builder::new()
432            .push_all(vec!["a".to_string(), "b".to_string()])
433            .unwrap();
434
435        Builder::new().push_all("a\nb\nc\nd\n".lines()).unwrap();
436
437        Builder::new()
438            .push_all(b"a\nb\nc\nd\n".lines().map(|l| l.unwrap()))
439            .unwrap();
440    }
441}