Skip to main content

regex_filtered/
lib.rs

1#![doc = include_str!("../README.md")]
2#![deny(unsafe_code)]
3#![warn(missing_docs)]
4
5use aho_corasick::AhoCorasick;
6
7mod int_set;
8pub mod mapper;
9pub mod model;
10pub use model::Error as ModelError;
11
12/// Builder for the regexes set
13pub struct Builder {
14    regexes: Vec<regex::Regex>,
15    mapper_builder: mapper::Builder,
16}
17
18/// Parser configuration, can be used to tune the regex parsing when
19/// adding it to the [`Builder`]. Every option defaults to `false`
20/// whether through [`Default`] or [`Options::new`].
21///
22/// The parser can also be configured via standard [`regex`] inline
23/// flags.
24#[derive(Default)]
25pub struct Options {
26    case_insensitive: bool,
27    dot_matches_new_line: bool,
28    ignore_whitespace: bool,
29    multi_line: bool,
30    crlf: bool,
31}
32
33impl Options {
34    /// Create a new options object.
35    pub fn new() -> Self {
36        Self::default()
37    }
38    /// Configures case-insensitive matching for the entire pattern.
39    pub fn case_insensitive(&mut self, yes: bool) -> &mut Self {
40        self.case_insensitive = yes;
41        self
42    }
43    /// Configures `.` to match newline characters, by default `.`
44    /// matches everything *except* newline characters.
45    pub fn dot_matches_new_line(&mut self, yes: bool) -> &mut Self {
46        self.dot_matches_new_line = yes;
47        self
48    }
49    /// Configures ignoring whitespace inside patterns, as well as `#`
50    /// line comments ("verbose" mode).
51    ///
52    /// Verbose mode is useful to break up complex regexes and improve
53    /// their documentation.
54    pub fn ignore_whitespace(&mut self, yes: bool) -> &mut Self {
55        self.ignore_whitespace = yes;
56        self
57    }
58    /// Configures multi-line mode. When enabled, `^` matches at every
59    /// start of line and `$` at every end of line, by default they
60    /// match only the start and end of the string respectively.ca
61    pub fn multi_line(&mut self, yes: bool) -> &mut Self {
62        self.multi_line = yes;
63        self
64    }
65    /// Allows `\r` as a line terminator, by default only `\n` is a
66    /// line terminator (relevant for [`Self::ignore_whitespace`] and
67    /// [`Self::multi_line`]).
68    pub fn crlf(&mut self, yes: bool) -> &mut Self {
69        self.crlf = yes;
70        self
71    }
72    fn to_regex(&self, pattern: &str) -> Result<regex::Regex, regex::Error> {
73        regex::RegexBuilder::new(pattern)
74            .case_insensitive(self.case_insensitive)
75            .dot_matches_new_line(self.dot_matches_new_line)
76            .ignore_whitespace(self.ignore_whitespace)
77            .multi_line(self.multi_line)
78            .crlf(self.crlf)
79            .build()
80    }
81}
82impl From<Options> for regex_syntax::Parser {
83    fn from(opt: Options) -> Self {
84        Self::from(&opt)
85    }
86}
87impl From<&Options> for regex_syntax::Parser {
88    fn from(
89        Options {
90            case_insensitive,
91            dot_matches_new_line,
92            ignore_whitespace,
93            multi_line,
94            crlf,
95        }: &Options,
96    ) -> Self {
97        regex_syntax::ParserBuilder::new()
98            .case_insensitive(*case_insensitive)
99            .dot_matches_new_line(*dot_matches_new_line)
100            .ignore_whitespace(*ignore_whitespace)
101            .multi_line(*multi_line)
102            .crlf(*crlf)
103            .build()
104    }
105}
106
107/// Parsing error when adding a new regex to the [`Builder`].
108#[derive(Debug)]
109pub enum ParseError {
110    /// An error occurred while parsing the regex or translating it to
111    /// HIR.
112    SyntaxError(String),
113    /// An error occurred while processing the regex for atom
114    /// extraction.
115    ProcessingError(ModelError),
116    /// The regex was too large to compile to the NFA (within the
117    /// default limits).
118    RegexTooLarge(usize),
119}
120impl std::error::Error for ParseError {
121    fn source(&self) -> Option<&(dyn std::error::Error + 'static)> {
122        match self {
123            ParseError::ProcessingError(e) => Some(e),
124            ParseError::SyntaxError(_) => None,
125            ParseError::RegexTooLarge(_) => None,
126        }
127    }
128}
129impl std::fmt::Display for ParseError {
130    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
131        write!(f, "{self:?}")
132    }
133}
134impl From<regex_syntax::Error> for ParseError {
135    fn from(value: regex_syntax::Error) -> Self {
136        Self::SyntaxError(value.to_string())
137    }
138}
139impl From<regex::Error> for ParseError {
140    fn from(value: regex::Error) -> Self {
141        match value {
142            regex::Error::CompiledTooBig(v) => Self::RegexTooLarge(v),
143            e => Self::SyntaxError(e.to_string()),
144        }
145    }
146}
147impl From<ModelError> for ParseError {
148    fn from(value: ModelError) -> Self {
149        Self::ProcessingError(value)
150    }
151}
152
153/// Error while compiling the builder to a prefiltered set.
154#[derive(Debug)]
155pub enum BuildError {
156    /// Error while building the prefilter.
157    PrefilterError(aho_corasick::BuildError),
158}
159impl std::error::Error for BuildError {
160    fn source(&self) -> Option<&(dyn std::error::Error + 'static)> {
161        match self {
162            BuildError::PrefilterError(p) => Some(p),
163        }
164    }
165}
166impl std::fmt::Display for BuildError {
167    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
168        write!(f, "{self:?}")
169    }
170}
171impl From<aho_corasick::BuildError> for BuildError {
172    fn from(value: aho_corasick::BuildError) -> Self {
173        Self::PrefilterError(value)
174    }
175}
176
177impl Builder {
178    /// Instantiate a builder with the default metadata configuration:
179    ///
180    /// - minimum atom length 3
181    #[must_use]
182    pub fn new() -> Self {
183        Self::new_atom_len(3)
184    }
185
186    /// Instantiate a builder with a custom minimum atom length.
187    /// Increasing the atom length decreases the size and cost of the
188    /// prefilter, but may make more regexes impossible to prefilter,
189    /// which can increase matching costs.
190    #[must_use]
191    pub fn new_atom_len(min_atom_len: usize) -> Self {
192        Self {
193            regexes: Vec::new(),
194            mapper_builder: mapper::Builder::new(min_atom_len),
195        }
196    }
197
198    /// Currently loaded regexes.
199    pub fn regexes(&self) -> &[regex::Regex] {
200        &self.regexes
201    }
202
203    /// Push a single regex into the builder, using the default
204    /// parsing options.
205    pub fn push(self, s: &str) -> Result<Self, ParseError> {
206        self.push_opt(s, &Options::new())
207    }
208
209    /// Push a single regex into the builder, using custom parsing
210    /// options.
211    pub fn push_opt(mut self, regex: &str, opts: &Options) -> Result<Self, ParseError> {
212        let hir = regex_syntax::Parser::from(opts).parse(regex)?;
213        let pf = model::Model::new(&hir)?;
214        self.mapper_builder.push(pf);
215        self.regexes.push(opts.to_regex(regex)?);
216        Ok(self)
217    }
218
219    /// Push a batch of regexes into the builder, using the default
220    /// parsing options.
221    pub fn push_all<T, I>(self, i: I) -> Result<Self, ParseError>
222    where
223        T: AsRef<str>,
224        I: IntoIterator<Item = T>,
225    {
226        i.into_iter().try_fold(self, |b, s| b.push(s.as_ref()))
227    }
228
229    /// Build the regexes set from the current builder.
230    ///
231    /// Building a regexes set from no regexes is useless but not an
232    /// error.
233    pub fn build(self) -> Result<Regexes, BuildError> {
234        let Self {
235            regexes,
236            mapper_builder,
237        } = self;
238        let (mapper, atoms) = mapper_builder.build();
239
240        // Instead of returning a bunch of atoms for the user to
241        // manage, since `regex` depends on aho-corasick by default we
242        // can use that directly and not bother the user.
243        let prefilter = AhoCorasick::builder()
244            .ascii_case_insensitive(true)
245            .prefilter(true)
246            .build(atoms)?;
247
248        Ok(Regexes {
249            regexes,
250            mapper,
251            prefilter,
252        })
253    }
254}
255
256impl Default for Builder {
257    fn default() -> Self {
258        Self::new()
259    }
260}
261
262/// Regexes set, allows testing inputs against a *large* number of
263/// *non-trivial* regexes.
264pub struct Regexes {
265    regexes: Vec<regex::Regex>,
266    mapper: mapper::Mapper,
267    prefilter: AhoCorasick,
268}
269
270impl Regexes {
271    // TODO:
272    // - number of tokens (prefilter.patterns_len())
273    // - number of regexes
274    // - number of unfiltered regexes (from mapper)
275    // - ratio of checked regexes to successes (cfg-gated)
276    // - total / prefiltered (- unfiltered?) so atom size can be manipulated
277    #[inline]
278    fn prefilter<'a>(&'a self, haystack: &'a str) -> impl Iterator<Item = usize> + 'a {
279        self.prefilter
280            .find_overlapping_iter(haystack)
281            .map(|m| m.pattern().as_usize())
282    }
283
284    #[inline]
285    fn prefiltered(&self, haystack: &str) -> impl Iterator<Item = usize> + use<> {
286        self.mapper.atom_to_re(self.prefilter(haystack)).into_iter()
287    }
288
289    /// Returns *whether* any regex in the set matches the haystack.
290    pub fn is_match(&self, haystack: &str) -> bool {
291        self.prefiltered(haystack)
292            .any(|idx| self.regexes[idx].is_match(haystack))
293    }
294
295    /// Yields the regexes matching the haystack along with their
296    /// index.
297    ///
298    /// The results are guaranteed to be returned in ascending order.
299    pub fn matching<'a>(
300        &'a self,
301        haystack: &'a str,
302    ) -> impl Iterator<Item = (usize, &'a regex::Regex)> + 'a {
303        self.prefiltered(haystack).filter_map(move |idx| {
304            let r = &self.regexes[idx];
305            r.is_match(haystack).then_some((idx, r))
306        })
307    }
308
309    /// Returns a reference to all the regexes in the set.
310    pub fn regexes(&self) -> &[regex::Regex] {
311        &self.regexes
312    }
313}
314
315#[cfg(test)]
316mod test {
317    use super::*;
318    use itertools::Itertools;
319
320    #[test]
321    fn empty_filter() {
322        let f = Builder::new().build().unwrap();
323        assert_eq!(f.prefilter("0123").collect_vec(), vec![]);
324
325        assert_eq!(f.matching("foo").count(), 0);
326    }
327
328    #[test]
329    fn empty_pattern() {
330        let f = Builder::new().push("").unwrap().build().unwrap();
331
332        assert_eq!(f.prefilter("0123").collect_vec(), vec![]);
333
334        assert_eq!(
335            f.matching("0123").map(|(idx, _)| idx).collect_vec(),
336            vec![0]
337        );
338    }
339
340    #[test]
341    fn small_or_test() {
342        let f = Builder::new_atom_len(4)
343            .push("(foo|bar)")
344            .unwrap()
345            .build()
346            .unwrap();
347
348        assert_eq!(f.prefilter("lemurs bar").collect_vec(), vec![]);
349
350        assert_eq!(
351            f.matching("lemurs bar").map(|(idx, _)| idx).collect_vec(),
352            vec![0],
353        );
354
355        let f = Builder::new().push("(foo|bar)").unwrap().build().unwrap();
356
357        assert_eq!(f.prefilter("lemurs bar").collect_vec(), vec![1]);
358
359        assert_eq!(
360            f.matching("lemurs bar").map(|(idx, _)| idx).collect_vec(),
361            vec![0],
362        );
363    }
364
365    #[test]
366    fn basic_matches() {
367        let f = Builder::new()
368            .push("(abc123|abc|defxyz|ghi789|abc1234|xyz).*[x-z]+")
369            .unwrap()
370            .push("abcd..yyy..yyyzzz")
371            .unwrap()
372            .push("mnmnpp[a-z]+PPP")
373            .unwrap()
374            .build()
375            .unwrap();
376
377        assert_eq!(
378            f.matching("abc121212xyz").map(|(idx, _)| idx).collect_vec(),
379            vec![0],
380        );
381
382        assert_eq!(
383            f.matching("abc12312yyyzzz")
384                .map(|(idx, _)| idx)
385                .collect_vec(),
386            vec![0],
387        );
388
389        assert_eq!(
390            f.matching("abcd12yyy32yyyzzz")
391                .map(|(idx, _)| idx)
392                .collect_vec(),
393            vec![0, 1],
394        );
395    }
396
397    #[test]
398    fn basics() {
399        // In re2 this is the `MoveSemantics` test, which is... so not
400        // necessary for us. But it's a pair of extra regexes we can
401        // test
402
403        let f = Builder::new().push("foo\\d+").unwrap().build().unwrap();
404
405        assert_eq!(
406            f.matching("abc foo1 xyz").map(|(idx, _)| idx).collect_vec(),
407            vec![0],
408        );
409        assert_eq!(
410            f.matching("abc bar2 xyz").map(|(idx, _)| idx).collect_vec(),
411            vec![],
412        );
413
414        let f = Builder::new().push("bar\\d+").unwrap().build().unwrap();
415
416        assert_eq!(
417            f.matching("abc foo1 xyz").map(|(idx, _)| idx).collect_vec(),
418            vec![],
419        );
420        assert_eq!(
421            f.matching("abc bar2 xyz").map(|(idx, _)| idx).collect_vec(),
422            vec![0],
423        );
424    }
425
426    #[test]
427    fn bulk_api() {
428        use std::io::BufRead as _;
429
430        Builder::new().push_all(["a", "b"]).unwrap();
431
432        Builder::new()
433            .push_all(vec!["a".to_string(), "b".to_string()])
434            .unwrap();
435
436        Builder::new().push_all("a\nb\nc\nd\n".lines()).unwrap();
437
438        Builder::new()
439            .push_all(b"a\nb\nc\nd\n".lines().map(|l| l.unwrap()))
440            .unwrap();
441    }
442
443    #[test]
444    fn alternate() {
445        let f = Builder::new().push("abc|").unwrap().build().unwrap();
446        assert_eq!(
447            f.matching("abcde").map(|(idx, _)| idx).collect_vec(),
448            vec![0],
449        );
450        assert_eq!(f.matching("xyz").map(|(idx, _)| idx).collect_vec(), vec![0],);
451    }
452
453    #[test]
454    fn non_ascii() {
455        let f = Builder::new().push("ΛΜΝΟΠ").unwrap().build().unwrap();
456        assert_eq!(
457            f.matching("ΛΜΝΟΠ").map(|(idx, _)| idx).collect_vec(),
458            vec![0],
459        );
460        assert_eq!(
461            f.matching("λμνοπ").map(|(idx, _)| idx).collect_vec(),
462            vec![],
463        );
464        assert_eq!(
465            f.matching("Λμνοπ").map(|(idx, _)| idx).collect_vec(),
466            vec![],
467        );
468        let f = Builder::new().push("(?i)ΛΜΝΟΠ").unwrap().build().unwrap();
469        assert_eq!(
470            f.matching("ΛΜΝΟΠ").map(|(idx, _)| idx).collect_vec(),
471            vec![0],
472        );
473        assert_eq!(
474            f.matching("λμνοπ").map(|(idx, _)| idx).collect_vec(),
475            vec![0],
476        );
477        assert_eq!(
478            f.matching("Λμνοπ").map(|(idx, _)| idx).collect_vec(),
479            vec![0],
480        );
481    }
482}