1#![doc = include_str!("../README.md")]
2#![deny(unsafe_code)]
3#![warn(missing_docs)]
4
5use aho_corasick::AhoCorasick;
6
7mod mapper;
8mod model;
9pub use model::Error as ModelError;
10
11pub struct Builder {
13 regexes: Vec<regex::Regex>,
14 mapper_builder: mapper::Builder,
15}
16
17#[derive(Default)]
24pub struct Options {
25 case_insensitive: bool,
26 dot_matches_new_line: bool,
27 ignore_whitespace: bool,
28 multi_line: bool,
29 crlf: bool,
30}
31
32impl Options {
33 pub fn new() -> Self {
35 Self::default()
36 }
37 pub fn case_insensitive(&mut self, yes: bool) -> &mut Self {
39 self.case_insensitive = yes;
40 self
41 }
42 pub fn dot_matches_new_line(&mut self, yes: bool) -> &mut Self {
45 self.dot_matches_new_line = yes;
46 self
47 }
48 pub fn ignore_whitespace(&mut self, yes: bool) -> &mut Self {
54 self.ignore_whitespace = yes;
55 self
56 }
57 pub fn multi_line(&mut self, yes: bool) -> &mut Self {
61 self.multi_line = yes;
62 self
63 }
64 pub fn crlf(&mut self, yes: bool) -> &mut Self {
68 self.crlf = yes;
69 self
70 }
71 fn to_regex(&self, pattern: &str) -> Result<regex::Regex, regex::Error> {
72 regex::RegexBuilder::new(pattern)
73 .case_insensitive(self.case_insensitive)
74 .dot_matches_new_line(self.dot_matches_new_line)
75 .ignore_whitespace(self.ignore_whitespace)
76 .multi_line(self.multi_line)
77 .crlf(self.crlf)
78 .build()
79 }
80}
81impl From<Options> for regex_syntax::Parser {
82 fn from(opt: Options) -> Self {
83 Self::from(&opt)
84 }
85}
86impl From<&Options> for regex_syntax::Parser {
87 fn from(
88 Options {
89 case_insensitive,
90 dot_matches_new_line,
91 ignore_whitespace,
92 multi_line,
93 crlf,
94 }: &Options,
95 ) -> Self {
96 regex_syntax::ParserBuilder::new()
97 .case_insensitive(*case_insensitive)
98 .dot_matches_new_line(*dot_matches_new_line)
99 .ignore_whitespace(*ignore_whitespace)
100 .multi_line(*multi_line)
101 .crlf(*crlf)
102 .build()
103 }
104}
105
106#[derive(Debug)]
108pub enum ParseError {
109 SyntaxError(String),
112 ProcessingError(ModelError),
115 RegexTooLarge(usize),
118}
119impl std::error::Error for ParseError {
120 fn source(&self) -> Option<&(dyn std::error::Error + 'static)> {
121 match self {
122 ParseError::ProcessingError(e) => Some(e),
123 ParseError::SyntaxError(_) => None,
124 ParseError::RegexTooLarge(_) => None,
125 }
126 }
127}
128impl std::fmt::Display for ParseError {
129 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
130 write!(f, "{self:?}")
131 }
132}
133impl From<regex_syntax::Error> for ParseError {
134 fn from(value: regex_syntax::Error) -> Self {
135 Self::SyntaxError(value.to_string())
136 }
137}
138impl From<regex::Error> for ParseError {
139 fn from(value: regex::Error) -> Self {
140 match value {
141 regex::Error::CompiledTooBig(v) => Self::RegexTooLarge(v),
142 e => Self::SyntaxError(e.to_string()),
143 }
144 }
145}
146impl From<ModelError> for ParseError {
147 fn from(value: ModelError) -> Self {
148 Self::ProcessingError(value)
149 }
150}
151
152#[derive(Debug)]
154pub enum BuildError {
155 PrefilterError(aho_corasick::BuildError),
157}
158impl std::error::Error for BuildError {
159 fn source(&self) -> Option<&(dyn std::error::Error + 'static)> {
160 match self {
161 BuildError::PrefilterError(p) => Some(p),
162 }
163 }
164}
165impl std::fmt::Display for BuildError {
166 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
167 write!(f, "{self:?}")
168 }
169}
170impl From<aho_corasick::BuildError> for BuildError {
171 fn from(value: aho_corasick::BuildError) -> Self {
172 Self::PrefilterError(value)
173 }
174}
175
176impl Builder {
177 #[must_use]
181 pub fn new() -> Self {
182 Self::new_atom_len(3)
183 }
184
185 #[must_use]
190 pub fn new_atom_len(min_atom_len: usize) -> Self {
191 Self {
192 regexes: Vec::new(),
193 mapper_builder: mapper::Builder::new(min_atom_len),
194 }
195 }
196
197 pub fn regexes(&self) -> &[regex::Regex] {
199 &self.regexes
200 }
201
202 pub fn push(self, s: &str) -> Result<Self, ParseError> {
205 self.push_opt(s, &Options::new())
206 }
207
208 pub fn push_opt(mut self, regex: &str, opts: &Options) -> Result<Self, ParseError> {
211 let hir = regex_syntax::Parser::from(opts).parse(regex)?;
212 let pf = model::Model::new(&hir)?;
213 self.mapper_builder.push(pf);
214 self.regexes.push(opts.to_regex(regex)?);
215 Ok(self)
216 }
217
218 pub fn push_all<T, I>(self, i: I) -> Result<Self, ParseError>
221 where
222 T: AsRef<str>,
223 I: IntoIterator<Item = T>,
224 {
225 i.into_iter().try_fold(self, |b, s| b.push(s.as_ref()))
226 }
227
228 pub fn build(self) -> Result<Regexes, BuildError> {
233 let Self {
234 regexes,
235 mapper_builder,
236 } = self;
237 let (mapper, atoms) = mapper_builder.build();
238
239 let prefilter = AhoCorasick::builder()
243 .ascii_case_insensitive(true)
244 .prefilter(true)
245 .build(atoms)?;
246
247 Ok(Regexes {
248 regexes,
249 mapper,
250 prefilter,
251 })
252 }
253}
254
255impl Default for Builder {
256 fn default() -> Self {
257 Self::new()
258 }
259}
260
261pub struct Regexes {
264 regexes: Vec<regex::Regex>,
265 mapper: mapper::Mapper,
266 prefilter: AhoCorasick,
267}
268
269impl Regexes {
270 #[inline]
277 fn prefilter<'a>(&'a self, haystack: &'a str) -> impl Iterator<Item = usize> + 'a {
278 self.prefilter
279 .find_overlapping_iter(haystack)
280 .map(|m| m.pattern().as_usize())
281 }
282
283 #[inline]
284 fn prefiltered(&self, haystack: &str) -> impl Iterator<Item = usize> {
285 self.mapper.atom_to_re(self.prefilter(haystack)).into_iter()
286 }
287
288 pub fn is_match(&self, haystack: &str) -> bool {
290 self.prefiltered(haystack)
291 .any(|idx| self.regexes[idx].is_match(haystack))
292 }
293
294 pub fn matching<'a>(
299 &'a self,
300 haystack: &'a str,
301 ) -> impl Iterator<Item = (usize, ®ex::Regex)> + 'a {
302 self.prefiltered(haystack).filter_map(move |idx| {
303 let r = &self.regexes[idx];
304 r.is_match(haystack).then_some((idx, r))
305 })
306 }
307
308 pub fn regexes(&self) -> &[regex::Regex] {
310 &self.regexes
311 }
312}
313
314#[cfg(test)]
315mod test {
316 use super::*;
317 use itertools::Itertools;
318
319 #[test]
320 fn empty_filter() {
321 let f = Builder::new().build().unwrap();
322 assert_eq!(f.prefilter("0123").collect_vec(), vec![]);
323
324 assert_eq!(f.matching("foo").count(), 0);
325 }
326
327 #[test]
328 fn empty_pattern() {
329 let f = Builder::new().push("").unwrap().build().unwrap();
330
331 assert_eq!(f.prefilter("0123").collect_vec(), vec![]);
332
333 assert_eq!(
334 f.matching("0123").map(|(idx, _)| idx).collect_vec(),
335 vec![0]
336 );
337 }
338
339 #[test]
340 fn small_or_test() {
341 let f = Builder::new_atom_len(4)
342 .push("(foo|bar)")
343 .unwrap()
344 .build()
345 .unwrap();
346
347 assert_eq!(f.prefilter("lemurs bar").collect_vec(), vec![]);
348
349 assert_eq!(
350 f.matching("lemurs bar").map(|(idx, _)| idx).collect_vec(),
351 vec![0],
352 );
353
354 let f = Builder::new().push("(foo|bar)").unwrap().build().unwrap();
355
356 assert_eq!(f.prefilter("lemurs bar").collect_vec(), vec![1]);
357
358 assert_eq!(
359 f.matching("lemurs bar").map(|(idx, _)| idx).collect_vec(),
360 vec![0],
361 );
362 }
363
364 #[test]
365 fn basic_matches() {
366 let f = Builder::new()
367 .push("(abc123|abc|defxyz|ghi789|abc1234|xyz).*[x-z]+")
368 .unwrap()
369 .push("abcd..yyy..yyyzzz")
370 .unwrap()
371 .push("mnmnpp[a-z]+PPP")
372 .unwrap()
373 .build()
374 .unwrap();
375
376 assert_eq!(
377 f.matching("abc121212xyz").map(|(idx, _)| idx).collect_vec(),
378 vec![0],
379 );
380
381 assert_eq!(
382 f.matching("abc12312yyyzzz")
383 .map(|(idx, _)| idx)
384 .collect_vec(),
385 vec![0],
386 );
387
388 assert_eq!(
389 f.matching("abcd12yyy32yyyzzz")
390 .map(|(idx, _)| idx)
391 .collect_vec(),
392 vec![0, 1],
393 );
394 }
395
396 #[test]
397 fn basics() {
398 let f = Builder::new().push("foo\\d+").unwrap().build().unwrap();
403
404 assert_eq!(
405 f.matching("abc foo1 xyz").map(|(idx, _)| idx).collect_vec(),
406 vec![0],
407 );
408 assert_eq!(
409 f.matching("abc bar2 xyz").map(|(idx, _)| idx).collect_vec(),
410 vec![],
411 );
412
413 let f = Builder::new().push("bar\\d+").unwrap().build().unwrap();
414
415 assert_eq!(
416 f.matching("abc foo1 xyz").map(|(idx, _)| idx).collect_vec(),
417 vec![],
418 );
419 assert_eq!(
420 f.matching("abc bar2 xyz").map(|(idx, _)| idx).collect_vec(),
421 vec![0],
422 );
423 }
424
425 #[test]
426 fn bulk_api() {
427 use std::io::BufRead as _;
428
429 Builder::new().push_all(["a", "b"]).unwrap();
430
431 Builder::new()
432 .push_all(vec!["a".to_string(), "b".to_string()])
433 .unwrap();
434
435 Builder::new().push_all("a\nb\nc\nd\n".lines()).unwrap();
436
437 Builder::new()
438 .push_all(b"a\nb\nc\nd\n".lines().map(|l| l.unwrap()))
439 .unwrap();
440 }
441}