1#![doc = include_str!("../README.md")]
2#![deny(unsafe_code)]
3#![warn(missing_docs)]
4
5use aho_corasick::AhoCorasick;
6
7mod int_set;
8pub mod mapper;
9pub mod model;
10pub use model::Error as ModelError;
11
12pub struct Builder {
14 regexes: Vec<regex::Regex>,
15 mapper_builder: mapper::Builder,
16}
17
18#[derive(Default)]
25pub struct Options {
26 case_insensitive: bool,
27 dot_matches_new_line: bool,
28 ignore_whitespace: bool,
29 multi_line: bool,
30 crlf: bool,
31}
32
33impl Options {
34 pub fn new() -> Self {
36 Self::default()
37 }
38 pub fn case_insensitive(&mut self, yes: bool) -> &mut Self {
40 self.case_insensitive = yes;
41 self
42 }
43 pub fn dot_matches_new_line(&mut self, yes: bool) -> &mut Self {
46 self.dot_matches_new_line = yes;
47 self
48 }
49 pub fn ignore_whitespace(&mut self, yes: bool) -> &mut Self {
55 self.ignore_whitespace = yes;
56 self
57 }
58 pub fn multi_line(&mut self, yes: bool) -> &mut Self {
62 self.multi_line = yes;
63 self
64 }
65 pub fn crlf(&mut self, yes: bool) -> &mut Self {
69 self.crlf = yes;
70 self
71 }
72 fn to_regex(&self, pattern: &str) -> Result<regex::Regex, regex::Error> {
73 regex::RegexBuilder::new(pattern)
74 .case_insensitive(self.case_insensitive)
75 .dot_matches_new_line(self.dot_matches_new_line)
76 .ignore_whitespace(self.ignore_whitespace)
77 .multi_line(self.multi_line)
78 .crlf(self.crlf)
79 .build()
80 }
81}
82impl From<Options> for regex_syntax::Parser {
83 fn from(opt: Options) -> Self {
84 Self::from(&opt)
85 }
86}
87impl From<&Options> for regex_syntax::Parser {
88 fn from(
89 Options {
90 case_insensitive,
91 dot_matches_new_line,
92 ignore_whitespace,
93 multi_line,
94 crlf,
95 }: &Options,
96 ) -> Self {
97 regex_syntax::ParserBuilder::new()
98 .case_insensitive(*case_insensitive)
99 .dot_matches_new_line(*dot_matches_new_line)
100 .ignore_whitespace(*ignore_whitespace)
101 .multi_line(*multi_line)
102 .crlf(*crlf)
103 .build()
104 }
105}
106
107#[derive(Debug)]
109pub enum ParseError {
110 SyntaxError(String),
113 ProcessingError(ModelError),
116 RegexTooLarge(usize),
119}
120impl std::error::Error for ParseError {
121 fn source(&self) -> Option<&(dyn std::error::Error + 'static)> {
122 match self {
123 ParseError::ProcessingError(e) => Some(e),
124 ParseError::SyntaxError(_) => None,
125 ParseError::RegexTooLarge(_) => None,
126 }
127 }
128}
129impl std::fmt::Display for ParseError {
130 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
131 write!(f, "{self:?}")
132 }
133}
134impl From<regex_syntax::Error> for ParseError {
135 fn from(value: regex_syntax::Error) -> Self {
136 Self::SyntaxError(value.to_string())
137 }
138}
139impl From<regex::Error> for ParseError {
140 fn from(value: regex::Error) -> Self {
141 match value {
142 regex::Error::CompiledTooBig(v) => Self::RegexTooLarge(v),
143 e => Self::SyntaxError(e.to_string()),
144 }
145 }
146}
147impl From<ModelError> for ParseError {
148 fn from(value: ModelError) -> Self {
149 Self::ProcessingError(value)
150 }
151}
152
153#[derive(Debug)]
155pub enum BuildError {
156 PrefilterError(aho_corasick::BuildError),
158}
159impl std::error::Error for BuildError {
160 fn source(&self) -> Option<&(dyn std::error::Error + 'static)> {
161 match self {
162 BuildError::PrefilterError(p) => Some(p),
163 }
164 }
165}
166impl std::fmt::Display for BuildError {
167 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
168 write!(f, "{self:?}")
169 }
170}
171impl From<aho_corasick::BuildError> for BuildError {
172 fn from(value: aho_corasick::BuildError) -> Self {
173 Self::PrefilterError(value)
174 }
175}
176
177impl Builder {
178 #[must_use]
182 pub fn new() -> Self {
183 Self::new_atom_len(3)
184 }
185
186 #[must_use]
191 pub fn new_atom_len(min_atom_len: usize) -> Self {
192 Self {
193 regexes: Vec::new(),
194 mapper_builder: mapper::Builder::new(min_atom_len),
195 }
196 }
197
198 pub fn regexes(&self) -> &[regex::Regex] {
200 &self.regexes
201 }
202
203 pub fn push(self, s: &str) -> Result<Self, ParseError> {
206 self.push_opt(s, &Options::new())
207 }
208
209 pub fn push_opt(mut self, regex: &str, opts: &Options) -> Result<Self, ParseError> {
212 let hir = regex_syntax::Parser::from(opts).parse(regex)?;
213 let pf = model::Model::new(&hir)?;
214 self.mapper_builder.push(pf);
215 self.regexes.push(opts.to_regex(regex)?);
216 Ok(self)
217 }
218
219 pub fn push_all<T, I>(self, i: I) -> Result<Self, ParseError>
222 where
223 T: AsRef<str>,
224 I: IntoIterator<Item = T>,
225 {
226 i.into_iter().try_fold(self, |b, s| b.push(s.as_ref()))
227 }
228
229 pub fn build(self) -> Result<Regexes, BuildError> {
234 let Self {
235 regexes,
236 mapper_builder,
237 } = self;
238 let (mapper, atoms) = mapper_builder.build();
239
240 let prefilter = AhoCorasick::builder()
244 .ascii_case_insensitive(true)
245 .prefilter(true)
246 .build(atoms)?;
247
248 Ok(Regexes {
249 regexes,
250 mapper,
251 prefilter,
252 })
253 }
254}
255
256impl Default for Builder {
257 fn default() -> Self {
258 Self::new()
259 }
260}
261
262pub struct Regexes {
265 regexes: Vec<regex::Regex>,
266 mapper: mapper::Mapper,
267 prefilter: AhoCorasick,
268}
269
270impl Regexes {
271 #[inline]
278 fn prefilter<'a>(&'a self, haystack: &'a str) -> impl Iterator<Item = usize> + 'a {
279 self.prefilter
280 .find_overlapping_iter(haystack)
281 .map(|m| m.pattern().as_usize())
282 }
283
284 #[inline]
285 fn prefiltered(&self, haystack: &str) -> impl Iterator<Item = usize> + use<> {
286 self.mapper.atom_to_re(self.prefilter(haystack)).into_iter()
287 }
288
289 pub fn is_match(&self, haystack: &str) -> bool {
291 self.prefiltered(haystack)
292 .any(|idx| self.regexes[idx].is_match(haystack))
293 }
294
295 pub fn matching<'a>(
300 &'a self,
301 haystack: &'a str,
302 ) -> impl Iterator<Item = (usize, &'a regex::Regex)> + 'a {
303 self.prefiltered(haystack).filter_map(move |idx| {
304 let r = &self.regexes[idx];
305 r.is_match(haystack).then_some((idx, r))
306 })
307 }
308
309 pub fn regexes(&self) -> &[regex::Regex] {
311 &self.regexes
312 }
313}
314
315#[cfg(test)]
316mod test {
317 use super::*;
318 use itertools::Itertools;
319
320 #[test]
321 fn empty_filter() {
322 let f = Builder::new().build().unwrap();
323 assert_eq!(f.prefilter("0123").collect_vec(), vec![]);
324
325 assert_eq!(f.matching("foo").count(), 0);
326 }
327
328 #[test]
329 fn empty_pattern() {
330 let f = Builder::new().push("").unwrap().build().unwrap();
331
332 assert_eq!(f.prefilter("0123").collect_vec(), vec![]);
333
334 assert_eq!(
335 f.matching("0123").map(|(idx, _)| idx).collect_vec(),
336 vec![0]
337 );
338 }
339
340 #[test]
341 fn small_or_test() {
342 let f = Builder::new_atom_len(4)
343 .push("(foo|bar)")
344 .unwrap()
345 .build()
346 .unwrap();
347
348 assert_eq!(f.prefilter("lemurs bar").collect_vec(), vec![]);
349
350 assert_eq!(
351 f.matching("lemurs bar").map(|(idx, _)| idx).collect_vec(),
352 vec![0],
353 );
354
355 let f = Builder::new().push("(foo|bar)").unwrap().build().unwrap();
356
357 assert_eq!(f.prefilter("lemurs bar").collect_vec(), vec![1]);
358
359 assert_eq!(
360 f.matching("lemurs bar").map(|(idx, _)| idx).collect_vec(),
361 vec![0],
362 );
363 }
364
365 #[test]
366 fn basic_matches() {
367 let f = Builder::new()
368 .push("(abc123|abc|defxyz|ghi789|abc1234|xyz).*[x-z]+")
369 .unwrap()
370 .push("abcd..yyy..yyyzzz")
371 .unwrap()
372 .push("mnmnpp[a-z]+PPP")
373 .unwrap()
374 .build()
375 .unwrap();
376
377 assert_eq!(
378 f.matching("abc121212xyz").map(|(idx, _)| idx).collect_vec(),
379 vec![0],
380 );
381
382 assert_eq!(
383 f.matching("abc12312yyyzzz")
384 .map(|(idx, _)| idx)
385 .collect_vec(),
386 vec![0],
387 );
388
389 assert_eq!(
390 f.matching("abcd12yyy32yyyzzz")
391 .map(|(idx, _)| idx)
392 .collect_vec(),
393 vec![0, 1],
394 );
395 }
396
397 #[test]
398 fn basics() {
399 let f = Builder::new().push("foo\\d+").unwrap().build().unwrap();
404
405 assert_eq!(
406 f.matching("abc foo1 xyz").map(|(idx, _)| idx).collect_vec(),
407 vec![0],
408 );
409 assert_eq!(
410 f.matching("abc bar2 xyz").map(|(idx, _)| idx).collect_vec(),
411 vec![],
412 );
413
414 let f = Builder::new().push("bar\\d+").unwrap().build().unwrap();
415
416 assert_eq!(
417 f.matching("abc foo1 xyz").map(|(idx, _)| idx).collect_vec(),
418 vec![],
419 );
420 assert_eq!(
421 f.matching("abc bar2 xyz").map(|(idx, _)| idx).collect_vec(),
422 vec![0],
423 );
424 }
425
426 #[test]
427 fn bulk_api() {
428 use std::io::BufRead as _;
429
430 Builder::new().push_all(["a", "b"]).unwrap();
431
432 Builder::new()
433 .push_all(vec!["a".to_string(), "b".to_string()])
434 .unwrap();
435
436 Builder::new().push_all("a\nb\nc\nd\n".lines()).unwrap();
437
438 Builder::new()
439 .push_all(b"a\nb\nc\nd\n".lines().map(|l| l.unwrap()))
440 .unwrap();
441 }
442
443 #[test]
444 fn alternate() {
445 let f = Builder::new().push("abc|").unwrap().build().unwrap();
446 assert_eq!(
447 f.matching("abcde").map(|(idx, _)| idx).collect_vec(),
448 vec![0],
449 );
450 assert_eq!(f.matching("xyz").map(|(idx, _)| idx).collect_vec(), vec![0],);
451 }
452
453 #[test]
454 fn non_ascii() {
455 let f = Builder::new().push("ΛΜΝΟΠ").unwrap().build().unwrap();
456 assert_eq!(
457 f.matching("ΛΜΝΟΠ").map(|(idx, _)| idx).collect_vec(),
458 vec![0],
459 );
460 assert_eq!(
461 f.matching("λμνοπ").map(|(idx, _)| idx).collect_vec(),
462 vec![],
463 );
464 assert_eq!(
465 f.matching("Λμνοπ").map(|(idx, _)| idx).collect_vec(),
466 vec![],
467 );
468 let f = Builder::new().push("(?i)ΛΜΝΟΠ").unwrap().build().unwrap();
469 assert_eq!(
470 f.matching("ΛΜΝΟΠ").map(|(idx, _)| idx).collect_vec(),
471 vec![0],
472 );
473 assert_eq!(
474 f.matching("λμνοπ").map(|(idx, _)| idx).collect_vec(),
475 vec![0],
476 );
477 assert_eq!(
478 f.matching("Λμνοπ").map(|(idx, _)| idx).collect_vec(),
479 vec![0],
480 );
481 }
482}