ib_matcher/regex/cp/regex.rs
1use std::{
2 cell::UnsafeCell,
3 marker::PhantomPinned,
4 mem::{transmute, MaybeUninit},
5 ops::Deref,
6 sync::Arc,
7};
8
9use bon::bon;
10use itertools::Itertools;
11use regex_syntax::hir::Hir;
12
13#[cfg(feature = "regex-callback")]
14use crate::regex::nfa::Callback;
15use crate::{
16 matcher::{pattern::Pattern, IbMatcher, MatchConfig},
17 regex::{
18 nfa::{
19 backtrack::{self, BoundedBacktracker},
20 thompson::{self},
21 NFA,
22 },
23 util::{self, captures::Captures, pool::Pool, prefilter::PrefilterIb},
24 Input, Match, MatchError,
25 },
26 syntax::regex::hir,
27};
28
29pub use crate::regex::nfa::{
30 backtrack::{Cache, Config, TryCapturesMatches, TryFindMatches},
31 thompson::BuildError,
32};
33
34/// A compiled regular expression for searching Unicode haystacks.
35///
36/// A `Regex` can be used to search haystacks, split haystacks into substrings
37/// or replace substrings in a haystack with a different substring. All
38/// searching is done with an implicit `(?s:.)*?` at the beginning and end of
39/// an pattern. To force an expression to match the whole string (or a prefix
40/// or a suffix), you can use anchored search or an anchor like `^` or `$` (or `\A` and `\z`).
41/**
42# Overview
43
44The most important methods are as follows:
45
46* [`Regex::new`] compiles a regex using the default configuration. A
47[`Builder`] permits setting a non-default configuration. (For example,
48case insensitive matching, verbose mode and others.)
49* [`Regex::is_match`] reports whether a match exists in a particular haystack.
50* [`Regex::find`] reports the byte offsets of a match in a haystack, if one
51exists. [`Regex::find_iter`] returns an iterator over all such matches.
52* [`Regex::captures`] returns a [`Captures`], which reports both the byte
53offsets of a match in a haystack and the byte offsets of each matching capture
54group from the regex in the haystack.
55[`Regex::captures_iter`] returns an iterator over all such matches.
56*/
57/// # Example
58///
59/// ```
60/// use ib_matcher::regex::cp::Regex;
61///
62/// let re = Regex::new(r"^[0-9]{4}-[0-9]{2}-[0-9]{2}$")?;
63/// assert!(re.is_match("2010-03-14"));
64///
65/// # Ok::<(), Box<dyn std::error::Error>>(())
66/// ```
67/**
68With `IbMatcher`'s Chinese pinyin and Japanese romaji matching:
69```
70// cargo add ib-matcher --features regex,pinyin,romaji
71use ib_matcher::{
72 matcher::{MatchConfig, PinyinMatchConfig, RomajiMatchConfig},
73 regex::{cp::Regex, Match},
74};
75
76let config = MatchConfig::builder()
77 .pinyin(PinyinMatchConfig::default())
78 .romaji(RomajiMatchConfig::default())
79 .build();
80
81let re = Regex::builder()
82 .ib(config.shallow_clone())
83 .build("raki.suta")
84 .unwrap();
85assert_eq!(re.find("「らき☆すた」"), Some(Match::must(0, 3..18)));
86
87let re = Regex::builder()
88 .ib(config.shallow_clone())
89 .build("pysou.*?(any|every)thing")
90 .unwrap();
91assert_eq!(re.find("拼音搜索Everything"), Some(Match::must(0, 0..22)));
92
93let config = MatchConfig::builder()
94 .pinyin(PinyinMatchConfig::default())
95 .romaji(RomajiMatchConfig::default())
96 .mix_lang(true)
97 .build();
98let re = Regex::builder()
99 .ib(config.shallow_clone())
100 .build("(?x)^zangsounofuri-?ren # Mixing pinyin and romaji")
101 .unwrap();
102assert_eq!(re.find("葬送のフリーレン"), Some(Match::must(0, 0..24)));
103```
104*/
105/// For more examples and the syntax, see [`crate::regex`].
106///
107/// # Case insensitivity
108/// To enable case insensitivity:
109/// ```
110/// use ib_matcher::{matcher::{PinyinMatchConfig, PlainMatchConfig, MatchConfig}, regex::cp::Regex};
111///
112/// let re = Regex::builder().ib(MatchConfig::default()).build("foo").unwrap();
113/// assert!(re.is_match("FOO"));
114///
115/// // Alternatively, with `case_insensitive()`:
116/// let re = Regex::builder()
117/// .ib(MatchConfig::builder()
118/// .case_insensitive(true)
119/// .pinyin(PinyinMatchConfig::default())
120/// .build())
121/// .build("pyss")
122/// .unwrap();
123/// assert!(re.is_match("PY搜索"));
124/// ```
125/// Note that enabling `syntax.case_insensitive` will make `ib` (i.e. pinyin and romaji match) doesn't work at the moment. You should only set [`MatchConfigBuilder::case_insensitive`](crate::matcher::MatchConfigBuilder::case_insensitive) ([`PlainMatchConfigBuilder::case_insensitive`](crate::matcher::PlainMatchConfigBuilder::case_insensitive)).
126///
127/// If you need case insensitive character classes, you need to write `(?i:[a-z])` instead at the moment.
128///
129/**
130# Custom matching callbacks
131Custom matching callbacks can be used to implement ad hoc look-around, backreferences, balancing groups/recursion/subroutines, combining domain-specific parsers, etc.
132
133Basic usage:
134```
135// cargo add ib-matcher --features regex,regex-callback
136use ib_matcher::regex::cp::Regex;
137
138let re = Regex::builder()
139 .callback("ascii", |input, at, push| {
140 let haystack = &input.haystack()[at..];
141 if haystack.len() > 0 && haystack[0].is_ascii() {
142 push(1);
143 }
144 })
145 .build(r"(ascii)+\d(ascii)+")
146 .unwrap();
147let hay = "that4U this4me";
148assert_eq!(&hay[re.find(hay).unwrap().span()], " this4me");
149```
150
151## Look-around
152```
153use ib_matcher::regex::cp::Regex;
154
155let re = Regex::builder()
156 .callback("lookahead_is_ascii", |input, at, push| {
157 let haystack = &input.haystack()[at..];
158 if haystack.len() > 0 && haystack[0].is_ascii() {
159 push(0);
160 }
161 })
162 .build(r"[\x00-\x7f]+?\d(lookahead_is_ascii)")
163 .unwrap();
164let hay = "that4U,this4me1plz";
165assert_eq!(
166 re.find_iter(hay).map(|m| &hay[m.span()]).collect::<Vec<_>>(),
167 vec![",this4", "me1"]
168);
169```
170
171## Balancing groups
172```
173use std::{cell::RefCell, rc::Rc};
174use ib_matcher::regex::cp::Regex;
175
176let count = Rc::new(RefCell::new(0));
177let re = Regex::builder()
178 .callback("open_quote", {
179 let count = count.clone();
180 move |input, at, push| {
181 if at < 2 || input.haystack()[at - 2] != b'\\' {
182 let mut count = count.borrow_mut();
183 *count += 1;
184 push(0);
185 }
186 }
187 })
188 .callback("close_quote", move |input, at, push| {
189 if at < 2 || input.haystack()[at - 2] != b'\\' {
190 let mut count = count.borrow_mut();
191 if *count > 0 {
192 push(0);
193 }
194 *count -= 1;
195 }
196 })
197 .build(r"'(open_quote).*?'(close_quote)")
198 .unwrap();
199let hay = r"'one' 'two\'three' 'four'";
200assert_eq!(
201 re.find_iter(hay).map(|m| &hay[m.span()]).collect::<Vec<_>>(),
202 vec!["'one'", r"'two\'three'", "'four'"]
203);
204```
205(In this simple example, just using `'([^'\\]+?|\\')*'` is actually enough, but there are more complex cases where balancing groups (or recursion/subroutines) are necessary.)
206*/
207/// # Synchronization and cloning
208///
209/// In order to make the `Regex` API convenient, most of the routines hide
210/// the fact that a `Cache` is needed at all. To achieve this, a [memory
211/// pool](automata::util::pool::Pool) is used internally to retrieve `Cache`
212/// values in a thread safe way that also permits reuse. This in turn implies
213/// that every such search call requires some form of synchronization. Usually
214/// this synchronization is fast enough to not notice, but in some cases, it
215/// can be a bottleneck. This typically occurs when all of the following are
216/// true:
217///
218/// * The same `Regex` is shared across multiple threads simultaneously,
219/// usually via a [`util::lazy::Lazy`](automata::util::lazy::Lazy) or something
220/// similar from the `once_cell` or `lazy_static` crates.
221/// * The primary unit of work in each thread is a regex search.
222/// * Searches are run on very short haystacks.
223///
224/// This particular case can lead to high contention on the pool used by a
225/// `Regex` internally, which can in turn increase latency to a noticeable
226/// effect. This cost can be mitigated in one of the following ways:
227///
228/// * Use a distinct copy of a `Regex` in each thread, usually by cloning it.
229/// Cloning a `Regex` _does not_ do a deep copy of its read-only component.
230/// But it does lead to each `Regex` having its own memory pool, which in
231/// turn eliminates the problem of contention. In general, this technique should
232/// not result in any additional memory usage when compared to sharing the same
233/// `Regex` across multiple threads simultaneously.
234/// * Use lower level APIs, like [`Regex::try_find`], which permit passing
235/// a `Cache` explicitly. In this case, it is up to you to determine how best
236/// to provide a `Cache`. For example, you might put a `Cache` in thread-local
237/// storage if your use case allows for it.
238///
239/// Overall, this is an issue that happens rarely in practice, but it can
240/// happen.
241///
242/// # Warning: spin-locks may be used in alloc-only mode
243///
244/// When this crate is built without the `std` feature and the high level APIs
245/// on a `Regex` are used, then a spin-lock will be used to synchronize access
246/// to an internal pool of `Cache` values. This may be undesirable because
247/// a spin-lock is [effectively impossible to implement correctly in user
248/// space][spinlocks-are-bad]. That is, more concretely, the spin-lock could
249/// result in a deadlock.
250///
251/// [spinlocks-are-bad]: https://matklad.github.io/2020/01/02/spinlocks-considered-harmful.html
252///
253/// If one wants to avoid the use of spin-locks when the `std` feature is
254/// disabled, then you must use APIs that accept a `Cache` value explicitly.
255/// For example, [`Regex::try_find`].
256pub struct Regex<'a> {
257 /// The actual regex implementation.
258 imp: Arc<RegexI<'a>>,
259 /// A thread safe pool of caches.
260 ///
261 /// For the higher level search APIs, a `Cache` is automatically plucked
262 /// from this pool before running a search. The lower level `with` methods
263 /// permit the caller to provide their own cache, thereby bypassing
264 /// accesses to this pool.
265 ///
266 /// Note that we put this outside the `Arc` so that cloning a `Regex`
267 /// results in creating a fresh `CachePool`. This in turn permits callers
268 /// to clone regexes into separate threads where each such regex gets
269 /// the pool's "thread owner" optimization. Otherwise, if one shares the
270 /// `Regex` directly, then the pool will go through a slower mutex path for
271 /// all threads except for the "owner."
272 pool: Pool<Cache>,
273}
274
275/// The internal implementation of `Regex`, split out so that it can be wrapped
276/// in an `Arc`.
277struct RegexI<'a> {
278 /// The core matching engine.
279 re: MaybeUninit<BoundedBacktracker>,
280 /// [`IbMatcher`]s in [`NFA`] states may have references to this config due to `shallow_clone()`, i.e. self-references.
281 /// We must keep it alive and not move it.
282 /// That's also the main reason why we wrap it into `Arc` (the core part of `BoundedBacktracker` is already `Arc`ed).
283 config: MatchConfig<'a>,
284 _pin: PhantomPinned,
285}
286
287/// `Cache::new` doesn't really need `&BoundedBacktracker`, so...
288fn create_cache() -> Cache {
289 Cache::new(unsafe { &*(8 as *const _) })
290}
291
292#[bon]
293impl<'a> Regex<'a> {
294 pub fn new(pattern: &str) -> Result<Self, BuildError> {
295 Self::builder().build(pattern)
296 }
297
298 pub fn config() -> thompson::Config {
299 thompson::Config::new()
300 }
301
302 /// Return a builder for configuring the construction of a `Regex`.
303 ///
304 /// This is a convenience routine to avoid needing to import the
305 /// [`Builder`] type in common cases.
306 ///
307 /// # Example: change the line terminator
308 ///
309 /// This example shows how to enable multi-line mode by default and change
310 /// the line terminator to the NUL byte:
311 ///
312 /// ```
313 /// use ib_matcher::regex::{cp::Regex, util::{syntax, look::LookMatcher}, Match};
314 ///
315 /// let mut lookm = LookMatcher::new();
316 /// lookm.set_line_terminator(b'\x00');
317 /// let re = Regex::builder()
318 /// .syntax(syntax::Config::new().multi_line(true))
319 /// .configure(Regex::config().look_matcher(lookm))
320 /// .build(r"^foo$")?;
321 /// let hay = "\x00foo\x00";
322 /// assert_eq!(Some(Match::must(0, 1..4)), re.find(hay));
323 ///
324 /// # Ok::<(), Box<dyn std::error::Error>>(())
325 /// ```
326 #[builder(builder_type = Builder, finish_fn(name = build_many_from_hir, doc {
327 /// Builds a `Regex` directly from many `Hir` expressions.
328 ///
329 /// This is useful if you needed to parse pattern strings into `Hir`
330 /// expressions for other reasons (such as analysis or transformations).
331 /// This routine permits building a `Regex` directly from the `Hir`
332 /// expressions instead of first converting the `Hir` expressions back to
333 /// pattern strings.
334 ///
335 /// When using this method, any options set via [`Builder::syntax`] are
336 /// ignored. Namely, the syntax options only apply when parsing a pattern
337 /// string, which isn't relevant here.
338 ///
339 /// If there was a problem building the underlying regex matcher for the
340 /// given `Hir` expressions, then an error is returned.
341 ///
342 /// Note that unlike [`Builder::build_many`], this can only fail as a
343 /// result of building the underlying matcher. In that case, there is
344 /// no single `Hir` expression that can be isolated as a reason for the
345 /// failure. So if this routine fails, it's not possible to determine which
346 /// `Hir` expression caused the failure.
347 ///
348 /// # Example
349 ///
350 /// This example shows how one can hand-construct multiple `Hir`
351 /// expressions and build a single regex from them without doing any
352 /// parsing at all.
353 ///
354 /// ```
355 /// use ib_matcher::{
356 /// regex::{cp::Regex, Match},
357 /// syntax::regex::hir::{Hir, Look},
358 /// };
359 ///
360 /// // (?Rm)^foo$
361 /// let hir1 = Hir::concat(vec![
362 /// Hir::look(Look::StartCRLF),
363 /// Hir::literal("foo".as_bytes()),
364 /// Hir::look(Look::EndCRLF),
365 /// ]);
366 /// // (?Rm)^bar$
367 /// let hir2 = Hir::concat(vec![
368 /// Hir::look(Look::StartCRLF),
369 /// Hir::literal("bar".as_bytes()),
370 /// Hir::look(Look::EndCRLF),
371 /// ]);
372 /// let re = Regex::builder()
373 /// .build_many_from_hir(vec![hir1, hir2])?;
374 /// let hay = "\r\nfoo\r\nbar";
375 /// let got: Vec<Match> = re.find_iter(hay).collect();
376 /// let expected = vec![
377 /// Match::must(0, 2..5),
378 /// Match::must(1, 7..10),
379 /// ];
380 /// assert_eq!(expected, got);
381 ///
382 /// Ok::<(), Box<dyn std::error::Error>>(())
383 /// ```
384 }))]
385 pub fn builder(
386 #[builder(field)] syntax: util::syntax::Config,
387 #[cfg(feature = "regex-callback")]
388 #[builder(field)]
389 callbacks: Vec<(String, Callback)>,
390 #[builder(finish_fn)] hirs: Vec<Hir>,
391 /// Thompson NFA config. Named `configure` to be compatible with [`regex_automata::meta::Builder`]. Although some fields are not supported and `utf8_empty` is named as `utf8` instead.
392 #[builder(default)]
393 configure: thompson::Config,
394 /// [`IbMatcher`] config.
395 #[builder(default = MatchConfig::builder().case_insensitive(false).build())]
396 ib: MatchConfig<'a>,
397 /// `IbMatcher` pattern parser.
398 ///
399 /// ### Example
400 /// ```
401 /// use ib_matcher::{regex::cp::Regex, matcher::{MatchConfig, pattern::Pattern}};
402 ///
403 /// let re = Regex::builder()
404 /// .ib(MatchConfig::builder().pinyin(Default::default()).build())
405 /// .ib_parser(&mut |pattern| Pattern::parse_ev(pattern).call())
406 /// .build("pinyin;py")
407 /// .unwrap();
408 /// assert!(re.is_match("拼音搜索"));
409 /// assert!(re.is_match("pinyin") == false);
410 /// ```
411 /// See [`crate::syntax::ev`] for more details.
412 mut ib_parser: Option<&mut dyn FnMut(&str) -> Pattern<str>>,
413 #[builder(default = backtrack::Config::new().visited_capacity(usize::MAX / 8))]
414 mut backtrack: backtrack::Config,
415 ) -> Result<Self, BuildError> {
416 _ = syntax;
417 #[cfg(test)]
418 dbg!(&hirs);
419
420 let mut imp = Arc::new(RegexI {
421 re: MaybeUninit::uninit(),
422 config: {
423 let mut config = ib;
424 config.starts_with = true;
425 config
426 },
427 _pin: PhantomPinned,
428 });
429
430 let case_insensitive =
431 imp.config.plain.as_ref().is_some_and(|p| p.case_insensitive);
432 #[cfg(feature = "perf-literal-substring")]
433 #[allow(unused_mut)]
434 let mut first_byte = hir::literal::extract_first_byte(&hirs);
435
436 // Copy-and-patch NFA
437 let (hirs, literals) = hir::fold::fold_literal_utf8(hirs.into_iter());
438 let mut nfa: NFA = thompson::Compiler::new()
439 .configure(configure)
440 .build_many_from_hir(&hirs)?
441 .into();
442 let count = literals.len();
443 #[cfg(feature = "regex-callback")]
444 let count = {
445 let mut count = count;
446 for (literal, callback) in callbacks {
447 for i in literals.iter().positions(|l| l == &literal) {
448 #[cfg(feature = "perf-literal-substring")]
449 first_byte.take_if(|b| literal.as_bytes()[0] == *b);
450
451 nfa.patch_first_byte(i as u8, |next| {
452 crate::regex::nfa::State::Callback {
453 callback: callback.clone(),
454 next,
455 }
456 });
457 count -= 1;
458 }
459 }
460 count
461 };
462 nfa.patch_bytes_to_matchers(literals.len() as u8, count, |b| {
463 let pattern = literals[b as usize].as_str();
464 let pattern = if let Some(ib_parser) = ib_parser.as_mut() {
465 ib_parser(pattern)
466 } else {
467 pattern.into()
468 };
469
470 // `shallow_clone()` requires `config` cannot be moved
471 let config: MatchConfig<'static> =
472 unsafe { transmute(imp.config.shallow_clone()) };
473 IbMatcher::with_config(pattern, config)
474 });
475 #[cfg(test)]
476 dbg!(&nfa);
477
478 // Engine
479 #[cfg(feature = "perf-literal-substring")]
480 if let Some(b) = first_byte {
481 backtrack.pre_ib =
482 Some(PrefilterIb::byte2_or_non_ascii(b, case_insensitive));
483 }
484 let re = BoundedBacktracker::builder()
485 .configure(backtrack)
486 .build_from_nfa(nfa)?;
487 unsafe { Arc::get_mut(&mut imp).unwrap_unchecked().re.write(re) };
488
489 Ok(Self { imp, pool: Pool::new(create_cache) })
490 }
491}
492
493impl<'a, S: builder::State> Builder<'a, '_, S> {
494 /// Configure the syntax options when parsing a pattern string while
495 /// building a `Regex`.
496 ///
497 /// These options _only_ apply when [`Builder::build`] or [`Builder::build_many`]
498 /// are used. The other build methods accept `Hir` values, which have
499 /// already been parsed.
500 ///
501 /// # Example
502 ///
503 /// This example shows how to enable case insensitive mode.
504 ///
505 /// ```
506 /// use ib_matcher::regex::{cp::Regex, util::syntax, Match};
507 ///
508 /// let re = Regex::builder()
509 /// .syntax(syntax::Config::new().case_insensitive(true))
510 /// .build(r"δ")?;
511 /// assert_eq!(Some(Match::must(0, 0..2)), re.find(r"Δ"));
512 ///
513 /// Ok::<(), Box<dyn std::error::Error>>(())
514 /// ```
515 pub fn syntax(mut self, syntax: util::syntax::Config) -> Self {
516 self.syntax = syntax;
517 self
518 }
519
520 /// Add a [custom matching callback](Regex#custom-matching-callbacks).
521 #[cfg(feature = "regex-callback")]
522 pub fn callback(
523 mut self,
524 literal: impl Into<String>,
525 callback: impl Fn(&Input, usize, &mut dyn FnMut(usize)) + 'static,
526 ) -> Self {
527 self.callbacks.push((literal.into(), Arc::new(callback)));
528 self
529 }
530
531 /// Builds a `Regex` from a single pattern string.
532 ///
533 /// If there was a problem parsing the pattern or a problem turning it into
534 /// a regex matcher, then an error is returned.
535 ///
536 /// # Example
537 ///
538 /// This example shows how to configure syntax options.
539 ///
540 /// ```
541 /// use ib_matcher::regex::{cp::Regex, util::syntax, Match};
542 ///
543 /// let re = Regex::builder()
544 /// .syntax(syntax::Config::new().crlf(true).multi_line(true))
545 /// .build(r"^foo$")?;
546 /// let hay = "\r\nfoo\r\n";
547 /// assert_eq!(Some(Match::must(0, 2..5)), re.find(hay));
548 ///
549 /// # Ok::<(), Box<dyn std::error::Error>>(())
550 /// ```
551 pub fn build(self, pattern: &str) -> Result<Regex<'a>, BuildError>
552 where
553 S: builder::IsComplete,
554 {
555 self.build_many(&[pattern])
556 }
557
558 /// Builds a `Regex` from many pattern strings.
559 ///
560 /// If there was a problem parsing any of the patterns or a problem turning
561 /// them into a regex matcher, then an error is returned.
562 ///
563 /// # Example: zero patterns is valid
564 ///
565 /// Building a regex with zero patterns results in a regex that never
566 /// matches anything. Because this routine is generic, passing an empty
567 /// slice usually requires a turbo-fish (or something else to help type
568 /// inference).
569 ///
570 /// ```
571 /// use ib_matcher::regex::{cp::Regex, util::syntax, Match};
572 ///
573 /// let re = Regex::builder()
574 /// .build_many::<&str>(&[])?;
575 /// assert_eq!(None, re.find(""));
576 ///
577 /// # Ok::<(), Box<dyn std::error::Error>>(())
578 /// ```
579 pub fn build_many<P: AsRef<str>>(
580 self,
581 patterns: &[P],
582 ) -> Result<Regex<'a>, BuildError>
583 where
584 S: builder::IsComplete,
585 {
586 // Bypass case_fold_char()
587 // case_insensitive class and (?i) will be broken
588 // .case_insensitive(false)
589 let syntax = self.syntax;
590
591 // Parse
592 let hirs = patterns
593 .into_iter()
594 .map(|pattern| {
595 let pattern = pattern.as_ref();
596 regex_automata::util::syntax::parse_with(pattern, &syntax)
597 .map_err(|_| {
598 // Shit
599 thompson::Compiler::new()
600 .syntax(syntax)
601 .build(pattern)
602 .unwrap_err()
603 })
604 })
605 .try_collect()?;
606 self.build_many_from_hir(hirs)
607 }
608
609 /// Builds a `Regex` directly from an `Hir` expression.
610 ///
611 /// This is useful if you needed to parse a pattern string into an `Hir`
612 /// for other reasons (such as analysis or transformations). This routine
613 /// permits building a `Regex` directly from the `Hir` expression instead
614 /// of first converting the `Hir` back to a pattern string.
615 ///
616 /// When using this method, any options set via [`Builder::syntax`] are
617 /// ignored. Namely, the syntax options only apply when parsing a pattern
618 /// string, which isn't relevant here.
619 ///
620 /// If there was a problem building the underlying regex matcher for the
621 /// given `Hir`, then an error is returned.
622 ///
623 /// # Example
624 ///
625 /// This example shows how one can hand-construct an `Hir` expression and
626 /// build a regex from it without doing any parsing at all.
627 ///
628 /// ```
629 /// use ib_matcher::{
630 /// regex::{cp::Regex, Match},
631 /// syntax::regex::hir::{Hir, Look},
632 /// };
633 ///
634 /// // (?Rm)^foo$
635 /// let hir = Hir::concat(vec![
636 /// Hir::look(Look::StartCRLF),
637 /// Hir::literal("foo".as_bytes()),
638 /// Hir::look(Look::EndCRLF),
639 /// ]);
640 /// let re = Regex::builder()
641 /// .build_from_hir(hir)?;
642 /// let hay = "\r\nfoo\r\n";
643 /// assert_eq!(Some(Match::must(0, 2..5)), re.find(hay));
644 ///
645 /// Ok::<(), Box<dyn std::error::Error>>(())
646 /// ```
647 pub fn build_from_hir(self, hir: Hir) -> Result<Regex<'a>, BuildError>
648 where
649 S: builder::IsComplete,
650 {
651 self.build_many_from_hir(vec![hir])
652 }
653}
654
655impl Clone for Regex<'_> {
656 fn clone(&self) -> Self {
657 Regex { imp: self.imp.clone(), pool: Pool::new(create_cache) }
658 }
659}
660
661impl Drop for RegexI<'_> {
662 fn drop(&mut self) {
663 unsafe { self.re.assume_init_drop() };
664 }
665}
666
667/// High level convenience routines for using a regex to search a haystack.
668impl<'a> Regex<'a> {
669 /// Returns true if and only if this regex matches the given haystack.
670 ///
671 /// This routine may short circuit if it knows that scanning future input
672 /// will never lead to a different result. (Consider how this might make
673 /// a difference given the regex `a+` on the haystack `aaaaaaaaaaaaaaa`.
674 /// This routine _may_ stop after it sees the first `a`, but routines like
675 /// `find` need to continue searching because `+` is greedy by default.)
676 ///
677 /// # Example
678 ///
679 /// ```
680 /// use ib_matcher::regex::cp::Regex;
681 ///
682 /// let re = Regex::new("foo[0-9]+bar")?;
683 ///
684 /// assert!(re.is_match("foo12345bar"));
685 /// assert!(!re.is_match("foobar"));
686 ///
687 /// # Ok::<(), Box<dyn std::error::Error>>(())
688 /// ```
689 ///
690 /// # Example: consistency with search APIs
691 ///
692 /// `is_match` is guaranteed to return `true` whenever `find` returns a
693 /// match. This includes searches that are executed entirely within a
694 /// codepoint:
695 ///
696 /// ```
697 /// use ib_matcher::regex::{cp::Regex, Input};
698 ///
699 /// let re = Regex::new("a*")?;
700 ///
701 /// // This doesn't match because the default configuration bans empty
702 /// // matches from splitting a codepoint.
703 /// assert!(!re.is_match(Input::new("☃").span(1..2)));
704 /// assert_eq!(None, re.find(Input::new("☃").span(1..2)));
705 ///
706 /// # Ok::<(), Box<dyn std::error::Error>>(())
707 /// ```
708 ///
709 /// Notice that when UTF-8 mode is disabled, then the above reports a
710 /// match because the restriction against zero-width matches that split a
711 /// codepoint has been lifted:
712 ///
713 /// ```
714 /// use ib_matcher::regex::{cp::Regex, Input, Match};
715 ///
716 /// let re = Regex::builder()
717 /// .configure(Regex::config().utf8(false))
718 /// .build("a*")?;
719 ///
720 /// assert!(re.is_match(Input::new("☃").span(1..2)));
721 /// assert_eq!(
722 /// Some(Match::must(0, 1..1)),
723 /// re.find(Input::new("☃").span(1..2)),
724 /// );
725 ///
726 /// # Ok::<(), Box<dyn std::error::Error>>(())
727 /// ```
728 ///
729 /// A similar idea applies when using line anchors with CRLF mode enabled,
730 /// which prevents them from matching between a `\r` and a `\n`.
731 ///
732 /// ```
733 /// use ib_matcher::regex::{cp::Regex, Input, Match};
734 ///
735 /// let re = Regex::new(r"(?Rm:$)")?;
736 /// assert!(!re.is_match(Input::new("\r\n").span(1..1)));
737 /// // A regular line anchor, which only considers \n as a
738 /// // line terminator, will match.
739 /// let re = Regex::new(r"(?m:$)")?;
740 /// assert!(re.is_match(Input::new("\r\n").span(1..1)));
741 ///
742 /// # Ok::<(), Box<dyn std::error::Error>>(())
743 /// ```
744 #[inline]
745 pub fn is_match<'h, I: Into<Input<'h>>>(&self, input: I) -> bool {
746 let input = input.into().earliest(true);
747 let mut guard = self.pool.get();
748 self.try_is_match(&mut guard, input).unwrap()
749 }
750
751 /// Executes a leftmost search and returns the first match that is found,
752 /// if one exists.
753 ///
754 /// # Example
755 ///
756 /// ```
757 /// use ib_matcher::regex::{cp::Regex, Match};
758 ///
759 /// let re = Regex::new("foo[0-9]+")?;
760 /// assert_eq!(Some(Match::must(0, 0..8)), re.find("foo12345"));
761 ///
762 /// # Ok::<(), Box<dyn std::error::Error>>(())
763 /// ```
764 #[inline]
765 pub fn find<'h, I: Into<Input<'h>>>(&self, input: I) -> Option<Match> {
766 let input = input.into();
767 let mut guard = self.pool.get();
768 self.try_find(&mut guard, input).unwrap()
769 }
770
771 /// Executes a leftmost forward search and writes the spans of capturing
772 /// groups that participated in a match into the provided [`Captures`]
773 /// value. If no match was found, then [`Captures::is_match`] is guaranteed
774 /// to return `false`.
775 ///
776 /// # Example
777 ///
778 /// ```
779 /// use ib_matcher::regex::{cp::Regex, Span};
780 ///
781 /// let re = Regex::new(r"^([0-9]{4})-([0-9]{2})-([0-9]{2})$")?;
782 /// let mut caps = re.create_captures();
783 ///
784 /// re.captures("2010-03-14", &mut caps);
785 /// assert!(caps.is_match());
786 /// assert_eq!(Some(Span::from(0..4)), caps.get_group(1));
787 /// assert_eq!(Some(Span::from(5..7)), caps.get_group(2));
788 /// assert_eq!(Some(Span::from(8..10)), caps.get_group(3));
789 ///
790 /// # Ok::<(), Box<dyn std::error::Error>>(())
791 /// ```
792 #[inline]
793 pub fn captures<'h, I: Into<Input<'h>>>(
794 &self,
795 input: I,
796 caps: &mut Captures,
797 ) -> Result<(), MatchError> {
798 let input = input.into();
799 let mut guard = self.pool.get();
800 self.try_captures(&mut guard, input, caps)
801 }
802
803 /// Returns an iterator over all non-overlapping leftmost matches in
804 /// the given haystack. If no match exists, then the iterator yields no
805 /// elements.
806 ///
807 /// # Example
808 ///
809 /// ```
810 /// use ib_matcher::regex::{cp::Regex, Match};
811 ///
812 /// let re = Regex::new("foo[0-9]+")?;
813 /// let haystack = "foo1 foo12 foo123";
814 /// let matches: Vec<Match> = re.find_iter(haystack).collect();
815 /// assert_eq!(matches, vec![
816 /// Match::must(0, 0..4),
817 /// Match::must(0, 5..10),
818 /// Match::must(0, 11..17),
819 /// ]);
820 /// # Ok::<(), Box<dyn std::error::Error>>(())
821 /// ```
822 #[inline]
823 pub fn find_iter<'h, I: Into<Input<'h>>>(
824 &'h self,
825 input: I,
826 ) -> impl Iterator<Item = Match> + 'h {
827 let input = input.into();
828 let guard = UnsafeCell::new(self.pool.get());
829 self.try_find_iter(unsafe { &mut *guard.get() }, input).map(move |r| {
830 let _guard = &guard;
831 r.unwrap()
832 })
833 }
834
835 /// Returns an iterator over all non-overlapping `Captures` values. If no
836 /// match exists, then the iterator yields no elements.
837 ///
838 /// This yields the same matches as [`Regex::find_iter`], but it includes
839 /// the spans of all capturing groups that participate in each match.
840 ///
841 /// **Tip:** See [`util::iter::Searcher`](crate::util::iter::Searcher) for
842 /// how to correctly iterate over all matches in a haystack while avoiding
843 /// the creation of a new `Captures` value for every match. (Which you are
844 /// forced to do with an `Iterator`.)
845 ///
846 /// # Example
847 ///
848 /// ```
849 /// use ib_matcher::regex::{cp::Regex, Span};
850 ///
851 /// let re = Regex::new("foo(?P<numbers>[0-9]+)")?;
852 ///
853 /// let haystack = "foo1 foo12 foo123";
854 /// let matches: Vec<Span> = re
855 /// .captures_iter(haystack)
856 /// // The unwrap is OK since 'numbers' matches if the pattern matches.
857 /// .map(|caps| caps.get_group_by_name("numbers").unwrap())
858 /// .collect();
859 /// assert_eq!(matches, vec![
860 /// Span::from(3..4),
861 /// Span::from(8..10),
862 /// Span::from(14..17),
863 /// ]);
864 /// # Ok::<(), Box<dyn std::error::Error>>(())
865 /// ```
866 #[inline]
867 pub fn captures_iter<'h, I: Into<Input<'h>>>(
868 &'h self,
869 input: I,
870 ) -> impl Iterator<Item = Captures> + 'h {
871 let input = input.into();
872 let guard = UnsafeCell::new(self.pool.get());
873 self.try_captures_iter(unsafe { &mut *guard.get() }, input).map(
874 move |r| {
875 let _guard = &guard;
876 r.unwrap()
877 },
878 )
879 }
880}
881
882impl Deref for Regex<'_> {
883 type Target = BoundedBacktracker;
884
885 fn deref(&self) -> &Self::Target {
886 unsafe { self.imp.re.assume_init_ref() }
887 }
888}
889
890#[cfg(test)]
891mod tests {
892 use regex_automata::Match;
893 use regex_syntax::hir::Look;
894
895 use crate::{
896 matcher::{PinyinMatchConfig, RomajiMatchConfig},
897 pinyin::PinyinNotation,
898 };
899
900 use super::*;
901
902 #[test]
903 fn literal() {
904 let re = Regex::builder()
905 .ib(MatchConfig::builder()
906 .pinyin(PinyinMatchConfig::notations(
907 PinyinNotation::Ascii | PinyinNotation::AsciiFirstLetter,
908 ))
909 .build())
910 .build("pyss")
911 .unwrap();
912
913 let mut cache = re.create_cache();
914 assert_eq!(
915 re.try_find(&mut cache, "pyss").unwrap(),
916 Some(Match::must(0, 0..4)),
917 );
918 assert_eq!(
919 re.try_find(&mut cache, "apyss").unwrap(),
920 Some(Match::must(0, 1..5)),
921 );
922 assert_eq!(
923 re.try_find(&mut cache, "拼音搜索").unwrap(),
924 Some(Match::must(0, 0..12)),
925 );
926
927 assert_eq!(re.find("pyss"), Some(Match::must(0, 0..4)),);
928 }
929
930 #[test]
931 fn case() {
932 let re = Regex::builder()
933 .syntax(util::syntax::Config::new().case_insensitive(true))
934 .build(r"δ")
935 .unwrap();
936 assert_eq!(Some(Match::must(0, 0..2)), re.find(r"Δ"));
937 }
938
939 #[test]
940 fn alt() {
941 let pinyin = PinyinMatchConfig::notations(
942 PinyinNotation::Ascii | PinyinNotation::AsciiFirstLetter,
943 );
944
945 let re = Regex::builder().build("samwise|sam").unwrap();
946 assert_eq!(Some(Match::must(0, 0..3)), re.find("sam"));
947
948 let re = Regex::builder()
949 .ib(MatchConfig::builder().pinyin(pinyin.shallow_clone()).build())
950 .build("samwise|pyss")
951 .unwrap();
952 assert_eq!(Some(Match::must(0, 0..12)), re.find("拼音搜索"));
953 }
954
955 #[test]
956 fn wildcard() {
957 let re = Regex::builder()
958 .ib(MatchConfig::builder()
959 .pinyin(PinyinMatchConfig::notations(
960 PinyinNotation::Ascii | PinyinNotation::AsciiFirstLetter,
961 ))
962 .romaji(RomajiMatchConfig::default())
963 .build())
964 .build("raki.suta")
965 .unwrap();
966
967 assert_eq!(re.max_haystack_len(), 0x1111111111111110);
968 let mut cache = re.create_cache();
969 assert_eq!(cache.memory_usage(), 0);
970 assert_eq!(
971 re.try_find(&mut cache, "¥らき☆すた").unwrap(),
972 Some(Match::must(0, 3..18)),
973 );
974 // 2 * 16 + (alignup(16 * (18+1) / 8, 8) = 40)
975 assert_eq!(cache.memory_usage(), 72);
976
977 let re = Regex::builder()
978 .ib(MatchConfig::builder()
979 .pinyin(PinyinMatchConfig::notations(
980 PinyinNotation::Ascii | PinyinNotation::AsciiFirstLetter,
981 ))
982 .build())
983 .build("p.*y.*s.*s")
984 .unwrap();
985 let mut cache = re.create_cache();
986 assert_eq!(
987 re.try_find(&mut cache, "拼a音b搜c索d").unwrap(),
988 Some(Match::must(0, 0..15)),
989 );
990 }
991
992 #[test]
993 fn mix_lang() {
994 let pinyin = PinyinMatchConfig::notations(
995 PinyinNotation::Ascii | PinyinNotation::AsciiFirstLetter,
996 );
997 let romaji = RomajiMatchConfig::default();
998
999 let re = Regex::builder()
1000 .ib(MatchConfig::builder()
1001 .pinyin(pinyin.shallow_clone())
1002 .romaji(romaji.shallow_clone())
1003 .build())
1004 .build("pysousuosousounofuri-ren")
1005 .unwrap();
1006 let mut cache = re.create_cache();
1007 assert_eq!(
1008 re.try_find(&mut cache, "拼音搜索葬送のフリーレン").unwrap(),
1009 None
1010 );
1011
1012 let re = Regex::builder()
1013 .ib(MatchConfig::builder()
1014 .pinyin(pinyin.shallow_clone())
1015 .romaji(romaji.shallow_clone())
1016 .mix_lang(true)
1017 .build())
1018 .build("pysousuosousounofuri-ren")
1019 .unwrap();
1020 assert_eq!(
1021 re.find("拼音搜索葬送のフリーレン"),
1022 Some(Match::must(0, 0..36)),
1023 );
1024
1025 let re = Regex::builder()
1026 .ib(MatchConfig::builder()
1027 .pinyin(pinyin.shallow_clone())
1028 .romaji(romaji.shallow_clone())
1029 .build())
1030 .build("(pysousuo)(sousounofuri-ren)")
1031 .unwrap();
1032 let mut cache = re.create_cache();
1033 assert_eq!(
1034 re.try_find(&mut cache, "拼音搜索葬送のフリーレン").unwrap(),
1035 Some(Match::must(0, 0..36)),
1036 );
1037
1038 let re = Regex::builder()
1039 .ib(MatchConfig::builder()
1040 .pinyin(pinyin.shallow_clone())
1041 .romaji(romaji.shallow_clone())
1042 .build())
1043 .build("pysousuo.*?sousounofuri-ren")
1044 .unwrap();
1045 let mut cache = re.create_cache();
1046 assert_eq!(
1047 re.try_find(&mut cache, "拼音搜索⭐葬送のフリーレン").unwrap(),
1048 Some(Match::must(0, 0..39)),
1049 );
1050 }
1051
1052 #[test]
1053 fn look() {
1054 // (?Rm)^foo$
1055 let hir1 = Hir::concat(vec![
1056 Hir::look(Look::StartCRLF),
1057 Hir::literal("foo".as_bytes()),
1058 Hir::look(Look::EndCRLF),
1059 ]);
1060 // (?Rm)^bar$
1061 let hir2 = Hir::concat(vec![
1062 Hir::look(Look::StartCRLF),
1063 Hir::literal("bar".as_bytes()),
1064 Hir::look(Look::EndCRLF),
1065 ]);
1066 let re =
1067 Regex::builder().build_many_from_hir(vec![hir1, hir2]).unwrap();
1068 let hay = "\r\nfoo\r\nbar";
1069 let got: Vec<Match> = re.find_iter(hay).collect();
1070 let expected = vec![Match::must(0, 2..5), Match::must(1, 7..10)];
1071 assert_eq!(expected, got);
1072 }
1073
1074 #[cfg(feature = "regex-callback")]
1075 #[test]
1076 fn callback() {
1077 use std::{cell::RefCell, rc::Rc};
1078
1079 let re = Regex::builder()
1080 .callback("ascii", |input, at, push| {
1081 let haystack = &input.haystack()[at..];
1082 if haystack.get(0).is_some_and(|c| c.is_ascii()) {
1083 push(1);
1084 }
1085 })
1086 .build(r"(ascii)+\d(ascii)+")
1087 .unwrap();
1088 assert_eq!(re.find("that4U this4me"), Some(Match::must(0, 8..16)));
1089
1090 let count = Rc::new(RefCell::new(0));
1091 let re = Regex::builder()
1092 .callback("open_quote", {
1093 let count = count.clone();
1094 move |input, at, push| {
1095 if at < 2 || input.haystack()[at - 2] != b'\\' {
1096 let mut count = count.borrow_mut();
1097 *count += 1;
1098 push(0);
1099 }
1100 }
1101 })
1102 .callback("close_quote", move |input, at, push| {
1103 if at < 2 || input.haystack()[at - 2] != b'\\' {
1104 let mut count = count.borrow_mut();
1105 if *count > 0 {
1106 push(0);
1107 }
1108 *count -= 1;
1109 }
1110 })
1111 // '([^'\\]+?|\\')*'
1112 .build(r"'(open_quote).*?'(close_quote)")
1113 .unwrap();
1114 let hay = r"'one' 'two\'three' 'four'";
1115 assert_eq!(
1116 re.find_iter(hay).map(|m| &hay[m.span()]).collect::<Vec<_>>(),
1117 vec!["'one'", r"'two\'three'", "'four'"]
1118 );
1119
1120 let re = Regex::builder()
1121 .callback("lookahead_is_ascii", |input, at, push| {
1122 let haystack = &input.haystack()[at..];
1123 if haystack.get(0).is_some_and(|c| c.is_ascii()) {
1124 push(0);
1125 }
1126 })
1127 .build(r"(?-u)[\x00-\x7f]+?\d(lookahead_is_ascii)")
1128 .unwrap();
1129 let hay = "that4U,this4me1plz";
1130 assert_eq!(
1131 re.find_iter(hay).map(|m| &hay[m.span()]).collect::<Vec<_>>(),
1132 vec![",this4", "me1"]
1133 );
1134 }
1135}