ib_matcher/regex/cp/
regex.rs

1use std::{
2    cell::UnsafeCell,
3    marker::PhantomPinned,
4    mem::{transmute, MaybeUninit},
5    ops::Deref,
6    sync::Arc,
7};
8
9use bon::bon;
10use itertools::Itertools;
11use regex_syntax::hir::Hir;
12
13#[cfg(feature = "regex-callback")]
14use crate::regex::nfa::Callback;
15use crate::{
16    matcher::{pattern::Pattern, IbMatcher, MatchConfig},
17    regex::{
18        nfa::{
19            backtrack::{self, BoundedBacktracker},
20            thompson::{self},
21            NFA,
22        },
23        util::{self, captures::Captures, pool::Pool, prefilter::PrefilterIb},
24        Input, Match, MatchError,
25    },
26    syntax::regex::hir,
27};
28
29pub use crate::regex::nfa::{
30    backtrack::{Cache, Config, TryCapturesMatches, TryFindMatches},
31    thompson::BuildError,
32};
33
34/// A compiled regular expression for searching Unicode haystacks.
35///
36/// A `Regex` can be used to search haystacks, split haystacks into substrings
37/// or replace substrings in a haystack with a different substring. All
38/// searching is done with an implicit `(?s:.)*?` at the beginning and end of
39/// an pattern. To force an expression to match the whole string (or a prefix
40/// or a suffix), you can use anchored search or an anchor like `^` or `$` (or `\A` and `\z`).
41/**
42# Overview
43
44The most important methods are as follows:
45
46* [`Regex::new`] compiles a regex using the default configuration. A
47[`Builder`] permits setting a non-default configuration. (For example,
48case insensitive matching, verbose mode and others.)
49* [`Regex::is_match`] reports whether a match exists in a particular haystack.
50* [`Regex::find`] reports the byte offsets of a match in a haystack, if one
51exists. [`Regex::find_iter`] returns an iterator over all such matches.
52* [`Regex::captures`] returns a [`Captures`], which reports both the byte
53offsets of a match in a haystack and the byte offsets of each matching capture
54group from the regex in the haystack.
55[`Regex::captures_iter`] returns an iterator over all such matches.
56*/
57/// # Example
58///
59/// ```
60/// use ib_matcher::regex::cp::Regex;
61///
62/// let re = Regex::new(r"^[0-9]{4}-[0-9]{2}-[0-9]{2}$")?;
63/// assert!(re.is_match("2010-03-14"));
64///
65/// # Ok::<(), Box<dyn std::error::Error>>(())
66/// ```
67/**
68With `IbMatcher`'s Chinese pinyin and Japanese romaji matching:
69```
70// cargo add ib-matcher --features regex,pinyin,romaji
71use ib_matcher::{
72    matcher::{MatchConfig, PinyinMatchConfig, RomajiMatchConfig},
73    regex::{cp::Regex, Match},
74};
75
76let config = MatchConfig::builder()
77    .pinyin(PinyinMatchConfig::default())
78    .romaji(RomajiMatchConfig::default())
79    .build();
80
81let re = Regex::builder()
82    .ib(config.shallow_clone())
83    .build("raki.suta")
84    .unwrap();
85assert_eq!(re.find("「らき☆すた」"), Some(Match::must(0, 3..18)));
86
87let re = Regex::builder()
88    .ib(config.shallow_clone())
89    .build("pysou.*?(any|every)thing")
90    .unwrap();
91assert_eq!(re.find("拼音搜索Everything"), Some(Match::must(0, 0..22)));
92
93let config = MatchConfig::builder()
94    .pinyin(PinyinMatchConfig::default())
95    .romaji(RomajiMatchConfig::default())
96    .mix_lang(true)
97    .build();
98let re = Regex::builder()
99    .ib(config.shallow_clone())
100    .build("(?x)^zangsounofuri-?ren # Mixing pinyin and romaji")
101    .unwrap();
102assert_eq!(re.find("葬送のフリーレン"), Some(Match::must(0, 0..24)));
103```
104*/
105/// For more examples and the syntax, see [`crate::regex`].
106///
107/// # Case insensitivity
108/// To enable case insensitivity:
109/// ```
110/// use ib_matcher::{matcher::{PinyinMatchConfig, PlainMatchConfig, MatchConfig}, regex::cp::Regex};
111///
112/// let re = Regex::builder().ib(MatchConfig::default()).build("foo").unwrap();
113/// assert!(re.is_match("FOO"));
114///
115/// // Alternatively, with `case_insensitive()`:
116/// let re = Regex::builder()
117///     .ib(MatchConfig::builder()
118///         .case_insensitive(true)
119///         .pinyin(PinyinMatchConfig::default())
120///         .build())
121///     .build("pyss")
122///     .unwrap();
123/// assert!(re.is_match("PY搜索"));
124/// ```
125/// Note that enabling `syntax.case_insensitive` will make `ib` (i.e. pinyin and romaji match) doesn't work at the moment. You should only set [`MatchConfigBuilder::case_insensitive`](crate::matcher::MatchConfigBuilder::case_insensitive) ([`PlainMatchConfigBuilder::case_insensitive`](crate::matcher::PlainMatchConfigBuilder::case_insensitive)).
126///
127/// If you need case insensitive character classes, you need to write `(?i:[a-z])` instead at the moment.
128///
129/**
130# Custom matching callbacks
131Custom matching callbacks can be used to implement ad hoc look-around, backreferences, balancing groups/recursion/subroutines, combining domain-specific parsers, etc.
132
133Basic usage:
134```
135// cargo add ib-matcher --features regex,regex-callback
136use ib_matcher::regex::cp::Regex;
137
138let re = Regex::builder()
139    .callback("ascii", |input, at, push| {
140        let haystack = &input.haystack()[at..];
141        if haystack.len() > 0 && haystack[0].is_ascii() {
142            push(1);
143        }
144    })
145    .build(r"(ascii)+\d(ascii)+")
146    .unwrap();
147let hay = "that4U this4me";
148assert_eq!(&hay[re.find(hay).unwrap().span()], " this4me");
149```
150
151## Look-around
152```
153use ib_matcher::regex::cp::Regex;
154
155let re = Regex::builder()
156    .callback("lookahead_is_ascii", |input, at, push| {
157        let haystack = &input.haystack()[at..];
158        if haystack.len() > 0 && haystack[0].is_ascii() {
159            push(0);
160        }
161    })
162    .build(r"[\x00-\x7f]+?\d(lookahead_is_ascii)")
163    .unwrap();
164let hay = "that4U,this4me1plz";
165assert_eq!(
166    re.find_iter(hay).map(|m| &hay[m.span()]).collect::<Vec<_>>(),
167    vec![",this4", "me1"]
168);
169```
170
171## Balancing groups
172```
173use std::{cell::RefCell, rc::Rc};
174use ib_matcher::regex::cp::Regex;
175
176let count = Rc::new(RefCell::new(0));
177let re = Regex::builder()
178    .callback("open_quote", {
179        let count = count.clone();
180        move |input, at, push| {
181            if at < 2 || input.haystack()[at - 2] != b'\\' {
182                let mut count = count.borrow_mut();
183                *count += 1;
184                push(0);
185            }
186        }
187    })
188    .callback("close_quote", move |input, at, push| {
189        if at < 2 || input.haystack()[at - 2] != b'\\' {
190            let mut count = count.borrow_mut();
191            if *count > 0 {
192                push(0);
193            }
194            *count -= 1;
195        }
196    })
197    .build(r"'(open_quote).*?'(close_quote)")
198    .unwrap();
199let hay = r"'one' 'two\'three' 'four'";
200assert_eq!(
201    re.find_iter(hay).map(|m| &hay[m.span()]).collect::<Vec<_>>(),
202    vec!["'one'", r"'two\'three'", "'four'"]
203);
204```
205(In this simple example, just using `'([^'\\]+?|\\')*'` is actually enough, but there are more complex cases where balancing groups (or recursion/subroutines) are necessary.)
206*/
207/// # Synchronization and cloning
208///
209/// In order to make the `Regex` API convenient, most of the routines hide
210/// the fact that a `Cache` is needed at all. To achieve this, a [memory
211/// pool](automata::util::pool::Pool) is used internally to retrieve `Cache`
212/// values in a thread safe way that also permits reuse. This in turn implies
213/// that every such search call requires some form of synchronization. Usually
214/// this synchronization is fast enough to not notice, but in some cases, it
215/// can be a bottleneck. This typically occurs when all of the following are
216/// true:
217///
218/// * The same `Regex` is shared across multiple threads simultaneously,
219/// usually via a [`util::lazy::Lazy`](automata::util::lazy::Lazy) or something
220/// similar from the `once_cell` or `lazy_static` crates.
221/// * The primary unit of work in each thread is a regex search.
222/// * Searches are run on very short haystacks.
223///
224/// This particular case can lead to high contention on the pool used by a
225/// `Regex` internally, which can in turn increase latency to a noticeable
226/// effect. This cost can be mitigated in one of the following ways:
227///
228/// * Use a distinct copy of a `Regex` in each thread, usually by cloning it.
229/// Cloning a `Regex` _does not_ do a deep copy of its read-only component.
230/// But it does lead to each `Regex` having its own memory pool, which in
231/// turn eliminates the problem of contention. In general, this technique should
232/// not result in any additional memory usage when compared to sharing the same
233/// `Regex` across multiple threads simultaneously.
234/// * Use lower level APIs, like [`Regex::try_find`], which permit passing
235/// a `Cache` explicitly. In this case, it is up to you to determine how best
236/// to provide a `Cache`. For example, you might put a `Cache` in thread-local
237/// storage if your use case allows for it.
238///
239/// Overall, this is an issue that happens rarely in practice, but it can
240/// happen.
241///
242/// # Warning: spin-locks may be used in alloc-only mode
243///
244/// When this crate is built without the `std` feature and the high level APIs
245/// on a `Regex` are used, then a spin-lock will be used to synchronize access
246/// to an internal pool of `Cache` values. This may be undesirable because
247/// a spin-lock is [effectively impossible to implement correctly in user
248/// space][spinlocks-are-bad]. That is, more concretely, the spin-lock could
249/// result in a deadlock.
250///
251/// [spinlocks-are-bad]: https://matklad.github.io/2020/01/02/spinlocks-considered-harmful.html
252///
253/// If one wants to avoid the use of spin-locks when the `std` feature is
254/// disabled, then you must use APIs that accept a `Cache` value explicitly.
255/// For example, [`Regex::try_find`].
256pub struct Regex<'a> {
257    /// The actual regex implementation.
258    imp: Arc<RegexI<'a>>,
259    /// A thread safe pool of caches.
260    ///
261    /// For the higher level search APIs, a `Cache` is automatically plucked
262    /// from this pool before running a search. The lower level `with` methods
263    /// permit the caller to provide their own cache, thereby bypassing
264    /// accesses to this pool.
265    ///
266    /// Note that we put this outside the `Arc` so that cloning a `Regex`
267    /// results in creating a fresh `CachePool`. This in turn permits callers
268    /// to clone regexes into separate threads where each such regex gets
269    /// the pool's "thread owner" optimization. Otherwise, if one shares the
270    /// `Regex` directly, then the pool will go through a slower mutex path for
271    /// all threads except for the "owner."
272    pool: Pool<Cache>,
273}
274
275/// The internal implementation of `Regex`, split out so that it can be wrapped
276/// in an `Arc`.
277struct RegexI<'a> {
278    /// The core matching engine.
279    re: MaybeUninit<BoundedBacktracker>,
280    /// [`IbMatcher`]s in [`NFA`] states may have references to this config due to `shallow_clone()`, i.e. self-references.
281    /// We must keep it alive and not move it.
282    /// That's also the main reason why we wrap it into `Arc` (the core part of `BoundedBacktracker` is already `Arc`ed).
283    config: MatchConfig<'a>,
284    _pin: PhantomPinned,
285}
286
287/// `Cache::new` doesn't really need `&BoundedBacktracker`, so...
288fn create_cache() -> Cache {
289    Cache::new(unsafe { &*(8 as *const _) })
290}
291
292#[bon]
293impl<'a> Regex<'a> {
294    pub fn new(pattern: &str) -> Result<Self, BuildError> {
295        Self::builder().build(pattern)
296    }
297
298    pub fn config() -> thompson::Config {
299        thompson::Config::new()
300    }
301
302    /// Return a builder for configuring the construction of a `Regex`.
303    ///
304    /// This is a convenience routine to avoid needing to import the
305    /// [`Builder`] type in common cases.
306    ///
307    /// # Example: change the line terminator
308    ///
309    /// This example shows how to enable multi-line mode by default and change
310    /// the line terminator to the NUL byte:
311    ///
312    /// ```
313    /// use ib_matcher::regex::{cp::Regex, util::{syntax, look::LookMatcher}, Match};
314    ///
315    /// let mut lookm = LookMatcher::new();
316    /// lookm.set_line_terminator(b'\x00');
317    /// let re = Regex::builder()
318    ///     .syntax(syntax::Config::new().multi_line(true))
319    ///     .configure(Regex::config().look_matcher(lookm))
320    ///     .build(r"^foo$")?;
321    /// let hay = "\x00foo\x00";
322    /// assert_eq!(Some(Match::must(0, 1..4)), re.find(hay));
323    ///
324    /// # Ok::<(), Box<dyn std::error::Error>>(())
325    /// ```
326    #[builder(builder_type = Builder, finish_fn(name = build_many_from_hir, doc {
327    /// Builds a `Regex` directly from many `Hir` expressions.
328    ///
329    /// This is useful if you needed to parse pattern strings into `Hir`
330    /// expressions for other reasons (such as analysis or transformations).
331    /// This routine permits building a `Regex` directly from the `Hir`
332    /// expressions instead of first converting the `Hir` expressions back to
333    /// pattern strings.
334    ///
335    /// When using this method, any options set via [`Builder::syntax`] are
336    /// ignored. Namely, the syntax options only apply when parsing a pattern
337    /// string, which isn't relevant here.
338    ///
339    /// If there was a problem building the underlying regex matcher for the
340    /// given `Hir` expressions, then an error is returned.
341    ///
342    /// Note that unlike [`Builder::build_many`], this can only fail as a
343    /// result of building the underlying matcher. In that case, there is
344    /// no single `Hir` expression that can be isolated as a reason for the
345    /// failure. So if this routine fails, it's not possible to determine which
346    /// `Hir` expression caused the failure.
347    ///
348    /// # Example
349    ///
350    /// This example shows how one can hand-construct multiple `Hir`
351    /// expressions and build a single regex from them without doing any
352    /// parsing at all.
353    ///
354    /// ```
355    /// use ib_matcher::{
356    ///     regex::{cp::Regex, Match},
357    ///     syntax::regex::hir::{Hir, Look},
358    /// };
359    ///
360    /// // (?Rm)^foo$
361    /// let hir1 = Hir::concat(vec![
362    ///     Hir::look(Look::StartCRLF),
363    ///     Hir::literal("foo".as_bytes()),
364    ///     Hir::look(Look::EndCRLF),
365    /// ]);
366    /// // (?Rm)^bar$
367    /// let hir2 = Hir::concat(vec![
368    ///     Hir::look(Look::StartCRLF),
369    ///     Hir::literal("bar".as_bytes()),
370    ///     Hir::look(Look::EndCRLF),
371    /// ]);
372    /// let re = Regex::builder()
373    ///     .build_many_from_hir(vec![hir1, hir2])?;
374    /// let hay = "\r\nfoo\r\nbar";
375    /// let got: Vec<Match> = re.find_iter(hay).collect();
376    /// let expected = vec![
377    ///     Match::must(0, 2..5),
378    ///     Match::must(1, 7..10),
379    /// ];
380    /// assert_eq!(expected, got);
381    ///
382    /// Ok::<(), Box<dyn std::error::Error>>(())
383    /// ```
384    }))]
385    pub fn builder(
386        #[builder(field)] syntax: util::syntax::Config,
387        #[cfg(feature = "regex-callback")]
388        #[builder(field)]
389        callbacks: Vec<(String, Callback)>,
390        #[builder(finish_fn)] hirs: Vec<Hir>,
391        /// Thompson NFA config. Named `configure` to be compatible with [`regex_automata::meta::Builder`]. Although some fields are not supported and `utf8_empty` is named as `utf8` instead.
392        #[builder(default)]
393        configure: thompson::Config,
394        /// [`IbMatcher`] config.
395        #[builder(default = MatchConfig::builder().case_insensitive(false).build())]
396        ib: MatchConfig<'a>,
397        /// `IbMatcher` pattern parser.
398        ///
399        /// ### Example
400        /// ```
401        /// use ib_matcher::{regex::cp::Regex, matcher::{MatchConfig, pattern::Pattern}};
402        ///
403        /// let re = Regex::builder()
404        ///     .ib(MatchConfig::builder().pinyin(Default::default()).build())
405        ///     .ib_parser(&mut |pattern| Pattern::parse_ev(pattern).call())
406        ///     .build("pinyin;py")
407        ///     .unwrap();
408        /// assert!(re.is_match("拼音搜索"));
409        /// assert!(re.is_match("pinyin") == false);
410        /// ```
411        /// See [`crate::syntax::ev`] for more details.
412        mut ib_parser: Option<&mut dyn FnMut(&str) -> Pattern<str>>,
413        #[builder(default = backtrack::Config::new().visited_capacity(usize::MAX / 8))]
414        mut backtrack: backtrack::Config,
415    ) -> Result<Self, BuildError> {
416        _ = syntax;
417        #[cfg(test)]
418        dbg!(&hirs);
419
420        let mut imp = Arc::new(RegexI {
421            re: MaybeUninit::uninit(),
422            config: {
423                let mut config = ib;
424                config.starts_with = true;
425                config
426            },
427            _pin: PhantomPinned,
428        });
429
430        let case_insensitive =
431            imp.config.plain.as_ref().is_some_and(|p| p.case_insensitive);
432        #[cfg(feature = "perf-literal-substring")]
433        #[allow(unused_mut)]
434        let mut first_byte = hir::literal::extract_first_byte(&hirs);
435
436        // Copy-and-patch NFA
437        let (hirs, literals) = hir::fold::fold_literal_utf8(hirs.into_iter());
438        let mut nfa: NFA = thompson::Compiler::new()
439            .configure(configure)
440            .build_many_from_hir(&hirs)?
441            .into();
442        let count = literals.len();
443        #[cfg(feature = "regex-callback")]
444        let count = {
445            let mut count = count;
446            for (literal, callback) in callbacks {
447                for i in literals.iter().positions(|l| l == &literal) {
448                    #[cfg(feature = "perf-literal-substring")]
449                    first_byte.take_if(|b| literal.as_bytes()[0] == *b);
450
451                    nfa.patch_first_byte(i as u8, |next| {
452                        crate::regex::nfa::State::Callback {
453                            callback: callback.clone(),
454                            next,
455                        }
456                    });
457                    count -= 1;
458                }
459            }
460            count
461        };
462        nfa.patch_bytes_to_matchers(literals.len() as u8, count, |b| {
463            let pattern = literals[b as usize].as_str();
464            let pattern = if let Some(ib_parser) = ib_parser.as_mut() {
465                ib_parser(pattern)
466            } else {
467                pattern.into()
468            };
469
470            // `shallow_clone()` requires `config` cannot be moved
471            let config: MatchConfig<'static> =
472                unsafe { transmute(imp.config.shallow_clone()) };
473            IbMatcher::with_config(pattern, config)
474        });
475        #[cfg(test)]
476        dbg!(&nfa);
477
478        // Engine
479        #[cfg(feature = "perf-literal-substring")]
480        if let Some(b) = first_byte {
481            backtrack.pre_ib =
482                Some(PrefilterIb::byte2_or_non_ascii(b, case_insensitive));
483        }
484        let re = BoundedBacktracker::builder()
485            .configure(backtrack)
486            .build_from_nfa(nfa)?;
487        unsafe { Arc::get_mut(&mut imp).unwrap_unchecked().re.write(re) };
488
489        Ok(Self { imp, pool: Pool::new(create_cache) })
490    }
491}
492
493impl<'a, S: builder::State> Builder<'a, '_, S> {
494    /// Configure the syntax options when parsing a pattern string while
495    /// building a `Regex`.
496    ///
497    /// These options _only_ apply when [`Builder::build`] or [`Builder::build_many`]
498    /// are used. The other build methods accept `Hir` values, which have
499    /// already been parsed.
500    ///
501    /// # Example
502    ///
503    /// This example shows how to enable case insensitive mode.
504    ///
505    /// ```
506    /// use ib_matcher::regex::{cp::Regex, util::syntax, Match};
507    ///
508    /// let re = Regex::builder()
509    ///     .syntax(syntax::Config::new().case_insensitive(true))
510    ///     .build(r"δ")?;
511    /// assert_eq!(Some(Match::must(0, 0..2)), re.find(r"Δ"));
512    ///
513    /// Ok::<(), Box<dyn std::error::Error>>(())
514    /// ```
515    pub fn syntax(mut self, syntax: util::syntax::Config) -> Self {
516        self.syntax = syntax;
517        self
518    }
519
520    /// Add a [custom matching callback](Regex#custom-matching-callbacks).
521    #[cfg(feature = "regex-callback")]
522    pub fn callback(
523        mut self,
524        literal: impl Into<String>,
525        callback: impl Fn(&Input, usize, &mut dyn FnMut(usize)) + 'static,
526    ) -> Self {
527        self.callbacks.push((literal.into(), Arc::new(callback)));
528        self
529    }
530
531    /// Builds a `Regex` from a single pattern string.
532    ///
533    /// If there was a problem parsing the pattern or a problem turning it into
534    /// a regex matcher, then an error is returned.
535    ///
536    /// # Example
537    ///
538    /// This example shows how to configure syntax options.
539    ///
540    /// ```
541    /// use ib_matcher::regex::{cp::Regex, util::syntax, Match};
542    ///
543    /// let re = Regex::builder()
544    ///     .syntax(syntax::Config::new().crlf(true).multi_line(true))
545    ///     .build(r"^foo$")?;
546    /// let hay = "\r\nfoo\r\n";
547    /// assert_eq!(Some(Match::must(0, 2..5)), re.find(hay));
548    ///
549    /// # Ok::<(), Box<dyn std::error::Error>>(())
550    /// ```
551    pub fn build(self, pattern: &str) -> Result<Regex<'a>, BuildError>
552    where
553        S: builder::IsComplete,
554    {
555        self.build_many(&[pattern])
556    }
557
558    /// Builds a `Regex` from many pattern strings.
559    ///
560    /// If there was a problem parsing any of the patterns or a problem turning
561    /// them into a regex matcher, then an error is returned.
562    ///
563    /// # Example: zero patterns is valid
564    ///
565    /// Building a regex with zero patterns results in a regex that never
566    /// matches anything. Because this routine is generic, passing an empty
567    /// slice usually requires a turbo-fish (or something else to help type
568    /// inference).
569    ///
570    /// ```
571    /// use ib_matcher::regex::{cp::Regex, util::syntax, Match};
572    ///
573    /// let re = Regex::builder()
574    ///     .build_many::<&str>(&[])?;
575    /// assert_eq!(None, re.find(""));
576    ///
577    /// # Ok::<(), Box<dyn std::error::Error>>(())
578    /// ```
579    pub fn build_many<P: AsRef<str>>(
580        self,
581        patterns: &[P],
582    ) -> Result<Regex<'a>, BuildError>
583    where
584        S: builder::IsComplete,
585    {
586        // Bypass case_fold_char()
587        // case_insensitive class and (?i) will be broken
588        // .case_insensitive(false)
589        let syntax = self.syntax;
590
591        // Parse
592        let hirs = patterns
593            .into_iter()
594            .map(|pattern| {
595                let pattern = pattern.as_ref();
596                regex_automata::util::syntax::parse_with(pattern, &syntax)
597                    .map_err(|_| {
598                        // Shit
599                        thompson::Compiler::new()
600                            .syntax(syntax)
601                            .build(pattern)
602                            .unwrap_err()
603                    })
604            })
605            .try_collect()?;
606        self.build_many_from_hir(hirs)
607    }
608
609    /// Builds a `Regex` directly from an `Hir` expression.
610    ///
611    /// This is useful if you needed to parse a pattern string into an `Hir`
612    /// for other reasons (such as analysis or transformations). This routine
613    /// permits building a `Regex` directly from the `Hir` expression instead
614    /// of first converting the `Hir` back to a pattern string.
615    ///
616    /// When using this method, any options set via [`Builder::syntax`] are
617    /// ignored. Namely, the syntax options only apply when parsing a pattern
618    /// string, which isn't relevant here.
619    ///
620    /// If there was a problem building the underlying regex matcher for the
621    /// given `Hir`, then an error is returned.
622    ///
623    /// # Example
624    ///
625    /// This example shows how one can hand-construct an `Hir` expression and
626    /// build a regex from it without doing any parsing at all.
627    ///
628    /// ```
629    /// use ib_matcher::{
630    ///     regex::{cp::Regex, Match},
631    ///     syntax::regex::hir::{Hir, Look},
632    /// };
633    ///
634    /// // (?Rm)^foo$
635    /// let hir = Hir::concat(vec![
636    ///     Hir::look(Look::StartCRLF),
637    ///     Hir::literal("foo".as_bytes()),
638    ///     Hir::look(Look::EndCRLF),
639    /// ]);
640    /// let re = Regex::builder()
641    ///     .build_from_hir(hir)?;
642    /// let hay = "\r\nfoo\r\n";
643    /// assert_eq!(Some(Match::must(0, 2..5)), re.find(hay));
644    ///
645    /// Ok::<(), Box<dyn std::error::Error>>(())
646    /// ```
647    pub fn build_from_hir(self, hir: Hir) -> Result<Regex<'a>, BuildError>
648    where
649        S: builder::IsComplete,
650    {
651        self.build_many_from_hir(vec![hir])
652    }
653}
654
655impl Clone for Regex<'_> {
656    fn clone(&self) -> Self {
657        Regex { imp: self.imp.clone(), pool: Pool::new(create_cache) }
658    }
659}
660
661impl Drop for RegexI<'_> {
662    fn drop(&mut self) {
663        unsafe { self.re.assume_init_drop() };
664    }
665}
666
667/// High level convenience routines for using a regex to search a haystack.
668impl<'a> Regex<'a> {
669    /// Returns true if and only if this regex matches the given haystack.
670    ///
671    /// This routine may short circuit if it knows that scanning future input
672    /// will never lead to a different result. (Consider how this might make
673    /// a difference given the regex `a+` on the haystack `aaaaaaaaaaaaaaa`.
674    /// This routine _may_ stop after it sees the first `a`, but routines like
675    /// `find` need to continue searching because `+` is greedy by default.)
676    ///
677    /// # Example
678    ///
679    /// ```
680    /// use ib_matcher::regex::cp::Regex;
681    ///
682    /// let re = Regex::new("foo[0-9]+bar")?;
683    ///
684    /// assert!(re.is_match("foo12345bar"));
685    /// assert!(!re.is_match("foobar"));
686    ///
687    /// # Ok::<(), Box<dyn std::error::Error>>(())
688    /// ```
689    ///
690    /// # Example: consistency with search APIs
691    ///
692    /// `is_match` is guaranteed to return `true` whenever `find` returns a
693    /// match. This includes searches that are executed entirely within a
694    /// codepoint:
695    ///
696    /// ```
697    /// use ib_matcher::regex::{cp::Regex, Input};
698    ///
699    /// let re = Regex::new("a*")?;
700    ///
701    /// // This doesn't match because the default configuration bans empty
702    /// // matches from splitting a codepoint.
703    /// assert!(!re.is_match(Input::new("☃").span(1..2)));
704    /// assert_eq!(None, re.find(Input::new("☃").span(1..2)));
705    ///
706    /// # Ok::<(), Box<dyn std::error::Error>>(())
707    /// ```
708    ///
709    /// Notice that when UTF-8 mode is disabled, then the above reports a
710    /// match because the restriction against zero-width matches that split a
711    /// codepoint has been lifted:
712    ///
713    /// ```
714    /// use ib_matcher::regex::{cp::Regex, Input, Match};
715    ///
716    /// let re = Regex::builder()
717    ///     .configure(Regex::config().utf8(false))
718    ///     .build("a*")?;
719    ///
720    /// assert!(re.is_match(Input::new("☃").span(1..2)));
721    /// assert_eq!(
722    ///     Some(Match::must(0, 1..1)),
723    ///     re.find(Input::new("☃").span(1..2)),
724    /// );
725    ///
726    /// # Ok::<(), Box<dyn std::error::Error>>(())
727    /// ```
728    ///
729    /// A similar idea applies when using line anchors with CRLF mode enabled,
730    /// which prevents them from matching between a `\r` and a `\n`.
731    ///
732    /// ```
733    /// use ib_matcher::regex::{cp::Regex, Input, Match};
734    ///
735    /// let re = Regex::new(r"(?Rm:$)")?;
736    /// assert!(!re.is_match(Input::new("\r\n").span(1..1)));
737    /// // A regular line anchor, which only considers \n as a
738    /// // line terminator, will match.
739    /// let re = Regex::new(r"(?m:$)")?;
740    /// assert!(re.is_match(Input::new("\r\n").span(1..1)));
741    ///
742    /// # Ok::<(), Box<dyn std::error::Error>>(())
743    /// ```
744    #[inline]
745    pub fn is_match<'h, I: Into<Input<'h>>>(&self, input: I) -> bool {
746        let input = input.into().earliest(true);
747        let mut guard = self.pool.get();
748        self.try_is_match(&mut guard, input).unwrap()
749    }
750
751    /// Executes a leftmost search and returns the first match that is found,
752    /// if one exists.
753    ///
754    /// # Example
755    ///
756    /// ```
757    /// use ib_matcher::regex::{cp::Regex, Match};
758    ///
759    /// let re = Regex::new("foo[0-9]+")?;
760    /// assert_eq!(Some(Match::must(0, 0..8)), re.find("foo12345"));
761    ///
762    /// # Ok::<(), Box<dyn std::error::Error>>(())
763    /// ```
764    #[inline]
765    pub fn find<'h, I: Into<Input<'h>>>(&self, input: I) -> Option<Match> {
766        let input = input.into();
767        let mut guard = self.pool.get();
768        self.try_find(&mut guard, input).unwrap()
769    }
770
771    /// Executes a leftmost forward search and writes the spans of capturing
772    /// groups that participated in a match into the provided [`Captures`]
773    /// value. If no match was found, then [`Captures::is_match`] is guaranteed
774    /// to return `false`.
775    ///
776    /// # Example
777    ///
778    /// ```
779    /// use ib_matcher::regex::{cp::Regex, Span};
780    ///
781    /// let re = Regex::new(r"^([0-9]{4})-([0-9]{2})-([0-9]{2})$")?;
782    /// let mut caps = re.create_captures();
783    ///
784    /// re.captures("2010-03-14", &mut caps);
785    /// assert!(caps.is_match());
786    /// assert_eq!(Some(Span::from(0..4)), caps.get_group(1));
787    /// assert_eq!(Some(Span::from(5..7)), caps.get_group(2));
788    /// assert_eq!(Some(Span::from(8..10)), caps.get_group(3));
789    ///
790    /// # Ok::<(), Box<dyn std::error::Error>>(())
791    /// ```
792    #[inline]
793    pub fn captures<'h, I: Into<Input<'h>>>(
794        &self,
795        input: I,
796        caps: &mut Captures,
797    ) -> Result<(), MatchError> {
798        let input = input.into();
799        let mut guard = self.pool.get();
800        self.try_captures(&mut guard, input, caps)
801    }
802
803    /// Returns an iterator over all non-overlapping leftmost matches in
804    /// the given haystack. If no match exists, then the iterator yields no
805    /// elements.
806    ///
807    /// # Example
808    ///
809    /// ```
810    /// use ib_matcher::regex::{cp::Regex, Match};
811    ///
812    /// let re = Regex::new("foo[0-9]+")?;
813    /// let haystack = "foo1 foo12 foo123";
814    /// let matches: Vec<Match> = re.find_iter(haystack).collect();
815    /// assert_eq!(matches, vec![
816    ///     Match::must(0, 0..4),
817    ///     Match::must(0, 5..10),
818    ///     Match::must(0, 11..17),
819    /// ]);
820    /// # Ok::<(), Box<dyn std::error::Error>>(())
821    /// ```
822    #[inline]
823    pub fn find_iter<'h, I: Into<Input<'h>>>(
824        &'h self,
825        input: I,
826    ) -> impl Iterator<Item = Match> + 'h {
827        let input = input.into();
828        let guard = UnsafeCell::new(self.pool.get());
829        self.try_find_iter(unsafe { &mut *guard.get() }, input).map(move |r| {
830            let _guard = &guard;
831            r.unwrap()
832        })
833    }
834
835    /// Returns an iterator over all non-overlapping `Captures` values. If no
836    /// match exists, then the iterator yields no elements.
837    ///
838    /// This yields the same matches as [`Regex::find_iter`], but it includes
839    /// the spans of all capturing groups that participate in each match.
840    ///
841    /// **Tip:** See [`util::iter::Searcher`](crate::util::iter::Searcher) for
842    /// how to correctly iterate over all matches in a haystack while avoiding
843    /// the creation of a new `Captures` value for every match. (Which you are
844    /// forced to do with an `Iterator`.)
845    ///
846    /// # Example
847    ///
848    /// ```
849    /// use ib_matcher::regex::{cp::Regex, Span};
850    ///
851    /// let re = Regex::new("foo(?P<numbers>[0-9]+)")?;
852    ///
853    /// let haystack = "foo1 foo12 foo123";
854    /// let matches: Vec<Span> = re
855    ///     .captures_iter(haystack)
856    ///     // The unwrap is OK since 'numbers' matches if the pattern matches.
857    ///     .map(|caps| caps.get_group_by_name("numbers").unwrap())
858    ///     .collect();
859    /// assert_eq!(matches, vec![
860    ///     Span::from(3..4),
861    ///     Span::from(8..10),
862    ///     Span::from(14..17),
863    /// ]);
864    /// # Ok::<(), Box<dyn std::error::Error>>(())
865    /// ```
866    #[inline]
867    pub fn captures_iter<'h, I: Into<Input<'h>>>(
868        &'h self,
869        input: I,
870    ) -> impl Iterator<Item = Captures> + 'h {
871        let input = input.into();
872        let guard = UnsafeCell::new(self.pool.get());
873        self.try_captures_iter(unsafe { &mut *guard.get() }, input).map(
874            move |r| {
875                let _guard = &guard;
876                r.unwrap()
877            },
878        )
879    }
880}
881
882impl Deref for Regex<'_> {
883    type Target = BoundedBacktracker;
884
885    fn deref(&self) -> &Self::Target {
886        unsafe { self.imp.re.assume_init_ref() }
887    }
888}
889
890#[cfg(test)]
891mod tests {
892    use regex_automata::Match;
893    use regex_syntax::hir::Look;
894
895    use crate::{
896        matcher::{PinyinMatchConfig, RomajiMatchConfig},
897        pinyin::PinyinNotation,
898    };
899
900    use super::*;
901
902    #[test]
903    fn literal() {
904        let re = Regex::builder()
905            .ib(MatchConfig::builder()
906                .pinyin(PinyinMatchConfig::notations(
907                    PinyinNotation::Ascii | PinyinNotation::AsciiFirstLetter,
908                ))
909                .build())
910            .build("pyss")
911            .unwrap();
912
913        let mut cache = re.create_cache();
914        assert_eq!(
915            re.try_find(&mut cache, "pyss").unwrap(),
916            Some(Match::must(0, 0..4)),
917        );
918        assert_eq!(
919            re.try_find(&mut cache, "apyss").unwrap(),
920            Some(Match::must(0, 1..5)),
921        );
922        assert_eq!(
923            re.try_find(&mut cache, "拼音搜索").unwrap(),
924            Some(Match::must(0, 0..12)),
925        );
926
927        assert_eq!(re.find("pyss"), Some(Match::must(0, 0..4)),);
928    }
929
930    #[test]
931    fn case() {
932        let re = Regex::builder()
933            .syntax(util::syntax::Config::new().case_insensitive(true))
934            .build(r"δ")
935            .unwrap();
936        assert_eq!(Some(Match::must(0, 0..2)), re.find(r"Δ"));
937    }
938
939    #[test]
940    fn alt() {
941        let pinyin = PinyinMatchConfig::notations(
942            PinyinNotation::Ascii | PinyinNotation::AsciiFirstLetter,
943        );
944
945        let re = Regex::builder().build("samwise|sam").unwrap();
946        assert_eq!(Some(Match::must(0, 0..3)), re.find("sam"));
947
948        let re = Regex::builder()
949            .ib(MatchConfig::builder().pinyin(pinyin.shallow_clone()).build())
950            .build("samwise|pyss")
951            .unwrap();
952        assert_eq!(Some(Match::must(0, 0..12)), re.find("拼音搜索"));
953    }
954
955    #[test]
956    fn wildcard() {
957        let re = Regex::builder()
958            .ib(MatchConfig::builder()
959                .pinyin(PinyinMatchConfig::notations(
960                    PinyinNotation::Ascii | PinyinNotation::AsciiFirstLetter,
961                ))
962                .romaji(RomajiMatchConfig::default())
963                .build())
964            .build("raki.suta")
965            .unwrap();
966
967        assert_eq!(re.max_haystack_len(), 0x1111111111111110);
968        let mut cache = re.create_cache();
969        assert_eq!(cache.memory_usage(), 0);
970        assert_eq!(
971            re.try_find(&mut cache, "¥らき☆すた").unwrap(),
972            Some(Match::must(0, 3..18)),
973        );
974        // 2 * 16 + (alignup(16 * (18+1) / 8, 8) = 40)
975        assert_eq!(cache.memory_usage(), 72);
976
977        let re = Regex::builder()
978            .ib(MatchConfig::builder()
979                .pinyin(PinyinMatchConfig::notations(
980                    PinyinNotation::Ascii | PinyinNotation::AsciiFirstLetter,
981                ))
982                .build())
983            .build("p.*y.*s.*s")
984            .unwrap();
985        let mut cache = re.create_cache();
986        assert_eq!(
987            re.try_find(&mut cache, "拼a音b搜c索d").unwrap(),
988            Some(Match::must(0, 0..15)),
989        );
990    }
991
992    #[test]
993    fn mix_lang() {
994        let pinyin = PinyinMatchConfig::notations(
995            PinyinNotation::Ascii | PinyinNotation::AsciiFirstLetter,
996        );
997        let romaji = RomajiMatchConfig::default();
998
999        let re = Regex::builder()
1000            .ib(MatchConfig::builder()
1001                .pinyin(pinyin.shallow_clone())
1002                .romaji(romaji.shallow_clone())
1003                .build())
1004            .build("pysousuosousounofuri-ren")
1005            .unwrap();
1006        let mut cache = re.create_cache();
1007        assert_eq!(
1008            re.try_find(&mut cache, "拼音搜索葬送のフリーレン").unwrap(),
1009            None
1010        );
1011
1012        let re = Regex::builder()
1013            .ib(MatchConfig::builder()
1014                .pinyin(pinyin.shallow_clone())
1015                .romaji(romaji.shallow_clone())
1016                .mix_lang(true)
1017                .build())
1018            .build("pysousuosousounofuri-ren")
1019            .unwrap();
1020        assert_eq!(
1021            re.find("拼音搜索葬送のフリーレン"),
1022            Some(Match::must(0, 0..36)),
1023        );
1024
1025        let re = Regex::builder()
1026            .ib(MatchConfig::builder()
1027                .pinyin(pinyin.shallow_clone())
1028                .romaji(romaji.shallow_clone())
1029                .build())
1030            .build("(pysousuo)(sousounofuri-ren)")
1031            .unwrap();
1032        let mut cache = re.create_cache();
1033        assert_eq!(
1034            re.try_find(&mut cache, "拼音搜索葬送のフリーレン").unwrap(),
1035            Some(Match::must(0, 0..36)),
1036        );
1037
1038        let re = Regex::builder()
1039            .ib(MatchConfig::builder()
1040                .pinyin(pinyin.shallow_clone())
1041                .romaji(romaji.shallow_clone())
1042                .build())
1043            .build("pysousuo.*?sousounofuri-ren")
1044            .unwrap();
1045        let mut cache = re.create_cache();
1046        assert_eq!(
1047            re.try_find(&mut cache, "拼音搜索⭐葬送のフリーレン").unwrap(),
1048            Some(Match::must(0, 0..39)),
1049        );
1050    }
1051
1052    #[test]
1053    fn look() {
1054        // (?Rm)^foo$
1055        let hir1 = Hir::concat(vec![
1056            Hir::look(Look::StartCRLF),
1057            Hir::literal("foo".as_bytes()),
1058            Hir::look(Look::EndCRLF),
1059        ]);
1060        // (?Rm)^bar$
1061        let hir2 = Hir::concat(vec![
1062            Hir::look(Look::StartCRLF),
1063            Hir::literal("bar".as_bytes()),
1064            Hir::look(Look::EndCRLF),
1065        ]);
1066        let re =
1067            Regex::builder().build_many_from_hir(vec![hir1, hir2]).unwrap();
1068        let hay = "\r\nfoo\r\nbar";
1069        let got: Vec<Match> = re.find_iter(hay).collect();
1070        let expected = vec![Match::must(0, 2..5), Match::must(1, 7..10)];
1071        assert_eq!(expected, got);
1072    }
1073
1074    #[cfg(feature = "regex-callback")]
1075    #[test]
1076    fn callback() {
1077        use std::{cell::RefCell, rc::Rc};
1078
1079        let re = Regex::builder()
1080            .callback("ascii", |input, at, push| {
1081                let haystack = &input.haystack()[at..];
1082                if haystack.get(0).is_some_and(|c| c.is_ascii()) {
1083                    push(1);
1084                }
1085            })
1086            .build(r"(ascii)+\d(ascii)+")
1087            .unwrap();
1088        assert_eq!(re.find("that4U this4me"), Some(Match::must(0, 8..16)));
1089
1090        let count = Rc::new(RefCell::new(0));
1091        let re = Regex::builder()
1092            .callback("open_quote", {
1093                let count = count.clone();
1094                move |input, at, push| {
1095                    if at < 2 || input.haystack()[at - 2] != b'\\' {
1096                        let mut count = count.borrow_mut();
1097                        *count += 1;
1098                        push(0);
1099                    }
1100                }
1101            })
1102            .callback("close_quote", move |input, at, push| {
1103                if at < 2 || input.haystack()[at - 2] != b'\\' {
1104                    let mut count = count.borrow_mut();
1105                    if *count > 0 {
1106                        push(0);
1107                    }
1108                    *count -= 1;
1109                }
1110            })
1111            // '([^'\\]+?|\\')*'
1112            .build(r"'(open_quote).*?'(close_quote)")
1113            .unwrap();
1114        let hay = r"'one' 'two\'three' 'four'";
1115        assert_eq!(
1116            re.find_iter(hay).map(|m| &hay[m.span()]).collect::<Vec<_>>(),
1117            vec!["'one'", r"'two\'three'", "'four'"]
1118        );
1119
1120        let re = Regex::builder()
1121            .callback("lookahead_is_ascii", |input, at, push| {
1122                let haystack = &input.haystack()[at..];
1123                if haystack.get(0).is_some_and(|c| c.is_ascii()) {
1124                    push(0);
1125                }
1126            })
1127            .build(r"(?-u)[\x00-\x7f]+?\d(lookahead_is_ascii)")
1128            .unwrap();
1129        let hay = "that4U,this4me1plz";
1130        assert_eq!(
1131            re.find_iter(hay).map(|m| &hay[m.span()]).collect::<Vec<_>>(),
1132            vec![",this4", "me1"]
1133        );
1134    }
1135}