ib_matcher/regex/lita/
regex.rs

1use std::sync::Arc;
2
3use bon::bon;
4use regex_automata::{
5    dfa::{self, dense},
6    util::{captures::GroupInfo, primitives::NonMaxUsize},
7    PatternID,
8};
9use regex_syntax::hir::{Hir, HirKind};
10
11use crate::{
12    matcher::{
13        self, config::IbMatcherWithConfig, pattern::Pattern, MatchConfig,
14    },
15    regex::{
16        cp,
17        nfa::{backtrack, thompson},
18        util::{self, captures::Captures},
19        Input, Match, MatchError,
20    },
21    syntax::regex::hir,
22};
23
24pub use crate::regex::nfa::{backtrack::Config, thompson::BuildError};
25
26/// A compiled regular expression for searching Unicode haystacks.
27///
28/// A `Regex` can be used to search haystacks, split haystacks into substrings
29/// or replace substrings in a haystack with a different substring. All
30/// searching is done with an implicit `(?s:.)*?` at the beginning and end of
31/// an pattern. To force an expression to match the whole string (or a prefix
32/// or a suffix), you can use anchored search or an anchor like `^` or `$` (or `\A` and `\z`).
33/**
34# Overview
35
36The most important methods are as follows:
37
38* [`Regex::new`] compiles a regex using the default configuration. A
39[`Builder`] permits setting a non-default configuration. (For example,
40case insensitive matching, verbose mode and others.)
41* [`Regex::is_match`] reports whether a match exists in a particular haystack.
42* [`Regex::find`] reports the byte offsets of a match in a haystack, if one
43exists. [`Regex::find_iter`] returns an iterator over all such matches.
44* [`Regex::captures`] returns a [`Captures`], which reports both the byte
45offsets of a match in a haystack and the byte offsets of each matching capture
46group from the regex in the haystack.
47[`Regex::captures_iter`] returns an iterator over all such matches.
48*/
49/// # Example
50///
51/// ```
52/// use ib_matcher::regex::lita::Regex;
53///
54/// let re = Regex::new(r"^[0-9]{4}-[0-9]{2}-[0-9]{2}$")?;
55/// assert!(re.is_match("2010-03-14"));
56///
57/// # Ok::<(), Box<dyn std::error::Error>>(())
58/// ```
59/**
60With `IbMatcher`'s Chinese pinyin and Japanese romaji matching:
61```
62// cargo add ib-matcher --features regex,pinyin,romaji
63use ib_matcher::{
64    matcher::{MatchConfig, PinyinMatchConfig, RomajiMatchConfig},
65    regex::{lita::Regex, Match},
66};
67
68let config = MatchConfig::builder()
69    .pinyin(PinyinMatchConfig::default())
70    .romaji(RomajiMatchConfig::default())
71    .build();
72
73let re = Regex::builder()
74    .ib(config.shallow_clone())
75    .build("raki.suta")
76    .unwrap();
77assert_eq!(re.find("「らき☆すた」"), Some(Match::must(0, 3..18)));
78
79let re = Regex::builder()
80    .ib(config.shallow_clone())
81    .build("pysou.*?(any|every)thing")
82    .unwrap();
83assert_eq!(re.find("拼音搜索Everything"), Some(Match::must(0, 0..22)));
84
85let config = MatchConfig::builder()
86    .pinyin(PinyinMatchConfig::default())
87    .romaji(RomajiMatchConfig::default())
88    .mix_lang(true)
89    .build();
90let re = Regex::builder()
91    .ib(config.shallow_clone())
92    .build("(?x)^zangsounofuri-?ren # Mixing pinyin and romaji")
93    .unwrap();
94assert_eq!(re.find("葬送のフリーレン"), Some(Match::must(0, 0..24)));
95```
96*/
97/// For more examples and the syntax, see [`crate::regex`].
98///
99/// # Case insensitivity
100/// To enable case insensitivity:
101/// ```
102/// use ib_matcher::{matcher::{PinyinMatchConfig, PlainMatchConfig, MatchConfig}, regex::lita::Regex};
103///
104/// let re = Regex::builder().ib(MatchConfig::default()).build("foo").unwrap();
105/// assert!(re.is_match("FOO"));
106///
107/// // Alternatively, with `case_insensitive()`:
108/// let re = Regex::builder()
109///     .ib(MatchConfig::builder()
110///         .case_insensitive(true)
111///         .pinyin(PinyinMatchConfig::default())
112///         .build())
113///     .build("pyss")
114///     .unwrap();
115/// assert!(re.is_match("PY搜索"));
116/// ```
117/// Note that enabling `syntax.case_insensitive` will make `ib` (i.e. pinyin and romaji match) doesn't work at the moment. You should only set [`MatchConfigBuilder::case_insensitive`](crate::matcher::MatchConfigBuilder::case_insensitive) ([`PlainMatchConfigBuilder::case_insensitive`](crate::matcher::PlainMatchConfigBuilder::case_insensitive)).
118///
119/// If you need case insensitive character classes, you need to write `(?i:[a-z])` instead at the moment.
120///
121/// # Synchronization and cloning
122///
123/// In order to make the `Regex` API convenient, most of the routines hide
124/// the fact that a `Cache` is needed at all. To achieve this, a [memory
125/// pool](automata::util::pool::Pool) is used internally to retrieve `Cache`
126/// values in a thread safe way that also permits reuse. This in turn implies
127/// that every such search call requires some form of synchronization. Usually
128/// this synchronization is fast enough to not notice, but in some cases, it
129/// can be a bottleneck. This typically occurs when all of the following are
130/// true:
131///
132/// * The same `Regex` is shared across multiple threads simultaneously,
133/// usually via a [`util::lazy::Lazy`](automata::util::lazy::Lazy) or something
134/// similar from the `once_cell` or `lazy_static` crates.
135/// * The primary unit of work in each thread is a regex search.
136/// * Searches are run on very short haystacks.
137///
138/// This particular case can lead to high contention on the pool used by a
139/// `Regex` internally, which can in turn increase latency to a noticeable
140/// effect. This cost can be mitigated in one of the following ways:
141///
142/// * Use a distinct copy of a `Regex` in each thread, usually by cloning it.
143/// Cloning a `Regex` _does not_ do a deep copy of its read-only component.
144/// But it does lead to each `Regex` having its own memory pool, which in
145/// turn eliminates the problem of contention. In general, this technique should
146/// not result in any additional memory usage when compared to sharing the same
147/// `Regex` across multiple threads simultaneously.
148/// * Use lower level APIs, like [`Regex::try_find`], which permit passing
149/// a `Cache` explicitly. In this case, it is up to you to determine how best
150/// to provide a `Cache`. For example, you might put a `Cache` in thread-local
151/// storage if your use case allows for it.
152///
153/// Overall, this is an issue that happens rarely in practice, but it can
154/// happen.
155///
156/// # Warning: spin-locks may be used in alloc-only mode
157///
158/// When this crate is built without the `std` feature and the high level APIs
159/// on a `Regex` are used, then a spin-lock will be used to synchronize access
160/// to an internal pool of `Cache` values. This may be undesirable because
161/// a spin-lock is [effectively impossible to implement correctly in user
162/// space][spinlocks-are-bad]. That is, more concretely, the spin-lock could
163/// result in a deadlock.
164///
165/// [spinlocks-are-bad]: https://matklad.github.io/2020/01/02/spinlocks-considered-harmful.html
166///
167/// If one wants to avoid the use of spin-locks when the `std` feature is
168/// disabled, then you must use APIs that accept a `Cache` value explicitly.
169/// For example, [`Regex::try_find`].
170#[derive(Clone)]
171pub struct Regex<'a> {
172    /// The actual regex implementation.
173    imp: RegexI<'a>,
174}
175
176#[derive(Clone)]
177enum RegexI<'a> {
178    Ib(Arc<IbMatcherWithConfig<'a>>),
179    Cp { dfa: dfa::regex::Regex, cp: cp::Regex<'a> },
180}
181
182#[bon]
183impl<'a> Regex<'a> {
184    pub fn new(pattern: &str) -> Result<Self, BuildError> {
185        Self::builder().build(pattern)
186    }
187
188    pub fn config() -> thompson::Config {
189        thompson::Config::new()
190    }
191
192    /// Return a builder for configuring the construction of a `Regex`.
193    ///
194    /// This is a convenience routine to avoid needing to import the
195    /// [`Builder`] type in common cases.
196    ///
197    /// # Example: change the line terminator
198    ///
199    /// This example shows how to enable multi-line mode by default and change
200    /// the line terminator to the NUL byte:
201    ///
202    /// ```
203    /// use ib_matcher::regex::{lita::Regex, util::{syntax, look::LookMatcher}, Match};
204    ///
205    /// let mut lookm = LookMatcher::new();
206    /// lookm.set_line_terminator(b'\x00');
207    /// let re = Regex::builder()
208    ///     .syntax(syntax::Config::new().multi_line(true))
209    ///     .thompson(Regex::config().look_matcher(lookm))
210    ///     .build(r"^foo$")?;
211    /// let hay = "\x00foo\x00";
212    /// assert_eq!(Some(Match::must(0, 1..4)), re.find(hay));
213    ///
214    /// # Ok::<(), Box<dyn std::error::Error>>(())
215    /// ```
216    #[builder(builder_type = Builder, finish_fn(name = build_from_hir, doc {
217    /// Builds a `Regex` directly from an `Hir` expression.
218    ///
219    /// This is useful if you needed to parse a pattern string into an `Hir`
220    /// for other reasons (such as analysis or transformations). This routine
221    /// permits building a `Regex` directly from the `Hir` expression instead
222    /// of first converting the `Hir` back to a pattern string.
223    ///
224    /// When using this method, any options set via [`Builder::syntax`] are
225    /// ignored. Namely, the syntax options only apply when parsing a pattern
226    /// string, which isn't relevant here.
227    ///
228    /// If there was a problem building the underlying regex matcher for the
229    /// given `Hir`, then an error is returned.
230    ///
231    /// # Example
232    ///
233    /// This example shows how one can hand-construct an `Hir` expression and
234    /// build a regex from it without doing any parsing at all.
235    ///
236    /// ```
237    /// use ib_matcher::{
238    ///     regex::{lita::Regex, Match},
239    ///     syntax::regex::hir::{Hir, Look},
240    /// };
241    ///
242    /// // (?Rm)^foo$
243    /// let hir = Hir::concat(vec![
244    ///     Hir::look(Look::StartCRLF),
245    ///     Hir::literal("foo".as_bytes()),
246    ///     Hir::look(Look::EndCRLF),
247    /// ]);
248    /// let re = Regex::builder()
249    ///     .build_from_hir(hir)?;
250    /// let hay = "\r\nfoo\r\n";
251    /// assert_eq!(Some(Match::must(0, 2..5)), re.find(hay));
252    ///
253    /// Ok::<(), Box<dyn std::error::Error>>(())
254    /// ```
255    }))]
256    pub fn builder(
257        #[builder(field)] syntax: util::syntax::Config,
258        #[builder(finish_fn)] hir: Hir,
259        /// If the provided `hir` is Unicode-aware, providing a ASCII-aware-only `Hir` as `hir_ascii` can improve performance.
260        ///
261        /// The second `bool` is whether the provided `hir_ascii` is case insensitive:
262        /// - If it's `false` but `ib.case_insensitive` is `true`, then `hir_ascii` will be converted to case insensitive. (Used by glob)
263        /// - If it's `true` but `ib.case_insensitive` is `false`, `build()` will panic.
264        hir_ascii: Option<(Hir, bool)>,
265        #[builder(default)] dfa_dense: dfa::dense::Config,
266        /// Thompson NFA config. Named `configure` to be compatible with [`regex_automata::meta::Builder`]. Although some fields are not supported and `utf8_empty` is named as `utf8` instead.
267        #[builder(default)]
268        thompson: thompson::Config,
269        /// [`IbMatcher`] config.
270        #[builder(default = MatchConfig::builder().case_insensitive(false).build())]
271        mut ib: MatchConfig<'a>,
272        /// `IbMatcher` pattern parser.
273        ///
274        /// ### Example
275        /// ```
276        /// use ib_matcher::{regex::lita::Regex, matcher::{MatchConfig, pattern::Pattern}};
277        ///
278        /// let re = Regex::builder()
279        ///     .ib(MatchConfig::builder().pinyin(Default::default()).build())
280        ///     .ib_parser(&mut |pattern| Pattern::parse_ev(pattern).call())
281        ///     .build("pinyin;py")
282        ///     .unwrap();
283        /// assert!(re.is_match("拼音搜索"));
284        /// assert!(re.is_match("pinyin") == false);
285        /// ```
286        /// See [`crate::syntax::ev`] for more details.
287        mut ib_parser: Option<&mut dyn FnMut(&str) -> Pattern<str>>,
288        #[builder(default = backtrack::Config::new().visited_capacity(usize::MAX / 8))]
289        backtrack: backtrack::Config,
290    ) -> Result<Self, BuildError> {
291        _ = syntax;
292        #[cfg(test)]
293        dbg!(&hir);
294
295        let imp = match hir.kind() {
296            // TODO: Look::{Start,End} optimization
297            HirKind::Literal(literal) => {
298                let pattern = str::from_utf8(&literal.0).unwrap();
299                let pattern = if let Some(ib_parser) = ib_parser.as_mut() {
300                    ib_parser(pattern)
301                } else {
302                    pattern.into()
303                };
304                RegexI::Ib(IbMatcherWithConfig::with_config(pattern, ib))
305            }
306            _ => {
307                let dfa = {
308                    // We can always forcefully disable captures because DFAs do not
309                    // support them.
310                    let thompson = thompson
311                        .clone()
312                        .which_captures(thompson::WhichCaptures::None);
313
314                    let mut compiler = thompson::Compiler::new();
315                    let hir_buf;
316                    let (mut hir, hir_case_insensitive) = hir_ascii
317                        .as_ref()
318                        .map(|(hir, case)| (hir, *case))
319                        .unwrap_or((&hir, false));
320                    if let Some(plain) = &ib.plain {
321                        debug_assert!(
322                            !(hir_case_insensitive && !plain.case_insensitive)
323                        );
324                        if !hir_case_insensitive && plain.case_insensitive {
325                            hir_buf = hir::case::hir_to_ascii_case_insensitive(
326                                hir.clone(),
327                            );
328                            hir = &hir_buf;
329                        }
330                    }
331
332                    let forward_nfa = compiler
333                        .configure(thompson.clone())
334                        .build_from_hir(hir)?;
335                    // TODO: prefilter
336                    // TODO: minimize?
337                    // TODO: quit vs is_ascii?
338                    let forward = dense::Builder::new()
339                        .configure(dfa_dense.clone())
340                        .build_from_nfa(&forward_nfa)
341                        .unwrap();
342
343                    let reverse_nfa = compiler
344                        .configure(thompson.reverse(true))
345                        .build_from_hir(hir)?;
346                    let reverse = dense::Builder::new()
347                        .configure(
348                            dfa_dense
349                                .prefilter(None)
350                                .specialize_start_states(false)
351                                .start_kind(dfa::StartKind::Anchored)
352                                .match_kind(regex_automata::MatchKind::All),
353                        )
354                        .build_from_nfa(&reverse_nfa)
355                        .unwrap();
356
357                    dfa::regex::Regex::builder()
358                        .build_from_dfas(forward, reverse)
359                };
360                if let Some(plain) = ib.plain.as_mut() {
361                    // -3.3%
362                    plain.maybe_ascii = false;
363                }
364                let cp = cp::Regex::builder()
365                    .syntax(syntax)
366                    .configure(thompson)
367                    .ib(ib)
368                    .maybe_ib_parser(ib_parser)
369                    .backtrack(backtrack)
370                    .build_from_hir(hir)?;
371                RegexI::Cp { dfa, cp }
372            }
373        };
374
375        Ok(Self { imp })
376    }
377
378    /// Create a new empty set of capturing groups that is guaranteed to be
379    /// valid for the search APIs on this `BoundedBacktracker`.
380    ///
381    /// A `Captures` value created for a specific `BoundedBacktracker` cannot
382    /// be used with any other `BoundedBacktracker`.
383    ///
384    /// This is a convenience function for [`Captures::all`]. See the
385    /// [`Captures`] documentation for an explanation of its alternative
386    /// constructors that permit the `BoundedBacktracker` to do less work
387    /// during a search, and thus might make it faster.
388    pub fn create_captures(&self) -> Captures {
389        match &self.imp {
390            RegexI::Ib(_) => Captures::matches(GroupInfo::empty()),
391            RegexI::Cp { dfa: _, cp } => cp.create_captures(),
392        }
393    }
394}
395
396impl<'a, S: builder::State> Builder<'a, '_, S> {
397    /// Configure the syntax options when parsing a pattern string while
398    /// building a `Regex`.
399    ///
400    /// These options _only_ apply when [`Builder::build`] or [`Builder::build_many`]
401    /// are used. The other build methods accept `Hir` values, which have
402    /// already been parsed.
403    ///
404    /// # Example
405    ///
406    /// This example shows how to enable case insensitive mode.
407    ///
408    /// ```
409    /// use ib_matcher::regex::{lita::Regex, util::syntax, Match};
410    ///
411    /// let re = Regex::builder()
412    ///     .syntax(syntax::Config::new().case_insensitive(true))
413    ///     .build(r"δ")?;
414    /// assert_eq!(Some(Match::must(0, 0..2)), re.find(r"Δ"));
415    ///
416    /// Ok::<(), Box<dyn std::error::Error>>(())
417    /// ```
418    pub fn syntax(mut self, syntax: util::syntax::Config) -> Self {
419        self.syntax = syntax;
420        self
421    }
422
423    /// Builds a `Regex` from a single pattern string.
424    ///
425    /// If there was a problem parsing the pattern or a problem turning it into
426    /// a regex matcher, then an error is returned.
427    ///
428    /// # Example
429    ///
430    /// This example shows how to configure syntax options.
431    ///
432    /// ```
433    /// use ib_matcher::regex::{lita::Regex, util::syntax, Match};
434    ///
435    /// let re = Regex::builder()
436    ///     .syntax(syntax::Config::new().crlf(true).multi_line(true))
437    ///     .build(r"^foo$")?;
438    /// let hay = "\r\nfoo\r\n";
439    /// assert_eq!(Some(Match::must(0, 2..5)), re.find(hay));
440    ///
441    /// # Ok::<(), Box<dyn std::error::Error>>(())
442    /// ```
443    pub fn build(self, pattern: &str) -> Result<Regex<'a>, BuildError>
444    where
445        S::HirAscii: builder::IsUnset,
446    {
447        let syntax = self.syntax;
448
449        // Parse
450        let pattern = pattern.as_ref();
451        let parse_with = |syntax| {
452            regex_automata::util::syntax::parse_with(pattern, &syntax).map_err(
453                |_| {
454                    // Shit
455                    thompson::Compiler::new()
456                        .syntax(syntax)
457                        .build(pattern)
458                        .unwrap_err()
459                },
460            )
461        };
462        let hir_ascii = parse_with(
463            syntax
464                // TODO: case_insensitive
465                .unicode(false)
466                // ASCII must be valid UTF-8
467                .utf8(false),
468        )?;
469        let hir = parse_with(syntax)?;
470        self.hir_ascii((hir_ascii, false)).build_from_hir(hir)
471    }
472}
473
474/// High level convenience routines for using a regex to search a haystack.
475impl<'a> Regex<'a> {
476    /// Returns true if and only if this regex matches the given haystack.
477    ///
478    /// This routine may short circuit if it knows that scanning future input
479    /// will never lead to a different result. (Consider how this might make
480    /// a difference given the regex `a+` on the haystack `aaaaaaaaaaaaaaa`.
481    /// This routine _may_ stop after it sees the first `a`, but routines like
482    /// `find` need to continue searching because `+` is greedy by default.)
483    ///
484    /// # Example
485    ///
486    /// ```
487    /// use ib_matcher::regex::lita::Regex;
488    ///
489    /// let re = Regex::new("foo[0-9]+bar")?;
490    ///
491    /// assert!(re.is_match("foo12345bar"));
492    /// assert!(!re.is_match("foobar"));
493    ///
494    /// # Ok::<(), Box<dyn std::error::Error>>(())
495    /// ```
496    ///
497    /// # Example: consistency with search APIs
498    ///
499    /// `is_match` is guaranteed to return `true` whenever `find` returns a
500    /// match. This includes searches that are executed entirely within a
501    /// codepoint:
502    ///
503    /// ```
504    /// use ib_matcher::regex::{lita::Regex, Input};
505    ///
506    /// let re = Regex::new("a*")?;
507    ///
508    /// // This doesn't match because the default configuration bans empty
509    /// // matches from splitting a codepoint.
510    /// assert!(!re.is_match(Input::new("☃").span(1..2)));
511    /// assert_eq!(None, re.find(Input::new("☃").span(1..2)));
512    ///
513    /// # Ok::<(), Box<dyn std::error::Error>>(())
514    /// ```
515    ///
516    /// Notice that when UTF-8 mode is disabled, then the above reports a
517    /// match because the restriction against zero-width matches that split a
518    /// codepoint has been lifted:
519    ///
520    /// ```
521    /// use ib_matcher::regex::{lita::Regex, Input, Match};
522    ///
523    /// let re = Regex::builder()
524    ///     .thompson(Regex::config().utf8(false))
525    ///     .build("a*")?;
526    ///
527    /// assert!(re.is_match(Input::new("☃").span(1..2)));
528    /// assert_eq!(
529    ///     Some(Match::must(0, 1..1)),
530    ///     re.find(Input::new("☃").span(1..2)),
531    /// );
532    ///
533    /// # Ok::<(), Box<dyn std::error::Error>>(())
534    /// ```
535    ///
536    /// A similar idea applies when using line anchors with CRLF mode enabled,
537    /// which prevents them from matching between a `\r` and a `\n`.
538    ///
539    /// ```
540    /// use ib_matcher::regex::{lita::Regex, Input, Match};
541    ///
542    /// let re = Regex::new(r"(?Rm:$)")?;
543    /// assert!(!re.is_match(Input::new("\r\n").span(1..1)));
544    /// // A regular line anchor, which only considers \n as a
545    /// // line terminator, will match.
546    /// let re = Regex::new(r"(?m:$)")?;
547    /// assert!(re.is_match(Input::new("\r\n").span(1..1)));
548    ///
549    /// # Ok::<(), Box<dyn std::error::Error>>(())
550    /// ```
551    #[inline]
552    pub fn is_match<'h, I: Into<Input<'h>>>(&self, input: I) -> bool {
553        let input = input.into().earliest(true);
554        match &self.imp {
555            RegexI::Ib(matcher) => {
556                matcher.is_match(matcher::input::Input::from_regex(&input))
557            }
558            RegexI::Cp { dfa, cp } => {
559                if input.haystack().is_ascii() {
560                    dfa.is_match(input)
561                } else {
562                    cp.is_match(input)
563                }
564            }
565        }
566    }
567
568    /// Executes a leftmost search and returns the first match that is found,
569    /// if one exists.
570    ///
571    /// # Example
572    ///
573    /// ```
574    /// use ib_matcher::regex::{lita::Regex, Match};
575    ///
576    /// let re = Regex::new("foo[0-9]+")?;
577    /// assert_eq!(Some(Match::must(0, 0..8)), re.find("foo12345"));
578    ///
579    /// # Ok::<(), Box<dyn std::error::Error>>(())
580    /// ```
581    #[inline]
582    pub fn find<'h, I: Into<Input<'h>>>(&self, input: I) -> Option<Match> {
583        let input = input.into();
584        match &self.imp {
585            RegexI::Ib(matcher) => matcher
586                .find(matcher::input::Input::from_regex(&input))
587                .map(|m| m.offset(input.start()).into()),
588            RegexI::Cp { dfa, cp } => {
589                if input.haystack().is_ascii() {
590                    dfa.find(input)
591                } else {
592                    cp.find(input)
593                }
594            }
595        }
596    }
597
598    /// Executes a leftmost forward search and writes the spans of capturing
599    /// groups that participated in a match into the provided [`Captures`]
600    /// value. If no match was found, then [`Captures::is_match`] is guaranteed
601    /// to return `false`.
602    ///
603    /// # Example
604    ///
605    /// ```
606    /// use ib_matcher::regex::{lita::Regex, Span};
607    ///
608    /// let re = Regex::new(r"^([0-9]{4})-([0-9]{2})-([0-9]{2})$")?;
609    /// let mut caps = re.create_captures();
610    ///
611    /// re.captures("2010-03-14", &mut caps);
612    /// assert!(caps.is_match());
613    /// assert_eq!(Some(Span::from(0..4)), caps.get_group(1));
614    /// assert_eq!(Some(Span::from(5..7)), caps.get_group(2));
615    /// assert_eq!(Some(Span::from(8..10)), caps.get_group(3));
616    ///
617    /// # Ok::<(), Box<dyn std::error::Error>>(())
618    /// ```
619    #[inline]
620    pub fn captures<'h, I: Into<Input<'h>>>(
621        &self,
622        input: I,
623        caps: &mut Captures,
624    ) -> Result<(), MatchError> {
625        let input = input.into();
626        match &self.imp {
627            RegexI::Ib(matcher) => {
628                let slots = caps.slots_mut();
629                if let Some(m) =
630                    matcher.find(matcher::input::Input::from_regex(&input))
631                {
632                    let m = m.offset(input.start());
633                    slots[0] = NonMaxUsize::new(m.start());
634                    slots[1] = NonMaxUsize::new(m.end());
635                    caps.set_pattern(Some(PatternID::ZERO));
636                } else {
637                    caps.set_pattern(None);
638                }
639                Ok(())
640            }
641            RegexI::Cp { dfa, cp } => {
642                if input.haystack().is_ascii() && !dfa.is_match(input.clone())
643                {
644                    caps.set_pattern(None);
645                    return Ok(());
646                }
647                cp.captures(input, caps)
648            }
649        }
650    }
651}
652
653#[cfg(test)]
654mod tests {
655    use regex_automata::Match;
656
657    use crate::{
658        matcher::{PinyinMatchConfig, RomajiMatchConfig},
659        pinyin::PinyinNotation,
660        syntax::glob,
661    };
662
663    use super::*;
664
665    #[test]
666    fn empty() {
667        let re = Regex::builder()
668            .ib(MatchConfig::builder()
669                .pinyin(PinyinMatchConfig::default())
670                .build())
671            .build("")
672            .unwrap();
673        assert_eq!(re.find("pyss"), Some(Match::must(0, 0..0)));
674        assert_eq!(re.find("apyss"), Some(Match::must(0, 0..0)));
675        assert_eq!(re.find("拼音搜索"), Some(Match::must(0, 0..0)));
676
677        let re = Regex::builder()
678            .ib(MatchConfig::builder()
679                .pinyin(PinyinMatchConfig::default())
680                .is_pattern_partial(true)
681                .analyze(true)
682                .build())
683            .build_from_hir(
684                glob::parse_wildcard_path()
685                    .separator(glob::PathSeparator::Windows)
686                    .call(""),
687            )
688            .unwrap();
689        assert_eq!(re.find("pyss"), Some(Match::must(0, 0..0)));
690        assert_eq!(re.find("apyss"), Some(Match::must(0, 0..0)));
691        assert_eq!(re.find("拼音搜索"), Some(Match::must(0, 0..0)));
692    }
693
694    #[test]
695    fn literal() {
696        let re = Regex::builder()
697            .ib(MatchConfig::builder()
698                .pinyin(PinyinMatchConfig::notations(
699                    PinyinNotation::Ascii | PinyinNotation::AsciiFirstLetter,
700                ))
701                .build())
702            .build("pyss")
703            .unwrap();
704
705        assert_eq!(re.find("pyss"), Some(Match::must(0, 0..4)));
706        assert_eq!(re.find("apyss"), Some(Match::must(0, 1..5)));
707        assert_eq!(re.find("拼音搜索"), Some(Match::must(0, 0..12)));
708
709        assert_eq!(re.find("pyss"), Some(Match::must(0, 0..4)));
710
711        let re = Regex::builder()
712            .ib(MatchConfig::builder()
713                .pinyin(PinyinMatchConfig::default())
714                .is_pattern_partial(true)
715                .analyze(true)
716                .build())
717            .ib_parser(&mut |pattern| Pattern::parse_ev(&pattern).call())
718            .build_from_hir(
719                glob::parse_wildcard_path()
720                    .separator(glob::PathSeparator::Windows)
721                    .call("abcdef"),
722            )
723            .unwrap();
724        assert_eq!(re.find("pyss"), None);
725        assert_eq!(re.find("abcdef"), Some(Match::must(0, 0..6)));
726        assert_eq!(re.find("0abcdef"), Some(Match::must(0, 1..7)));
727        assert_eq!(re.find("#文档"), None);
728        assert_eq!(re.find("$$"), None);
729    }
730
731    #[test]
732    fn case() {
733        let re = Regex::builder()
734            .syntax(util::syntax::Config::new().case_insensitive(true))
735            .build(r"δ")
736            .unwrap();
737        assert_eq!(Some(Match::must(0, 0..2)), re.find(r"Δ"));
738
739        let re = Regex::builder()
740            .ib(MatchConfig::builder().build())
741            .build("pro.*m")
742            .unwrap();
743        assert!(re
744            .is_match(r"C:\Program Files\Everything 1.5a\Everything64.exe？"));
745        assert!(
746            re.is_match(r"C:\Program Files\Everything 1.5a\Everything64.exe")
747        );
748
749        let re = Regex::builder()
750            .ib(MatchConfig::builder().build())
751            .build_from_hir(
752                glob::parse_wildcard_path()
753                    .separator(glob::PathSeparator::Windows)
754                    .call(r"pro*m"),
755            )
756            .unwrap();
757        assert!(
758            re.is_match(r"C:\Program Files\Everything 1.5a\Everything64.exe")
759        );
760    }
761
762    #[test]
763    fn alt() {
764        let pinyin = PinyinMatchConfig::notations(
765            PinyinNotation::Ascii | PinyinNotation::AsciiFirstLetter,
766        );
767
768        let re = Regex::builder().build("samwise|sam").unwrap();
769        assert_eq!(Some(Match::must(0, 0..3)), re.find("sam"));
770
771        let re = Regex::builder()
772            .ib(MatchConfig::builder().pinyin(pinyin.shallow_clone()).build())
773            .build("samwise|pyss")
774            .unwrap();
775        assert_eq!(Some(Match::must(0, 0..12)), re.find("拼音搜索"));
776    }
777
778    #[test]
779    fn wildcard() {
780        let re = Regex::builder()
781            .ib(MatchConfig::builder()
782                .pinyin(PinyinMatchConfig::notations(
783                    PinyinNotation::Ascii | PinyinNotation::AsciiFirstLetter,
784                ))
785                .romaji(RomajiMatchConfig::default())
786                .build())
787            .build("raki.suta")
788            .unwrap();
789
790        assert_eq!(re.find("￥らき☆すた"), Some(Match::must(0, 3..18)));
791
792        let re = Regex::builder()
793            .ib(MatchConfig::builder()
794                .pinyin(PinyinMatchConfig::notations(
795                    PinyinNotation::Ascii | PinyinNotation::AsciiFirstLetter,
796                ))
797                .build())
798            .build("p.*y.*s.*s")
799            .unwrap();
800
801        assert_eq!(re.find("拼a音b搜c索d"), Some(Match::must(0, 0..15)));
802    }
803
804    #[test]
805    fn mix_lang() {
806        let pinyin = PinyinMatchConfig::notations(
807            PinyinNotation::Ascii | PinyinNotation::AsciiFirstLetter,
808        );
809        let romaji = RomajiMatchConfig::default();
810
811        let re = Regex::builder()
812            .ib(MatchConfig::builder()
813                .pinyin(pinyin.shallow_clone())
814                .romaji(romaji.shallow_clone())
815                .build())
816            .build("pysousuosousounofuri-ren")
817            .unwrap();
818
819        assert_eq!(re.find("拼音搜索葬送のフリーレン"), None);
820
821        let re = Regex::builder()
822            .ib(MatchConfig::builder()
823                .pinyin(pinyin.shallow_clone())
824                .romaji(romaji.shallow_clone())
825                .mix_lang(true)
826                .build())
827            .build("pysousuosousounofuri-ren")
828            .unwrap();
829        assert_eq!(
830            re.find("拼音搜索葬送のフリーレン"),
831            Some(Match::must(0, 0..36)),
832        );
833
834        let re = Regex::builder()
835            .ib(MatchConfig::builder()
836                .pinyin(pinyin.shallow_clone())
837                .romaji(romaji.shallow_clone())
838                .build())
839            .build("(pysousuo)(sousounofuri-ren)")
840            .unwrap();
841
842        assert_eq!(
843            re.find("拼音搜索葬送のフリーレン"),
844            Some(Match::must(0, 0..36)),
845        );
846
847        let re = Regex::builder()
848            .ib(MatchConfig::builder()
849                .pinyin(pinyin.shallow_clone())
850                .romaji(romaji.shallow_clone())
851                .build())
852            .build("pysousuo.*?sousounofuri-ren")
853            .unwrap();
854
855        assert_eq!(
856            re.find("拼音搜索⭐葬送のフリーレン"),
857            Some(Match::must(0, 0..39)),
858        );
859    }
860}
ib_matcher/regex/lita/regex.rs

ib_matcher/regex/lita/
regex.rs