ib_matcher/regex/lita/regex.rs
1use std::sync::Arc;
2
3use bon::bon;
4use regex_automata::{
5 dfa::{self, dense},
6 util::{captures::GroupInfo, primitives::NonMaxUsize},
7 PatternID,
8};
9use regex_syntax::hir::{Hir, HirKind};
10
11use crate::{
12 matcher::{
13 self, config::IbMatcherWithConfig, pattern::Pattern, MatchConfig,
14 },
15 regex::{
16 cp,
17 nfa::{backtrack, thompson},
18 util::{self, captures::Captures},
19 Input, Match, MatchError,
20 },
21 syntax::regex::hir,
22};
23
24pub use crate::regex::nfa::{backtrack::Config, thompson::BuildError};
25
26/// A compiled regular expression for searching Unicode haystacks.
27///
28/// A `Regex` can be used to search haystacks, split haystacks into substrings
29/// or replace substrings in a haystack with a different substring. All
30/// searching is done with an implicit `(?s:.)*?` at the beginning and end of
31/// an pattern. To force an expression to match the whole string (or a prefix
32/// or a suffix), you can use anchored search or an anchor like `^` or `$` (or `\A` and `\z`).
33/**
34# Overview
35
36The most important methods are as follows:
37
38* [`Regex::new`] compiles a regex using the default configuration. A
39[`Builder`] permits setting a non-default configuration. (For example,
40case insensitive matching, verbose mode and others.)
41* [`Regex::is_match`] reports whether a match exists in a particular haystack.
42* [`Regex::find`] reports the byte offsets of a match in a haystack, if one
43exists. [`Regex::find_iter`] returns an iterator over all such matches.
44* [`Regex::captures`] returns a [`Captures`], which reports both the byte
45offsets of a match in a haystack and the byte offsets of each matching capture
46group from the regex in the haystack.
47[`Regex::captures_iter`] returns an iterator over all such matches.
48*/
49/// # Example
50///
51/// ```
52/// use ib_matcher::regex::lita::Regex;
53///
54/// let re = Regex::new(r"^[0-9]{4}-[0-9]{2}-[0-9]{2}$")?;
55/// assert!(re.is_match("2010-03-14"));
56///
57/// # Ok::<(), Box<dyn std::error::Error>>(())
58/// ```
59/**
60With `IbMatcher`'s Chinese pinyin and Japanese romaji matching:
61```
62// cargo add ib-matcher --features regex,pinyin,romaji
63use ib_matcher::{
64 matcher::{MatchConfig, PinyinMatchConfig, RomajiMatchConfig},
65 regex::{lita::Regex, Match},
66};
67
68let config = MatchConfig::builder()
69 .pinyin(PinyinMatchConfig::default())
70 .romaji(RomajiMatchConfig::default())
71 .build();
72
73let re = Regex::builder()
74 .ib(config.shallow_clone())
75 .build("raki.suta")
76 .unwrap();
77assert_eq!(re.find("「らき☆すた」"), Some(Match::must(0, 3..18)));
78
79let re = Regex::builder()
80 .ib(config.shallow_clone())
81 .build("pysou.*?(any|every)thing")
82 .unwrap();
83assert_eq!(re.find("拼音搜索Everything"), Some(Match::must(0, 0..22)));
84
85let config = MatchConfig::builder()
86 .pinyin(PinyinMatchConfig::default())
87 .romaji(RomajiMatchConfig::default())
88 .mix_lang(true)
89 .build();
90let re = Regex::builder()
91 .ib(config.shallow_clone())
92 .build("(?x)^zangsounofuri-?ren # Mixing pinyin and romaji")
93 .unwrap();
94assert_eq!(re.find("葬送のフリーレン"), Some(Match::must(0, 0..24)));
95```
96*/
97/// For more examples and the syntax, see [`crate::regex`].
98///
99/// # Case insensitivity
100/// To enable case insensitivity:
101/// ```
102/// use ib_matcher::{matcher::{PinyinMatchConfig, PlainMatchConfig, MatchConfig}, regex::lita::Regex};
103///
104/// let re = Regex::builder().ib(MatchConfig::default()).build("foo").unwrap();
105/// assert!(re.is_match("FOO"));
106///
107/// // Alternatively, with `case_insensitive()`:
108/// let re = Regex::builder()
109/// .ib(MatchConfig::builder()
110/// .case_insensitive(true)
111/// .pinyin(PinyinMatchConfig::default())
112/// .build())
113/// .build("pyss")
114/// .unwrap();
115/// assert!(re.is_match("PY搜索"));
116/// ```
117/// Note that enabling `syntax.case_insensitive` will make `ib` (i.e. pinyin and romaji match) doesn't work at the moment. You should only set [`MatchConfigBuilder::case_insensitive`](crate::matcher::MatchConfigBuilder::case_insensitive) ([`PlainMatchConfigBuilder::case_insensitive`](crate::matcher::PlainMatchConfigBuilder::case_insensitive)).
118///
119/// If you need case insensitive character classes, you need to write `(?i:[a-z])` instead at the moment.
120///
121/// # Synchronization and cloning
122///
123/// In order to make the `Regex` API convenient, most of the routines hide
124/// the fact that a `Cache` is needed at all. To achieve this, a [memory
125/// pool](automata::util::pool::Pool) is used internally to retrieve `Cache`
126/// values in a thread safe way that also permits reuse. This in turn implies
127/// that every such search call requires some form of synchronization. Usually
128/// this synchronization is fast enough to not notice, but in some cases, it
129/// can be a bottleneck. This typically occurs when all of the following are
130/// true:
131///
132/// * The same `Regex` is shared across multiple threads simultaneously,
133/// usually via a [`util::lazy::Lazy`](automata::util::lazy::Lazy) or something
134/// similar from the `once_cell` or `lazy_static` crates.
135/// * The primary unit of work in each thread is a regex search.
136/// * Searches are run on very short haystacks.
137///
138/// This particular case can lead to high contention on the pool used by a
139/// `Regex` internally, which can in turn increase latency to a noticeable
140/// effect. This cost can be mitigated in one of the following ways:
141///
142/// * Use a distinct copy of a `Regex` in each thread, usually by cloning it.
143/// Cloning a `Regex` _does not_ do a deep copy of its read-only component.
144/// But it does lead to each `Regex` having its own memory pool, which in
145/// turn eliminates the problem of contention. In general, this technique should
146/// not result in any additional memory usage when compared to sharing the same
147/// `Regex` across multiple threads simultaneously.
148/// * Use lower level APIs, like [`Regex::try_find`], which permit passing
149/// a `Cache` explicitly. In this case, it is up to you to determine how best
150/// to provide a `Cache`. For example, you might put a `Cache` in thread-local
151/// storage if your use case allows for it.
152///
153/// Overall, this is an issue that happens rarely in practice, but it can
154/// happen.
155///
156/// # Warning: spin-locks may be used in alloc-only mode
157///
158/// When this crate is built without the `std` feature and the high level APIs
159/// on a `Regex` are used, then a spin-lock will be used to synchronize access
160/// to an internal pool of `Cache` values. This may be undesirable because
161/// a spin-lock is [effectively impossible to implement correctly in user
162/// space][spinlocks-are-bad]. That is, more concretely, the spin-lock could
163/// result in a deadlock.
164///
165/// [spinlocks-are-bad]: https://matklad.github.io/2020/01/02/spinlocks-considered-harmful.html
166///
167/// If one wants to avoid the use of spin-locks when the `std` feature is
168/// disabled, then you must use APIs that accept a `Cache` value explicitly.
169/// For example, [`Regex::try_find`].
170#[derive(Clone)]
171pub struct Regex<'a> {
172 /// The actual regex implementation.
173 imp: RegexI<'a>,
174}
175
176#[derive(Clone)]
177enum RegexI<'a> {
178 Ib(Arc<IbMatcherWithConfig<'a>>),
179 Cp { dfa: dfa::regex::Regex, cp: cp::Regex<'a> },
180}
181
182#[bon]
183impl<'a> Regex<'a> {
184 pub fn new(pattern: &str) -> Result<Self, BuildError> {
185 Self::builder().build(pattern)
186 }
187
188 pub fn config() -> thompson::Config {
189 thompson::Config::new()
190 }
191
192 /// Return a builder for configuring the construction of a `Regex`.
193 ///
194 /// This is a convenience routine to avoid needing to import the
195 /// [`Builder`] type in common cases.
196 ///
197 /// # Example: change the line terminator
198 ///
199 /// This example shows how to enable multi-line mode by default and change
200 /// the line terminator to the NUL byte:
201 ///
202 /// ```
203 /// use ib_matcher::regex::{lita::Regex, util::{syntax, look::LookMatcher}, Match};
204 ///
205 /// let mut lookm = LookMatcher::new();
206 /// lookm.set_line_terminator(b'\x00');
207 /// let re = Regex::builder()
208 /// .syntax(syntax::Config::new().multi_line(true))
209 /// .thompson(Regex::config().look_matcher(lookm))
210 /// .build(r"^foo$")?;
211 /// let hay = "\x00foo\x00";
212 /// assert_eq!(Some(Match::must(0, 1..4)), re.find(hay));
213 ///
214 /// # Ok::<(), Box<dyn std::error::Error>>(())
215 /// ```
216 #[builder(builder_type = Builder, finish_fn(name = build_from_hir, doc {
217 /// Builds a `Regex` directly from an `Hir` expression.
218 ///
219 /// This is useful if you needed to parse a pattern string into an `Hir`
220 /// for other reasons (such as analysis or transformations). This routine
221 /// permits building a `Regex` directly from the `Hir` expression instead
222 /// of first converting the `Hir` back to a pattern string.
223 ///
224 /// When using this method, any options set via [`Builder::syntax`] are
225 /// ignored. Namely, the syntax options only apply when parsing a pattern
226 /// string, which isn't relevant here.
227 ///
228 /// If there was a problem building the underlying regex matcher for the
229 /// given `Hir`, then an error is returned.
230 ///
231 /// # Example
232 ///
233 /// This example shows how one can hand-construct an `Hir` expression and
234 /// build a regex from it without doing any parsing at all.
235 ///
236 /// ```
237 /// use ib_matcher::{
238 /// regex::{lita::Regex, Match},
239 /// syntax::regex::hir::{Hir, Look},
240 /// };
241 ///
242 /// // (?Rm)^foo$
243 /// let hir = Hir::concat(vec![
244 /// Hir::look(Look::StartCRLF),
245 /// Hir::literal("foo".as_bytes()),
246 /// Hir::look(Look::EndCRLF),
247 /// ]);
248 /// let re = Regex::builder()
249 /// .build_from_hir(hir)?;
250 /// let hay = "\r\nfoo\r\n";
251 /// assert_eq!(Some(Match::must(0, 2..5)), re.find(hay));
252 ///
253 /// Ok::<(), Box<dyn std::error::Error>>(())
254 /// ```
255 }))]
256 pub fn builder(
257 #[builder(field)] syntax: util::syntax::Config,
258 #[builder(finish_fn)] hir: Hir,
259 /// If the provided `hir` is Unicode-aware, providing a ASCII-aware-only `Hir` as `hir_ascii` can improve performance.
260 ///
261 /// The second `bool` is whether the provided `hir_ascii` is case insensitive:
262 /// - If it's `false` but `ib.case_insensitive` is `true`, then `hir_ascii` will be converted to case insensitive. (Used by glob)
263 /// - If it's `true` but `ib.case_insensitive` is `false`, `build()` will panic.
264 hir_ascii: Option<(Hir, bool)>,
265 #[builder(default)] dfa_dense: dfa::dense::Config,
266 /// Thompson NFA config. Named `configure` to be compatible with [`regex_automata::meta::Builder`]. Although some fields are not supported and `utf8_empty` is named as `utf8` instead.
267 #[builder(default)]
268 thompson: thompson::Config,
269 /// [`IbMatcher`] config.
270 #[builder(default = MatchConfig::builder().case_insensitive(false).build())]
271 mut ib: MatchConfig<'a>,
272 /// `IbMatcher` pattern parser.
273 ///
274 /// ### Example
275 /// ```
276 /// use ib_matcher::{regex::lita::Regex, matcher::{MatchConfig, pattern::Pattern}};
277 ///
278 /// let re = Regex::builder()
279 /// .ib(MatchConfig::builder().pinyin(Default::default()).build())
280 /// .ib_parser(&mut |pattern| Pattern::parse_ev(pattern).call())
281 /// .build("pinyin;py")
282 /// .unwrap();
283 /// assert!(re.is_match("拼音搜索"));
284 /// assert!(re.is_match("pinyin") == false);
285 /// ```
286 /// See [`crate::syntax::ev`] for more details.
287 mut ib_parser: Option<&mut dyn FnMut(&str) -> Pattern<str>>,
288 #[builder(default = backtrack::Config::new().visited_capacity(usize::MAX / 8))]
289 backtrack: backtrack::Config,
290 ) -> Result<Self, BuildError> {
291 _ = syntax;
292 #[cfg(test)]
293 dbg!(&hir);
294
295 let imp = match hir.kind() {
296 // TODO: Look::{Start,End} optimization
297 HirKind::Literal(literal) => {
298 let pattern = str::from_utf8(&literal.0).unwrap();
299 let pattern = if let Some(ib_parser) = ib_parser.as_mut() {
300 ib_parser(pattern)
301 } else {
302 pattern.into()
303 };
304 RegexI::Ib(IbMatcherWithConfig::with_config(pattern, ib))
305 }
306 _ => {
307 let dfa = {
308 // We can always forcefully disable captures because DFAs do not
309 // support them.
310 let thompson = thompson
311 .clone()
312 .which_captures(thompson::WhichCaptures::None);
313
314 let mut compiler = thompson::Compiler::new();
315 let hir_buf;
316 let (mut hir, hir_case_insensitive) = hir_ascii
317 .as_ref()
318 .map(|(hir, case)| (hir, *case))
319 .unwrap_or((&hir, false));
320 if let Some(plain) = &ib.plain {
321 debug_assert!(
322 !(hir_case_insensitive && !plain.case_insensitive)
323 );
324 if !hir_case_insensitive && plain.case_insensitive {
325 hir_buf = hir::case::hir_to_ascii_case_insensitive(
326 hir.clone(),
327 );
328 hir = &hir_buf;
329 }
330 }
331
332 let forward_nfa = compiler
333 .configure(thompson.clone())
334 .build_from_hir(hir)?;
335 // TODO: prefilter
336 // TODO: minimize?
337 // TODO: quit vs is_ascii?
338 let forward = dense::Builder::new()
339 .configure(dfa_dense.clone())
340 .build_from_nfa(&forward_nfa)
341 .unwrap();
342
343 let reverse_nfa = compiler
344 .configure(thompson.reverse(true))
345 .build_from_hir(hir)?;
346 let reverse = dense::Builder::new()
347 .configure(
348 dfa_dense
349 .prefilter(None)
350 .specialize_start_states(false)
351 .start_kind(dfa::StartKind::Anchored)
352 .match_kind(regex_automata::MatchKind::All),
353 )
354 .build_from_nfa(&reverse_nfa)
355 .unwrap();
356
357 dfa::regex::Regex::builder()
358 .build_from_dfas(forward, reverse)
359 };
360 if let Some(plain) = ib.plain.as_mut() {
361 // -3.3%
362 plain.maybe_ascii = false;
363 }
364 let cp = cp::Regex::builder()
365 .syntax(syntax)
366 .configure(thompson)
367 .ib(ib)
368 .maybe_ib_parser(ib_parser)
369 .backtrack(backtrack)
370 .build_from_hir(hir)?;
371 RegexI::Cp { dfa, cp }
372 }
373 };
374
375 Ok(Self { imp })
376 }
377
378 /// Create a new empty set of capturing groups that is guaranteed to be
379 /// valid for the search APIs on this `BoundedBacktracker`.
380 ///
381 /// A `Captures` value created for a specific `BoundedBacktracker` cannot
382 /// be used with any other `BoundedBacktracker`.
383 ///
384 /// This is a convenience function for [`Captures::all`]. See the
385 /// [`Captures`] documentation for an explanation of its alternative
386 /// constructors that permit the `BoundedBacktracker` to do less work
387 /// during a search, and thus might make it faster.
388 pub fn create_captures(&self) -> Captures {
389 match &self.imp {
390 RegexI::Ib(_) => Captures::matches(GroupInfo::empty()),
391 RegexI::Cp { dfa: _, cp } => cp.create_captures(),
392 }
393 }
394}
395
396impl<'a, S: builder::State> Builder<'a, '_, S> {
397 /// Configure the syntax options when parsing a pattern string while
398 /// building a `Regex`.
399 ///
400 /// These options _only_ apply when [`Builder::build`] or [`Builder::build_many`]
401 /// are used. The other build methods accept `Hir` values, which have
402 /// already been parsed.
403 ///
404 /// # Example
405 ///
406 /// This example shows how to enable case insensitive mode.
407 ///
408 /// ```
409 /// use ib_matcher::regex::{lita::Regex, util::syntax, Match};
410 ///
411 /// let re = Regex::builder()
412 /// .syntax(syntax::Config::new().case_insensitive(true))
413 /// .build(r"δ")?;
414 /// assert_eq!(Some(Match::must(0, 0..2)), re.find(r"Δ"));
415 ///
416 /// Ok::<(), Box<dyn std::error::Error>>(())
417 /// ```
418 pub fn syntax(mut self, syntax: util::syntax::Config) -> Self {
419 self.syntax = syntax;
420 self
421 }
422
423 /// Builds a `Regex` from a single pattern string.
424 ///
425 /// If there was a problem parsing the pattern or a problem turning it into
426 /// a regex matcher, then an error is returned.
427 ///
428 /// # Example
429 ///
430 /// This example shows how to configure syntax options.
431 ///
432 /// ```
433 /// use ib_matcher::regex::{lita::Regex, util::syntax, Match};
434 ///
435 /// let re = Regex::builder()
436 /// .syntax(syntax::Config::new().crlf(true).multi_line(true))
437 /// .build(r"^foo$")?;
438 /// let hay = "\r\nfoo\r\n";
439 /// assert_eq!(Some(Match::must(0, 2..5)), re.find(hay));
440 ///
441 /// # Ok::<(), Box<dyn std::error::Error>>(())
442 /// ```
443 pub fn build(self, pattern: &str) -> Result<Regex<'a>, BuildError>
444 where
445 S::HirAscii: builder::IsUnset,
446 {
447 let syntax = self.syntax;
448
449 // Parse
450 let pattern = pattern.as_ref();
451 let parse_with = |syntax| {
452 regex_automata::util::syntax::parse_with(pattern, &syntax).map_err(
453 |_| {
454 // Shit
455 thompson::Compiler::new()
456 .syntax(syntax)
457 .build(pattern)
458 .unwrap_err()
459 },
460 )
461 };
462 let hir_ascii = parse_with(
463 syntax
464 // TODO: case_insensitive
465 .unicode(false)
466 // ASCII must be valid UTF-8
467 .utf8(false),
468 )?;
469 let hir = parse_with(syntax)?;
470 self.hir_ascii((hir_ascii, false)).build_from_hir(hir)
471 }
472}
473
474/// High level convenience routines for using a regex to search a haystack.
475impl<'a> Regex<'a> {
476 /// Returns true if and only if this regex matches the given haystack.
477 ///
478 /// This routine may short circuit if it knows that scanning future input
479 /// will never lead to a different result. (Consider how this might make
480 /// a difference given the regex `a+` on the haystack `aaaaaaaaaaaaaaa`.
481 /// This routine _may_ stop after it sees the first `a`, but routines like
482 /// `find` need to continue searching because `+` is greedy by default.)
483 ///
484 /// # Example
485 ///
486 /// ```
487 /// use ib_matcher::regex::lita::Regex;
488 ///
489 /// let re = Regex::new("foo[0-9]+bar")?;
490 ///
491 /// assert!(re.is_match("foo12345bar"));
492 /// assert!(!re.is_match("foobar"));
493 ///
494 /// # Ok::<(), Box<dyn std::error::Error>>(())
495 /// ```
496 ///
497 /// # Example: consistency with search APIs
498 ///
499 /// `is_match` is guaranteed to return `true` whenever `find` returns a
500 /// match. This includes searches that are executed entirely within a
501 /// codepoint:
502 ///
503 /// ```
504 /// use ib_matcher::regex::{lita::Regex, Input};
505 ///
506 /// let re = Regex::new("a*")?;
507 ///
508 /// // This doesn't match because the default configuration bans empty
509 /// // matches from splitting a codepoint.
510 /// assert!(!re.is_match(Input::new("☃").span(1..2)));
511 /// assert_eq!(None, re.find(Input::new("☃").span(1..2)));
512 ///
513 /// # Ok::<(), Box<dyn std::error::Error>>(())
514 /// ```
515 ///
516 /// Notice that when UTF-8 mode is disabled, then the above reports a
517 /// match because the restriction against zero-width matches that split a
518 /// codepoint has been lifted:
519 ///
520 /// ```
521 /// use ib_matcher::regex::{lita::Regex, Input, Match};
522 ///
523 /// let re = Regex::builder()
524 /// .thompson(Regex::config().utf8(false))
525 /// .build("a*")?;
526 ///
527 /// assert!(re.is_match(Input::new("☃").span(1..2)));
528 /// assert_eq!(
529 /// Some(Match::must(0, 1..1)),
530 /// re.find(Input::new("☃").span(1..2)),
531 /// );
532 ///
533 /// # Ok::<(), Box<dyn std::error::Error>>(())
534 /// ```
535 ///
536 /// A similar idea applies when using line anchors with CRLF mode enabled,
537 /// which prevents them from matching between a `\r` and a `\n`.
538 ///
539 /// ```
540 /// use ib_matcher::regex::{lita::Regex, Input, Match};
541 ///
542 /// let re = Regex::new(r"(?Rm:$)")?;
543 /// assert!(!re.is_match(Input::new("\r\n").span(1..1)));
544 /// // A regular line anchor, which only considers \n as a
545 /// // line terminator, will match.
546 /// let re = Regex::new(r"(?m:$)")?;
547 /// assert!(re.is_match(Input::new("\r\n").span(1..1)));
548 ///
549 /// # Ok::<(), Box<dyn std::error::Error>>(())
550 /// ```
551 #[inline]
552 pub fn is_match<'h, I: Into<Input<'h>>>(&self, input: I) -> bool {
553 let input = input.into().earliest(true);
554 match &self.imp {
555 RegexI::Ib(matcher) => {
556 matcher.is_match(matcher::input::Input::from_regex(&input))
557 }
558 RegexI::Cp { dfa, cp } => {
559 if input.haystack().is_ascii() {
560 dfa.is_match(input)
561 } else {
562 cp.is_match(input)
563 }
564 }
565 }
566 }
567
568 /// Executes a leftmost search and returns the first match that is found,
569 /// if one exists.
570 ///
571 /// # Example
572 ///
573 /// ```
574 /// use ib_matcher::regex::{lita::Regex, Match};
575 ///
576 /// let re = Regex::new("foo[0-9]+")?;
577 /// assert_eq!(Some(Match::must(0, 0..8)), re.find("foo12345"));
578 ///
579 /// # Ok::<(), Box<dyn std::error::Error>>(())
580 /// ```
581 #[inline]
582 pub fn find<'h, I: Into<Input<'h>>>(&self, input: I) -> Option<Match> {
583 let input = input.into();
584 match &self.imp {
585 RegexI::Ib(matcher) => matcher
586 .find(matcher::input::Input::from_regex(&input))
587 .map(|m| m.offset(input.start()).into()),
588 RegexI::Cp { dfa, cp } => {
589 if input.haystack().is_ascii() {
590 dfa.find(input)
591 } else {
592 cp.find(input)
593 }
594 }
595 }
596 }
597
598 /// Executes a leftmost forward search and writes the spans of capturing
599 /// groups that participated in a match into the provided [`Captures`]
600 /// value. If no match was found, then [`Captures::is_match`] is guaranteed
601 /// to return `false`.
602 ///
603 /// # Example
604 ///
605 /// ```
606 /// use ib_matcher::regex::{lita::Regex, Span};
607 ///
608 /// let re = Regex::new(r"^([0-9]{4})-([0-9]{2})-([0-9]{2})$")?;
609 /// let mut caps = re.create_captures();
610 ///
611 /// re.captures("2010-03-14", &mut caps);
612 /// assert!(caps.is_match());
613 /// assert_eq!(Some(Span::from(0..4)), caps.get_group(1));
614 /// assert_eq!(Some(Span::from(5..7)), caps.get_group(2));
615 /// assert_eq!(Some(Span::from(8..10)), caps.get_group(3));
616 ///
617 /// # Ok::<(), Box<dyn std::error::Error>>(())
618 /// ```
619 #[inline]
620 pub fn captures<'h, I: Into<Input<'h>>>(
621 &self,
622 input: I,
623 caps: &mut Captures,
624 ) -> Result<(), MatchError> {
625 let input = input.into();
626 match &self.imp {
627 RegexI::Ib(matcher) => {
628 let slots = caps.slots_mut();
629 if let Some(m) =
630 matcher.find(matcher::input::Input::from_regex(&input))
631 {
632 let m = m.offset(input.start());
633 slots[0] = NonMaxUsize::new(m.start());
634 slots[1] = NonMaxUsize::new(m.end());
635 caps.set_pattern(Some(PatternID::ZERO));
636 } else {
637 caps.set_pattern(None);
638 }
639 Ok(())
640 }
641 RegexI::Cp { dfa, cp } => {
642 if input.haystack().is_ascii() && !dfa.is_match(input.clone())
643 {
644 caps.set_pattern(None);
645 return Ok(());
646 }
647 cp.captures(input, caps)
648 }
649 }
650 }
651}
652
653#[cfg(test)]
654mod tests {
655 use regex_automata::Match;
656
657 use crate::{
658 matcher::{PinyinMatchConfig, RomajiMatchConfig},
659 pinyin::PinyinNotation,
660 syntax::glob,
661 };
662
663 use super::*;
664
665 #[test]
666 fn empty() {
667 let re = Regex::builder()
668 .ib(MatchConfig::builder()
669 .pinyin(PinyinMatchConfig::default())
670 .build())
671 .build("")
672 .unwrap();
673 assert_eq!(re.find("pyss"), Some(Match::must(0, 0..0)));
674 assert_eq!(re.find("apyss"), Some(Match::must(0, 0..0)));
675 assert_eq!(re.find("拼音搜索"), Some(Match::must(0, 0..0)));
676
677 let re = Regex::builder()
678 .ib(MatchConfig::builder()
679 .pinyin(PinyinMatchConfig::default())
680 .is_pattern_partial(true)
681 .analyze(true)
682 .build())
683 .build_from_hir(
684 glob::parse_wildcard_path()
685 .separator(glob::PathSeparator::Windows)
686 .call(""),
687 )
688 .unwrap();
689 assert_eq!(re.find("pyss"), Some(Match::must(0, 0..0)));
690 assert_eq!(re.find("apyss"), Some(Match::must(0, 0..0)));
691 assert_eq!(re.find("拼音搜索"), Some(Match::must(0, 0..0)));
692 }
693
694 #[test]
695 fn literal() {
696 let re = Regex::builder()
697 .ib(MatchConfig::builder()
698 .pinyin(PinyinMatchConfig::notations(
699 PinyinNotation::Ascii | PinyinNotation::AsciiFirstLetter,
700 ))
701 .build())
702 .build("pyss")
703 .unwrap();
704
705 assert_eq!(re.find("pyss"), Some(Match::must(0, 0..4)));
706 assert_eq!(re.find("apyss"), Some(Match::must(0, 1..5)));
707 assert_eq!(re.find("拼音搜索"), Some(Match::must(0, 0..12)));
708
709 assert_eq!(re.find("pyss"), Some(Match::must(0, 0..4)));
710
711 let re = Regex::builder()
712 .ib(MatchConfig::builder()
713 .pinyin(PinyinMatchConfig::default())
714 .is_pattern_partial(true)
715 .analyze(true)
716 .build())
717 .ib_parser(&mut |pattern| Pattern::parse_ev(&pattern).call())
718 .build_from_hir(
719 glob::parse_wildcard_path()
720 .separator(glob::PathSeparator::Windows)
721 .call("abcdef"),
722 )
723 .unwrap();
724 assert_eq!(re.find("pyss"), None);
725 assert_eq!(re.find("abcdef"), Some(Match::must(0, 0..6)));
726 assert_eq!(re.find("0abcdef"), Some(Match::must(0, 1..7)));
727 assert_eq!(re.find("#文档"), None);
728 assert_eq!(re.find("$$"), None);
729 }
730
731 #[test]
732 fn case() {
733 let re = Regex::builder()
734 .syntax(util::syntax::Config::new().case_insensitive(true))
735 .build(r"δ")
736 .unwrap();
737 assert_eq!(Some(Match::must(0, 0..2)), re.find(r"Δ"));
738
739 let re = Regex::builder()
740 .ib(MatchConfig::builder().build())
741 .build("pro.*m")
742 .unwrap();
743 assert!(re
744 .is_match(r"C:\Program Files\Everything 1.5a\Everything64.exe?"));
745 assert!(
746 re.is_match(r"C:\Program Files\Everything 1.5a\Everything64.exe")
747 );
748
749 let re = Regex::builder()
750 .ib(MatchConfig::builder().build())
751 .build_from_hir(
752 glob::parse_wildcard_path()
753 .separator(glob::PathSeparator::Windows)
754 .call(r"pro*m"),
755 )
756 .unwrap();
757 assert!(
758 re.is_match(r"C:\Program Files\Everything 1.5a\Everything64.exe")
759 );
760 }
761
762 #[test]
763 fn alt() {
764 let pinyin = PinyinMatchConfig::notations(
765 PinyinNotation::Ascii | PinyinNotation::AsciiFirstLetter,
766 );
767
768 let re = Regex::builder().build("samwise|sam").unwrap();
769 assert_eq!(Some(Match::must(0, 0..3)), re.find("sam"));
770
771 let re = Regex::builder()
772 .ib(MatchConfig::builder().pinyin(pinyin.shallow_clone()).build())
773 .build("samwise|pyss")
774 .unwrap();
775 assert_eq!(Some(Match::must(0, 0..12)), re.find("拼音搜索"));
776 }
777
778 #[test]
779 fn wildcard() {
780 let re = Regex::builder()
781 .ib(MatchConfig::builder()
782 .pinyin(PinyinMatchConfig::notations(
783 PinyinNotation::Ascii | PinyinNotation::AsciiFirstLetter,
784 ))
785 .romaji(RomajiMatchConfig::default())
786 .build())
787 .build("raki.suta")
788 .unwrap();
789
790 assert_eq!(re.find("¥らき☆すた"), Some(Match::must(0, 3..18)));
791
792 let re = Regex::builder()
793 .ib(MatchConfig::builder()
794 .pinyin(PinyinMatchConfig::notations(
795 PinyinNotation::Ascii | PinyinNotation::AsciiFirstLetter,
796 ))
797 .build())
798 .build("p.*y.*s.*s")
799 .unwrap();
800
801 assert_eq!(re.find("拼a音b搜c索d"), Some(Match::must(0, 0..15)));
802 }
803
804 #[test]
805 fn mix_lang() {
806 let pinyin = PinyinMatchConfig::notations(
807 PinyinNotation::Ascii | PinyinNotation::AsciiFirstLetter,
808 );
809 let romaji = RomajiMatchConfig::default();
810
811 let re = Regex::builder()
812 .ib(MatchConfig::builder()
813 .pinyin(pinyin.shallow_clone())
814 .romaji(romaji.shallow_clone())
815 .build())
816 .build("pysousuosousounofuri-ren")
817 .unwrap();
818
819 assert_eq!(re.find("拼音搜索葬送のフリーレン"), None);
820
821 let re = Regex::builder()
822 .ib(MatchConfig::builder()
823 .pinyin(pinyin.shallow_clone())
824 .romaji(romaji.shallow_clone())
825 .mix_lang(true)
826 .build())
827 .build("pysousuosousounofuri-ren")
828 .unwrap();
829 assert_eq!(
830 re.find("拼音搜索葬送のフリーレン"),
831 Some(Match::must(0, 0..36)),
832 );
833
834 let re = Regex::builder()
835 .ib(MatchConfig::builder()
836 .pinyin(pinyin.shallow_clone())
837 .romaji(romaji.shallow_clone())
838 .build())
839 .build("(pysousuo)(sousounofuri-ren)")
840 .unwrap();
841
842 assert_eq!(
843 re.find("拼音搜索葬送のフリーレン"),
844 Some(Match::must(0, 0..36)),
845 );
846
847 let re = Regex::builder()
848 .ib(MatchConfig::builder()
849 .pinyin(pinyin.shallow_clone())
850 .romaji(romaji.shallow_clone())
851 .build())
852 .build("pysousuo.*?sousounofuri-ren")
853 .unwrap();
854
855 assert_eq!(
856 re.find("拼音搜索⭐葬送のフリーレン"),
857 Some(Match::must(0, 0..39)),
858 );
859 }
860}