fancy_regex/
lib.rs

1// Copyright 2016 The Fancy Regex Authors.
2//
3// Permission is hereby granted, free of charge, to any person obtaining a copy
4// of this software and associated documentation files (the "Software"), to deal
5// in the Software without restriction, including without limitation the rights
6// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7// copies of the Software, and to permit persons to whom the Software is
8// furnished to do so, subject to the following conditions:
9//
10// The above copyright notice and this permission notice shall be included in
11// all copies or substantial portions of the Software.
12//
13// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19// THE SOFTWARE.
20
21/*!
22An implementation of regexes, supporting a relatively rich set of features, including backreferences
23and lookaround.
24
25It builds on top of the excellent [regex] crate. If you are not
26familiar with it, make sure you read its documentation and maybe you don't even need fancy-regex.
27
28If your regex or parts of it does not use any special features, the matching is delegated to the
29regex crate. That means it has linear runtime. But if you use "fancy" features such as
30backreferences or look-around, an engine with backtracking needs to be used. In that case, the regex
31can be slow and take exponential time to run because of what is called "catastrophic backtracking".
32This depends on the regex and the input.
33
34# Usage
35
36The API should feel very similar to the regex crate, and involves compiling a regex and then using
37it to find matches in text.
38
39## Example: Matching text
40
41An example with backreferences to check if a text consists of two identical words:
42
43```rust
44use fancy_regex::Regex;
45
46let re = Regex::new(r"^(\w+) (\1)$").unwrap();
47let result = re.is_match("foo foo");
48
49assert!(result.is_ok());
50let did_match = result.unwrap();
51assert!(did_match);
52```
53
54Note that like in the regex crate, the regex needs anchors like `^` and `$` to match against the
55entire input text.
56
57## Example: Finding the position of matches
58
59```rust
60use fancy_regex::Regex;
61
62let re = Regex::new(r"(\d)\1").unwrap();
63let result = re.find("foo 22");
64
65assert!(result.is_ok(), "execution was successful");
66let match_option = result.unwrap();
67
68assert!(match_option.is_some(), "found a match");
69let m = match_option.unwrap();
70
71assert_eq!(m.start(), 4);
72assert_eq!(m.end(), 6);
73assert_eq!(m.as_str(), "22");
74```
75
76## Example: Capturing groups
77
78```rust
79use fancy_regex::Regex;
80
81let re = Regex::new(r"(?<!AU)\$(\d+)").unwrap();
82let result = re.captures("AU$10, $20");
83
84let captures = result.expect("Error running regex").expect("No match found");
85let group = captures.get(1).expect("No group");
86assert_eq!(group.as_str(), "20");
87```
88
89## Example: Splitting text
90
91```rust
92use fancy_regex::Regex;
93
94let re = Regex::new(r"[ \t]+").unwrap();
95let target = "a b \t  c\td    e";
96let fields: Vec<&str> = re.split(target).map(|x| x.unwrap()).collect();
97assert_eq!(fields, vec!["a", "b", "c", "d", "e"]);
98
99let fields: Vec<&str> = re.splitn(target, 3).map(|x| x.unwrap()).collect();
100assert_eq!(fields, vec!["a", "b", "c\td    e"]);
101```
102
103# Syntax
104
105The regex syntax is based on the [regex] crate's, with some additional supported syntax.
106
107Escapes:
108
109`\h`
110: hex digit (`[0-9A-Fa-f]`) \
111`\H`
112: not hex digit (`[^0-9A-Fa-f]`) \
113`\e`
114: escape control character (`\x1B`) \
115`\K`
116: keep text matched so far out of the overall match ([docs](https://www.regular-expressions.info/keep.html))\
117`\G`
118: anchor to where the previous match ended ([docs](https://www.regular-expressions.info/continue.html))\
119`\Z`
120: anchor to the end of the text before any trailing newlines\
121`\O`
122: any character including newline
123
124Backreferences:
125
126`\1`
127: match the exact string that the first capture group matched \
128`\2`
129: backref to the second capture group, etc
130
131Named capture groups:
132
133`(?<name>exp)`
134: match *exp*, creating capture group named *name* \
135`\k<name>`
136: match the exact string that the capture group named *name* matched \
137`(?P<name>exp)`
138: same as `(?<name>exp)` for compatibility with Python, etc. \
139`(?P=name)`
140: same as `\k<name>` for compatibility with Python, etc.
141
142Look-around assertions for matching without changing the current position:
143
144`(?=exp)`
145: look-ahead, succeeds if *exp* matches to the right of the current position \
146`(?!exp)`
147: negative look-ahead, succeeds if *exp* doesn't match to the right \
148`(?<=exp)`
149: look-behind, succeeds if *exp* matches to the left of the current position \
150`(?<!exp)`
151: negative look-behind, succeeds if *exp* doesn't match to the left
152
153Atomic groups using `(?>exp)` to prevent backtracking within `exp`, e.g.:
154
155```
156# use fancy_regex::Regex;
157let re = Regex::new(r"^a(?>bc|b)c$").unwrap();
158assert!(re.is_match("abcc").unwrap());
159// Doesn't match because `|b` is never tried because of the atomic group
160assert!(!re.is_match("abc").unwrap());
161```
162
163Conditionals - if/then/else:
164
165`(?(1))`
166: continue only if first capture group matched \
167`(?(<name>))`
168: continue only if capture group named *name* matched \
169`(?(1)true_branch|false_branch)`
170: if the first capture group matched then execute the true_branch regex expression, else execute false_branch ([docs](https://www.regular-expressions.info/conditional.html)) \
171`(?(condition)true_branch|false_branch)`
172: if the condition matches then execute the true_branch regex expression, else execute false_branch from the point just before the condition was evaluated
173
174[regex]: https://crates.io/crates/regex
175*/
176
177#![deny(missing_docs)]
178#![deny(missing_debug_implementations)]
179#![cfg_attr(not(feature = "std"), no_std)]
180
181extern crate alloc;
182
183use alloc::borrow::Cow;
184use alloc::boxed::Box;
185use alloc::string::{String, ToString};
186use alloc::sync::Arc;
187use alloc::vec;
188use alloc::vec::Vec;
189
190use core::convert::TryFrom;
191use core::fmt;
192use core::fmt::{Debug, Formatter};
193use core::ops::{Index, Range};
194use core::str::FromStr;
195use regex_automata::meta::Regex as RaRegex;
196use regex_automata::util::captures::Captures as RaCaptures;
197use regex_automata::util::syntax::Config as SyntaxConfig;
198use regex_automata::Input as RaInput;
199
200mod analyze;
201mod compile;
202mod error;
203mod expand;
204mod optimize;
205mod parse;
206mod parse_flags;
207mod replacer;
208mod vm;
209
210use crate::analyze::analyze;
211use crate::analyze::can_compile_as_anchored;
212use crate::compile::compile;
213use crate::optimize::optimize;
214use crate::parse::{ExprTree, NamedGroups, Parser};
215use crate::parse_flags::*;
216use crate::vm::{Prog, OPTION_SKIPPED_EMPTY_MATCH};
217
218pub use crate::error::{CompileError, Error, ParseError, Result, RuntimeError};
219pub use crate::expand::Expander;
220pub use crate::replacer::{NoExpand, Replacer, ReplacerRef};
221
222const MAX_RECURSION: usize = 64;
223
224// the public API
225
226/// A builder for a `Regex` to allow configuring options.
227#[derive(Debug)]
228pub struct RegexBuilder(RegexOptions);
229
230/// A compiled regular expression.
231#[derive(Clone)]
232pub struct Regex {
233    inner: RegexImpl,
234    named_groups: Arc<NamedGroups>,
235}
236
237// Separate enum because we don't want to expose any of this
238#[derive(Clone)]
239enum RegexImpl {
240    // Do we want to box this? It's pretty big...
241    Wrap {
242        inner: RaRegex,
243        options: RegexOptions,
244        /// Some optimizations avoid the VM, but need to use an extra capture group to represent the match boundaries
245        explicit_capture_group_0: bool,
246        debug_pattern: String,
247    },
248    Fancy {
249        prog: Prog,
250        n_groups: usize,
251        options: RegexOptions,
252    },
253}
254
255/// A single match of a regex or group in an input text
256#[derive(Copy, Clone, Debug, Eq, PartialEq)]
257pub struct Match<'t> {
258    text: &'t str,
259    start: usize,
260    end: usize,
261}
262
263/// An iterator over all non-overlapping matches for a particular string.
264///
265/// The iterator yields a `Result<Match>`. The iterator stops when no more
266/// matches can be found.
267///
268/// `'r` is the lifetime of the compiled regular expression and `'t` is the
269/// lifetime of the matched string.
270#[derive(Debug)]
271pub struct Matches<'r, 't> {
272    re: &'r Regex,
273    text: &'t str,
274    last_end: usize,
275    last_match: Option<usize>,
276}
277
278impl<'r, 't> Matches<'r, 't> {
279    /// Return the text being searched.
280    pub fn text(&self) -> &'t str {
281        self.text
282    }
283
284    /// Return the underlying regex.
285    pub fn regex(&self) -> &'r Regex {
286        self.re
287    }
288}
289
290impl<'r, 't> Iterator for Matches<'r, 't> {
291    type Item = Result<Match<'t>>;
292
293    /// Adapted from the `regex` crate. Calls `find_from_pos` repeatedly.
294    /// Ignores empty matches immediately after a match.
295    fn next(&mut self) -> Option<Self::Item> {
296        if self.last_end > self.text.len() {
297            return None;
298        }
299
300        let option_flags = if let Some(last_match) = self.last_match {
301            if self.last_end > last_match {
302                OPTION_SKIPPED_EMPTY_MATCH
303            } else {
304                0
305            }
306        } else {
307            0
308        };
309        let mat =
310            match self
311                .re
312                .find_from_pos_with_option_flags(self.text, self.last_end, option_flags)
313            {
314                Err(error) => {
315                    // Stop on first error: If an error is encountered, return it, and set the "last match position"
316                    // to the string length, so that the next next() call will return None, to prevent an infinite loop.
317                    self.last_end = self.text.len() + 1;
318                    return Some(Err(error));
319                }
320                Ok(None) => return None,
321                Ok(Some(mat)) => mat,
322            };
323
324        if mat.start == mat.end {
325            // This is an empty match. To ensure we make progress, start
326            // the next search at the smallest possible starting position
327            // of the next match following this one.
328            self.last_end = next_utf8(self.text, mat.end);
329            // Don't accept empty matches immediately following a match.
330            // Just move on to the next match.
331            if Some(mat.end) == self.last_match {
332                return self.next();
333            }
334        } else {
335            self.last_end = mat.end;
336        }
337
338        self.last_match = Some(mat.end);
339
340        Some(Ok(mat))
341    }
342}
343
344/// An iterator that yields all non-overlapping capture groups matching a
345/// particular regular expression.
346///
347/// The iterator stops when no more matches can be found.
348///
349/// `'r` is the lifetime of the compiled regular expression and `'t` is the
350/// lifetime of the matched string.
351#[derive(Debug)]
352pub struct CaptureMatches<'r, 't>(Matches<'r, 't>);
353
354impl<'r, 't> CaptureMatches<'r, 't> {
355    /// Return the text being searched.
356    pub fn text(&self) -> &'t str {
357        self.0.text
358    }
359
360    /// Return the underlying regex.
361    pub fn regex(&self) -> &'r Regex {
362        self.0.re
363    }
364}
365
366impl<'r, 't> Iterator for CaptureMatches<'r, 't> {
367    type Item = Result<Captures<'t>>;
368
369    /// Adapted from the `regex` crate. Calls `captures_from_pos` repeatedly.
370    /// Ignores empty matches immediately after a match.
371    fn next(&mut self) -> Option<Self::Item> {
372        if self.0.last_end > self.0.text.len() {
373            return None;
374        }
375
376        let captures = match self.0.re.captures_from_pos(self.0.text, self.0.last_end) {
377            Err(error) => {
378                // Stop on first error: If an error is encountered, return it, and set the "last match position"
379                // to the string length, so that the next next() call will return None, to prevent an infinite loop.
380                self.0.last_end = self.0.text.len() + 1;
381                return Some(Err(error));
382            }
383            Ok(None) => return None,
384            Ok(Some(captures)) => captures,
385        };
386
387        let mat = captures
388            .get(0)
389            .expect("`Captures` is expected to have entire match at 0th position");
390        if mat.start == mat.end {
391            self.0.last_end = next_utf8(self.0.text, mat.end);
392            if Some(mat.end) == self.0.last_match {
393                return self.next();
394            }
395        } else {
396            self.0.last_end = mat.end;
397        }
398
399        self.0.last_match = Some(mat.end);
400
401        Some(Ok(captures))
402    }
403}
404
405/// A set of capture groups found for a regex.
406#[derive(Debug)]
407pub struct Captures<'t> {
408    inner: CapturesImpl<'t>,
409    named_groups: Arc<NamedGroups>,
410}
411
412#[derive(Debug)]
413enum CapturesImpl<'t> {
414    Wrap {
415        text: &'t str,
416        locations: RaCaptures,
417        /// Some optimizations avoid the VM but need an extra capture group to represent the match boundaries.
418        /// Therefore what is actually capture group 1 should be treated as capture group 0, and all other
419        /// capture groups should have their index reduced by one as well to line up with what the pattern specifies.
420        explicit_capture_group_0: bool,
421    },
422    Fancy {
423        text: &'t str,
424        saves: Vec<usize>,
425    },
426}
427
428/// Iterator for captured groups in order in which they appear in the regex.
429#[derive(Debug)]
430pub struct SubCaptureMatches<'c, 't> {
431    caps: &'c Captures<'t>,
432    i: usize,
433}
434
435/// An iterator over all substrings delimited by a regex.
436///
437/// This iterator yields `Result<&'h str>`, where each item is a substring of the
438/// target string that is delimited by matches of the regular expression. It stops when there
439/// are no more substrings to yield.
440///
441/// `'r` is the lifetime of the compiled regular expression, and `'h` is the
442/// lifetime of the target string being split.
443///
444/// This iterator can be created by the [`Regex::split`] method.
445#[derive(Debug)]
446pub struct Split<'r, 'h> {
447    matches: Matches<'r, 'h>,
448    next_start: usize,
449    target: &'h str,
450}
451
452impl<'r, 'h> Iterator for Split<'r, 'h> {
453    type Item = Result<&'h str>;
454
455    /// Returns the next substring that results from splitting the target string by the regex.
456    ///
457    /// If no more matches are found, returns the remaining part of the string,
458    /// or `None` if all substrings have been yielded.
459    fn next(&mut self) -> Option<Result<&'h str>> {
460        match self.matches.next() {
461            None => {
462                let len = self.target.len();
463                if self.next_start > len {
464                    // No more substrings to return
465                    None
466                } else {
467                    // Return the last part of the target string
468                    // Next call will return None
469                    let part = &self.target[self.next_start..len];
470                    self.next_start = len + 1;
471                    Some(Ok(part))
472                }
473            }
474            // Return the next substring
475            Some(Ok(m)) => {
476                let part = &self.target[self.next_start..m.start()];
477                self.next_start = m.end();
478                Some(Ok(part))
479            }
480            Some(Err(e)) => Some(Err(e)),
481        }
482    }
483}
484
485impl<'r, 'h> core::iter::FusedIterator for Split<'r, 'h> {}
486
487/// An iterator over at most `N` substrings delimited by a regex.
488///
489/// This iterator yields `Result<&'h str>`, where each item is a substring of the
490/// target that is delimited by matches of the regular expression. It stops either when
491/// there are no more substrings to yield, or after `N` substrings have been yielded.
492///
493/// The `N`th substring is the remaining part of the target.
494///
495/// `'r` is the lifetime of the compiled regular expression, and `'h` is the
496/// lifetime of the target string being split.
497///
498/// This iterator can be created by the [`Regex::splitn`] method.
499#[derive(Debug)]
500pub struct SplitN<'r, 'h> {
501    splits: Split<'r, 'h>,
502    limit: usize,
503}
504
505impl<'r, 'h> Iterator for SplitN<'r, 'h> {
506    type Item = Result<&'h str>;
507
508    /// Returns the next substring resulting from splitting the target by the regex,
509    /// limited to `N` splits.
510    ///
511    /// Returns `None` if no more matches are found or if the limit is reached after yielding
512    /// the remaining part of the target.
513    fn next(&mut self) -> Option<Result<&'h str>> {
514        if self.limit == 0 {
515            // Limit reached. No more substrings available.
516            return None;
517        }
518
519        // Decrement the limit for each split.
520        self.limit -= 1;
521        if self.limit > 0 {
522            return self.splits.next();
523        }
524
525        // Nth split
526        let len = self.splits.target.len();
527        if self.splits.next_start > len {
528            // No more substrings available.
529            None
530        } else {
531            // Return the remaining part of the target
532            let start = self.splits.next_start;
533            self.splits.next_start = len + 1;
534            Some(Ok(&self.splits.target[start..len]))
535        }
536    }
537
538    fn size_hint(&self) -> (usize, Option<usize>) {
539        (0, Some(self.limit))
540    }
541}
542
543impl<'r, 'h> core::iter::FusedIterator for SplitN<'r, 'h> {}
544
545#[derive(Clone, Debug)]
546struct RegexOptions {
547    pattern: String,
548    syntaxc: SyntaxConfig,
549    backtrack_limit: usize,
550    delegate_size_limit: Option<usize>,
551    delegate_dfa_size_limit: Option<usize>,
552    oniguruma_mode: bool,
553}
554
555impl RegexOptions {
556    fn get_flag_value(flag_value: bool, enum_value: u32) -> u32 {
557        if flag_value {
558            enum_value
559        } else {
560            0
561        }
562    }
563
564    fn compute_flags(&self) -> u32 {
565        let insensitive = Self::get_flag_value(self.syntaxc.get_case_insensitive(), FLAG_CASEI);
566        let multiline = Self::get_flag_value(self.syntaxc.get_multi_line(), FLAG_MULTI);
567        let whitespace =
568            Self::get_flag_value(self.syntaxc.get_ignore_whitespace(), FLAG_IGNORE_SPACE);
569        let dotnl = Self::get_flag_value(self.syntaxc.get_dot_matches_new_line(), FLAG_DOTNL);
570        let unicode = Self::get_flag_value(self.syntaxc.get_unicode(), FLAG_UNICODE);
571        let oniguruma_mode = Self::get_flag_value(self.oniguruma_mode, FLAG_ONIGURUMA_MODE);
572
573        insensitive | multiline | whitespace | dotnl | unicode | unicode | oniguruma_mode
574    }
575}
576
577impl Default for RegexOptions {
578    fn default() -> Self {
579        RegexOptions {
580            pattern: String::new(),
581            syntaxc: SyntaxConfig::default(),
582            backtrack_limit: 1_000_000,
583            delegate_size_limit: None,
584            delegate_dfa_size_limit: None,
585            oniguruma_mode: false,
586        }
587    }
588}
589
590impl RegexBuilder {
591    /// Create a new regex builder with a regex pattern.
592    ///
593    /// If the pattern is invalid, the call to `build` will fail later.
594    pub fn new(pattern: &str) -> Self {
595        let mut builder = RegexBuilder(RegexOptions::default());
596        builder.0.pattern = pattern.to_string();
597        builder
598    }
599
600    /// Build the `Regex`.
601    ///
602    /// Returns an [`Error`](enum.Error.html) if the pattern could not be parsed.
603    pub fn build(&self) -> Result<Regex> {
604        Regex::new_options(self.0.clone())
605    }
606
607    fn set_config(&mut self, func: impl Fn(SyntaxConfig) -> SyntaxConfig) -> &mut Self {
608        self.0.syntaxc = func(self.0.syntaxc);
609        self
610    }
611
612    /// Override default case insensitive
613    /// this is to enable/disable casing via builder instead of a flag within
614    /// the raw string provided to the regex builder
615    ///
616    /// Default is false
617    pub fn case_insensitive(&mut self, yes: bool) -> &mut Self {
618        self.set_config(|x| x.case_insensitive(yes))
619    }
620
621    /// Enable multi-line regex
622    pub fn multi_line(&mut self, yes: bool) -> &mut Self {
623        self.set_config(|x| x.multi_line(yes))
624    }
625
626    /// Allow ignore whitespace
627    pub fn ignore_whitespace(&mut self, yes: bool) -> &mut Self {
628        self.set_config(|x| x.ignore_whitespace(yes))
629    }
630
631    /// Enable or disable the "dot matches any character" flag.
632    /// When this is enabled, `.` will match any character. When it's disabled, then `.` will match any character
633    /// except for a new line character.
634    pub fn dot_matches_new_line(&mut self, yes: bool) -> &mut Self {
635        self.set_config(|x| x.dot_matches_new_line(yes))
636    }
637
638    /// Enable verbose mode in the regular expression.
639    ///
640    /// The same as ignore_whitespace
641    ///
642    /// When enabled, verbose mode permits insigificant whitespace in many
643    /// places in the regular expression, as well as comments. Comments are
644    /// started using `#` and continue until the end of the line.
645    ///
646    /// By default, this is disabled. It may be selectively enabled in the
647    /// regular expression by using the `x` flag regardless of this setting.
648    pub fn verbose_mode(&mut self, yes: bool) -> &mut Self {
649        self.set_config(|x| x.ignore_whitespace(yes))
650    }
651
652    /// Enable or disable the Unicode flag (`u`) by default.
653    ///
654    /// By default this is **enabled**. It may alternatively be selectively
655    /// disabled in the regular expression itself via the `u` flag.
656    ///
657    /// Note that unless "allow invalid UTF-8" is enabled (it's disabled by
658    /// default), a regular expression will fail to parse if Unicode mode is
659    /// disabled and a sub-expression could possibly match invalid UTF-8.
660    ///
661    /// **WARNING**: Unicode mode can greatly increase the size of the compiled
662    /// DFA, which can noticeably impact both memory usage and compilation
663    /// time. This is especially noticeable if your regex contains character
664    /// classes like `\w` that are impacted by whether Unicode is enabled or
665    /// not. If Unicode is not necessary, you are encouraged to disable it.
666    pub fn unicode_mode(&mut self, yes: bool) -> &mut Self {
667        self.set_config(|x| x.unicode(yes))
668    }
669
670    /// Limit for how many times backtracking should be attempted for fancy regexes (where
671    /// backtracking is used). If this limit is exceeded, execution returns an error with
672    /// [`Error::BacktrackLimitExceeded`](enum.Error.html#variant.BacktrackLimitExceeded).
673    /// This is for preventing a regex with catastrophic backtracking to run for too long.
674    ///
675    /// Default is `1_000_000` (1 million).
676    pub fn backtrack_limit(&mut self, limit: usize) -> &mut Self {
677        self.0.backtrack_limit = limit;
678        self
679    }
680
681    /// Set the approximate size limit of the compiled regular expression.
682    ///
683    /// This option is forwarded from the wrapped `regex` crate. Note that depending on the used
684    /// regex features there may be multiple delegated sub-regexes fed to the `regex` crate. As
685    /// such the actual limit is closer to `<number of delegated regexes> * delegate_size_limit`.
686    pub fn delegate_size_limit(&mut self, limit: usize) -> &mut Self {
687        self.0.delegate_size_limit = Some(limit);
688        self
689    }
690
691    /// Set the approximate size of the cache used by the DFA.
692    ///
693    /// This option is forwarded from the wrapped `regex` crate. Note that depending on the used
694    /// regex features there may be multiple delegated sub-regexes fed to the `regex` crate. As
695    /// such the actual limit is closer to `<number of delegated regexes> *
696    /// delegate_dfa_size_limit`.
697    pub fn delegate_dfa_size_limit(&mut self, limit: usize) -> &mut Self {
698        self.0.delegate_dfa_size_limit = Some(limit);
699        self
700    }
701
702    /// Attempts to better match [Oniguruma](https://github.com/kkos/oniguruma)'s default behavior
703    ///
704    /// Currently this amounts to changing behavior with:
705    ///
706    /// # Left and right word bounds
707    ///
708    /// `fancy-regex` follows the default of other regex engines such as the `regex` crate itself
709    /// where `\<` and `\>` correspond to a _left_ and _right_ word-bound respectively. This
710    /// differs from Oniguruma's defaults which treat them as matching the literals `<` and `>`.
711    /// When this option is set using `\<` and `\>` in the pattern will match the literals
712    /// `<` and `>` instead of word bounds.
713    ///
714    /// ## Example
715    ///
716    /// ```
717    /// use fancy_regex::{Regex, RegexBuilder};
718    ///
719    /// let haystack = "turbo::<Fish>";
720    /// let regex = r"\<\w*\>";
721    ///
722    /// // By default `\<` and `\>` will match the start and end of a word boundary
723    /// let word_bounds_regex = Regex::new(regex).unwrap();
724    /// let word_bounds = word_bounds_regex.find(haystack).unwrap().unwrap();
725    /// assert_eq!(word_bounds.as_str(), "turbo");
726    ///
727    /// // With the option set they instead match the literal `<` and `>` characters
728    /// let literals_regex = RegexBuilder::new(regex).oniguruma_mode(true).build().unwrap();
729    /// let literals = literals_regex.find(haystack).unwrap().unwrap();
730    /// assert_eq!(literals.as_str(), "<Fish>");
731    /// ```
732    pub fn oniguruma_mode(&mut self, yes: bool) -> &mut Self {
733        self.0.oniguruma_mode = yes;
734        self
735    }
736}
737
738impl fmt::Debug for Regex {
739    /// Shows the original regular expression.
740    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
741        write!(f, "{}", self.as_str())
742    }
743}
744
745impl fmt::Display for Regex {
746    /// Shows the original regular expression
747    fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
748        write!(f, "{}", self.as_str())
749    }
750}
751
752impl FromStr for Regex {
753    type Err = Error;
754
755    /// Attempts to parse a string into a regular expression
756    fn from_str(s: &str) -> Result<Regex> {
757        Regex::new(s)
758    }
759}
760
761impl Regex {
762    /// Parse and compile a regex with default options, see `RegexBuilder`.
763    ///
764    /// Returns an [`Error`](enum.Error.html) if the pattern could not be parsed.
765    pub fn new(re: &str) -> Result<Regex> {
766        let options = RegexOptions {
767            pattern: re.to_string(),
768            ..RegexOptions::default()
769        };
770        Self::new_options(options)
771    }
772
773    fn new_options(options: RegexOptions) -> Result<Regex> {
774        let mut tree = Expr::parse_tree_with_flags(&options.pattern, options.compute_flags())?;
775
776        // try to optimize the expression tree
777        let requires_capture_group_fixup = optimize(&mut tree);
778        let info = analyze(&tree, requires_capture_group_fixup)?;
779
780        if !info.hard {
781            // easy case, wrap regex
782
783            // we do our own to_str because escapes are different
784            // NOTE: there is a good opportunity here to use Hir to avoid regex-automata re-parsing it
785            let mut re_cooked = String::new();
786            tree.expr.to_str(&mut re_cooked, 0);
787            let inner = compile::compile_inner(&re_cooked, &options)?;
788            return Ok(Regex {
789                inner: RegexImpl::Wrap {
790                    inner,
791                    options: RegexOptions {
792                        pattern: options.pattern,
793                        ..options
794                    },
795                    explicit_capture_group_0: requires_capture_group_fixup,
796                    debug_pattern: re_cooked,
797                },
798                named_groups: Arc::new(tree.named_groups),
799            });
800        }
801
802        let prog = compile(&info, can_compile_as_anchored(&tree.expr))?;
803        Ok(Regex {
804            inner: RegexImpl::Fancy {
805                prog,
806                n_groups: info.end_group,
807                options,
808            },
809            named_groups: Arc::new(tree.named_groups),
810        })
811    }
812
813    /// Returns the original string of this regex.
814    pub fn as_str(&self) -> &str {
815        match &self.inner {
816            RegexImpl::Wrap { options, .. } => &options.pattern,
817            RegexImpl::Fancy { options, .. } => &options.pattern,
818        }
819    }
820
821    /// Check if the regex matches the input text.
822    ///
823    /// # Example
824    ///
825    /// Test if some text contains the same word twice:
826    ///
827    /// ```rust
828    /// # use fancy_regex::Regex;
829    ///
830    /// let re = Regex::new(r"(\w+) \1").unwrap();
831    /// assert!(re.is_match("mirror mirror on the wall").unwrap());
832    /// ```
833    pub fn is_match(&self, text: &str) -> Result<bool> {
834        match &self.inner {
835            RegexImpl::Wrap { ref inner, .. } => Ok(inner.is_match(text)),
836            RegexImpl::Fancy {
837                ref prog, options, ..
838            } => {
839                let result = vm::run(prog, text, 0, 0, options)?;
840                Ok(result.is_some())
841            }
842        }
843    }
844
845    /// Returns an iterator for each successive non-overlapping match in `text`.
846    ///
847    /// If you have capturing groups in your regex that you want to extract, use the [Regex::captures_iter()]
848    /// method.
849    ///
850    /// # Example
851    ///
852    /// Find all words followed by an exclamation point:
853    ///
854    /// ```rust
855    /// # use fancy_regex::Regex;
856    ///
857    /// let re = Regex::new(r"\w+(?=!)").unwrap();
858    /// let mut matches = re.find_iter("so fancy! even with! iterators!");
859    /// assert_eq!(matches.next().unwrap().unwrap().as_str(), "fancy");
860    /// assert_eq!(matches.next().unwrap().unwrap().as_str(), "with");
861    /// assert_eq!(matches.next().unwrap().unwrap().as_str(), "iterators");
862    /// assert!(matches.next().is_none());
863    /// ```
864    pub fn find_iter<'r, 't>(&'r self, text: &'t str) -> Matches<'r, 't> {
865        Matches {
866            re: self,
867            text,
868            last_end: 0,
869            last_match: None,
870        }
871    }
872
873    /// Find the first match in the input text.
874    ///
875    /// If you have capturing groups in your regex that you want to extract, use the [Regex::captures()]
876    /// method.
877    ///
878    /// # Example
879    ///
880    /// Find a word that is followed by an exclamation point:
881    ///
882    /// ```rust
883    /// # use fancy_regex::Regex;
884    ///
885    /// let re = Regex::new(r"\w+(?=!)").unwrap();
886    /// assert_eq!(re.find("so fancy!").unwrap().unwrap().as_str(), "fancy");
887    /// ```
888    pub fn find<'t>(&self, text: &'t str) -> Result<Option<Match<'t>>> {
889        self.find_from_pos(text, 0)
890    }
891
892    /// Returns the first match in `text`, starting from the specified byte position `pos`.
893    ///
894    /// # Examples
895    ///
896    /// Finding match starting at a position:
897    ///
898    /// ```
899    /// # use fancy_regex::Regex;
900    /// let re = Regex::new(r"(?m:^)(\d+)").unwrap();
901    /// let text = "1 test 123\n2 foo";
902    /// let mat = re.find_from_pos(text, 7).unwrap().unwrap();
903    ///
904    /// assert_eq!(mat.start(), 11);
905    /// assert_eq!(mat.end(), 12);
906    /// ```
907    ///
908    /// Note that in some cases this is not the same as using the `find`
909    /// method and passing a slice of the string, see [Regex::captures_from_pos()] for details.
910    pub fn find_from_pos<'t>(&self, text: &'t str, pos: usize) -> Result<Option<Match<'t>>> {
911        self.find_from_pos_with_option_flags(text, pos, 0)
912    }
913
914    fn find_from_pos_with_option_flags<'t>(
915        &self,
916        text: &'t str,
917        pos: usize,
918        option_flags: u32,
919    ) -> Result<Option<Match<'t>>> {
920        match &self.inner {
921            RegexImpl::Wrap {
922                inner,
923                explicit_capture_group_0,
924                ..
925            } => {
926                if !*explicit_capture_group_0 {
927                    Ok(inner
928                        .search(&RaInput::new(text).span(pos..text.len()))
929                        .map(|m| Match::new(text, m.start(), m.end())))
930                } else {
931                    let mut locations = inner.create_captures();
932                    inner.captures(RaInput::new(text).span(pos..text.len()), &mut locations);
933                    Ok(locations.is_match().then(|| {
934                        Match::new(
935                            text,
936                            locations.get_group(1).unwrap().start,
937                            locations.get_group(1).unwrap().end,
938                        )
939                    }))
940                }
941            }
942            RegexImpl::Fancy { prog, options, .. } => {
943                let result = vm::run(prog, text, pos, option_flags, options)?;
944                Ok(result.map(|saves| Match::new(text, saves[0], saves[1])))
945            }
946        }
947    }
948
949    /// Returns an iterator over all the non-overlapping capture groups matched in `text`.
950    ///
951    /// # Examples
952    ///
953    /// Finding all matches and capturing parts of each:
954    ///
955    /// ```rust
956    /// # use fancy_regex::Regex;
957    ///
958    /// let re = Regex::new(r"(\d{4})-(\d{2})").unwrap();
959    /// let text = "It was between 2018-04 and 2020-01";
960    /// let mut all_captures = re.captures_iter(text);
961    ///
962    /// let first = all_captures.next().unwrap().unwrap();
963    /// assert_eq!(first.get(1).unwrap().as_str(), "2018");
964    /// assert_eq!(first.get(2).unwrap().as_str(), "04");
965    /// assert_eq!(first.get(0).unwrap().as_str(), "2018-04");
966    ///
967    /// let second = all_captures.next().unwrap().unwrap();
968    /// assert_eq!(second.get(1).unwrap().as_str(), "2020");
969    /// assert_eq!(second.get(2).unwrap().as_str(), "01");
970    /// assert_eq!(second.get(0).unwrap().as_str(), "2020-01");
971    ///
972    /// assert!(all_captures.next().is_none());
973    /// ```
974    pub fn captures_iter<'r, 't>(&'r self, text: &'t str) -> CaptureMatches<'r, 't> {
975        CaptureMatches(self.find_iter(text))
976    }
977
978    /// Returns the capture groups for the first match in `text`.
979    ///
980    /// If no match is found, then `Ok(None)` is returned.
981    ///
982    /// # Examples
983    ///
984    /// Finding matches and capturing parts of the match:
985    ///
986    /// ```rust
987    /// # use fancy_regex::Regex;
988    ///
989    /// let re = Regex::new(r"(\d{4})-(\d{2})-(\d{2})").unwrap();
990    /// let text = "The date was 2018-04-07";
991    /// let captures = re.captures(text).unwrap().unwrap();
992    ///
993    /// assert_eq!(captures.get(1).unwrap().as_str(), "2018");
994    /// assert_eq!(captures.get(2).unwrap().as_str(), "04");
995    /// assert_eq!(captures.get(3).unwrap().as_str(), "07");
996    /// assert_eq!(captures.get(0).unwrap().as_str(), "2018-04-07");
997    /// ```
998    pub fn captures<'t>(&self, text: &'t str) -> Result<Option<Captures<'t>>> {
999        self.captures_from_pos(text, 0)
1000    }
1001
1002    /// Returns the capture groups for the first match in `text`, starting from
1003    /// the specified byte position `pos`.
1004    ///
1005    /// # Examples
1006    ///
1007    /// Finding captures starting at a position:
1008    ///
1009    /// ```
1010    /// # use fancy_regex::Regex;
1011    /// let re = Regex::new(r"(?m:^)(\d+)").unwrap();
1012    /// let text = "1 test 123\n2 foo";
1013    /// let captures = re.captures_from_pos(text, 7).unwrap().unwrap();
1014    ///
1015    /// let group = captures.get(1).unwrap();
1016    /// assert_eq!(group.as_str(), "2");
1017    /// assert_eq!(group.start(), 11);
1018    /// assert_eq!(group.end(), 12);
1019    /// ```
1020    ///
1021    /// Note that in some cases this is not the same as using the `captures`
1022    /// method and passing a slice of the string, see the capture that we get
1023    /// when we do this:
1024    ///
1025    /// ```
1026    /// # use fancy_regex::Regex;
1027    /// let re = Regex::new(r"(?m:^)(\d+)").unwrap();
1028    /// let text = "1 test 123\n2 foo";
1029    /// let captures = re.captures(&text[7..]).unwrap().unwrap();
1030    /// assert_eq!(captures.get(1).unwrap().as_str(), "123");
1031    /// ```
1032    ///
1033    /// This matched the number "123" because it's at the beginning of the text
1034    /// of the string slice.
1035    ///
1036    pub fn captures_from_pos<'t>(&self, text: &'t str, pos: usize) -> Result<Option<Captures<'t>>> {
1037        let named_groups = self.named_groups.clone();
1038        match &self.inner {
1039            RegexImpl::Wrap {
1040                inner,
1041                explicit_capture_group_0,
1042                ..
1043            } => {
1044                let mut locations = inner.create_captures();
1045                inner.captures(RaInput::new(text).span(pos..text.len()), &mut locations);
1046                if locations.is_match() {
1047                    Ok(Some(Captures {
1048                        inner: CapturesImpl::Wrap {
1049                            text,
1050                            locations,
1051                            explicit_capture_group_0: *explicit_capture_group_0,
1052                        },
1053                        named_groups,
1054                    }))
1055                } else {
1056                    Ok(None)
1057                }
1058            }
1059            RegexImpl::Fancy {
1060                prog,
1061                n_groups,
1062                options,
1063                ..
1064            } => {
1065                let result = vm::run(prog, text, pos, 0, options)?;
1066                Ok(result.map(|mut saves| {
1067                    saves.truncate(n_groups * 2);
1068                    Captures {
1069                        inner: CapturesImpl::Fancy { text, saves },
1070                        named_groups,
1071                    }
1072                }))
1073            }
1074        }
1075    }
1076
1077    /// Returns the number of captures, including the implicit capture of the entire expression.
1078    pub fn captures_len(&self) -> usize {
1079        match &self.inner {
1080            RegexImpl::Wrap {
1081                inner,
1082                explicit_capture_group_0,
1083                ..
1084            } => inner.captures_len() - if *explicit_capture_group_0 { 1 } else { 0 },
1085            RegexImpl::Fancy { n_groups, .. } => *n_groups,
1086        }
1087    }
1088
1089    /// Returns an iterator over the capture names.
1090    pub fn capture_names(&self) -> CaptureNames<'_> {
1091        let mut names = Vec::new();
1092        names.resize(self.captures_len(), None);
1093        for (name, &i) in self.named_groups.iter() {
1094            names[i] = Some(name.as_str());
1095        }
1096        CaptureNames(names.into_iter())
1097    }
1098
1099    // for debugging only
1100    #[doc(hidden)]
1101    pub fn debug_print(&self, writer: &mut Formatter<'_>) -> fmt::Result {
1102        match &self.inner {
1103            RegexImpl::Wrap {
1104                debug_pattern,
1105                explicit_capture_group_0,
1106                ..
1107            } => {
1108                write!(
1109                    writer,
1110                    "wrapped Regex {:?}, explicit_capture_group_0: {:}",
1111                    debug_pattern, *explicit_capture_group_0
1112                )
1113            }
1114            RegexImpl::Fancy { prog, .. } => prog.debug_print(writer),
1115        }
1116    }
1117
1118    /// Replaces the leftmost-first match with the replacement provided.
1119    /// The replacement can be a regular string (where `$N` and `$name` are
1120    /// expanded to match capture groups) or a function that takes the matches'
1121    /// `Captures` and returns the replaced string.
1122    ///
1123    /// If no match is found, then a copy of the string is returned unchanged.
1124    ///
1125    /// # Replacement string syntax
1126    ///
1127    /// All instances of `$name` in the replacement text is replaced with the
1128    /// corresponding capture group `name`.
1129    ///
1130    /// `name` may be an integer corresponding to the index of the
1131    /// capture group (counted by order of opening parenthesis where `0` is the
1132    /// entire match) or it can be a name (consisting of letters, digits or
1133    /// underscores) corresponding to a named capture group.
1134    ///
1135    /// If `name` isn't a valid capture group (whether the name doesn't exist
1136    /// or isn't a valid index), then it is replaced with the empty string.
1137    ///
1138    /// The longest possible name is used. e.g., `$1a` looks up the capture
1139    /// group named `1a` and not the capture group at index `1`. To exert more
1140    /// precise control over the name, use braces, e.g., `${1}a`.
1141    ///
1142    /// To write a literal `$` use `$$`.
1143    ///
1144    /// # Examples
1145    ///
1146    /// Note that this function is polymorphic with respect to the replacement.
1147    /// In typical usage, this can just be a normal string:
1148    ///
1149    /// ```rust
1150    /// # use fancy_regex::Regex;
1151    /// let re = Regex::new("[^01]+").unwrap();
1152    /// assert_eq!(re.replace("1078910", ""), "1010");
1153    /// ```
1154    ///
1155    /// But anything satisfying the `Replacer` trait will work. For example,
1156    /// a closure of type `|&Captures| -> String` provides direct access to the
1157    /// captures corresponding to a match. This allows one to access
1158    /// capturing group matches easily:
1159    ///
1160    /// ```rust
1161    /// # use fancy_regex::{Regex, Captures};
1162    /// let re = Regex::new(r"([^,\s]+),\s+(\S+)").unwrap();
1163    /// let result = re.replace("Springsteen, Bruce", |caps: &Captures| {
1164    ///     format!("{} {}", &caps[2], &caps[1])
1165    /// });
1166    /// assert_eq!(result, "Bruce Springsteen");
1167    /// ```
1168    ///
1169    /// But this is a bit cumbersome to use all the time. Instead, a simple
1170    /// syntax is supported that expands `$name` into the corresponding capture
1171    /// group. Here's the last example, but using this expansion technique
1172    /// with named capture groups:
1173    ///
1174    /// ```rust
1175    /// # use fancy_regex::Regex;
1176    /// let re = Regex::new(r"(?P<last>[^,\s]+),\s+(?P<first>\S+)").unwrap();
1177    /// let result = re.replace("Springsteen, Bruce", "$first $last");
1178    /// assert_eq!(result, "Bruce Springsteen");
1179    /// ```
1180    ///
1181    /// Note that using `$2` instead of `$first` or `$1` instead of `$last`
1182    /// would produce the same result. To write a literal `$` use `$$`.
1183    ///
1184    /// Sometimes the replacement string requires use of curly braces to
1185    /// delineate a capture group replacement and surrounding literal text.
1186    /// For example, if we wanted to join two words together with an
1187    /// underscore:
1188    ///
1189    /// ```rust
1190    /// # use fancy_regex::Regex;
1191    /// let re = Regex::new(r"(?P<first>\w+)\s+(?P<second>\w+)").unwrap();
1192    /// let result = re.replace("deep fried", "${first}_$second");
1193    /// assert_eq!(result, "deep_fried");
1194    /// ```
1195    ///
1196    /// Without the curly braces, the capture group name `first_` would be
1197    /// used, and since it doesn't exist, it would be replaced with the empty
1198    /// string.
1199    ///
1200    /// Finally, sometimes you just want to replace a literal string with no
1201    /// regard for capturing group expansion. This can be done by wrapping a
1202    /// byte string with `NoExpand`:
1203    ///
1204    /// ```rust
1205    /// # use fancy_regex::Regex;
1206    /// use fancy_regex::NoExpand;
1207    ///
1208    /// let re = Regex::new(r"(?P<last>[^,\s]+),\s+(\S+)").unwrap();
1209    /// let result = re.replace("Springsteen, Bruce", NoExpand("$2 $last"));
1210    /// assert_eq!(result, "$2 $last");
1211    /// ```
1212    pub fn replace<'t, R: Replacer>(&self, text: &'t str, rep: R) -> Cow<'t, str> {
1213        self.replacen(text, 1, rep)
1214    }
1215
1216    /// Replaces all non-overlapping matches in `text` with the replacement
1217    /// provided. This is the same as calling `replacen` with `limit` set to
1218    /// `0`.
1219    ///
1220    /// See the documentation for `replace` for details on how to access
1221    /// capturing group matches in the replacement string.
1222    pub fn replace_all<'t, R: Replacer>(&self, text: &'t str, rep: R) -> Cow<'t, str> {
1223        self.replacen(text, 0, rep)
1224    }
1225
1226    /// Replaces at most `limit` non-overlapping matches in `text` with the
1227    /// replacement provided. If `limit` is 0, then all non-overlapping matches
1228    /// are replaced.
1229    ///
1230    /// Will panic if any errors are encountered. Use `try_replacen`, which this
1231    /// function unwraps, if you want to handle errors.
1232    ///
1233    /// See the documentation for `replace` for details on how to access
1234    /// capturing group matches in the replacement string.
1235    ///
1236    pub fn replacen<'t, R: Replacer>(&self, text: &'t str, limit: usize, rep: R) -> Cow<'t, str> {
1237        self.try_replacen(text, limit, rep).unwrap()
1238    }
1239
1240    /// Replaces at most `limit` non-overlapping matches in `text` with the
1241    /// replacement provided. If `limit` is 0, then all non-overlapping matches
1242    /// are replaced.
1243    ///
1244    /// Propagates any errors encountered, such as `RuntimeError::BacktrackLimitExceeded`.
1245    ///
1246    /// See the documentation for `replace` for details on how to access
1247    /// capturing group matches in the replacement string.
1248    pub fn try_replacen<'t, R: Replacer>(
1249        &self,
1250        text: &'t str,
1251        limit: usize,
1252        mut rep: R,
1253    ) -> Result<Cow<'t, str>> {
1254        // If we know that the replacement doesn't have any capture expansions,
1255        // then we can fast path. The fast path can make a tremendous
1256        // difference:
1257        //
1258        //   1) We use `find_iter` instead of `captures_iter`. Not asking for
1259        //      captures generally makes the regex engines faster.
1260        //   2) We don't need to look up all of the capture groups and do
1261        //      replacements inside the replacement string. We just push it
1262        //      at each match and be done with it.
1263        if let Some(rep) = rep.no_expansion() {
1264            let mut it = self.find_iter(text).enumerate().peekable();
1265            if it.peek().is_none() {
1266                return Ok(Cow::Borrowed(text));
1267            }
1268            let mut new = String::with_capacity(text.len());
1269            let mut last_match = 0;
1270            for (i, m) in it {
1271                let m = m?;
1272
1273                if limit > 0 && i >= limit {
1274                    break;
1275                }
1276                new.push_str(&text[last_match..m.start()]);
1277                new.push_str(&rep);
1278                last_match = m.end();
1279            }
1280            new.push_str(&text[last_match..]);
1281            return Ok(Cow::Owned(new));
1282        }
1283
1284        // The slower path, which we use if the replacement needs access to
1285        // capture groups.
1286        let mut it = self.captures_iter(text).enumerate().peekable();
1287        if it.peek().is_none() {
1288            return Ok(Cow::Borrowed(text));
1289        }
1290        let mut new = String::with_capacity(text.len());
1291        let mut last_match = 0;
1292        for (i, cap) in it {
1293            let cap = cap?;
1294
1295            if limit > 0 && i >= limit {
1296                break;
1297            }
1298            // unwrap on 0 is OK because captures only reports matches
1299            let m = cap.get(0).unwrap();
1300            new.push_str(&text[last_match..m.start()]);
1301            rep.replace_append(&cap, &mut new);
1302            last_match = m.end();
1303        }
1304        new.push_str(&text[last_match..]);
1305        Ok(Cow::Owned(new))
1306    }
1307
1308    /// Splits the string by matches of the regex.
1309    ///
1310    /// Returns an iterator over the substrings of the target string
1311    ///  that *aren't* matched by the regex.
1312    ///
1313    /// # Example
1314    ///
1315    /// To split a string delimited by arbitrary amounts of spaces or tabs:
1316    ///
1317    /// ```rust
1318    /// # use fancy_regex::Regex;
1319    /// let re = Regex::new(r"[ \t]+").unwrap();
1320    /// let target = "a b \t  c\td    e";
1321    /// let fields: Vec<&str> = re.split(target).map(|x| x.unwrap()).collect();
1322    /// assert_eq!(fields, vec!["a", "b", "c", "d", "e"]);
1323    /// ```
1324    pub fn split<'r, 'h>(&'r self, target: &'h str) -> Split<'r, 'h> {
1325        Split {
1326            matches: self.find_iter(target),
1327            next_start: 0,
1328            target,
1329        }
1330    }
1331
1332    /// Splits the string by matches of the regex at most `limit` times.
1333    ///
1334    /// Returns an iterator over the substrings of the target string
1335    /// that *aren't* matched by the regex.
1336    ///
1337    /// The `N`th substring is the remaining part of the target.
1338    ///
1339    /// # Example
1340    ///
1341    /// To split a string delimited by arbitrary amounts of spaces or tabs
1342    /// 3 times:
1343    ///
1344    /// ```rust
1345    /// # use fancy_regex::Regex;
1346    /// let re = Regex::new(r"[ \t]+").unwrap();
1347    /// let target = "a b \t  c\td    e";
1348    /// let fields: Vec<&str> = re.splitn(target, 3).map(|x| x.unwrap()).collect();
1349    /// assert_eq!(fields, vec!["a", "b", "c\td    e"]);
1350    /// ```
1351    pub fn splitn<'r, 'h>(&'r self, target: &'h str, limit: usize) -> SplitN<'r, 'h> {
1352        SplitN {
1353            splits: self.split(target),
1354            limit,
1355        }
1356    }
1357}
1358
1359impl TryFrom<&str> for Regex {
1360    type Error = Error;
1361
1362    /// Attempts to parse a string into a regular expression
1363    fn try_from(s: &str) -> Result<Self> {
1364        Self::new(s)
1365    }
1366}
1367
1368impl TryFrom<String> for Regex {
1369    type Error = Error;
1370
1371    /// Attempts to parse a string into a regular expression
1372    fn try_from(s: String) -> Result<Self> {
1373        Self::new(&s)
1374    }
1375}
1376
1377impl<'t> Match<'t> {
1378    /// Returns the starting byte offset of the match in the text.
1379    #[inline]
1380    pub fn start(&self) -> usize {
1381        self.start
1382    }
1383
1384    /// Returns the ending byte offset of the match in the text.
1385    #[inline]
1386    pub fn end(&self) -> usize {
1387        self.end
1388    }
1389
1390    /// Returns the range over the starting and ending byte offsets of the match in text.
1391    #[inline]
1392    pub fn range(&self) -> Range<usize> {
1393        self.start..self.end
1394    }
1395
1396    /// Returns the matched text.
1397    #[inline]
1398    pub fn as_str(&self) -> &'t str {
1399        &self.text[self.start..self.end]
1400    }
1401
1402    /// Creates a new match from the given text and byte offsets.
1403    fn new(text: &'t str, start: usize, end: usize) -> Match<'t> {
1404        Match { text, start, end }
1405    }
1406}
1407
1408impl<'t> From<Match<'t>> for &'t str {
1409    fn from(m: Match<'t>) -> &'t str {
1410        m.as_str()
1411    }
1412}
1413
1414impl<'t> From<Match<'t>> for Range<usize> {
1415    fn from(m: Match<'t>) -> Range<usize> {
1416        m.range()
1417    }
1418}
1419
1420#[allow(clippy::len_without_is_empty)] // follow regex's API
1421impl<'t> Captures<'t> {
1422    /// Get the capture group by its index in the regex.
1423    ///
1424    /// If there is no match for that group or the index does not correspond to a group, `None` is
1425    /// returned. The index 0 returns the whole match.
1426    pub fn get(&self, i: usize) -> Option<Match<'t>> {
1427        match &self.inner {
1428            CapturesImpl::Wrap {
1429                text,
1430                locations,
1431                explicit_capture_group_0,
1432            } => locations
1433                .get_group(i + if *explicit_capture_group_0 { 1 } else { 0 })
1434                .map(|span| Match {
1435                    text,
1436                    start: span.start,
1437                    end: span.end,
1438                }),
1439            CapturesImpl::Fancy { text, ref saves } => {
1440                let slot = i * 2;
1441                if slot >= saves.len() {
1442                    return None;
1443                }
1444                let lo = saves[slot];
1445                if lo == usize::MAX {
1446                    return None;
1447                }
1448                let hi = saves[slot + 1];
1449                Some(Match {
1450                    text,
1451                    start: lo,
1452                    end: hi,
1453                })
1454            }
1455        }
1456    }
1457
1458    /// Returns the match for a named capture group.  Returns `None` the capture
1459    /// group did not match or if there is no group with the given name.
1460    pub fn name(&self, name: &str) -> Option<Match<'t>> {
1461        self.named_groups.get(name).and_then(|i| self.get(*i))
1462    }
1463
1464    /// Expands all instances of `$group` in `replacement` to the corresponding
1465    /// capture group `name`, and writes them to the `dst` buffer given.
1466    ///
1467    /// `group` may be an integer corresponding to the index of the
1468    /// capture group (counted by order of opening parenthesis where `\0` is the
1469    /// entire match) or it can be a name (consisting of letters, digits or
1470    /// underscores) corresponding to a named capture group.
1471    ///
1472    /// If `group` isn't a valid capture group (whether the name doesn't exist
1473    /// or isn't a valid index), then it is replaced with the empty string.
1474    ///
1475    /// The longest possible name is used. e.g., `$1a` looks up the capture
1476    /// group named `1a` and not the capture group at index `1`. To exert more
1477    /// precise control over the name, use braces, e.g., `${1}a`.
1478    ///
1479    /// To write a literal `$`, use `$$`.
1480    ///
1481    /// For more control over expansion, see [`Expander`].
1482    ///
1483    /// [`Expander`]: expand/struct.Expander.html
1484    pub fn expand(&self, replacement: &str, dst: &mut String) {
1485        Expander::default().append_expansion(dst, replacement, self);
1486    }
1487
1488    /// Iterate over the captured groups in order in which they appeared in the regex. The first
1489    /// capture corresponds to the whole match.
1490    pub fn iter<'c>(&'c self) -> SubCaptureMatches<'c, 't> {
1491        SubCaptureMatches { caps: self, i: 0 }
1492    }
1493
1494    /// How many groups were captured. This is always at least 1 because group 0 returns the whole
1495    /// match.
1496    pub fn len(&self) -> usize {
1497        match &self.inner {
1498            CapturesImpl::Wrap {
1499                locations,
1500                explicit_capture_group_0,
1501                ..
1502            } => locations.group_len() - if *explicit_capture_group_0 { 1 } else { 0 },
1503            CapturesImpl::Fancy { saves, .. } => saves.len() / 2,
1504        }
1505    }
1506}
1507
1508/// Get a group by index.
1509///
1510/// `'t` is the lifetime of the matched text.
1511///
1512/// The text can't outlive the `Captures` object if this method is
1513/// used, because of how `Index` is defined (normally `a[i]` is part
1514/// of `a` and can't outlive it); to do that, use `get()` instead.
1515///
1516/// # Panics
1517///
1518/// If there is no group at the given index.
1519impl<'t> Index<usize> for Captures<'t> {
1520    type Output = str;
1521
1522    fn index(&self, i: usize) -> &str {
1523        self.get(i)
1524            .map(|m| m.as_str())
1525            .unwrap_or_else(|| panic!("no group at index '{}'", i))
1526    }
1527}
1528
1529/// Get a group by name.
1530///
1531/// `'t` is the lifetime of the matched text and `'i` is the lifetime
1532/// of the group name (the index).
1533///
1534/// The text can't outlive the `Captures` object if this method is
1535/// used, because of how `Index` is defined (normally `a[i]` is part
1536/// of `a` and can't outlive it); to do that, use `name` instead.
1537///
1538/// # Panics
1539///
1540/// If there is no group named by the given value.
1541impl<'t, 'i> Index<&'i str> for Captures<'t> {
1542    type Output = str;
1543
1544    fn index<'a>(&'a self, name: &'i str) -> &'a str {
1545        self.name(name)
1546            .map(|m| m.as_str())
1547            .unwrap_or_else(|| panic!("no group named '{}'", name))
1548    }
1549}
1550
1551impl<'c, 't> Iterator for SubCaptureMatches<'c, 't> {
1552    type Item = Option<Match<'t>>;
1553
1554    fn next(&mut self) -> Option<Option<Match<'t>>> {
1555        if self.i < self.caps.len() {
1556            let result = self.caps.get(self.i);
1557            self.i += 1;
1558            Some(result)
1559        } else {
1560            None
1561        }
1562    }
1563}
1564
1565// TODO: might be nice to implement ExactSizeIterator etc for SubCaptures
1566
1567/// Regular expression AST. This is public for now but may change.
1568#[derive(Debug, PartialEq, Eq, Clone)]
1569pub enum Expr {
1570    /// An empty expression, e.g. the last branch in `(a|b|)`
1571    Empty,
1572    /// Any character, regex `.`
1573    Any {
1574        /// Whether it also matches newlines or not
1575        newline: bool,
1576    },
1577    /// An assertion
1578    Assertion(Assertion),
1579    /// The string as a literal, e.g. `a`
1580    Literal {
1581        /// The string to match
1582        val: String,
1583        /// Whether match is case-insensitive or not
1584        casei: bool,
1585    },
1586    /// Concatenation of multiple expressions, must match in order, e.g. `a.` is a concatenation of
1587    /// the literal `a` and `.` for any character
1588    Concat(Vec<Expr>),
1589    /// Alternative of multiple expressions, one of them must match, e.g. `a|b` is an alternative
1590    /// where either the literal `a` or `b` must match
1591    Alt(Vec<Expr>),
1592    /// Capturing group of expression, e.g. `(a.)` matches `a` and any character and "captures"
1593    /// (remembers) the match
1594    Group(Box<Expr>),
1595    /// Look-around (e.g. positive/negative look-ahead or look-behind) with an expression, e.g.
1596    /// `(?=a)` means the next character must be `a` (but the match is not consumed)
1597    LookAround(Box<Expr>, LookAround),
1598    /// Repeat of an expression, e.g. `a*` or `a+` or `a{1,3}`
1599    Repeat {
1600        /// The expression that is being repeated
1601        child: Box<Expr>,
1602        /// The minimum number of repetitions
1603        lo: usize,
1604        /// The maximum number of repetitions (or `usize::MAX`)
1605        hi: usize,
1606        /// Greedy means as much as possible is matched, e.g. `.*b` would match all of `abab`.
1607        /// Non-greedy means as little as possible, e.g. `.*?b` would match only `ab` in `abab`.
1608        greedy: bool,
1609    },
1610    /// Delegate a regex to the regex crate. This is used as a simplification so that we don't have
1611    /// to represent all the expressions in the AST, e.g. character classes.
1612    Delegate {
1613        /// The regex
1614        inner: String,
1615        /// How many characters the regex matches
1616        size: usize, // TODO: move into analysis result
1617        /// Whether the matching is case-insensitive or not
1618        casei: bool,
1619    },
1620    /// Back reference to a capture group, e.g. `\1` in `(abc|def)\1` references the captured group
1621    /// and the whole regex matches either `abcabc` or `defdef`.
1622    Backref {
1623        /// The capture group number being referenced
1624        group: usize,
1625        /// Whether the matching is case-insensitive or not
1626        casei: bool,
1627    },
1628    /// Back reference to a capture group at the given specified relative recursion level.
1629    BackrefWithRelativeRecursionLevel {
1630        /// The capture group number being referenced
1631        group: usize,
1632        /// Relative recursion level
1633        relative_level: isize,
1634        /// Whether the matching is case-insensitive or not
1635        casei: bool,
1636    },
1637    /// Atomic non-capturing group, e.g. `(?>ab|a)` in text that contains `ab` will match `ab` and
1638    /// never backtrack and try `a`, even if matching fails after the atomic group.
1639    AtomicGroup(Box<Expr>),
1640    /// Keep matched text so far out of overall match
1641    KeepOut,
1642    /// Anchor to match at the position where the previous match ended
1643    ContinueFromPreviousMatchEnd,
1644    /// Conditional expression based on whether the numbered capture group matched or not
1645    BackrefExistsCondition(usize),
1646    /// If/Then/Else Condition. If there is no Then/Else, these will just be empty expressions.
1647    Conditional {
1648        /// The conditional expression to evaluate
1649        condition: Box<Expr>,
1650        /// What to execute if the condition is true
1651        true_branch: Box<Expr>,
1652        /// What to execute if the condition is false
1653        false_branch: Box<Expr>,
1654    },
1655    /// Subroutine call to the specified group number
1656    SubroutineCall(usize),
1657    /// Unresolved subroutine call to the specified group name
1658    UnresolvedNamedSubroutineCall {
1659        /// The capture group name
1660        name: String,
1661        /// The position in the original regex pattern where the subroutine call is made
1662        ix: usize,
1663    },
1664}
1665
1666/// Type of look-around assertion as used for a look-around expression.
1667#[derive(Debug, PartialEq, Eq, Clone, Copy)]
1668pub enum LookAround {
1669    /// Look-ahead assertion, e.g. `(?=a)`
1670    LookAhead,
1671    /// Negative look-ahead assertion, e.g. `(?!a)`
1672    LookAheadNeg,
1673    /// Look-behind assertion, e.g. `(?<=a)`
1674    LookBehind,
1675    /// Negative look-behind assertion, e.g. `(?<!a)`
1676    LookBehindNeg,
1677}
1678
1679/// An iterator over capture names in a [Regex].  The iterator
1680/// returns the name of each group, or [None] if the group has
1681/// no name.  Because capture group 0 cannot have a name, the
1682/// first item returned is always [None].
1683pub struct CaptureNames<'r>(vec::IntoIter<Option<&'r str>>);
1684
1685impl Debug for CaptureNames<'_> {
1686    fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
1687        f.write_str("<CaptureNames>")
1688    }
1689}
1690
1691impl<'r> Iterator for CaptureNames<'r> {
1692    type Item = Option<&'r str>;
1693
1694    fn next(&mut self) -> Option<Self::Item> {
1695        self.0.next()
1696    }
1697}
1698
1699// silly to write my own, but this is super-fast for the common 1-digit
1700// case.
1701fn push_usize(s: &mut String, x: usize) {
1702    if x >= 10 {
1703        push_usize(s, x / 10);
1704        s.push((b'0' + (x % 10) as u8) as char);
1705    } else {
1706        s.push((b'0' + (x as u8)) as char);
1707    }
1708}
1709
1710fn is_special(c: char) -> bool {
1711    match c {
1712        '\\' | '.' | '+' | '*' | '?' | '(' | ')' | '|' | '[' | ']' | '{' | '}' | '^' | '$'
1713        | '#' => true,
1714        _ => false,
1715    }
1716}
1717
1718fn push_quoted(buf: &mut String, s: &str) {
1719    for c in s.chars() {
1720        if is_special(c) {
1721            buf.push('\\');
1722        }
1723        buf.push(c);
1724    }
1725}
1726
1727/// Escapes special characters in `text` with '\\'.  Returns a string which, when interpreted
1728/// as a regex, matches exactly `text`.
1729pub fn escape(text: &str) -> Cow<'_, str> {
1730    // Using bytes() is OK because all special characters are single bytes.
1731    match text.bytes().filter(|&b| is_special(b as char)).count() {
1732        0 => Cow::Borrowed(text),
1733        n => {
1734            // The capacity calculation is exact because '\\' is a single byte.
1735            let mut buf = String::with_capacity(text.len() + n);
1736            push_quoted(&mut buf, text);
1737            Cow::Owned(buf)
1738        }
1739    }
1740}
1741
1742/// Type of assertions
1743#[derive(Debug, PartialEq, Eq, Clone, Copy)]
1744pub enum Assertion {
1745    /// Start of input text
1746    StartText,
1747    /// End of input text
1748    EndText,
1749    /// Start of a line
1750    StartLine {
1751        /// CRLF mode
1752        crlf: bool,
1753    },
1754    /// End of a line
1755    EndLine {
1756        /// CRLF mode
1757        crlf: bool,
1758    },
1759    /// Left word boundary
1760    LeftWordBoundary,
1761    /// Right word boundary
1762    RightWordBoundary,
1763    /// Both word boundaries
1764    WordBoundary,
1765    /// Not word boundary
1766    NotWordBoundary,
1767}
1768
1769impl Assertion {
1770    pub(crate) fn is_hard(&self) -> bool {
1771        use Assertion::*;
1772        matches!(
1773            self,
1774            // these will make regex-automata use PikeVM
1775            LeftWordBoundary | RightWordBoundary | WordBoundary | NotWordBoundary
1776        )
1777    }
1778}
1779
1780impl Expr {
1781    /// Parse the regex and return an expression (AST) and a bit set with the indexes of groups
1782    /// that are referenced by backrefs.
1783    pub fn parse_tree(re: &str) -> Result<ExprTree> {
1784        Parser::parse(re)
1785    }
1786
1787    /// Parse the regex and return an expression (AST)
1788    /// Flags should be bit based based on flags
1789    pub fn parse_tree_with_flags(re: &str, flags: u32) -> Result<ExprTree> {
1790        Parser::parse_with_flags(re, flags)
1791    }
1792
1793    /// Convert expression to a regex string in the regex crate's syntax.
1794    ///
1795    /// # Panics
1796    ///
1797    /// Panics for expressions that are hard, i.e. can not be handled by the regex crate.
1798    pub fn to_str(&self, buf: &mut String, precedence: u8) {
1799        match *self {
1800            Expr::Empty => (),
1801            Expr::Any { newline } => buf.push_str(if newline { "(?s:.)" } else { "." }),
1802            Expr::Literal { ref val, casei } => {
1803                if casei {
1804                    buf.push_str("(?i:");
1805                }
1806                push_quoted(buf, val);
1807                if casei {
1808                    buf.push(')');
1809                }
1810            }
1811            Expr::Assertion(Assertion::StartText) => buf.push('^'),
1812            Expr::Assertion(Assertion::EndText) => buf.push('$'),
1813            Expr::Assertion(Assertion::StartLine { crlf: false }) => buf.push_str("(?m:^)"),
1814            Expr::Assertion(Assertion::EndLine { crlf: false }) => buf.push_str("(?m:$)"),
1815            Expr::Assertion(Assertion::StartLine { crlf: true }) => buf.push_str("(?Rm:^)"),
1816            Expr::Assertion(Assertion::EndLine { crlf: true }) => buf.push_str("(?Rm:$)"),
1817            Expr::Concat(ref children) => {
1818                if precedence > 1 {
1819                    buf.push_str("(?:");
1820                }
1821                for child in children {
1822                    child.to_str(buf, 2);
1823                }
1824                if precedence > 1 {
1825                    buf.push(')')
1826                }
1827            }
1828            Expr::Alt(ref children) => {
1829                if precedence > 0 {
1830                    buf.push_str("(?:");
1831                }
1832                for (i, child) in children.iter().enumerate() {
1833                    if i != 0 {
1834                        buf.push('|');
1835                    }
1836                    child.to_str(buf, 1);
1837                }
1838                if precedence > 0 {
1839                    buf.push(')');
1840                }
1841            }
1842            Expr::Group(ref child) => {
1843                buf.push('(');
1844                child.to_str(buf, 0);
1845                buf.push(')');
1846            }
1847            Expr::Repeat {
1848                ref child,
1849                lo,
1850                hi,
1851                greedy,
1852            } => {
1853                if precedence > 2 {
1854                    buf.push_str("(?:");
1855                }
1856                child.to_str(buf, 3);
1857                match (lo, hi) {
1858                    (0, 1) => buf.push('?'),
1859                    (0, usize::MAX) => buf.push('*'),
1860                    (1, usize::MAX) => buf.push('+'),
1861                    (lo, hi) => {
1862                        buf.push('{');
1863                        push_usize(buf, lo);
1864                        if lo != hi {
1865                            buf.push(',');
1866                            if hi != usize::MAX {
1867                                push_usize(buf, hi);
1868                            }
1869                        }
1870                        buf.push('}');
1871                    }
1872                }
1873                if !greedy {
1874                    buf.push('?');
1875                }
1876                if precedence > 2 {
1877                    buf.push(')');
1878                }
1879            }
1880            Expr::Delegate {
1881                ref inner, casei, ..
1882            } => {
1883                // at the moment, delegate nodes are just atoms
1884                if casei {
1885                    buf.push_str("(?i:");
1886                }
1887                buf.push_str(inner);
1888                if casei {
1889                    buf.push(')');
1890                }
1891            }
1892            _ => panic!("attempting to format hard expr {:?}", self),
1893        }
1894    }
1895}
1896
1897// precondition: ix > 0
1898fn prev_codepoint_ix(s: &str, mut ix: usize) -> usize {
1899    let bytes = s.as_bytes();
1900    loop {
1901        ix -= 1;
1902        // fancy bit magic for ranges 0..0x80 + 0xc0..
1903        if (bytes[ix] as i8) >= -0x40 {
1904            break;
1905        }
1906    }
1907    ix
1908}
1909
1910fn codepoint_len(b: u8) -> usize {
1911    match b {
1912        b if b < 0x80 => 1,
1913        b if b < 0xe0 => 2,
1914        b if b < 0xf0 => 3,
1915        _ => 4,
1916    }
1917}
1918
1919/// Returns the smallest possible index of the next valid UTF-8 sequence
1920/// starting after `i`.
1921/// Adapted from a function with the same name in the `regex` crate.
1922fn next_utf8(text: &str, i: usize) -> usize {
1923    let b = match text.as_bytes().get(i) {
1924        None => return i + 1,
1925        Some(&b) => b,
1926    };
1927    i + codepoint_len(b)
1928}
1929
1930// If this returns false, then there is no possible backref in the re
1931
1932// Both potential implementations are turned off, because we currently
1933// always need to do a deeper analysis because of 1-character
1934// look-behind. If we could call a find_from_pos method of regex::Regex,
1935// it would make sense to bring this back.
1936/*
1937pub fn detect_possible_backref(re: &str) -> bool {
1938    let mut last = b'\x00';
1939    for b in re.as_bytes() {
1940        if b'0' <= *b && *b <= b'9' && last == b'\\' { return true; }
1941        last = *b;
1942    }
1943    false
1944}
1945
1946pub fn detect_possible_backref(re: &str) -> bool {
1947    let mut bytes = re.as_bytes();
1948    loop {
1949        match memchr::memchr(b'\\', &bytes[..bytes.len() - 1]) {
1950            Some(i) => {
1951                bytes = &bytes[i + 1..];
1952                let c = bytes[0];
1953                if b'0' <= c && c <= b'9' { return true; }
1954            }
1955            None => return false
1956        }
1957    }
1958}
1959*/
1960
1961/// The internal module only exists so that the toy example can access internals for debugging and
1962/// experimenting.
1963#[doc(hidden)]
1964pub mod internal {
1965    pub use crate::analyze::{analyze, can_compile_as_anchored};
1966    pub use crate::compile::compile;
1967    pub use crate::optimize::optimize;
1968    pub use crate::parse_flags::{
1969        FLAG_CASEI, FLAG_DOTNL, FLAG_IGNORE_SPACE, FLAG_MULTI, FLAG_UNICODE,
1970    };
1971    pub use crate::vm::{run_default, run_trace, Insn, Prog};
1972}
1973
1974#[cfg(test)]
1975mod tests {
1976    use alloc::borrow::Cow;
1977    use alloc::boxed::Box;
1978    use alloc::string::String;
1979    use alloc::{format, vec};
1980
1981    use crate::parse::make_literal;
1982    use crate::{Expr, Regex, RegexImpl};
1983
1984    //use detect_possible_backref;
1985
1986    // tests for to_str
1987
1988    fn to_str(e: Expr) -> String {
1989        let mut s = String::new();
1990        e.to_str(&mut s, 0);
1991        s
1992    }
1993
1994    #[test]
1995    fn to_str_concat_alt() {
1996        let e = Expr::Concat(vec![
1997            Expr::Alt(vec![make_literal("a"), make_literal("b")]),
1998            make_literal("c"),
1999        ]);
2000        assert_eq!(to_str(e), "(?:a|b)c");
2001    }
2002
2003    #[test]
2004    fn to_str_rep_concat() {
2005        let e = Expr::Repeat {
2006            child: Box::new(Expr::Concat(vec![make_literal("a"), make_literal("b")])),
2007            lo: 2,
2008            hi: 3,
2009            greedy: true,
2010        };
2011        assert_eq!(to_str(e), "(?:ab){2,3}");
2012    }
2013
2014    #[test]
2015    fn to_str_group_alt() {
2016        let e = Expr::Group(Box::new(Expr::Alt(vec![
2017            make_literal("a"),
2018            make_literal("b"),
2019        ])));
2020        assert_eq!(to_str(e), "(a|b)");
2021    }
2022
2023    #[test]
2024    fn as_str_debug() {
2025        let s = r"(a+)b\1";
2026        let regex = Regex::new(s).unwrap();
2027        assert_eq!(s, regex.as_str());
2028        assert_eq!(s, format!("{:?}", regex));
2029    }
2030
2031    #[test]
2032    fn display() {
2033        let s = r"(a+)b\1";
2034        let regex = Regex::new(s).unwrap();
2035        assert_eq!(s, format!("{}", regex));
2036    }
2037
2038    #[test]
2039    fn from_str() {
2040        let s = r"(a+)b\1";
2041        let regex = s.parse::<Regex>().unwrap();
2042        assert_eq!(regex.as_str(), s);
2043    }
2044
2045    #[test]
2046    fn to_str_repeat() {
2047        fn repeat(lo: usize, hi: usize, greedy: bool) -> Expr {
2048            Expr::Repeat {
2049                child: Box::new(make_literal("a")),
2050                lo,
2051                hi,
2052                greedy,
2053            }
2054        }
2055
2056        assert_eq!(to_str(repeat(2, 2, true)), "a{2}");
2057        assert_eq!(to_str(repeat(2, 2, false)), "a{2}?");
2058        assert_eq!(to_str(repeat(2, 3, true)), "a{2,3}");
2059        assert_eq!(to_str(repeat(2, 3, false)), "a{2,3}?");
2060        assert_eq!(to_str(repeat(2, usize::MAX, true)), "a{2,}");
2061        assert_eq!(to_str(repeat(2, usize::MAX, false)), "a{2,}?");
2062        assert_eq!(to_str(repeat(0, 1, true)), "a?");
2063        assert_eq!(to_str(repeat(0, 1, false)), "a??");
2064        assert_eq!(to_str(repeat(0, usize::MAX, true)), "a*");
2065        assert_eq!(to_str(repeat(0, usize::MAX, false)), "a*?");
2066        assert_eq!(to_str(repeat(1, usize::MAX, true)), "a+");
2067        assert_eq!(to_str(repeat(1, usize::MAX, false)), "a+?");
2068    }
2069
2070    #[test]
2071    fn escape() {
2072        // Check that strings that need no quoting are borrowed, and that non-special punctuation
2073        // is not quoted.
2074        match crate::escape("@foo") {
2075            Cow::Borrowed(s) => assert_eq!(s, "@foo"),
2076            _ => panic!("Value should be borrowed."),
2077        }
2078
2079        // Check typical usage.
2080        assert_eq!(crate::escape("fo*o").into_owned(), "fo\\*o");
2081
2082        // Check that multibyte characters are handled correctly.
2083        assert_eq!(crate::escape("fø*ø").into_owned(), "fø\\*ø");
2084    }
2085
2086    #[test]
2087    fn trailing_positive_lookahead_wrap_capture_group_fixup() {
2088        let s = r"a+(?=c)";
2089        let regex = s.parse::<Regex>().unwrap();
2090        assert!(matches!(regex.inner,
2091            RegexImpl::Wrap { explicit_capture_group_0: true, .. }),
2092            "trailing positive lookahead for an otherwise easy pattern should avoid going through the VM");
2093        assert_eq!(s, regex.as_str());
2094        assert_eq!(s, format!("{:?}", regex));
2095    }
2096
2097    #[test]
2098    fn easy_regex() {
2099        let s = r"(a+)b";
2100        let regex = s.parse::<Regex>().unwrap();
2101        assert!(
2102            matches!(regex.inner, RegexImpl::Wrap { explicit_capture_group_0: false, .. }),
2103            "easy pattern should avoid going through the VM, and capture group 0 should be implicit"
2104        );
2105
2106        assert_eq!(s, regex.as_str());
2107        assert_eq!(s, format!("{:?}", regex));
2108    }
2109
2110    #[test]
2111    fn hard_regex() {
2112        let s = r"(a+)(?>c)";
2113        let regex = s.parse::<Regex>().unwrap();
2114        assert!(
2115            matches!(regex.inner, RegexImpl::Fancy { .. }),
2116            "hard regex should be compiled into a VM"
2117        );
2118        assert_eq!(s, regex.as_str());
2119        assert_eq!(s, format!("{:?}", regex));
2120    }
2121
2122    /*
2123    #[test]
2124    fn detect_backref() {
2125        assert_eq!(detect_possible_backref("a0a1a2"), false);
2126        assert_eq!(detect_possible_backref("a0a1\\a2"), false);
2127        assert_eq!(detect_possible_backref("a0a\\1a2"), true);
2128        assert_eq!(detect_possible_backref("a0a1a2\\"), false);
2129    }
2130    */
2131}
fancy_regex/lib.rs

fancy_regex/
lib.rs