fancy_regex/
lib.rs

1// Copyright 2016 The Fancy Regex Authors.
2//
3// Permission is hereby granted, free of charge, to any person obtaining a copy
4// of this software and associated documentation files (the "Software"), to deal
5// in the Software without restriction, including without limitation the rights
6// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7// copies of the Software, and to permit persons to whom the Software is
8// furnished to do so, subject to the following conditions:
9//
10// The above copyright notice and this permission notice shall be included in
11// all copies or substantial portions of the Software.
12//
13// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19// THE SOFTWARE.
20
21/*!
22An implementation of regexes, supporting a relatively rich set of features, including backreferences
23and lookaround.
24
25It builds on top of the excellent [regex] crate. If you are not
26familiar with it, make sure you read its documentation and maybe you don't even need fancy-regex.
27
28If your regex or parts of it does not use any special features, the matching is delegated to the
29regex crate. That means it has linear runtime. But if you use "fancy" features such as
30backreferences or look-around, an engine with backtracking needs to be used. In that case, the regex
31can be slow and take exponential time to run because of what is called "catastrophic backtracking".
32This depends on the regex and the input.
33
34# Usage
35
36The API should feel very similar to the regex crate, and involves compiling a regex and then using
37it to find matches in text.
38
39## Example: Matching text
40
41An example with backreferences to check if a text consists of two identical words:
42
43```rust
44use fancy_regex::Regex;
45
46let re = Regex::new(r"^(\w+) (\1)$").unwrap();
47let result = re.is_match("foo foo");
48
49assert!(result.is_ok());
50let did_match = result.unwrap();
51assert!(did_match);
52```
53
54Note that like in the regex crate, the regex needs anchors like `^` and `$` to match against the
55entire input text.
56
57## Example: Finding the position of matches
58
59```rust
60use fancy_regex::Regex;
61
62let re = Regex::new(r"(\d)\1").unwrap();
63let result = re.find("foo 22");
64
65assert!(result.is_ok(), "execution was successful");
66let match_option = result.unwrap();
67
68assert!(match_option.is_some(), "found a match");
69let m = match_option.unwrap();
70
71assert_eq!(m.start(), 4);
72assert_eq!(m.end(), 6);
73assert_eq!(m.as_str(), "22");
74```
75
76## Example: Capturing groups
77
78```rust
79use fancy_regex::Regex;
80
81let re = Regex::new(r"(?<!AU)\$(\d+)").unwrap();
82let result = re.captures("AU$10, $20");
83
84let captures = result.expect("Error running regex").expect("No match found");
85let group = captures.get(1).expect("No group");
86assert_eq!(group.as_str(), "20");
87```
88
89## Example: Splitting text
90
91```rust
92use fancy_regex::Regex;
93
94let re = Regex::new(r"[ \t]+").unwrap();
95let target = "a b \t  c\td    e";
96let fields: Vec<&str> = re.split(target).map(|x| x.unwrap()).collect();
97assert_eq!(fields, vec!["a", "b", "c", "d", "e"]);
98
99let fields: Vec<&str> = re.splitn(target, 3).map(|x| x.unwrap()).collect();
100assert_eq!(fields, vec!["a", "b", "c\td    e"]);
101```
102
103# Syntax
104
105The regex syntax is based on the [regex] crate's, with some additional supported syntax.
106
107Escapes:
108
109`\h`
110: hex digit (`[0-9A-Fa-f]`) \
111`\H`
112: not hex digit (`[^0-9A-Fa-f]`) \
113`\e`
114: escape control character (`\x1B`) \
115`\K`
116: keep text matched so far out of the overall match ([docs](https://www.regular-expressions.info/keep.html))\
117`\G`
118: anchor to where the previous match ended ([docs](https://www.regular-expressions.info/continue.html))\
119`\Z`
120: anchor to the end of the text before any trailing newlines\
121`\O`
122: any character including newline
123
124Backreferences:
125
126`\1`
127: match the exact string that the first capture group matched \
128`\2`
129: backref to the second capture group, etc
130
131Named capture groups:
132
133`(?<name>exp)`
134: match *exp*, creating capture group named *name* \
135`\k<name>`
136: match the exact string that the capture group named *name* matched \
137`(?P<name>exp)`
138: same as `(?<name>exp)` for compatibility with Python, etc. \
139`(?P=name)`
140: same as `\k<name>` for compatibility with Python, etc.
141
142Look-around assertions for matching without changing the current position:
143
144`(?=exp)`
145: look-ahead, succeeds if *exp* matches to the right of the current position \
146`(?!exp)`
147: negative look-ahead, succeeds if *exp* doesn't match to the right \
148`(?<=exp)`
149: look-behind, succeeds if *exp* matches to the left of the current position \
150`(?<!exp)`
151: negative look-behind, succeeds if *exp* doesn't match to the left
152
153Atomic groups using `(?>exp)` to prevent backtracking within `exp`, e.g.:
154
155```
156# use fancy_regex::Regex;
157let re = Regex::new(r"^a(?>bc|b)c$").unwrap();
158assert!(re.is_match("abcc").unwrap());
159// Doesn't match because `|b` is never tried because of the atomic group
160assert!(!re.is_match("abc").unwrap());
161```
162
163Conditionals - if/then/else:
164
165`(?(1))`
166: continue only if first capture group matched \
167`(?(<name>))`
168: continue only if capture group named *name* matched \
169`(?(1)true_branch|false_branch)`
170: if the first capture group matched then execute the true_branch regex expression, else execute false_branch ([docs](https://www.regular-expressions.info/conditional.html)) \
171`(?(condition)true_branch|false_branch)`
172: if the condition matches then execute the true_branch regex expression, else execute false_branch from the point just before the condition was evaluated
173
174[regex]: https://crates.io/crates/regex
175*/
176
177#![deny(missing_docs)]
178#![deny(missing_debug_implementations)]
179#![cfg_attr(not(feature = "std"), no_std)]
180
181extern crate alloc;
182
183use alloc::borrow::Cow;
184use alloc::boxed::Box;
185use alloc::string::{String, ToString};
186use alloc::sync::Arc;
187use alloc::vec;
188use alloc::vec::Vec;
189
190use core::convert::TryFrom;
191use core::fmt::{Debug, Formatter};
192use core::ops::{Index, Range};
193use core::str::FromStr;
194use core::{fmt, usize};
195use regex_automata::meta::Regex as RaRegex;
196use regex_automata::util::captures::Captures as RaCaptures;
197use regex_automata::util::syntax::Config as SyntaxConfig;
198use regex_automata::Input as RaInput;
199
200mod analyze;
201mod compile;
202mod error;
203mod expand;
204mod flags;
205mod parse;
206mod replacer;
207mod vm;
208
209use crate::analyze::analyze;
210use crate::compile::compile;
211use crate::flags::*;
212use crate::parse::{ExprTree, NamedGroups, Parser};
213use crate::vm::{Prog, OPTION_SKIPPED_EMPTY_MATCH};
214
215pub use crate::error::{CompileError, Error, ParseError, Result, RuntimeError};
216pub use crate::expand::Expander;
217pub use crate::replacer::{NoExpand, Replacer, ReplacerRef};
218
219const MAX_RECURSION: usize = 64;
220
221// the public API
222
223/// A builder for a `Regex` to allow configuring options.
224#[derive(Debug)]
225pub struct RegexBuilder(RegexOptions);
226
227/// A compiled regular expression.
228#[derive(Clone)]
229pub struct Regex {
230    inner: RegexImpl,
231    named_groups: Arc<NamedGroups>,
232}
233
234// Separate enum because we don't want to expose any of this
235#[derive(Clone)]
236enum RegexImpl {
237    // Do we want to box this? It's pretty big...
238    Wrap {
239        inner: RaRegex,
240        options: RegexOptions,
241    },
242    Fancy {
243        prog: Prog,
244        n_groups: usize,
245        options: RegexOptions,
246    },
247}
248
249/// A single match of a regex or group in an input text
250#[derive(Copy, Clone, Debug, Eq, PartialEq)]
251pub struct Match<'t> {
252    text: &'t str,
253    start: usize,
254    end: usize,
255}
256
257/// An iterator over all non-overlapping matches for a particular string.
258///
259/// The iterator yields a `Result<Match>`. The iterator stops when no more
260/// matches can be found.
261///
262/// `'r` is the lifetime of the compiled regular expression and `'t` is the
263/// lifetime of the matched string.
264#[derive(Debug)]
265pub struct Matches<'r, 't> {
266    re: &'r Regex,
267    text: &'t str,
268    last_end: usize,
269    last_match: Option<usize>,
270}
271
272impl<'r, 't> Matches<'r, 't> {
273    /// Return the text being searched.
274    pub fn text(&self) -> &'t str {
275        self.text
276    }
277
278    /// Return the underlying regex.
279    pub fn regex(&self) -> &'r Regex {
280        &self.re
281    }
282}
283
284impl<'r, 't> Iterator for Matches<'r, 't> {
285    type Item = Result<Match<'t>>;
286
287    /// Adapted from the `regex` crate. Calls `find_from_pos` repeatedly.
288    /// Ignores empty matches immediately after a match.
289    fn next(&mut self) -> Option<Self::Item> {
290        if self.last_end > self.text.len() {
291            return None;
292        }
293
294        let option_flags = if let Some(last_match) = self.last_match {
295            if self.last_end > last_match {
296                OPTION_SKIPPED_EMPTY_MATCH
297            } else {
298                0
299            }
300        } else {
301            0
302        };
303        let mat =
304            match self
305                .re
306                .find_from_pos_with_option_flags(self.text, self.last_end, option_flags)
307            {
308                Err(error) => {
309                    // Stop on first error: If an error is encountered, return it, and set the "last match position"
310                    // to the string length, so that the next next() call will return None, to prevent an infinite loop.
311                    self.last_end = self.text.len() + 1;
312                    return Some(Err(error));
313                }
314                Ok(None) => return None,
315                Ok(Some(mat)) => mat,
316            };
317
318        if mat.start == mat.end {
319            // This is an empty match. To ensure we make progress, start
320            // the next search at the smallest possible starting position
321            // of the next match following this one.
322            self.last_end = next_utf8(self.text, mat.end);
323            // Don't accept empty matches immediately following a match.
324            // Just move on to the next match.
325            if Some(mat.end) == self.last_match {
326                return self.next();
327            }
328        } else {
329            self.last_end = mat.end;
330        }
331
332        self.last_match = Some(mat.end);
333
334        Some(Ok(mat))
335    }
336}
337
338/// An iterator that yields all non-overlapping capture groups matching a
339/// particular regular expression.
340///
341/// The iterator stops when no more matches can be found.
342///
343/// `'r` is the lifetime of the compiled regular expression and `'t` is the
344/// lifetime of the matched string.
345#[derive(Debug)]
346pub struct CaptureMatches<'r, 't>(Matches<'r, 't>);
347
348impl<'r, 't> CaptureMatches<'r, 't> {
349    /// Return the text being searched.
350    pub fn text(&self) -> &'t str {
351        self.0.text
352    }
353
354    /// Return the underlying regex.
355    pub fn regex(&self) -> &'r Regex {
356        &self.0.re
357    }
358}
359
360impl<'r, 't> Iterator for CaptureMatches<'r, 't> {
361    type Item = Result<Captures<'t>>;
362
363    /// Adapted from the `regex` crate. Calls `captures_from_pos` repeatedly.
364    /// Ignores empty matches immediately after a match.
365    fn next(&mut self) -> Option<Self::Item> {
366        if self.0.last_end > self.0.text.len() {
367            return None;
368        }
369
370        let captures = match self.0.re.captures_from_pos(self.0.text, self.0.last_end) {
371            Err(error) => {
372                // Stop on first error: If an error is encountered, return it, and set the "last match position"
373                // to the string length, so that the next next() call will return None, to prevent an infinite loop.
374                self.0.last_end = self.0.text.len() + 1;
375                return Some(Err(error));
376            }
377            Ok(None) => return None,
378            Ok(Some(captures)) => captures,
379        };
380
381        let mat = captures
382            .get(0)
383            .expect("`Captures` is expected to have entire match at 0th position");
384        if mat.start == mat.end {
385            self.0.last_end = next_utf8(self.0.text, mat.end);
386            if Some(mat.end) == self.0.last_match {
387                return self.next();
388            }
389        } else {
390            self.0.last_end = mat.end;
391        }
392
393        self.0.last_match = Some(mat.end);
394
395        Some(Ok(captures))
396    }
397}
398
399/// A set of capture groups found for a regex.
400#[derive(Debug)]
401pub struct Captures<'t> {
402    inner: CapturesImpl<'t>,
403    named_groups: Arc<NamedGroups>,
404}
405
406#[derive(Debug)]
407enum CapturesImpl<'t> {
408    Wrap {
409        text: &'t str,
410        locations: RaCaptures,
411    },
412    Fancy {
413        text: &'t str,
414        saves: Vec<usize>,
415    },
416}
417
418/// Iterator for captured groups in order in which they appear in the regex.
419#[derive(Debug)]
420pub struct SubCaptureMatches<'c, 't> {
421    caps: &'c Captures<'t>,
422    i: usize,
423}
424
425/// An iterator over all substrings delimited by a regex.
426///
427/// This iterator yields `Result<&'h str>`, where each item is a substring of the
428/// target string that is delimited by matches of the regular expression. It stops when there
429/// are no more substrings to yield.
430///
431/// `'r` is the lifetime of the compiled regular expression, and `'h` is the
432/// lifetime of the target string being split.
433///
434/// This iterator can be created by the [`Regex::split`] method.
435#[derive(Debug)]
436pub struct Split<'r, 'h> {
437    matches: Matches<'r, 'h>,
438    next_start: usize,
439    target: &'h str,
440}
441
442impl<'r, 'h> Iterator for Split<'r, 'h> {
443    type Item = Result<&'h str>;
444
445    /// Returns the next substring that results from splitting the target string by the regex.
446    ///
447    /// If no more matches are found, returns the remaining part of the string,
448    /// or `None` if all substrings have been yielded.
449    fn next(&mut self) -> Option<Result<&'h str>> {
450        match self.matches.next() {
451            None => {
452                let len = self.target.len();
453                if self.next_start > len {
454                    // No more substrings to return
455                    None
456                } else {
457                    // Return the last part of the target string
458                    // Next call will return None
459                    let part = &self.target[self.next_start..len];
460                    self.next_start = len + 1;
461                    Some(Ok(part))
462                }
463            }
464            // Return the next substring
465            Some(Ok(m)) => {
466                let part = &self.target[self.next_start..m.start()];
467                self.next_start = m.end();
468                Some(Ok(part))
469            }
470            Some(Err(e)) => Some(Err(e)),
471        }
472    }
473}
474
475impl<'r, 'h> core::iter::FusedIterator for Split<'r, 'h> {}
476
477/// An iterator over at most `N` substrings delimited by a regex.
478///
479/// This iterator yields `Result<&'h str>`, where each item is a substring of the
480/// target that is delimited by matches of the regular expression. It stops either when
481/// there are no more substrings to yield, or after `N` substrings have been yielded.
482///
483/// The `N`th substring is the remaining part of the target.
484///
485/// `'r` is the lifetime of the compiled regular expression, and `'h` is the
486/// lifetime of the target string being split.
487///
488/// This iterator can be created by the [`Regex::splitn`] method.
489#[derive(Debug)]
490pub struct SplitN<'r, 'h> {
491    splits: Split<'r, 'h>,
492    limit: usize,
493}
494
495impl<'r, 'h> Iterator for SplitN<'r, 'h> {
496    type Item = Result<&'h str>;
497
498    /// Returns the next substring resulting from splitting the target by the regex,
499    /// limited to `N` splits.
500    ///
501    /// Returns `None` if no more matches are found or if the limit is reached after yielding
502    /// the remaining part of the target.
503    fn next(&mut self) -> Option<Result<&'h str>> {
504        if self.limit == 0 {
505            // Limit reached. No more substrings available.
506            return None;
507        }
508
509        // Decrement the limit for each split.
510        self.limit -= 1;
511        if self.limit > 0 {
512            return self.splits.next();
513        }
514
515        // Nth split
516        let len = self.splits.target.len();
517        if self.splits.next_start > len {
518            // No more substrings available.
519            return None;
520        } else {
521            // Return the remaining part of the target
522            let start = self.splits.next_start;
523            self.splits.next_start = len + 1;
524            return Some(Ok(&self.splits.target[start..len]));
525        }
526    }
527
528    fn size_hint(&self) -> (usize, Option<usize>) {
529        (0, Some(self.limit))
530    }
531}
532
533impl<'r, 'h> core::iter::FusedIterator for SplitN<'r, 'h> {}
534
535#[derive(Clone, Debug)]
536struct RegexOptions {
537    pattern: String,
538    syntaxc: SyntaxConfig,
539    backtrack_limit: usize,
540    delegate_size_limit: Option<usize>,
541    delegate_dfa_size_limit: Option<usize>,
542}
543
544impl RegexOptions {
545    fn get_flag_value(flag_value: bool, enum_value: u32) -> u32 {
546        if flag_value {
547            enum_value
548        } else {
549            0
550        }
551    }
552
553    fn compute_flags(&self) -> u32 {
554        let insensitive = Self::get_flag_value(self.syntaxc.get_case_insensitive(), FLAG_CASEI);
555        let multiline = Self::get_flag_value(self.syntaxc.get_multi_line(), FLAG_MULTI);
556        let whitespace =
557            Self::get_flag_value(self.syntaxc.get_ignore_whitespace(), FLAG_IGNORE_SPACE);
558        let dotnl = Self::get_flag_value(self.syntaxc.get_dot_matches_new_line(), FLAG_DOTNL);
559        let unicode = Self::get_flag_value(self.syntaxc.get_unicode(), FLAG_UNICODE);
560
561        let all_flags = insensitive | multiline | whitespace | dotnl | unicode | unicode;
562        all_flags
563    }
564}
565
566impl Default for RegexOptions {
567    fn default() -> Self {
568        RegexOptions {
569            pattern: String::new(),
570            syntaxc: SyntaxConfig::default(),
571            backtrack_limit: 1_000_000,
572            delegate_size_limit: None,
573            delegate_dfa_size_limit: None,
574        }
575    }
576}
577
578impl RegexBuilder {
579    /// Create a new regex builder with a regex pattern.
580    ///
581    /// If the pattern is invalid, the call to `build` will fail later.
582    pub fn new(pattern: &str) -> Self {
583        let mut builder = RegexBuilder(RegexOptions::default());
584        builder.0.pattern = pattern.to_string();
585        builder
586    }
587
588    /// Build the `Regex`.
589    ///
590    /// Returns an [`Error`](enum.Error.html) if the pattern could not be parsed.
591    pub fn build(&self) -> Result<Regex> {
592        Regex::new_options(self.0.clone())
593    }
594
595    fn set_config(&mut self, func: impl Fn(SyntaxConfig) -> SyntaxConfig) -> &mut Self {
596        self.0.syntaxc = func(self.0.syntaxc);
597        self
598    }
599
600    /// Override default case insensitive
601    /// this is to enable/disable casing via builder instead of a flag within
602    /// the raw string provided to the regex builder
603    ///
604    /// Default is false
605    pub fn case_insensitive(&mut self, yes: bool) -> &mut Self {
606        self.set_config(|x| x.case_insensitive(yes))
607    }
608
609    /// Enable multi-line regex
610    pub fn multi_line(&mut self, yes: bool) -> &mut Self {
611        self.set_config(|x| x.multi_line(yes))
612    }
613
614    /// Allow ignore whitespace
615    pub fn ignore_whitespace(&mut self, yes: bool) -> &mut Self {
616        self.set_config(|x| x.ignore_whitespace(yes))
617    }
618
619    /// Enable or disable the "dot matches any character" flag.
620    /// When this is enabled, `.` will match any character. When it's disabled, then `.` will match any character
621    /// except for a new line character.
622    pub fn dot_matches_new_line(&mut self, yes: bool) -> &mut Self {
623        self.set_config(|x| x.dot_matches_new_line(yes))
624    }
625
626    /// Enable verbose mode in the regular expression.
627    ///
628    /// The same as ignore_whitespace
629    ///
630    /// When enabled, verbose mode permits insigificant whitespace in many
631    /// places in the regular expression, as well as comments. Comments are
632    /// started using `#` and continue until the end of the line.
633    ///
634    /// By default, this is disabled. It may be selectively enabled in the
635    /// regular expression by using the `x` flag regardless of this setting.
636    pub fn verbose_mode(&mut self, yes: bool) -> &mut Self {
637        self.set_config(|x| x.ignore_whitespace(yes))
638    }
639
640    /// Enable or disable the Unicode flag (`u`) by default.
641    ///
642    /// By default this is **enabled**. It may alternatively be selectively
643    /// disabled in the regular expression itself via the `u` flag.
644    ///
645    /// Note that unless "allow invalid UTF-8" is enabled (it's disabled by
646    /// default), a regular expression will fail to parse if Unicode mode is
647    /// disabled and a sub-expression could possibly match invalid UTF-8.
648    ///
649    /// **WARNING**: Unicode mode can greatly increase the size of the compiled
650    /// DFA, which can noticeably impact both memory usage and compilation
651    /// time. This is especially noticeable if your regex contains character
652    /// classes like `\w` that are impacted by whether Unicode is enabled or
653    /// not. If Unicode is not necessary, you are encouraged to disable it.
654    pub fn unicode_mode(&mut self, yes: bool) -> &mut Self {
655        self.set_config(|x| x.unicode(yes))
656    }
657
658    /// Limit for how many times backtracking should be attempted for fancy regexes (where
659    /// backtracking is used). If this limit is exceeded, execution returns an error with
660    /// [`Error::BacktrackLimitExceeded`](enum.Error.html#variant.BacktrackLimitExceeded).
661    /// This is for preventing a regex with catastrophic backtracking to run for too long.
662    ///
663    /// Default is `1_000_000` (1 million).
664    pub fn backtrack_limit(&mut self, limit: usize) -> &mut Self {
665        self.0.backtrack_limit = limit;
666        self
667    }
668
669    /// Set the approximate size limit of the compiled regular expression.
670    ///
671    /// This option is forwarded from the wrapped `regex` crate. Note that depending on the used
672    /// regex features there may be multiple delegated sub-regexes fed to the `regex` crate. As
673    /// such the actual limit is closer to `<number of delegated regexes> * delegate_size_limit`.
674    pub fn delegate_size_limit(&mut self, limit: usize) -> &mut Self {
675        self.0.delegate_size_limit = Some(limit);
676        self
677    }
678
679    /// Set the approximate size of the cache used by the DFA.
680    ///
681    /// This option is forwarded from the wrapped `regex` crate. Note that depending on the used
682    /// regex features there may be multiple delegated sub-regexes fed to the `regex` crate. As
683    /// such the actual limit is closer to `<number of delegated regexes> *
684    /// delegate_dfa_size_limit`.
685    pub fn delegate_dfa_size_limit(&mut self, limit: usize) -> &mut Self {
686        self.0.delegate_dfa_size_limit = Some(limit);
687        self
688    }
689}
690
691impl fmt::Debug for Regex {
692    /// Shows the original regular expression.
693    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
694        write!(f, "{}", self.as_str())
695    }
696}
697
698impl fmt::Display for Regex {
699    /// Shows the original regular expression
700    fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
701        write!(f, "{}", self.as_str())
702    }
703}
704
705impl FromStr for Regex {
706    type Err = Error;
707
708    /// Attempts to parse a string into a regular expression
709    fn from_str(s: &str) -> Result<Regex> {
710        Regex::new(s)
711    }
712}
713
714impl Regex {
715    /// Parse and compile a regex with default options, see `RegexBuilder`.
716    ///
717    /// Returns an [`Error`](enum.Error.html) if the pattern could not be parsed.
718    pub fn new(re: &str) -> Result<Regex> {
719        let options = RegexOptions {
720            pattern: re.to_string(),
721            ..RegexOptions::default()
722        };
723        Self::new_options(options)
724    }
725
726    fn new_options(options: RegexOptions) -> Result<Regex> {
727        let raw_tree = Expr::parse_tree_with_flags(&options.pattern, options.compute_flags())?;
728
729        // wrapper to search for re at arbitrary start position,
730        // and to capture the match bounds
731        let tree = wrap_tree(raw_tree);
732
733        let info = analyze(&tree)?;
734
735        let inner_info = &info.children[1].children[0]; // references inner expr
736        if !inner_info.hard {
737            // easy case, wrap regex
738
739            // we do our own to_str because escapes are different
740            let mut re_cooked = String::new();
741            // same as raw_tree.expr above, but it was moved, so traverse to find it
742            let raw_e = match tree.expr {
743                Expr::Concat(ref v) => match v[1] {
744                    Expr::Group(ref child) => child,
745                    _ => unreachable!(),
746                },
747                _ => unreachable!(),
748            };
749            raw_e.to_str(&mut re_cooked, 0);
750            let inner = compile::compile_inner(&re_cooked, &options)?;
751            return Ok(Regex {
752                inner: RegexImpl::Wrap { inner, options },
753                named_groups: Arc::new(tree.named_groups),
754            });
755        }
756
757        let prog = compile(&info)?;
758        Ok(Regex {
759            inner: RegexImpl::Fancy {
760                prog,
761                n_groups: info.end_group,
762                options,
763            },
764            named_groups: Arc::new(tree.named_groups),
765        })
766    }
767
768    /// Returns the original string of this regex.
769    pub fn as_str(&self) -> &str {
770        match &self.inner {
771            RegexImpl::Wrap { options, .. } => &options.pattern,
772            RegexImpl::Fancy { options, .. } => &options.pattern,
773        }
774    }
775
776    /// Check if the regex matches the input text.
777    ///
778    /// # Example
779    ///
780    /// Test if some text contains the same word twice:
781    ///
782    /// ```rust
783    /// # use fancy_regex::Regex;
784    ///
785    /// let re = Regex::new(r"(\w+) \1").unwrap();
786    /// assert!(re.is_match("mirror mirror on the wall").unwrap());
787    /// ```
788    pub fn is_match(&self, text: &str) -> Result<bool> {
789        match &self.inner {
790            RegexImpl::Wrap { ref inner, .. } => Ok(inner.is_match(text)),
791            RegexImpl::Fancy {
792                ref prog, options, ..
793            } => {
794                let result = vm::run(prog, text, 0, 0, options)?;
795                Ok(result.is_some())
796            }
797        }
798    }
799
800    /// Returns an iterator for each successive non-overlapping match in `text`.
801    ///
802    /// If you have capturing groups in your regex that you want to extract, use the [Regex::captures_iter()]
803    /// method.
804    ///
805    /// # Example
806    ///
807    /// Find all words followed by an exclamation point:
808    ///
809    /// ```rust
810    /// # use fancy_regex::Regex;
811    ///
812    /// let re = Regex::new(r"\w+(?=!)").unwrap();
813    /// let mut matches = re.find_iter("so fancy! even with! iterators!");
814    /// assert_eq!(matches.next().unwrap().unwrap().as_str(), "fancy");
815    /// assert_eq!(matches.next().unwrap().unwrap().as_str(), "with");
816    /// assert_eq!(matches.next().unwrap().unwrap().as_str(), "iterators");
817    /// assert!(matches.next().is_none());
818    /// ```
819    pub fn find_iter<'r, 't>(&'r self, text: &'t str) -> Matches<'r, 't> {
820        Matches {
821            re: &self,
822            text,
823            last_end: 0,
824            last_match: None,
825        }
826    }
827
828    /// Find the first match in the input text.
829    ///
830    /// If you have capturing groups in your regex that you want to extract, use the [Regex::captures()]
831    /// method.
832    ///
833    /// # Example
834    ///
835    /// Find a word that is followed by an exclamation point:
836    ///
837    /// ```rust
838    /// # use fancy_regex::Regex;
839    ///
840    /// let re = Regex::new(r"\w+(?=!)").unwrap();
841    /// assert_eq!(re.find("so fancy!").unwrap().unwrap().as_str(), "fancy");
842    /// ```
843    pub fn find<'t>(&self, text: &'t str) -> Result<Option<Match<'t>>> {
844        self.find_from_pos(text, 0)
845    }
846
847    /// Returns the first match in `text`, starting from the specified byte position `pos`.
848    ///
849    /// # Examples
850    ///
851    /// Finding match starting at a position:
852    ///
853    /// ```
854    /// # use fancy_regex::Regex;
855    /// let re = Regex::new(r"(?m:^)(\d+)").unwrap();
856    /// let text = "1 test 123\n2 foo";
857    /// let mat = re.find_from_pos(text, 7).unwrap().unwrap();
858    ///
859    /// assert_eq!(mat.start(), 11);
860    /// assert_eq!(mat.end(), 12);
861    /// ```
862    ///
863    /// Note that in some cases this is not the same as using the `find`
864    /// method and passing a slice of the string, see [Regex::captures_from_pos()] for details.
865    pub fn find_from_pos<'t>(&self, text: &'t str, pos: usize) -> Result<Option<Match<'t>>> {
866        self.find_from_pos_with_option_flags(text, pos, 0)
867    }
868
869    fn find_from_pos_with_option_flags<'t>(
870        &self,
871        text: &'t str,
872        pos: usize,
873        option_flags: u32,
874    ) -> Result<Option<Match<'t>>> {
875        match &self.inner {
876            RegexImpl::Wrap { inner, .. } => Ok(inner
877                .search(&RaInput::new(text).span(pos..text.len()))
878                .map(|m| Match::new(text, m.start(), m.end()))),
879            RegexImpl::Fancy { prog, options, .. } => {
880                let result = vm::run(prog, text, pos, option_flags, options)?;
881                Ok(result.map(|saves| Match::new(text, saves[0], saves[1])))
882            }
883        }
884    }
885
886    /// Returns an iterator over all the non-overlapping capture groups matched in `text`.
887    ///
888    /// # Examples
889    ///
890    /// Finding all matches and capturing parts of each:
891    ///
892    /// ```rust
893    /// # use fancy_regex::Regex;
894    ///
895    /// let re = Regex::new(r"(\d{4})-(\d{2})").unwrap();
896    /// let text = "It was between 2018-04 and 2020-01";
897    /// let mut all_captures = re.captures_iter(text);
898    ///
899    /// let first = all_captures.next().unwrap().unwrap();
900    /// assert_eq!(first.get(1).unwrap().as_str(), "2018");
901    /// assert_eq!(first.get(2).unwrap().as_str(), "04");
902    /// assert_eq!(first.get(0).unwrap().as_str(), "2018-04");
903    ///
904    /// let second = all_captures.next().unwrap().unwrap();
905    /// assert_eq!(second.get(1).unwrap().as_str(), "2020");
906    /// assert_eq!(second.get(2).unwrap().as_str(), "01");
907    /// assert_eq!(second.get(0).unwrap().as_str(), "2020-01");
908    ///
909    /// assert!(all_captures.next().is_none());
910    /// ```
911    pub fn captures_iter<'r, 't>(&'r self, text: &'t str) -> CaptureMatches<'r, 't> {
912        CaptureMatches(self.find_iter(text))
913    }
914
915    /// Returns the capture groups for the first match in `text`.
916    ///
917    /// If no match is found, then `Ok(None)` is returned.
918    ///
919    /// # Examples
920    ///
921    /// Finding matches and capturing parts of the match:
922    ///
923    /// ```rust
924    /// # use fancy_regex::Regex;
925    ///
926    /// let re = Regex::new(r"(\d{4})-(\d{2})-(\d{2})").unwrap();
927    /// let text = "The date was 2018-04-07";
928    /// let captures = re.captures(text).unwrap().unwrap();
929    ///
930    /// assert_eq!(captures.get(1).unwrap().as_str(), "2018");
931    /// assert_eq!(captures.get(2).unwrap().as_str(), "04");
932    /// assert_eq!(captures.get(3).unwrap().as_str(), "07");
933    /// assert_eq!(captures.get(0).unwrap().as_str(), "2018-04-07");
934    /// ```
935    pub fn captures<'t>(&self, text: &'t str) -> Result<Option<Captures<'t>>> {
936        self.captures_from_pos(text, 0)
937    }
938
939    /// Returns the capture groups for the first match in `text`, starting from
940    /// the specified byte position `pos`.
941    ///
942    /// # Examples
943    ///
944    /// Finding captures starting at a position:
945    ///
946    /// ```
947    /// # use fancy_regex::Regex;
948    /// let re = Regex::new(r"(?m:^)(\d+)").unwrap();
949    /// let text = "1 test 123\n2 foo";
950    /// let captures = re.captures_from_pos(text, 7).unwrap().unwrap();
951    ///
952    /// let group = captures.get(1).unwrap();
953    /// assert_eq!(group.as_str(), "2");
954    /// assert_eq!(group.start(), 11);
955    /// assert_eq!(group.end(), 12);
956    /// ```
957    ///
958    /// Note that in some cases this is not the same as using the `captures`
959    /// method and passing a slice of the string, see the capture that we get
960    /// when we do this:
961    ///
962    /// ```
963    /// # use fancy_regex::Regex;
964    /// let re = Regex::new(r"(?m:^)(\d+)").unwrap();
965    /// let text = "1 test 123\n2 foo";
966    /// let captures = re.captures(&text[7..]).unwrap().unwrap();
967    /// assert_eq!(captures.get(1).unwrap().as_str(), "123");
968    /// ```
969    ///
970    /// This matched the number "123" because it's at the beginning of the text
971    /// of the string slice.
972    ///
973    pub fn captures_from_pos<'t>(&self, text: &'t str, pos: usize) -> Result<Option<Captures<'t>>> {
974        let named_groups = self.named_groups.clone();
975        match &self.inner {
976            RegexImpl::Wrap { inner, .. } => {
977                let mut locations = inner.create_captures();
978                inner.captures(RaInput::new(text).span(pos..text.len()), &mut locations);
979                Ok(locations.is_match().then(|| Captures {
980                    inner: CapturesImpl::Wrap { text, locations },
981                    named_groups,
982                }))
983            }
984            RegexImpl::Fancy {
985                prog,
986                n_groups,
987                options,
988                ..
989            } => {
990                let result = vm::run(prog, text, pos, 0, options)?;
991                Ok(result.map(|mut saves| {
992                    saves.truncate(n_groups * 2);
993                    Captures {
994                        inner: CapturesImpl::Fancy { text, saves },
995                        named_groups,
996                    }
997                }))
998            }
999        }
1000    }
1001
1002    /// Returns the number of captures, including the implicit capture of the entire expression.
1003    pub fn captures_len(&self) -> usize {
1004        match &self.inner {
1005            RegexImpl::Wrap { inner, .. } => inner.captures_len(),
1006            RegexImpl::Fancy { n_groups, .. } => *n_groups,
1007        }
1008    }
1009
1010    /// Returns an iterator over the capture names.
1011    pub fn capture_names(&self) -> CaptureNames {
1012        let mut names = Vec::new();
1013        names.resize(self.captures_len(), None);
1014        for (name, &i) in self.named_groups.iter() {
1015            names[i] = Some(name.as_str());
1016        }
1017        CaptureNames(names.into_iter())
1018    }
1019
1020    // for debugging only
1021    #[doc(hidden)]
1022    pub fn debug_print(&self, writer: &mut Formatter<'_>) -> fmt::Result {
1023        match &self.inner {
1024            RegexImpl::Wrap { options, .. } => {
1025                write!(writer, "wrapped Regex {:?}", options.pattern)
1026            }
1027            RegexImpl::Fancy { prog, .. } => prog.debug_print(writer),
1028        }
1029    }
1030
1031    /// Replaces the leftmost-first match with the replacement provided.
1032    /// The replacement can be a regular string (where `$N` and `$name` are
1033    /// expanded to match capture groups) or a function that takes the matches'
1034    /// `Captures` and returns the replaced string.
1035    ///
1036    /// If no match is found, then a copy of the string is returned unchanged.
1037    ///
1038    /// # Replacement string syntax
1039    ///
1040    /// All instances of `$name` in the replacement text is replaced with the
1041    /// corresponding capture group `name`.
1042    ///
1043    /// `name` may be an integer corresponding to the index of the
1044    /// capture group (counted by order of opening parenthesis where `0` is the
1045    /// entire match) or it can be a name (consisting of letters, digits or
1046    /// underscores) corresponding to a named capture group.
1047    ///
1048    /// If `name` isn't a valid capture group (whether the name doesn't exist
1049    /// or isn't a valid index), then it is replaced with the empty string.
1050    ///
1051    /// The longest possible name is used. e.g., `$1a` looks up the capture
1052    /// group named `1a` and not the capture group at index `1`. To exert more
1053    /// precise control over the name, use braces, e.g., `${1}a`.
1054    ///
1055    /// To write a literal `$` use `$$`.
1056    ///
1057    /// # Examples
1058    ///
1059    /// Note that this function is polymorphic with respect to the replacement.
1060    /// In typical usage, this can just be a normal string:
1061    ///
1062    /// ```rust
1063    /// # use fancy_regex::Regex;
1064    /// let re = Regex::new("[^01]+").unwrap();
1065    /// assert_eq!(re.replace("1078910", ""), "1010");
1066    /// ```
1067    ///
1068    /// But anything satisfying the `Replacer` trait will work. For example,
1069    /// a closure of type `|&Captures| -> String` provides direct access to the
1070    /// captures corresponding to a match. This allows one to access
1071    /// capturing group matches easily:
1072    ///
1073    /// ```rust
1074    /// # use fancy_regex::{Regex, Captures};
1075    /// let re = Regex::new(r"([^,\s]+),\s+(\S+)").unwrap();
1076    /// let result = re.replace("Springsteen, Bruce", |caps: &Captures| {
1077    ///     format!("{} {}", &caps[2], &caps[1])
1078    /// });
1079    /// assert_eq!(result, "Bruce Springsteen");
1080    /// ```
1081    ///
1082    /// But this is a bit cumbersome to use all the time. Instead, a simple
1083    /// syntax is supported that expands `$name` into the corresponding capture
1084    /// group. Here's the last example, but using this expansion technique
1085    /// with named capture groups:
1086    ///
1087    /// ```rust
1088    /// # use fancy_regex::Regex;
1089    /// let re = Regex::new(r"(?P<last>[^,\s]+),\s+(?P<first>\S+)").unwrap();
1090    /// let result = re.replace("Springsteen, Bruce", "$first $last");
1091    /// assert_eq!(result, "Bruce Springsteen");
1092    /// ```
1093    ///
1094    /// Note that using `$2` instead of `$first` or `$1` instead of `$last`
1095    /// would produce the same result. To write a literal `$` use `$$`.
1096    ///
1097    /// Sometimes the replacement string requires use of curly braces to
1098    /// delineate a capture group replacement and surrounding literal text.
1099    /// For example, if we wanted to join two words together with an
1100    /// underscore:
1101    ///
1102    /// ```rust
1103    /// # use fancy_regex::Regex;
1104    /// let re = Regex::new(r"(?P<first>\w+)\s+(?P<second>\w+)").unwrap();
1105    /// let result = re.replace("deep fried", "${first}_$second");
1106    /// assert_eq!(result, "deep_fried");
1107    /// ```
1108    ///
1109    /// Without the curly braces, the capture group name `first_` would be
1110    /// used, and since it doesn't exist, it would be replaced with the empty
1111    /// string.
1112    ///
1113    /// Finally, sometimes you just want to replace a literal string with no
1114    /// regard for capturing group expansion. This can be done by wrapping a
1115    /// byte string with `NoExpand`:
1116    ///
1117    /// ```rust
1118    /// # use fancy_regex::Regex;
1119    /// use fancy_regex::NoExpand;
1120    ///
1121    /// let re = Regex::new(r"(?P<last>[^,\s]+),\s+(\S+)").unwrap();
1122    /// let result = re.replace("Springsteen, Bruce", NoExpand("$2 $last"));
1123    /// assert_eq!(result, "$2 $last");
1124    /// ```
1125    pub fn replace<'t, R: Replacer>(&self, text: &'t str, rep: R) -> Cow<'t, str> {
1126        self.replacen(text, 1, rep)
1127    }
1128
1129    /// Replaces all non-overlapping matches in `text` with the replacement
1130    /// provided. This is the same as calling `replacen` with `limit` set to
1131    /// `0`.
1132    ///
1133    /// See the documentation for `replace` for details on how to access
1134    /// capturing group matches in the replacement string.
1135    pub fn replace_all<'t, R: Replacer>(&self, text: &'t str, rep: R) -> Cow<'t, str> {
1136        self.replacen(text, 0, rep)
1137    }
1138
1139    /// Replaces at most `limit` non-overlapping matches in `text` with the
1140    /// replacement provided. If `limit` is 0, then all non-overlapping matches
1141    /// are replaced.
1142    ///
1143    /// Will panic if any errors are encountered. Use `try_replacen`, which this
1144    /// function unwraps, if you want to handle errors.
1145    ///
1146    /// See the documentation for `replace` for details on how to access
1147    /// capturing group matches in the replacement string.
1148    ///
1149    pub fn replacen<'t, R: Replacer>(&self, text: &'t str, limit: usize, rep: R) -> Cow<'t, str> {
1150        self.try_replacen(text, limit, rep).unwrap()
1151    }
1152
1153    /// Replaces at most `limit` non-overlapping matches in `text` with the
1154    /// replacement provided. If `limit` is 0, then all non-overlapping matches
1155    /// are replaced.
1156    ///
1157    /// Propagates any errors encountered, such as `RuntimeError::BacktrackLimitExceeded`.
1158    ///
1159    /// See the documentation for `replace` for details on how to access
1160    /// capturing group matches in the replacement string.
1161    pub fn try_replacen<'t, R: Replacer>(
1162        &self,
1163        text: &'t str,
1164        limit: usize,
1165        mut rep: R,
1166    ) -> Result<Cow<'t, str>> {
1167        // If we know that the replacement doesn't have any capture expansions,
1168        // then we can fast path. The fast path can make a tremendous
1169        // difference:
1170        //
1171        //   1) We use `find_iter` instead of `captures_iter`. Not asking for
1172        //      captures generally makes the regex engines faster.
1173        //   2) We don't need to look up all of the capture groups and do
1174        //      replacements inside the replacement string. We just push it
1175        //      at each match and be done with it.
1176        if let Some(rep) = rep.no_expansion() {
1177            let mut it = self.find_iter(text).enumerate().peekable();
1178            if it.peek().is_none() {
1179                return Ok(Cow::Borrowed(text));
1180            }
1181            let mut new = String::with_capacity(text.len());
1182            let mut last_match = 0;
1183            for (i, m) in it {
1184                let m = m?;
1185
1186                if limit > 0 && i >= limit {
1187                    break;
1188                }
1189                new.push_str(&text[last_match..m.start()]);
1190                new.push_str(&rep);
1191                last_match = m.end();
1192            }
1193            new.push_str(&text[last_match..]);
1194            return Ok(Cow::Owned(new));
1195        }
1196
1197        // The slower path, which we use if the replacement needs access to
1198        // capture groups.
1199        let mut it = self.captures_iter(text).enumerate().peekable();
1200        if it.peek().is_none() {
1201            return Ok(Cow::Borrowed(text));
1202        }
1203        let mut new = String::with_capacity(text.len());
1204        let mut last_match = 0;
1205        for (i, cap) in it {
1206            let cap = cap?;
1207
1208            if limit > 0 && i >= limit {
1209                break;
1210            }
1211            // unwrap on 0 is OK because captures only reports matches
1212            let m = cap.get(0).unwrap();
1213            new.push_str(&text[last_match..m.start()]);
1214            rep.replace_append(&cap, &mut new);
1215            last_match = m.end();
1216        }
1217        new.push_str(&text[last_match..]);
1218        Ok(Cow::Owned(new))
1219    }
1220
1221    /// Splits the string by matches of the regex.
1222    ///
1223    /// Returns an iterator over the substrings of the target string
1224    ///  that *aren't* matched by the regex.
1225    ///
1226    /// # Example
1227    ///
1228    /// To split a string delimited by arbitrary amounts of spaces or tabs:
1229    ///
1230    /// ```rust
1231    /// # use fancy_regex::Regex;
1232    /// let re = Regex::new(r"[ \t]+").unwrap();
1233    /// let target = "a b \t  c\td    e";
1234    /// let fields: Vec<&str> = re.split(target).map(|x| x.unwrap()).collect();
1235    /// assert_eq!(fields, vec!["a", "b", "c", "d", "e"]);
1236    /// ```
1237    pub fn split<'r, 'h>(&'r self, target: &'h str) -> Split<'r, 'h> {
1238        Split {
1239            matches: self.find_iter(target),
1240            next_start: 0,
1241            target,
1242        }
1243    }
1244
1245    /// Splits the string by matches of the regex at most `limit` times.
1246    ///
1247    /// Returns an iterator over the substrings of the target string
1248    /// that *aren't* matched by the regex.
1249    ///
1250    /// The `N`th substring is the remaining part of the target.
1251    ///
1252    /// # Example
1253    ///
1254    /// To split a string delimited by arbitrary amounts of spaces or tabs
1255    /// 3 times:
1256    ///
1257    /// ```rust
1258    /// # use fancy_regex::Regex;
1259    /// let re = Regex::new(r"[ \t]+").unwrap();
1260    /// let target = "a b \t  c\td    e";
1261    /// let fields: Vec<&str> = re.splitn(target, 3).map(|x| x.unwrap()).collect();
1262    /// assert_eq!(fields, vec!["a", "b", "c\td    e"]);
1263    /// ```
1264    pub fn splitn<'r, 'h>(&'r self, target: &'h str, limit: usize) -> SplitN<'r, 'h> {
1265        SplitN {
1266            splits: self.split(target),
1267            limit: limit,
1268        }
1269    }
1270}
1271
1272impl TryFrom<&str> for Regex {
1273    type Error = Error;
1274
1275    /// Attempts to parse a string into a regular expression
1276    fn try_from(s: &str) -> Result<Self> {
1277        Self::new(s)
1278    }
1279}
1280
1281impl TryFrom<String> for Regex {
1282    type Error = Error;
1283
1284    /// Attempts to parse a string into a regular expression
1285    fn try_from(s: String) -> Result<Self> {
1286        Self::new(&s)
1287    }
1288}
1289
1290impl<'t> Match<'t> {
1291    /// Returns the starting byte offset of the match in the text.
1292    #[inline]
1293    pub fn start(&self) -> usize {
1294        self.start
1295    }
1296
1297    /// Returns the ending byte offset of the match in the text.
1298    #[inline]
1299    pub fn end(&self) -> usize {
1300        self.end
1301    }
1302
1303    /// Returns the range over the starting and ending byte offsets of the match in text.
1304    #[inline]
1305    pub fn range(&self) -> Range<usize> {
1306        self.start..self.end
1307    }
1308
1309    /// Returns the matched text.
1310    #[inline]
1311    pub fn as_str(&self) -> &'t str {
1312        &self.text[self.start..self.end]
1313    }
1314
1315    /// Creates a new match from the given text and byte offsets.
1316    fn new(text: &'t str, start: usize, end: usize) -> Match<'t> {
1317        Match { text, start, end }
1318    }
1319}
1320
1321impl<'t> From<Match<'t>> for &'t str {
1322    fn from(m: Match<'t>) -> &'t str {
1323        m.as_str()
1324    }
1325}
1326
1327impl<'t> From<Match<'t>> for Range<usize> {
1328    fn from(m: Match<'t>) -> Range<usize> {
1329        m.range()
1330    }
1331}
1332
1333#[allow(clippy::len_without_is_empty)] // follow regex's API
1334impl<'t> Captures<'t> {
1335    /// Get the capture group by its index in the regex.
1336    ///
1337    /// If there is no match for that group or the index does not correspond to a group, `None` is
1338    /// returned. The index 0 returns the whole match.
1339    pub fn get(&self, i: usize) -> Option<Match<'t>> {
1340        match &self.inner {
1341            CapturesImpl::Wrap { text, locations } => locations.get_group(i).map(|span| Match {
1342                text,
1343                start: span.start,
1344                end: span.end,
1345            }),
1346            CapturesImpl::Fancy { text, ref saves } => {
1347                let slot = i * 2;
1348                if slot >= saves.len() {
1349                    return None;
1350                }
1351                let lo = saves[slot];
1352                if lo == usize::MAX {
1353                    return None;
1354                }
1355                let hi = saves[slot + 1];
1356                Some(Match {
1357                    text,
1358                    start: lo,
1359                    end: hi,
1360                })
1361            }
1362        }
1363    }
1364
1365    /// Returns the match for a named capture group.  Returns `None` the capture
1366    /// group did not match or if there is no group with the given name.
1367    pub fn name(&self, name: &str) -> Option<Match<'t>> {
1368        self.named_groups.get(name).and_then(|i| self.get(*i))
1369    }
1370
1371    /// Expands all instances of `$group` in `replacement` to the corresponding
1372    /// capture group `name`, and writes them to the `dst` buffer given.
1373    ///
1374    /// `group` may be an integer corresponding to the index of the
1375    /// capture group (counted by order of opening parenthesis where `\0` is the
1376    /// entire match) or it can be a name (consisting of letters, digits or
1377    /// underscores) corresponding to a named capture group.
1378    ///
1379    /// If `group` isn't a valid capture group (whether the name doesn't exist
1380    /// or isn't a valid index), then it is replaced with the empty string.
1381    ///
1382    /// The longest possible name is used. e.g., `$1a` looks up the capture
1383    /// group named `1a` and not the capture group at index `1`. To exert more
1384    /// precise control over the name, use braces, e.g., `${1}a`.
1385    ///
1386    /// To write a literal `$`, use `$$`.
1387    ///
1388    /// For more control over expansion, see [`Expander`].
1389    ///
1390    /// [`Expander`]: expand/struct.Expander.html
1391    pub fn expand(&self, replacement: &str, dst: &mut String) {
1392        Expander::default().append_expansion(dst, replacement, self);
1393    }
1394
1395    /// Iterate over the captured groups in order in which they appeared in the regex. The first
1396    /// capture corresponds to the whole match.
1397    pub fn iter<'c>(&'c self) -> SubCaptureMatches<'c, 't> {
1398        SubCaptureMatches { caps: self, i: 0 }
1399    }
1400
1401    /// How many groups were captured. This is always at least 1 because group 0 returns the whole
1402    /// match.
1403    pub fn len(&self) -> usize {
1404        match &self.inner {
1405            CapturesImpl::Wrap { locations, .. } => locations.group_len(),
1406            CapturesImpl::Fancy { saves, .. } => saves.len() / 2,
1407        }
1408    }
1409}
1410
1411/// Get a group by index.
1412///
1413/// `'t` is the lifetime of the matched text.
1414///
1415/// The text can't outlive the `Captures` object if this method is
1416/// used, because of how `Index` is defined (normally `a[i]` is part
1417/// of `a` and can't outlive it); to do that, use `get()` instead.
1418///
1419/// # Panics
1420///
1421/// If there is no group at the given index.
1422impl<'t> Index<usize> for Captures<'t> {
1423    type Output = str;
1424
1425    fn index(&self, i: usize) -> &str {
1426        self.get(i)
1427            .map(|m| m.as_str())
1428            .unwrap_or_else(|| panic!("no group at index '{}'", i))
1429    }
1430}
1431
1432/// Get a group by name.
1433///
1434/// `'t` is the lifetime of the matched text and `'i` is the lifetime
1435/// of the group name (the index).
1436///
1437/// The text can't outlive the `Captures` object if this method is
1438/// used, because of how `Index` is defined (normally `a[i]` is part
1439/// of `a` and can't outlive it); to do that, use `name` instead.
1440///
1441/// # Panics
1442///
1443/// If there is no group named by the given value.
1444impl<'t, 'i> Index<&'i str> for Captures<'t> {
1445    type Output = str;
1446
1447    fn index<'a>(&'a self, name: &'i str) -> &'a str {
1448        self.name(name)
1449            .map(|m| m.as_str())
1450            .unwrap_or_else(|| panic!("no group named '{}'", name))
1451    }
1452}
1453
1454impl<'c, 't> Iterator for SubCaptureMatches<'c, 't> {
1455    type Item = Option<Match<'t>>;
1456
1457    fn next(&mut self) -> Option<Option<Match<'t>>> {
1458        if self.i < self.caps.len() {
1459            let result = self.caps.get(self.i);
1460            self.i += 1;
1461            Some(result)
1462        } else {
1463            None
1464        }
1465    }
1466}
1467
1468// TODO: might be nice to implement ExactSizeIterator etc for SubCaptures
1469
1470/// Regular expression AST. This is public for now but may change.
1471#[derive(Debug, PartialEq, Eq)]
1472pub enum Expr {
1473    /// An empty expression, e.g. the last branch in `(a|b|)`
1474    Empty,
1475    /// Any character, regex `.`
1476    Any {
1477        /// Whether it also matches newlines or not
1478        newline: bool,
1479    },
1480    /// An assertion
1481    Assertion(Assertion),
1482    /// The string as a literal, e.g. `a`
1483    Literal {
1484        /// The string to match
1485        val: String,
1486        /// Whether match is case-insensitive or not
1487        casei: bool,
1488    },
1489    /// Concatenation of multiple expressions, must match in order, e.g. `a.` is a concatenation of
1490    /// the literal `a` and `.` for any character
1491    Concat(Vec<Expr>),
1492    /// Alternative of multiple expressions, one of them must match, e.g. `a|b` is an alternative
1493    /// where either the literal `a` or `b` must match
1494    Alt(Vec<Expr>),
1495    /// Capturing group of expression, e.g. `(a.)` matches `a` and any character and "captures"
1496    /// (remembers) the match
1497    Group(Box<Expr>),
1498    /// Look-around (e.g. positive/negative look-ahead or look-behind) with an expression, e.g.
1499    /// `(?=a)` means the next character must be `a` (but the match is not consumed)
1500    LookAround(Box<Expr>, LookAround),
1501    /// Repeat of an expression, e.g. `a*` or `a+` or `a{1,3}`
1502    Repeat {
1503        /// The expression that is being repeated
1504        child: Box<Expr>,
1505        /// The minimum number of repetitions
1506        lo: usize,
1507        /// The maximum number of repetitions (or `usize::MAX`)
1508        hi: usize,
1509        /// Greedy means as much as possible is matched, e.g. `.*b` would match all of `abab`.
1510        /// Non-greedy means as little as possible, e.g. `.*?b` would match only `ab` in `abab`.
1511        greedy: bool,
1512    },
1513    /// Delegate a regex to the regex crate. This is used as a simplification so that we don't have
1514    /// to represent all the expressions in the AST, e.g. character classes.
1515    Delegate {
1516        /// The regex
1517        inner: String,
1518        /// How many characters the regex matches
1519        size: usize, // TODO: move into analysis result
1520        /// Whether the matching is case-insensitive or not
1521        casei: bool,
1522    },
1523    /// Back reference to a capture group, e.g. `\1` in `(abc|def)\1` references the captured group
1524    /// and the whole regex matches either `abcabc` or `defdef`.
1525    Backref {
1526        /// The capture group number being referenced
1527        group: usize,
1528        /// Whether the matching is case-insensitive or not
1529        casei: bool,
1530    },
1531    /// Back reference to a capture group at the given specified relative recursion level.
1532    BackrefWithRelativeRecursionLevel {
1533        /// The capture group number being referenced
1534        group: usize,
1535        /// Relative recursion level
1536        relative_level: isize,
1537        /// Whether the matching is case-insensitive or not
1538        casei: bool,
1539    },
1540    /// Atomic non-capturing group, e.g. `(?>ab|a)` in text that contains `ab` will match `ab` and
1541    /// never backtrack and try `a`, even if matching fails after the atomic group.
1542    AtomicGroup(Box<Expr>),
1543    /// Keep matched text so far out of overall match
1544    KeepOut,
1545    /// Anchor to match at the position where the previous match ended
1546    ContinueFromPreviousMatchEnd,
1547    /// Conditional expression based on whether the numbered capture group matched or not
1548    BackrefExistsCondition(usize),
1549    /// If/Then/Else Condition. If there is no Then/Else, these will just be empty expressions.
1550    Conditional {
1551        /// The conditional expression to evaluate
1552        condition: Box<Expr>,
1553        /// What to execute if the condition is true
1554        true_branch: Box<Expr>,
1555        /// What to execute if the condition is false
1556        false_branch: Box<Expr>,
1557    },
1558    /// Subroutine call to the specified group number
1559    SubroutineCall(usize),
1560    /// Unresolved subroutine call to the specified group name
1561    UnresolvedNamedSubroutineCall {
1562        /// The capture group name
1563        name: String,
1564        /// The position in the original regex pattern where the subroutine call is made
1565        ix: usize,
1566    },
1567}
1568
1569/// Type of look-around assertion as used for a look-around expression.
1570#[derive(Debug, PartialEq, Eq, Clone, Copy)]
1571pub enum LookAround {
1572    /// Look-ahead assertion, e.g. `(?=a)`
1573    LookAhead,
1574    /// Negative look-ahead assertion, e.g. `(?!a)`
1575    LookAheadNeg,
1576    /// Look-behind assertion, e.g. `(?<=a)`
1577    LookBehind,
1578    /// Negative look-behind assertion, e.g. `(?<!a)`
1579    LookBehindNeg,
1580}
1581
1582/// An iterator over capture names in a [Regex].  The iterator
1583/// returns the name of each group, or [None] if the group has
1584/// no name.  Because capture group 0 cannot have a name, the
1585/// first item returned is always [None].
1586pub struct CaptureNames<'r>(vec::IntoIter<Option<&'r str>>);
1587
1588impl Debug for CaptureNames<'_> {
1589    fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
1590        f.write_str("<CaptureNames>")
1591    }
1592}
1593
1594impl<'r> Iterator for CaptureNames<'r> {
1595    type Item = Option<&'r str>;
1596
1597    fn next(&mut self) -> Option<Self::Item> {
1598        self.0.next()
1599    }
1600}
1601
1602// silly to write my own, but this is super-fast for the common 1-digit
1603// case.
1604fn push_usize(s: &mut String, x: usize) {
1605    if x >= 10 {
1606        push_usize(s, x / 10);
1607        s.push((b'0' + (x % 10) as u8) as char);
1608    } else {
1609        s.push((b'0' + (x as u8)) as char);
1610    }
1611}
1612
1613fn is_special(c: char) -> bool {
1614    match c {
1615        '\\' | '.' | '+' | '*' | '?' | '(' | ')' | '|' | '[' | ']' | '{' | '}' | '^' | '$'
1616        | '#' => true,
1617        _ => false,
1618    }
1619}
1620
1621fn push_quoted(buf: &mut String, s: &str) {
1622    for c in s.chars() {
1623        if is_special(c) {
1624            buf.push('\\');
1625        }
1626        buf.push(c);
1627    }
1628}
1629
1630/// Escapes special characters in `text` with '\\'.  Returns a string which, when interpreted
1631/// as a regex, matches exactly `text`.
1632pub fn escape(text: &str) -> Cow<str> {
1633    // Using bytes() is OK because all special characters are single bytes.
1634    match text.bytes().filter(|&b| is_special(b as char)).count() {
1635        0 => Cow::Borrowed(text),
1636        n => {
1637            // The capacity calculation is exact because '\\' is a single byte.
1638            let mut buf = String::with_capacity(text.len() + n);
1639            push_quoted(&mut buf, text);
1640            Cow::Owned(buf)
1641        }
1642    }
1643}
1644
1645/// Type of assertions
1646#[derive(Debug, PartialEq, Eq, Clone, Copy)]
1647pub enum Assertion {
1648    /// Start of input text
1649    StartText,
1650    /// End of input text
1651    EndText,
1652    /// Start of a line
1653    StartLine {
1654        /// CRLF mode
1655        crlf: bool,
1656    },
1657    /// End of a line
1658    EndLine {
1659        /// CRLF mode
1660        crlf: bool,
1661    },
1662    /// Left word boundary
1663    LeftWordBoundary,
1664    /// Right word boundary
1665    RightWordBoundary,
1666    /// Both word boundaries
1667    WordBoundary,
1668    /// Not word boundary
1669    NotWordBoundary,
1670}
1671
1672impl Assertion {
1673    pub(crate) fn is_hard(&self) -> bool {
1674        use Assertion::*;
1675        matches!(
1676            self,
1677            // these will make regex-automata use PikeVM
1678            LeftWordBoundary | RightWordBoundary | WordBoundary | NotWordBoundary
1679        )
1680    }
1681}
1682
1683impl Expr {
1684    /// Parse the regex and return an expression (AST) and a bit set with the indexes of groups
1685    /// that are referenced by backrefs.
1686    pub fn parse_tree(re: &str) -> Result<ExprTree> {
1687        Parser::parse(re)
1688    }
1689
1690    /// Parse the regex and return an expression (AST)
1691    /// Flags should be bit based based on flags
1692    pub fn parse_tree_with_flags(re: &str, flags: u32) -> Result<ExprTree> {
1693        Parser::parse_with_flags(re, flags)
1694    }
1695
1696    /// Convert expression to a regex string in the regex crate's syntax.
1697    ///
1698    /// # Panics
1699    ///
1700    /// Panics for expressions that are hard, i.e. can not be handled by the regex crate.
1701    pub fn to_str(&self, buf: &mut String, precedence: u8) {
1702        match *self {
1703            Expr::Empty => (),
1704            Expr::Any { newline } => buf.push_str(if newline { "(?s:.)" } else { "." }),
1705            Expr::Literal { ref val, casei } => {
1706                if casei {
1707                    buf.push_str("(?i:");
1708                }
1709                push_quoted(buf, val);
1710                if casei {
1711                    buf.push_str(")");
1712                }
1713            }
1714            Expr::Assertion(Assertion::StartText) => buf.push('^'),
1715            Expr::Assertion(Assertion::EndText) => buf.push('$'),
1716            Expr::Assertion(Assertion::StartLine { crlf: false }) => buf.push_str("(?m:^)"),
1717            Expr::Assertion(Assertion::EndLine { crlf: false }) => buf.push_str("(?m:$)"),
1718            Expr::Assertion(Assertion::StartLine { crlf: true }) => buf.push_str("(?Rm:^)"),
1719            Expr::Assertion(Assertion::EndLine { crlf: true }) => buf.push_str("(?Rm:$)"),
1720            Expr::Concat(ref children) => {
1721                if precedence > 1 {
1722                    buf.push_str("(?:");
1723                }
1724                for child in children {
1725                    child.to_str(buf, 2);
1726                }
1727                if precedence > 1 {
1728                    buf.push(')')
1729                }
1730            }
1731            Expr::Alt(ref children) => {
1732                if precedence > 0 {
1733                    buf.push_str("(?:");
1734                }
1735                for (i, child) in children.iter().enumerate() {
1736                    if i != 0 {
1737                        buf.push('|');
1738                    }
1739                    child.to_str(buf, 1);
1740                }
1741                if precedence > 0 {
1742                    buf.push(')');
1743                }
1744            }
1745            Expr::Group(ref child) => {
1746                buf.push('(');
1747                child.to_str(buf, 0);
1748                buf.push(')');
1749            }
1750            Expr::Repeat {
1751                ref child,
1752                lo,
1753                hi,
1754                greedy,
1755            } => {
1756                if precedence > 2 {
1757                    buf.push_str("(?:");
1758                }
1759                child.to_str(buf, 3);
1760                match (lo, hi) {
1761                    (0, 1) => buf.push('?'),
1762                    (0, usize::MAX) => buf.push('*'),
1763                    (1, usize::MAX) => buf.push('+'),
1764                    (lo, hi) => {
1765                        buf.push('{');
1766                        push_usize(buf, lo);
1767                        if lo != hi {
1768                            buf.push(',');
1769                            if hi != usize::MAX {
1770                                push_usize(buf, hi);
1771                            }
1772                        }
1773                        buf.push('}');
1774                    }
1775                }
1776                if !greedy {
1777                    buf.push('?');
1778                }
1779                if precedence > 2 {
1780                    buf.push(')');
1781                }
1782            }
1783            Expr::Delegate {
1784                ref inner, casei, ..
1785            } => {
1786                // at the moment, delegate nodes are just atoms
1787                if casei {
1788                    buf.push_str("(?i:");
1789                }
1790                buf.push_str(inner);
1791                if casei {
1792                    buf.push_str(")");
1793                }
1794            }
1795            _ => panic!("attempting to format hard expr"),
1796        }
1797    }
1798}
1799
1800// precondition: ix > 0
1801fn prev_codepoint_ix(s: &str, mut ix: usize) -> usize {
1802    let bytes = s.as_bytes();
1803    loop {
1804        ix -= 1;
1805        // fancy bit magic for ranges 0..0x80 + 0xc0..
1806        if (bytes[ix] as i8) >= -0x40 {
1807            break;
1808        }
1809    }
1810    ix
1811}
1812
1813fn codepoint_len(b: u8) -> usize {
1814    match b {
1815        b if b < 0x80 => 1,
1816        b if b < 0xe0 => 2,
1817        b if b < 0xf0 => 3,
1818        _ => 4,
1819    }
1820}
1821
1822/// Returns the smallest possible index of the next valid UTF-8 sequence
1823/// starting after `i`.
1824/// Adapted from a function with the same name in the `regex` crate.
1825fn next_utf8(text: &str, i: usize) -> usize {
1826    let b = match text.as_bytes().get(i) {
1827        None => return i + 1,
1828        Some(&b) => b,
1829    };
1830    i + codepoint_len(b)
1831}
1832
1833// If this returns false, then there is no possible backref in the re
1834
1835// Both potential implementations are turned off, because we currently
1836// always need to do a deeper analysis because of 1-character
1837// look-behind. If we could call a find_from_pos method of regex::Regex,
1838// it would make sense to bring this back.
1839/*
1840pub fn detect_possible_backref(re: &str) -> bool {
1841    let mut last = b'\x00';
1842    for b in re.as_bytes() {
1843        if b'0' <= *b && *b <= b'9' && last == b'\\' { return true; }
1844        last = *b;
1845    }
1846    false
1847}
1848
1849pub fn detect_possible_backref(re: &str) -> bool {
1850    let mut bytes = re.as_bytes();
1851    loop {
1852        match memchr::memchr(b'\\', &bytes[..bytes.len() - 1]) {
1853            Some(i) => {
1854                bytes = &bytes[i + 1..];
1855                let c = bytes[0];
1856                if b'0' <= c && c <= b'9' { return true; }
1857            }
1858            None => return false
1859        }
1860    }
1861}
1862*/
1863
1864/// wrapper to search for re at arbitrary start position,
1865/// and to capture the match bounds
1866pub fn wrap_tree(raw_tree: ExprTree) -> ExprTree {
1867    return ExprTree {
1868        expr: Expr::Concat(vec![
1869            Expr::Repeat {
1870                child: Box::new(Expr::Any { newline: true }),
1871                lo: 0,
1872                hi: usize::MAX,
1873                greedy: false,
1874            },
1875            Expr::Group(Box::new(raw_tree.expr)),
1876        ]),
1877        ..raw_tree
1878    };
1879}
1880
1881/// The internal module only exists so that the toy example can access internals for debugging and
1882/// experimenting.
1883#[doc(hidden)]
1884pub mod internal {
1885    pub use crate::analyze::analyze;
1886    pub use crate::compile::compile;
1887    pub use crate::vm::{run_default, run_trace, Insn, Prog};
1888}
1889
1890#[cfg(test)]
1891mod tests {
1892    use alloc::borrow::Cow;
1893    use alloc::boxed::Box;
1894    use alloc::string::String;
1895    use alloc::{format, vec};
1896
1897    use crate::parse::make_literal;
1898    use crate::{Expr, Regex};
1899
1900    //use detect_possible_backref;
1901
1902    // tests for to_str
1903
1904    fn to_str(e: Expr) -> String {
1905        let mut s = String::new();
1906        e.to_str(&mut s, 0);
1907        s
1908    }
1909
1910    #[test]
1911    fn to_str_concat_alt() {
1912        let e = Expr::Concat(vec![
1913            Expr::Alt(vec![make_literal("a"), make_literal("b")]),
1914            make_literal("c"),
1915        ]);
1916        assert_eq!(to_str(e), "(?:a|b)c");
1917    }
1918
1919    #[test]
1920    fn to_str_rep_concat() {
1921        let e = Expr::Repeat {
1922            child: Box::new(Expr::Concat(vec![make_literal("a"), make_literal("b")])),
1923            lo: 2,
1924            hi: 3,
1925            greedy: true,
1926        };
1927        assert_eq!(to_str(e), "(?:ab){2,3}");
1928    }
1929
1930    #[test]
1931    fn to_str_group_alt() {
1932        let e = Expr::Group(Box::new(Expr::Alt(vec![
1933            make_literal("a"),
1934            make_literal("b"),
1935        ])));
1936        assert_eq!(to_str(e), "(a|b)");
1937    }
1938
1939    #[test]
1940    fn as_str_debug() {
1941        let s = r"(a+)b\1";
1942        let regex = Regex::new(s).unwrap();
1943        assert_eq!(s, regex.as_str());
1944        assert_eq!(s, format!("{:?}", regex));
1945    }
1946
1947    #[test]
1948    fn display() {
1949        let s = r"(a+)b\1";
1950        let regex = Regex::new(s).unwrap();
1951        assert_eq!(s, format!("{}", regex));
1952    }
1953
1954    #[test]
1955    fn from_str() {
1956        let s = r"(a+)b\1";
1957        let regex = s.parse::<Regex>().unwrap();
1958        assert_eq!(regex.as_str(), s);
1959    }
1960
1961    #[test]
1962    fn to_str_repeat() {
1963        fn repeat(lo: usize, hi: usize, greedy: bool) -> Expr {
1964            Expr::Repeat {
1965                child: Box::new(make_literal("a")),
1966                lo,
1967                hi,
1968                greedy,
1969            }
1970        }
1971
1972        assert_eq!(to_str(repeat(2, 2, true)), "a{2}");
1973        assert_eq!(to_str(repeat(2, 2, false)), "a{2}?");
1974        assert_eq!(to_str(repeat(2, 3, true)), "a{2,3}");
1975        assert_eq!(to_str(repeat(2, 3, false)), "a{2,3}?");
1976        assert_eq!(to_str(repeat(2, usize::MAX, true)), "a{2,}");
1977        assert_eq!(to_str(repeat(2, usize::MAX, false)), "a{2,}?");
1978        assert_eq!(to_str(repeat(0, 1, true)), "a?");
1979        assert_eq!(to_str(repeat(0, 1, false)), "a??");
1980        assert_eq!(to_str(repeat(0, usize::MAX, true)), "a*");
1981        assert_eq!(to_str(repeat(0, usize::MAX, false)), "a*?");
1982        assert_eq!(to_str(repeat(1, usize::MAX, true)), "a+");
1983        assert_eq!(to_str(repeat(1, usize::MAX, false)), "a+?");
1984    }
1985
1986    #[test]
1987    fn escape() {
1988        // Check that strings that need no quoting are borrowed, and that non-special punctuation
1989        // is not quoted.
1990        match crate::escape("@foo") {
1991            Cow::Borrowed(s) => assert_eq!(s, "@foo"),
1992            _ => panic!("Value should be borrowed."),
1993        }
1994
1995        // Check typical usage.
1996        assert_eq!(crate::escape("fo*o").into_owned(), "fo\\*o");
1997
1998        // Check that multibyte characters are handled correctly.
1999        assert_eq!(crate::escape("fø*ø").into_owned(), "fø\\*ø");
2000    }
2001
2002    /*
2003    #[test]
2004    fn detect_backref() {
2005        assert_eq!(detect_possible_backref("a0a1a2"), false);
2006        assert_eq!(detect_possible_backref("a0a1\\a2"), false);
2007        assert_eq!(detect_possible_backref("a0a\\1a2"), true);
2008        assert_eq!(detect_possible_backref("a0a1a2\\"), false);
2009    }
2010    */
2011}
fancy_regex/lib.rs

fancy_regex/
lib.rs