fancy_regex_fork_pb/
lib.rs

1// Copyright 2016 The Fancy Regex Authors.
2//
3// Permission is hereby granted, free of charge, to any person obtaining a copy
4// of this software and associated documentation files (the "Software"), to deal
5// in the Software without restriction, including without limitation the rights
6// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7// copies of the Software, and to permit persons to whom the Software is
8// furnished to do so, subject to the following conditions:
9//
10// The above copyright notice and this permission notice shall be included in
11// all copies or substantial portions of the Software.
12//
13// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19// THE SOFTWARE.
20
21/*!
22An implementation of regexes, supporting a relatively rich set of features, including backreferences
23and lookaround.
24
25It builds on top of the excellent [regex] crate. If you are not
26familiar with it, make sure you read its documentation and maybe you don't even need fancy-regex.
27
28If your regex or parts of it does not use any special features, the matching is delegated to the
29regex crate. That means it has linear runtime. But if you use "fancy" features such as
30backreferences or look-around, an engine with backtracking needs to be used. In that case, the regex
31can be slow and take exponential time to run because of what is called "catastrophic backtracking".
32This depends on the regex and the input.
33
34# Usage
35
36The API should feel very similar to the regex crate, and involves compiling a regex and then using
37it to find matches in text.
38
39## Example: Matching text
40
41An example with backreferences to check if a text consists of two identical words:
42
43```rust
44use fancy_regex::Regex;
45
46let re = Regex::new(r"^(\w+) (\1)$").unwrap();
47let result = re.is_match("foo foo");
48
49assert!(result.is_ok());
50let did_match = result.unwrap();
51assert!(did_match);
52```
53
54Note that like in the regex crate, the regex needs anchors like `^` and `$` to match against the
55entire input text.
56
57## Example: Finding the position of matches
58
59```rust
60use fancy_regex::Regex;
61
62let re = Regex::new(r"(\d)\1").unwrap();
63let result = re.find("foo 22");
64
65assert!(result.is_ok(), "execution was successful");
66let match_option = result.unwrap();
67
68assert!(match_option.is_some(), "found a match");
69let m = match_option.unwrap();
70
71assert_eq!(m.start(), 4);
72assert_eq!(m.end(), 6);
73assert_eq!(m.as_str(), "22");
74```
75
76## Example: Capturing groups
77
78```rust
79use fancy_regex::Regex;
80
81let re = Regex::new(r"(?<!AU)\$(\d+)").unwrap();
82let result = re.captures("AU$10, $20");
83
84let captures = result.expect("Error running regex").expect("No match found");
85let group = captures.get(1).expect("No group");
86assert_eq!(group.as_str(), "20");
87```
88
89# Syntax
90
91The regex syntax is based on the [regex] crate's, with some additional supported syntax. Escapes:
92
93```norun
94\h    hex digit ([0-9A-Fa-f])
95\H    not hex digit ([^0-9A-Fa-f])
96\e    escape control character (\x1B)
97```
98
99Backreferences:
100
101```norun
102\1    match the exact string that the first capture group matched
103\2    backref to the second capture group, etc
104```
105
106Look-around assertions for matching without changing the current position:
107
108```norun
109(?=exp)    look-ahead, succeeds if exp matches to the right of the current position
110(?!exp)    negative look-ahead, succeeds if exp doesn't match to the right
111(?<=exp)   look-behind, succeeds if exp matches to the left of the current position
112(?<!exp)   negative look-behind, succeeds if exp doesn't match to the left
113```
114
115Atomic groups using `(?>exp)` to prevent backtracking within `exp`, e.g.:
116
117```
118# use fancy_regex::Regex;
119let re = Regex::new(r"^a(?>bc|b)c$").unwrap();
120assert!(re.is_match("abcc").unwrap());
121// Doesn't match because `|b` is never tried because of the atomic group
122assert!(!re.is_match("abc").unwrap());
123```
124
125[regex]: https://crates.io/crates/regex
126*/
127
128#![doc(html_root_url = "https://docs.rs/fancy-regex/0.3.1")]
129#![deny(missing_docs)]
130#![deny(missing_debug_implementations)]
131
132use bit_set::BitSet;
133use std::fmt;
134use std::usize;
135
136mod analyze;
137mod compile;
138mod error;
139mod parse;
140mod vm;
141
142use crate::analyze::analyze;
143use crate::compile::compile;
144use crate::parse::Parser;
145use crate::vm::Prog;
146
147pub use crate::error::{Error, Result};
148
149const MAX_RECURSION: usize = 64;
150
151// the public API
152
153/// A builder for a `Regex` to allow configuring options.
154#[derive(Debug)]
155pub struct RegexBuilder(RegexOptions);
156
157/// A compiled regular expression.
158pub struct Regex(RegexImpl);
159
160// Separate enum because we don't want to expose any of this
161enum RegexImpl {
162    // Do we want to box this? It's pretty big...
163    Wrap {
164        inner: regex::Regex,
165        inner1: Option<Box<regex::Regex>>,
166        options: RegexOptions,
167    },
168    Fancy {
169        prog: Prog,
170        n_groups: usize,
171        options: RegexOptions,
172    },
173}
174
175/// A single match of a regex or group in an input text
176#[derive(Copy, Clone, Debug, Eq, PartialEq)]
177pub struct Match<'t> {
178    text: &'t str,
179    start: usize,
180    end: usize,
181}
182
183/// A set of capture groups found for a regex.
184#[derive(Debug)]
185pub struct Captures<'t>(CapturesImpl<'t>);
186
187#[derive(Debug)]
188enum CapturesImpl<'t> {
189    Wrap {
190        text: &'t str,
191        inner: regex::Captures<'t>,
192
193        // starting position, in _from_pos variants
194        offset: usize,
195
196        enclosing_groups: usize,
197    },
198    Fancy {
199        text: &'t str,
200        saves: Vec<usize>,
201    },
202}
203
204/// Iterator for captured groups in order in which they appear in the regex.
205#[derive(Debug)]
206pub struct SubCaptureMatches<'c, 't> {
207    caps: &'c Captures<'t>,
208    i: usize,
209}
210
211#[derive(Clone, Debug)]
212struct RegexOptions {
213    pattern: String,
214    backtrack_limit: usize,
215    delegate_size_limit: Option<usize>,
216    delegate_dfa_size_limit: Option<usize>,
217    case_insensitive: bool,
218    multi_line: bool,
219    dot_matches_new_line: bool,
220    unicode: bool,
221}
222
223impl Default for RegexOptions {
224    fn default() -> Self {
225        RegexOptions {
226            pattern: String::new(),
227            backtrack_limit: 1_000_000,
228            delegate_size_limit: None,
229            delegate_dfa_size_limit: None,
230            case_insensitive: false,
231            multi_line: false,
232            dot_matches_new_line: false,
233            unicode: false,
234        }
235    }
236}
237
238impl RegexBuilder {
239    /// Create a new regex builder with a regex pattern.
240    ///
241    /// If the pattern is invalid, the call to `build` will fail later.
242    pub fn new(pattern: &str) -> Self {
243        let mut builder = RegexBuilder(RegexOptions::default());
244        if builder.0.case_insensitive || builder.0.multi_line || builder.0.dot_matches_new_line || builder.0.unicode {
245            let flags = format!(
246                "{}{}{}{}",
247                if builder.0.case_insensitive { "i" } else { "" },
248                if builder.0.multi_line { "m" } else { "" },
249                if builder.0.dot_matches_new_line { "s" } else { "" },
250                if builder.0.unicode { "u" } else { "" },
251            );
252            builder.0.pattern = format!("(?{}){}", &flags, pattern);
253        } else {
254            builder.0.pattern = pattern.to_string();
255        }
256        builder
257    }
258
259    /// Build the `Regex`.
260    ///
261    /// Returns an [`Error`](enum.Error.html) if the pattern could not be parsed.
262    pub fn build(&self) -> Result<Regex> {
263        Regex::new_options(self.0.clone())
264    }
265
266    /// Limit for how many times backtracking should be attempted for fancy regexes (where
267    /// backtracking is used). If this limit is exceeded, execution returns an error with
268    /// [`Error::BacktrackLimitExceeded`](enum.Error.html#variant.BacktrackLimitExceeded).
269    /// This is for preventing a regex with catastrophic backtracking to run for too long.
270    ///
271    /// Default is `1_000_000` (1 million).
272    pub fn backtrack_limit(&mut self, limit: usize) -> &mut Self {
273        self.0.backtrack_limit = limit;
274        self
275    }
276
277    /// Set the approximate size limit of the compiled regular expression.
278    ///
279    /// This option is forwarded from the wrapped `regex` crate. Note that depending on the used
280    /// regex features there may be multiple delegated sub-regexes fed to the `regex` crate. As
281    /// such the actual limit is closer to `<number of delegated regexes> * delegate_size_limit`.
282    pub fn delegate_size_limit(&mut self, limit: usize) -> &mut Self {
283        self.0.delegate_size_limit = Some(limit);
284        self
285    }
286
287    /// Set the approximate size of the cache used by the DFA.
288    ///
289    /// This option is forwarded from the wrapped `regex` crate. Note that depending on the used
290    /// regex features there may be multiple delegated sub-regexes fed to the `regex` crate. As
291    /// such the actual limit is closer to `<number of delegated regexes> *
292    /// delegate_dfa_size_limit`.
293    pub fn delegate_dfa_size_limit(&mut self, limit: usize) -> &mut Self {
294        self.0.delegate_dfa_size_limit = Some(limit);
295        self
296    }
297
298    /// Set the case insensitivity flag.
299    pub fn case_insensitive(&mut self, value: bool) -> &mut Self {
300        self.0.case_insensitive = value;
301        self
302    }
303
304    /// Set the multi-line flag.
305    pub fn multi_line(&mut self, value: bool) -> &mut Self {
306        self.0.multi_line = value;
307        self
308    }
309
310    /// Set the dot-matches-new-line flag.
311    pub fn dot_matches_new_line(&mut self, value: bool) -> &mut Self {
312        self.0.dot_matches_new_line = value;
313        self
314    }
315
316    /// Set the unicode flag.
317    pub fn unicode(&mut self, value: bool) -> &mut Self {
318        self.0.unicode = value;
319        self
320    }
321}
322
323impl fmt::Debug for Regex {
324    /// Shows the original regular expression.
325    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
326        write!(f, "{}", self.as_str())
327    }
328}
329
330impl Regex {
331    /// Parse and compile a regex with default options, see `RegexBuilder`.
332    ///
333    /// Returns an [`Error`](enum.Error.html) if the pattern could not be parsed.
334    pub fn new(re: &str) -> Result<Regex> {
335        let options = RegexOptions {
336            pattern: re.to_string(),
337            ..RegexOptions::default()
338        };
339        Self::new_options(options)
340    }
341
342    fn new_options(options: RegexOptions) -> Result<Regex> {
343        let (raw_e, backrefs) = Expr::parse(&options.pattern)?;
344
345        // wrapper to search for re at arbitrary start position,
346        // and to capture the match bounds
347        let e = Expr::Concat(vec![
348            Expr::Repeat {
349                child: Box::new(Expr::Any { newline: true }),
350                lo: 0,
351                hi: usize::MAX,
352                greedy: false,
353            },
354            Expr::Group(Box::new(raw_e)),
355        ]);
356
357        let info = analyze(&e, &backrefs)?;
358
359        let inner_info = &info.children[1].children[0]; // references inner expr
360        if !inner_info.hard {
361            // easy case, wrap regex
362
363            // we do our own to_str because escapes are different
364            let mut re_cooked = String::new();
365            // same as raw_e above, but it was moved, so traverse to find it
366            let raw_e = match e {
367                Expr::Concat(ref v) => match v[1] {
368                    Expr::Group(ref child) => child,
369                    _ => unreachable!(),
370                },
371                _ => unreachable!(),
372            };
373            raw_e.to_str(&mut re_cooked, 0);
374            let inner = compile::compile_inner(&re_cooked, &options)?;
375            let inner1 = if inner_info.looks_left {
376                // create regex to handle 1-char look-behind
377                let re1 = ["^(?s:.)+?(", re_cooked.as_str(), ")"].concat();
378                let compiled = compile::compile_inner(&re1, &options)?;
379                Some(Box::new(compiled))
380            } else {
381                None
382            };
383            return Ok(Regex(RegexImpl::Wrap {
384                inner,
385                inner1,
386                options,
387            }));
388        }
389
390        let prog = compile(&info)?;
391        Ok(Regex(RegexImpl::Fancy {
392            prog,
393            n_groups: info.end_group,
394            options,
395        }))
396    }
397
398    /// Returns the original string of this regex.
399    pub fn as_str(&self) -> &str {
400        match &self.0 {
401            RegexImpl::Wrap { options, .. } => &options.pattern,
402            RegexImpl::Fancy { options, .. } => &options.pattern,
403        }
404    }
405
406    /// Check if the regex matches the input text.
407    ///
408    /// # Example
409    ///
410    /// Test if some text contains the same word twice:
411    ///
412    /// ```rust
413    /// # use fancy_regex::Regex;
414    ///
415    /// let re = Regex::new(r"(\w+) \1").unwrap();
416    /// assert!(re.is_match("mirror mirror on the wall").unwrap());
417    /// ```
418    pub fn is_match(&self, text: &str) -> Result<bool> {
419        match &self.0 {
420            RegexImpl::Wrap { ref inner, .. } => Ok(inner.is_match(text)),
421            RegexImpl::Fancy {
422                ref prog, options, ..
423            } => {
424                let result = vm::run(prog, text, 0, 0, options)?;
425                Ok(result.is_some())
426            }
427        }
428    }
429
430    /// Find the first match in the input text.
431    ///
432    /// If you have capturing groups in your regex that you want to extract, use the [captures()]
433    /// method.
434    ///
435    /// # Example
436    ///
437    /// Find a word that is followed by an exclamation point:
438    ///
439    /// ```rust
440    /// # use fancy_regex::Regex;
441    ///
442    /// let re = Regex::new(r"\w+(?=!)").unwrap();
443    /// assert_eq!(re.find("so fancy!").unwrap().unwrap().as_str(), "fancy");
444    /// ```
445    pub fn find<'t>(&self, text: &'t str) -> Result<Option<Match<'t>>> {
446        match &self.0 {
447            RegexImpl::Wrap { inner, .. } => Ok(inner
448                .find(text)
449                .map(|m| Match::new(text, m.start(), m.end()))),
450            RegexImpl::Fancy { prog, options, .. } => {
451                let result = vm::run(prog, text, 0, 0, options)?;
452                Ok(result.map(|saves| Match::new(text, saves[0], saves[1])))
453            }
454        }
455    }
456
457    /// Returns the capture groups for the first match in `text`.
458    ///
459    /// If no match is found, then `Ok(None)` is returned.
460    ///
461    /// # Examples
462    ///
463    /// Finding matches and capturing parts of the match:
464    ///
465    /// ```rust
466    /// # use fancy_regex::Regex;
467    ///
468    /// let re = Regex::new(r"(\d{4})-(\d{2})-(\d{2})").unwrap();
469    /// let text = "The date was 2018-04-07";
470    /// let captures = re.captures(text).unwrap().unwrap();
471    ///
472    /// assert_eq!(captures.get(1).unwrap().as_str(), "2018");
473    /// assert_eq!(captures.get(2).unwrap().as_str(), "04");
474    /// assert_eq!(captures.get(3).unwrap().as_str(), "07");
475    /// assert_eq!(captures.get(0).unwrap().as_str(), "2018-04-07");
476    /// ```
477    pub fn captures<'t>(&self, text: &'t str) -> Result<Option<Captures<'t>>> {
478        match &self.0 {
479            RegexImpl::Wrap { inner, .. } => Ok(inner.captures(text).map(|caps| {
480                Captures(CapturesImpl::Wrap {
481                    text,
482                    inner: caps,
483                    offset: 0,
484                    enclosing_groups: 0,
485                })
486            })),
487            RegexImpl::Fancy {
488                prog,
489                n_groups,
490                options,
491                ..
492            } => {
493                let result = vm::run(prog, text, 0, 0, options)?;
494                Ok(result.map(|mut saves| {
495                    saves.truncate(n_groups * 2);
496                    Captures(CapturesImpl::Fancy { text, saves })
497                }))
498            }
499        }
500    }
501
502    /// Returns the capture groups for the first match in `text`, starting from
503    /// the specified byte position `pos`.
504    ///
505    /// # Examples
506    ///
507    /// Finding captures starting at a position:
508    ///
509    /// ```
510    /// # use fancy_regex::Regex;
511    /// let re = Regex::new(r"(?m:^)(\d+)").unwrap();
512    /// let text = "1 test 123\n2 foo";
513    /// let captures = re.captures_from_pos(text, 7).unwrap().unwrap();
514    ///
515    /// let group = captures.get(1).unwrap();
516    /// assert_eq!(group.as_str(), "2");
517    /// assert_eq!(group.start(), 11);
518    /// assert_eq!(group.end(), 12);
519    /// ```
520    ///
521    /// Note that in some cases this is not the same as using the `captures`
522    /// methods and passing a slice of the string, see the capture that we get
523    /// when we do this:
524    ///
525    /// ```
526    /// # use fancy_regex::Regex;
527    /// let re = Regex::new(r"(?m:^)(\d+)").unwrap();
528    /// let text = "1 test 123\n2 foo";
529    /// let captures = re.captures(&text[7..]).unwrap().unwrap();
530    /// assert_eq!(captures.get(1).unwrap().as_str(), "123");
531    /// ```
532    ///
533    /// This matched the number "123" because it's at the beginning of the text
534    /// of the string slice.
535    ///
536    pub fn captures_from_pos<'t>(&self, text: &'t str, pos: usize) -> Result<Option<Captures<'t>>> {
537        match &self.0 {
538            RegexImpl::Wrap { inner, inner1, .. } => {
539                if inner1.is_none() || pos == 0 {
540                    let result = inner.captures(&text[pos..]);
541                    Ok(result.map(|caps| {
542                        Captures(CapturesImpl::Wrap {
543                            text,
544                            inner: caps,
545                            offset: pos,
546                            enclosing_groups: 0,
547                        })
548                    }))
549                } else {
550                    let ix = prev_codepoint_ix(text, pos);
551                    let inner1 = inner1.as_ref().unwrap();
552                    let result = inner1.captures(&text[ix..]);
553                    Ok(result.map(|caps| {
554                        Captures(CapturesImpl::Wrap {
555                            text,
556                            inner: caps,
557                            offset: ix,
558                            enclosing_groups: 1,
559                        })
560                    }))
561                }
562            }
563            RegexImpl::Fancy {
564                prog,
565                n_groups,
566                options,
567                ..
568            } => {
569                let result = vm::run(prog, text, pos, 0, options)?;
570                Ok(result.map(|mut saves| {
571                    saves.truncate(n_groups * 2);
572                    Captures(CapturesImpl::Fancy { text, saves })
573                }))
574            }
575        }
576    }
577
578    // for debugging only
579    #[doc(hidden)]
580    pub fn debug_print(&self) {
581        match &self.0 {
582            RegexImpl::Wrap { inner, .. } => println!("wrapped {:?}", inner),
583            RegexImpl::Fancy { prog, .. } => prog.debug_print(),
584        }
585    }
586}
587
588impl<'t> Match<'t> {
589    /// Returns the starting byte offset of the match in the text.
590    #[inline]
591    pub fn start(&self) -> usize {
592        self.start
593    }
594
595    /// Returns the ending byte offset of the match in the text.
596    #[inline]
597    pub fn end(&self) -> usize {
598        self.end
599    }
600
601    /// Returns the matched text.
602    #[inline]
603    pub fn as_str(&self) -> &'t str {
604        &self.text[self.start..self.end]
605    }
606
607    fn new(text: &'t str, start: usize, end: usize) -> Match<'t> {
608        Match { text, start, end }
609    }
610}
611
612impl<'t> Captures<'t> {
613    /// Get the capture group by its index in the regex.
614    ///
615    /// If there is no match for that group or the index does not correspond to a group, `None` is
616    /// returned. The index 0 returns the whole match.
617    pub fn get(&self, i: usize) -> Option<Match<'t>> {
618        match &self.0 {
619            CapturesImpl::Wrap {
620                text,
621                inner,
622                offset,
623                enclosing_groups,
624            } => inner.get(i + *enclosing_groups).map(|m| Match {
625                text,
626                start: m.start() + *offset,
627                end: m.end() + *offset,
628            }),
629            CapturesImpl::Fancy { text, ref saves } => {
630                let slot = i * 2;
631                if slot >= saves.len() {
632                    return None;
633                }
634                let lo = saves[slot];
635                if lo == std::usize::MAX {
636                    return None;
637                }
638                let hi = saves[slot + 1];
639                Some(Match {
640                    text,
641                    start: lo,
642                    end: hi,
643                })
644            }
645        }
646    }
647
648    /// Iterate over the captured groups in order in which they appeared in the regex. The first
649    /// capture corresponds to the whole match.
650    pub fn iter<'c>(&'c self) -> SubCaptureMatches<'c, 't> {
651        SubCaptureMatches { caps: self, i: 0 }
652    }
653
654    /// How many groups were captured.
655    pub fn len(&self) -> usize {
656        match self.0 {
657            CapturesImpl::Wrap {
658                ref inner,
659                enclosing_groups,
660                ..
661            } => inner.len() - enclosing_groups,
662            CapturesImpl::Fancy { ref saves, .. } => saves.len() / 2,
663        }
664    }
665}
666
667impl<'c, 't> Iterator for SubCaptureMatches<'c, 't> {
668    type Item = Option<Match<'t>>;
669
670    fn next(&mut self) -> Option<Option<Match<'t>>> {
671        if self.i < self.caps.len() {
672            let result = self.caps.get(self.i);
673            self.i += 1;
674            Some(result)
675        } else {
676            None
677        }
678    }
679}
680
681// TODO: might be nice to implement ExactSizeIterator etc for SubCaptures
682
683/// Regular expression AST. This is public for now but may change.
684#[derive(Debug, PartialEq, Eq)]
685pub enum Expr {
686    /// An empty expression, e.g. the last branch in `(a|b|)`
687    Empty,
688    /// Any character, regex `.`
689    Any {
690        /// Whether it also matches newlines or not
691        newline: bool,
692    },
693    /// Start of input text
694    StartText,
695    /// End of input text
696    EndText,
697    /// Start of a line
698    StartLine,
699    /// End of a line
700    EndLine,
701    /// The string as a literal, e.g. `a`
702    Literal {
703        /// The string to match
704        val: String,
705        /// Whether match is case-insensitive or not
706        casei: bool,
707    },
708    /// Concatenation of multiple expressions, must match in order, e.g. `a.` is a concatenation of
709    /// the literal `a` and `.` for any character
710    Concat(Vec<Expr>),
711    /// Alternative of multiple expressions, one of them must match, e.g. `a|b` is an alternative
712    /// where either the literal `a` or `b` must match
713    Alt(Vec<Expr>),
714    /// Capturing group of expression, e.g. `(a.)` matches `a` and any character and "captures"
715    /// (remembers) the match
716    Group(Box<Expr>),
717    /// Look-around (e.g. positive/negative look-ahead or look-behind) with an expression, e.g.
718    /// `(?=a)` means the next character must be `a` (but the match is not consumed)
719    LookAround(Box<Expr>, LookAround),
720    /// Repeat of an expression, e.g. `a*` or `a+` or `a{1,3}`
721    Repeat {
722        /// The expression that is being repeated
723        child: Box<Expr>,
724        /// The minimum number of repetitions
725        lo: usize,
726        /// The maximum number of repetitions (or `usize::MAX`)
727        hi: usize,
728        /// Greedy means as much as possible is matched, e.g. `.*b` would match all of `abab`.
729        /// Non-greedy means as little as possible, e.g. `.*?b` would match only `ab` in `abab`.
730        greedy: bool,
731    },
732    /// Delegate a regex to the regex crate. This is used as a simplification so that we don't have
733    /// to represent all the expressions in the AST, e.g. character classes.
734    Delegate {
735        /// The regex
736        inner: String,
737        /// How many characters the regex matches
738        size: usize, // TODO: move into analysis result
739        /// Whether the matching is case-insensitive or not
740        casei: bool,
741    },
742    /// Back reference to a capture group, e.g. `\1` in `(abc|def)\1` references the captured group
743    /// and the whole regex matches either `abcabc` or `defdef`.
744    Backref(usize),
745    /// Atomic non-capturing group, e.g. `(?>ab|a)` in text that contains `ab` will match `ab` and
746    /// never backtrack and try `a`, even if matching fails after the atomic group.
747    AtomicGroup(Box<Expr>),
748}
749
750/// Type of look-around assertion as used for a look-around expression.
751#[derive(Debug, PartialEq, Eq, Clone, Copy)]
752pub enum LookAround {
753    /// Look-ahead assertion, e.g. `(?=a)`
754    LookAhead,
755    /// Negative look-ahead assertion, e.g. `(?!a)`
756    LookAheadNeg,
757    /// Look-behind assertion, e.g. `(?<=a)`
758    LookBehind,
759    /// Negative look-behind assertion, e.g. `(?<!a)`
760    LookBehindNeg,
761}
762
763// silly to write my own, but this is super-fast for the common 1-digit
764// case.
765fn push_usize(s: &mut String, x: usize) {
766    if x >= 10 {
767        push_usize(s, x / 10);
768        s.push((b'0' + (x % 10) as u8) as char);
769    } else {
770        s.push((b'0' + (x as u8)) as char);
771    }
772}
773
774fn push_quoted(buf: &mut String, s: &str) {
775    for c in s.chars() {
776        match c {
777            '\\' | '.' | '+' | '*' | '?' | '(' | ')' | '|' | '[' | ']' | '{' | '}' | '^' | '$'
778            | '#' => buf.push('\\'),
779            _ => (),
780        }
781        buf.push(c);
782    }
783}
784
785impl Expr {
786    /// Parse the regex and return an expression (AST) and a bit set with the indexes of groups
787    /// that are referenced by backrefs.
788    pub fn parse(re: &str) -> Result<(Expr, BitSet)> {
789        Parser::parse(re)
790    }
791
792    /// Convert expression to a regex string in the regex crate's syntax.
793    ///
794    /// # Panics
795    ///
796    /// Panics for expressions that are hard, i.e. can not be handled by the regex crate.
797    pub fn to_str(&self, buf: &mut String, precedence: u8) {
798        match *self {
799            Expr::Empty => (),
800            Expr::Any { newline } => buf.push_str(if newline { "(?s:.)" } else { "." }),
801            Expr::Literal { ref val, casei } => {
802                if casei {
803                    buf.push_str("(?i:");
804                }
805                push_quoted(buf, val);
806                if casei {
807                    buf.push_str(")");
808                }
809            }
810            Expr::StartText => buf.push('^'),
811            Expr::EndText => buf.push('$'),
812            Expr::StartLine => buf.push_str("(?m:^)"),
813            Expr::EndLine => buf.push_str("(?m:$)"),
814            Expr::Concat(ref children) => {
815                if precedence > 1 {
816                    buf.push_str("(?:");
817                }
818                for child in children {
819                    child.to_str(buf, 2);
820                }
821                if precedence > 1 {
822                    buf.push(')')
823                }
824            }
825            Expr::Alt(ref children) => {
826                if precedence > 0 {
827                    buf.push_str("(?:");
828                }
829
830                let is_empty = |e: &Expr| match e {
831                    Expr::Empty => true,
832                    _ => false,
833                };
834                let contains_empty = children.iter().any(&is_empty);
835                if contains_empty {
836                    buf.push_str("(?:");
837                }
838                for (i, child) in children.iter().filter(|&c| !is_empty(c)).enumerate() {
839                    if i != 0 {
840                        buf.push('|');
841                    }
842                    child.to_str(buf, 1);
843                }
844                if contains_empty {
845                    // regex fails with `(a|b|)`, so transform to `((?:a|b)?)`
846                    buf.push_str(")?");
847                }
848
849                if precedence > 0 {
850                    buf.push(')');
851                }
852            }
853            Expr::Group(ref child) => {
854                buf.push('(');
855                child.to_str(buf, 0);
856                buf.push(')');
857            }
858            Expr::Repeat {
859                ref child,
860                lo,
861                hi,
862                greedy,
863            } => {
864                if precedence > 2 {
865                    buf.push_str("(?:");
866                }
867                child.to_str(buf, 3);
868                match (lo, hi) {
869                    (0, 1) => buf.push('?'),
870                    (0, usize::MAX) => buf.push('*'),
871                    (1, usize::MAX) => buf.push('+'),
872                    (lo, hi) => {
873                        buf.push('{');
874                        push_usize(buf, lo);
875                        if lo != hi {
876                            buf.push(',');
877                            if hi != usize::MAX {
878                                push_usize(buf, hi);
879                            }
880                        }
881                        buf.push('}');
882                    }
883                }
884                if !greedy {
885                    buf.push('?');
886                }
887                if precedence > 2 {
888                    buf.push(')');
889                }
890            }
891            Expr::Delegate {
892                ref inner, casei, ..
893            } => {
894                // at the moment, delegate nodes are just atoms
895                if casei {
896                    buf.push_str("(?i:");
897                }
898                buf.push_str(inner);
899                if casei {
900                    buf.push_str(")");
901                }
902            }
903            _ => panic!("attempting to format hard expr"),
904        }
905    }
906}
907
908// precondition: ix > 0
909fn prev_codepoint_ix(s: &str, mut ix: usize) -> usize {
910    let bytes = s.as_bytes();
911    loop {
912        ix -= 1;
913        // fancy bit magic for ranges 0..0x80 + 0xc0..
914        if (bytes[ix] as i8) >= -0x40 {
915            break;
916        }
917    }
918    ix
919}
920
921fn codepoint_len(b: u8) -> usize {
922    match b {
923        b if b < 0x80 => 1,
924        b if b < 0xe0 => 2,
925        b if b < 0xf0 => 3,
926        _ => 4,
927    }
928}
929
930// If this returns false, then there is no possible backref in the re
931
932// Both potential implementations are turned off, because we currently
933// always need to do a deeper analysis because of 1-character
934// look-behind. If we could call a find_from_pos method of regex::Regex,
935// it would make sense to bring this back.
936/*
937pub fn detect_possible_backref(re: &str) -> bool {
938    let mut last = b'\x00';
939    for b in re.as_bytes() {
940        if b'0' <= *b && *b <= b'9' && last == b'\\' { return true; }
941        last = *b;
942    }
943    false
944}
945
946pub fn detect_possible_backref(re: &str) -> bool {
947    let mut bytes = re.as_bytes();
948    loop {
949        match memchr::memchr(b'\\', &bytes[..bytes.len() - 1]) {
950            Some(i) => {
951                bytes = &bytes[i + 1..];
952                let c = bytes[0];
953                if b'0' <= c && c <= b'9' { return true; }
954            }
955            None => return false
956        }
957    }
958}
959*/
960
961/// The internal module only exists so that the toy example can access internals for debugging and
962/// experimenting.
963#[doc(hidden)]
964pub mod internal {
965    pub use crate::analyze::analyze;
966    pub use crate::compile::compile;
967    pub use crate::vm::{run_default, run_trace, Insn, Prog};
968}
969
970#[cfg(test)]
971mod tests {
972    use crate::parse::make_literal;
973    use crate::Expr;
974    use crate::Regex;
975    use std::usize;
976    //use detect_possible_backref;
977
978    // tests for to_str
979
980    fn to_str(e: Expr) -> String {
981        let mut s = String::new();
982        e.to_str(&mut s, 0);
983        s
984    }
985
986    #[test]
987    fn to_str_concat_alt() {
988        let e = Expr::Concat(vec![
989            Expr::Alt(vec![make_literal("a"), make_literal("b")]),
990            make_literal("c"),
991        ]);
992        assert_eq!(to_str(e), "(?:a|b)c");
993    }
994
995    #[test]
996    fn to_str_rep_concat() {
997        let e = Expr::Repeat {
998            child: Box::new(Expr::Concat(vec![make_literal("a"), make_literal("b")])),
999            lo: 2,
1000            hi: 3,
1001            greedy: true,
1002        };
1003        assert_eq!(to_str(e), "(?:ab){2,3}");
1004    }
1005
1006    #[test]
1007    fn to_str_group_alt() {
1008        let e = Expr::Group(Box::new(Expr::Alt(vec![
1009            make_literal("a"),
1010            make_literal("b"),
1011        ])));
1012        assert_eq!(to_str(e), "(a|b)");
1013    }
1014
1015    #[test]
1016    fn as_str_debug() {
1017        let s = r"(a+)b\1";
1018        let regex = Regex::new(s).unwrap();
1019        assert_eq!(s, regex.as_str());
1020        assert_eq!(s, format!("{:?}", regex));
1021    }
1022
1023    #[test]
1024    fn to_str_repeat() {
1025        fn repeat(lo: usize, hi: usize, greedy: bool) -> Expr {
1026            Expr::Repeat {
1027                child: Box::new(make_literal("a")),
1028                lo,
1029                hi,
1030                greedy,
1031            }
1032        }
1033
1034        assert_eq!(to_str(repeat(2, 2, true)), "a{2}");
1035        assert_eq!(to_str(repeat(2, 2, false)), "a{2}?");
1036        assert_eq!(to_str(repeat(2, 3, true)), "a{2,3}");
1037        assert_eq!(to_str(repeat(2, 3, false)), "a{2,3}?");
1038        assert_eq!(to_str(repeat(2, usize::MAX, true)), "a{2,}");
1039        assert_eq!(to_str(repeat(2, usize::MAX, false)), "a{2,}?");
1040        assert_eq!(to_str(repeat(0, 1, true)), "a?");
1041        assert_eq!(to_str(repeat(0, 1, false)), "a??");
1042        assert_eq!(to_str(repeat(0, usize::MAX, true)), "a*");
1043        assert_eq!(to_str(repeat(0, usize::MAX, false)), "a*?");
1044        assert_eq!(to_str(repeat(1, usize::MAX, true)), "a+");
1045        assert_eq!(to_str(repeat(1, usize::MAX, false)), "a+?");
1046    }
1047
1048    /*
1049    #[test]
1050    fn detect_backref() {
1051        assert_eq!(detect_possible_backref("a0a1a2"), false);
1052        assert_eq!(detect_possible_backref("a0a1\\a2"), false);
1053        assert_eq!(detect_possible_backref("a0a\\1a2"), true);
1054        assert_eq!(detect_possible_backref("a0a1a2\\"), false);
1055    }
1056    */
1057}