fancy_regex/lib.rs
1// Copyright 2016 The Fancy Regex Authors.
2//
3// Permission is hereby granted, free of charge, to any person obtaining a copy
4// of this software and associated documentation files (the "Software"), to deal
5// in the Software without restriction, including without limitation the rights
6// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7// copies of the Software, and to permit persons to whom the Software is
8// furnished to do so, subject to the following conditions:
9//
10// The above copyright notice and this permission notice shall be included in
11// all copies or substantial portions of the Software.
12//
13// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19// THE SOFTWARE.
20
21/*!
22An implementation of regexes, supporting a relatively rich set of features, including backreferences
23and lookaround.
24
25It builds on top of the excellent [regex] crate. If you are not
26familiar with it, make sure you read its documentation and maybe you don't even need fancy-regex.
27
28If your regex or parts of it does not use any special features, the matching is delegated to the
29regex crate. That means it has linear runtime. But if you use "fancy" features such as
30backreferences or look-around, an engine with backtracking needs to be used. In that case, the regex
31can be slow and take exponential time to run because of what is called "catastrophic backtracking".
32This depends on the regex and the input.
33
34# Usage
35
36The API should feel very similar to the regex crate, and involves compiling a regex and then using
37it to find matches in text.
38
39## Example: Matching text
40
41An example with backreferences to check if a text consists of two identical words:
42
43```rust
44use fancy_regex::Regex;
45
46let re = Regex::new(r"^(\w+) (\1)$").unwrap();
47let result = re.is_match("foo foo");
48
49assert!(result.is_ok());
50let did_match = result.unwrap();
51assert!(did_match);
52```
53
54Note that like in the regex crate, the regex needs anchors like `^` and `$` to match against the
55entire input text.
56
57## Example: Finding the position of matches
58
59```rust
60use fancy_regex::Regex;
61
62let re = Regex::new(r"(\d)\1").unwrap();
63let result = re.find("foo 22");
64
65assert!(result.is_ok(), "execution was successful");
66let match_option = result.unwrap();
67
68assert!(match_option.is_some(), "found a match");
69let m = match_option.unwrap();
70
71assert_eq!(m.start(), 4);
72assert_eq!(m.end(), 6);
73assert_eq!(m.as_str(), "22");
74```
75
76## Example: Capturing groups
77
78```rust
79use fancy_regex::Regex;
80
81let re = Regex::new(r"(?<!AU)\$(\d+)").unwrap();
82let result = re.captures("AU$10, $20");
83
84let captures = result.expect("Error running regex").expect("No match found");
85let group = captures.get(1).expect("No group");
86assert_eq!(group.as_str(), "20");
87```
88
89## Example: Splitting text
90
91```rust
92use fancy_regex::Regex;
93
94let re = Regex::new(r"[ \t]+").unwrap();
95let target = "a b \t c\td e";
96let fields: Vec<&str> = re.split(target).map(|x| x.unwrap()).collect();
97assert_eq!(fields, vec!["a", "b", "c", "d", "e"]);
98
99let fields: Vec<&str> = re.splitn(target, 3).map(|x| x.unwrap()).collect();
100assert_eq!(fields, vec!["a", "b", "c\td e"]);
101```
102
103# Syntax
104
105The regex syntax is based on the [regex] crate's, with some additional supported syntax.
106
107Escapes:
108
109`\h`
110: hex digit (`[0-9A-Fa-f]`) \
111`\H`
112: not hex digit (`[^0-9A-Fa-f]`) \
113`\e`
114: escape control character (`\x1B`) \
115`\K`
116: keep text matched so far out of the overall match ([docs](https://www.regular-expressions.info/keep.html))\
117`\G`
118: anchor to where the previous match ended ([docs](https://www.regular-expressions.info/continue.html))\
119`\Z`
120: anchor to the end of the text before any trailing newlines\
121`\O`
122: any character including newline
123
124Backreferences:
125
126`\1`
127: match the exact string that the first capture group matched \
128`\2`
129: backref to the second capture group, etc
130
131Named capture groups:
132
133`(?<name>exp)`
134: match *exp*, creating capture group named *name* \
135`\k<name>`
136: match the exact string that the capture group named *name* matched \
137`(?P<name>exp)`
138: same as `(?<name>exp)` for compatibility with Python, etc. \
139`(?P=name)`
140: same as `\k<name>` for compatibility with Python, etc.
141
142Look-around assertions for matching without changing the current position:
143
144`(?=exp)`
145: look-ahead, succeeds if *exp* matches to the right of the current position \
146`(?!exp)`
147: negative look-ahead, succeeds if *exp* doesn't match to the right \
148`(?<=exp)`
149: look-behind, succeeds if *exp* matches to the left of the current position \
150`(?<!exp)`
151: negative look-behind, succeeds if *exp* doesn't match to the left
152
153Atomic groups using `(?>exp)` to prevent backtracking within `exp`, e.g.:
154
155```
156# use fancy_regex::Regex;
157let re = Regex::new(r"^a(?>bc|b)c$").unwrap();
158assert!(re.is_match("abcc").unwrap());
159// Doesn't match because `|b` is never tried because of the atomic group
160assert!(!re.is_match("abc").unwrap());
161```
162
163Conditionals - if/then/else:
164
165`(?(1))`
166: continue only if first capture group matched \
167`(?(<name>))`
168: continue only if capture group named *name* matched \
169`(?(1)true_branch|false_branch)`
170: if the first capture group matched then execute the true_branch regex expression, else execute false_branch ([docs](https://www.regular-expressions.info/conditional.html)) \
171`(?(condition)true_branch|false_branch)`
172: if the condition matches then execute the true_branch regex expression, else execute false_branch from the point just before the condition was evaluated
173
174[regex]: https://crates.io/crates/regex
175*/
176
177#![deny(missing_docs)]
178#![deny(missing_debug_implementations)]
179#![cfg_attr(not(feature = "std"), no_std)]
180
181extern crate alloc;
182
183use alloc::borrow::Cow;
184use alloc::boxed::Box;
185use alloc::string::{String, ToString};
186use alloc::sync::Arc;
187use alloc::vec;
188use alloc::vec::Vec;
189
190use core::convert::TryFrom;
191use core::fmt::{Debug, Formatter};
192use core::ops::{Index, Range};
193use core::str::FromStr;
194use core::{fmt, usize};
195use regex_automata::meta::Regex as RaRegex;
196use regex_automata::util::captures::Captures as RaCaptures;
197use regex_automata::util::syntax::Config as SyntaxConfig;
198use regex_automata::Input as RaInput;
199
200mod analyze;
201mod compile;
202mod error;
203mod expand;
204mod flags;
205mod parse;
206mod replacer;
207mod vm;
208
209use crate::analyze::analyze;
210use crate::compile::compile;
211use crate::flags::*;
212use crate::parse::{ExprTree, NamedGroups, Parser};
213use crate::vm::{Prog, OPTION_SKIPPED_EMPTY_MATCH};
214
215pub use crate::error::{CompileError, Error, ParseError, Result, RuntimeError};
216pub use crate::expand::Expander;
217pub use crate::replacer::{NoExpand, Replacer, ReplacerRef};
218
219const MAX_RECURSION: usize = 64;
220
221// the public API
222
223/// A builder for a `Regex` to allow configuring options.
224#[derive(Debug)]
225pub struct RegexBuilder(RegexOptions);
226
227/// A compiled regular expression.
228#[derive(Clone)]
229pub struct Regex {
230 inner: RegexImpl,
231 named_groups: Arc<NamedGroups>,
232}
233
234// Separate enum because we don't want to expose any of this
235#[derive(Clone)]
236enum RegexImpl {
237 // Do we want to box this? It's pretty big...
238 Wrap {
239 inner: RaRegex,
240 options: RegexOptions,
241 },
242 Fancy {
243 prog: Prog,
244 n_groups: usize,
245 options: RegexOptions,
246 },
247}
248
249/// A single match of a regex or group in an input text
250#[derive(Copy, Clone, Debug, Eq, PartialEq)]
251pub struct Match<'t> {
252 text: &'t str,
253 start: usize,
254 end: usize,
255}
256
257/// An iterator over all non-overlapping matches for a particular string.
258///
259/// The iterator yields a `Result<Match>`. The iterator stops when no more
260/// matches can be found.
261///
262/// `'r` is the lifetime of the compiled regular expression and `'t` is the
263/// lifetime of the matched string.
264#[derive(Debug)]
265pub struct Matches<'r, 't> {
266 re: &'r Regex,
267 text: &'t str,
268 last_end: usize,
269 last_match: Option<usize>,
270}
271
272impl<'r, 't> Matches<'r, 't> {
273 /// Return the text being searched.
274 pub fn text(&self) -> &'t str {
275 self.text
276 }
277
278 /// Return the underlying regex.
279 pub fn regex(&self) -> &'r Regex {
280 &self.re
281 }
282}
283
284impl<'r, 't> Iterator for Matches<'r, 't> {
285 type Item = Result<Match<'t>>;
286
287 /// Adapted from the `regex` crate. Calls `find_from_pos` repeatedly.
288 /// Ignores empty matches immediately after a match.
289 fn next(&mut self) -> Option<Self::Item> {
290 if self.last_end > self.text.len() {
291 return None;
292 }
293
294 let option_flags = if let Some(last_match) = self.last_match {
295 if self.last_end > last_match {
296 OPTION_SKIPPED_EMPTY_MATCH
297 } else {
298 0
299 }
300 } else {
301 0
302 };
303 let mat =
304 match self
305 .re
306 .find_from_pos_with_option_flags(self.text, self.last_end, option_flags)
307 {
308 Err(error) => {
309 // Stop on first error: If an error is encountered, return it, and set the "last match position"
310 // to the string length, so that the next next() call will return None, to prevent an infinite loop.
311 self.last_end = self.text.len() + 1;
312 return Some(Err(error));
313 }
314 Ok(None) => return None,
315 Ok(Some(mat)) => mat,
316 };
317
318 if mat.start == mat.end {
319 // This is an empty match. To ensure we make progress, start
320 // the next search at the smallest possible starting position
321 // of the next match following this one.
322 self.last_end = next_utf8(self.text, mat.end);
323 // Don't accept empty matches immediately following a match.
324 // Just move on to the next match.
325 if Some(mat.end) == self.last_match {
326 return self.next();
327 }
328 } else {
329 self.last_end = mat.end;
330 }
331
332 self.last_match = Some(mat.end);
333
334 Some(Ok(mat))
335 }
336}
337
338/// An iterator that yields all non-overlapping capture groups matching a
339/// particular regular expression.
340///
341/// The iterator stops when no more matches can be found.
342///
343/// `'r` is the lifetime of the compiled regular expression and `'t` is the
344/// lifetime of the matched string.
345#[derive(Debug)]
346pub struct CaptureMatches<'r, 't>(Matches<'r, 't>);
347
348impl<'r, 't> CaptureMatches<'r, 't> {
349 /// Return the text being searched.
350 pub fn text(&self) -> &'t str {
351 self.0.text
352 }
353
354 /// Return the underlying regex.
355 pub fn regex(&self) -> &'r Regex {
356 &self.0.re
357 }
358}
359
360impl<'r, 't> Iterator for CaptureMatches<'r, 't> {
361 type Item = Result<Captures<'t>>;
362
363 /// Adapted from the `regex` crate. Calls `captures_from_pos` repeatedly.
364 /// Ignores empty matches immediately after a match.
365 fn next(&mut self) -> Option<Self::Item> {
366 if self.0.last_end > self.0.text.len() {
367 return None;
368 }
369
370 let captures = match self.0.re.captures_from_pos(self.0.text, self.0.last_end) {
371 Err(error) => {
372 // Stop on first error: If an error is encountered, return it, and set the "last match position"
373 // to the string length, so that the next next() call will return None, to prevent an infinite loop.
374 self.0.last_end = self.0.text.len() + 1;
375 return Some(Err(error));
376 }
377 Ok(None) => return None,
378 Ok(Some(captures)) => captures,
379 };
380
381 let mat = captures
382 .get(0)
383 .expect("`Captures` is expected to have entire match at 0th position");
384 if mat.start == mat.end {
385 self.0.last_end = next_utf8(self.0.text, mat.end);
386 if Some(mat.end) == self.0.last_match {
387 return self.next();
388 }
389 } else {
390 self.0.last_end = mat.end;
391 }
392
393 self.0.last_match = Some(mat.end);
394
395 Some(Ok(captures))
396 }
397}
398
399/// A set of capture groups found for a regex.
400#[derive(Debug)]
401pub struct Captures<'t> {
402 inner: CapturesImpl<'t>,
403 named_groups: Arc<NamedGroups>,
404}
405
406#[derive(Debug)]
407enum CapturesImpl<'t> {
408 Wrap {
409 text: &'t str,
410 locations: RaCaptures,
411 },
412 Fancy {
413 text: &'t str,
414 saves: Vec<usize>,
415 },
416}
417
418/// Iterator for captured groups in order in which they appear in the regex.
419#[derive(Debug)]
420pub struct SubCaptureMatches<'c, 't> {
421 caps: &'c Captures<'t>,
422 i: usize,
423}
424
425/// An iterator over all substrings delimited by a regex.
426///
427/// This iterator yields `Result<&'h str>`, where each item is a substring of the
428/// target string that is delimited by matches of the regular expression. It stops when there
429/// are no more substrings to yield.
430///
431/// `'r` is the lifetime of the compiled regular expression, and `'h` is the
432/// lifetime of the target string being split.
433///
434/// This iterator can be created by the [`Regex::split`] method.
435#[derive(Debug)]
436pub struct Split<'r, 'h> {
437 matches: Matches<'r, 'h>,
438 next_start: usize,
439 target: &'h str,
440}
441
442impl<'r, 'h> Iterator for Split<'r, 'h> {
443 type Item = Result<&'h str>;
444
445 /// Returns the next substring that results from splitting the target string by the regex.
446 ///
447 /// If no more matches are found, returns the remaining part of the string,
448 /// or `None` if all substrings have been yielded.
449 fn next(&mut self) -> Option<Result<&'h str>> {
450 match self.matches.next() {
451 None => {
452 let len = self.target.len();
453 if self.next_start > len {
454 // No more substrings to return
455 None
456 } else {
457 // Return the last part of the target string
458 // Next call will return None
459 let part = &self.target[self.next_start..len];
460 self.next_start = len + 1;
461 Some(Ok(part))
462 }
463 }
464 // Return the next substring
465 Some(Ok(m)) => {
466 let part = &self.target[self.next_start..m.start()];
467 self.next_start = m.end();
468 Some(Ok(part))
469 }
470 Some(Err(e)) => Some(Err(e)),
471 }
472 }
473}
474
475impl<'r, 'h> core::iter::FusedIterator for Split<'r, 'h> {}
476
477/// An iterator over at most `N` substrings delimited by a regex.
478///
479/// This iterator yields `Result<&'h str>`, where each item is a substring of the
480/// target that is delimited by matches of the regular expression. It stops either when
481/// there are no more substrings to yield, or after `N` substrings have been yielded.
482///
483/// The `N`th substring is the remaining part of the target.
484///
485/// `'r` is the lifetime of the compiled regular expression, and `'h` is the
486/// lifetime of the target string being split.
487///
488/// This iterator can be created by the [`Regex::splitn`] method.
489#[derive(Debug)]
490pub struct SplitN<'r, 'h> {
491 splits: Split<'r, 'h>,
492 limit: usize,
493}
494
495impl<'r, 'h> Iterator for SplitN<'r, 'h> {
496 type Item = Result<&'h str>;
497
498 /// Returns the next substring resulting from splitting the target by the regex,
499 /// limited to `N` splits.
500 ///
501 /// Returns `None` if no more matches are found or if the limit is reached after yielding
502 /// the remaining part of the target.
503 fn next(&mut self) -> Option<Result<&'h str>> {
504 if self.limit == 0 {
505 // Limit reached. No more substrings available.
506 return None;
507 }
508
509 // Decrement the limit for each split.
510 self.limit -= 1;
511 if self.limit > 0 {
512 return self.splits.next();
513 }
514
515 // Nth split
516 let len = self.splits.target.len();
517 if self.splits.next_start > len {
518 // No more substrings available.
519 return None;
520 } else {
521 // Return the remaining part of the target
522 let start = self.splits.next_start;
523 self.splits.next_start = len + 1;
524 return Some(Ok(&self.splits.target[start..len]));
525 }
526 }
527
528 fn size_hint(&self) -> (usize, Option<usize>) {
529 (0, Some(self.limit))
530 }
531}
532
533impl<'r, 'h> core::iter::FusedIterator for SplitN<'r, 'h> {}
534
535#[derive(Clone, Debug)]
536struct RegexOptions {
537 pattern: String,
538 syntaxc: SyntaxConfig,
539 backtrack_limit: usize,
540 delegate_size_limit: Option<usize>,
541 delegate_dfa_size_limit: Option<usize>,
542}
543
544impl RegexOptions {
545 fn get_flag_value(flag_value: bool, enum_value: u32) -> u32 {
546 if flag_value {
547 enum_value
548 } else {
549 0
550 }
551 }
552
553 fn compute_flags(&self) -> u32 {
554 let insensitive = Self::get_flag_value(self.syntaxc.get_case_insensitive(), FLAG_CASEI);
555 let multiline = Self::get_flag_value(self.syntaxc.get_multi_line(), FLAG_MULTI);
556 let whitespace =
557 Self::get_flag_value(self.syntaxc.get_ignore_whitespace(), FLAG_IGNORE_SPACE);
558 let dotnl = Self::get_flag_value(self.syntaxc.get_dot_matches_new_line(), FLAG_DOTNL);
559 let unicode = Self::get_flag_value(self.syntaxc.get_unicode(), FLAG_UNICODE);
560
561 let all_flags = insensitive | multiline | whitespace | dotnl | unicode | unicode;
562 all_flags
563 }
564}
565
566impl Default for RegexOptions {
567 fn default() -> Self {
568 RegexOptions {
569 pattern: String::new(),
570 syntaxc: SyntaxConfig::default(),
571 backtrack_limit: 1_000_000,
572 delegate_size_limit: None,
573 delegate_dfa_size_limit: None,
574 }
575 }
576}
577
578impl RegexBuilder {
579 /// Create a new regex builder with a regex pattern.
580 ///
581 /// If the pattern is invalid, the call to `build` will fail later.
582 pub fn new(pattern: &str) -> Self {
583 let mut builder = RegexBuilder(RegexOptions::default());
584 builder.0.pattern = pattern.to_string();
585 builder
586 }
587
588 /// Build the `Regex`.
589 ///
590 /// Returns an [`Error`](enum.Error.html) if the pattern could not be parsed.
591 pub fn build(&self) -> Result<Regex> {
592 Regex::new_options(self.0.clone())
593 }
594
595 fn set_config(&mut self, func: impl Fn(SyntaxConfig) -> SyntaxConfig) -> &mut Self {
596 self.0.syntaxc = func(self.0.syntaxc);
597 self
598 }
599
600 /// Override default case insensitive
601 /// this is to enable/disable casing via builder instead of a flag within
602 /// the raw string provided to the regex builder
603 ///
604 /// Default is false
605 pub fn case_insensitive(&mut self, yes: bool) -> &mut Self {
606 self.set_config(|x| x.case_insensitive(yes))
607 }
608
609 /// Enable multi-line regex
610 pub fn multi_line(&mut self, yes: bool) -> &mut Self {
611 self.set_config(|x| x.multi_line(yes))
612 }
613
614 /// Allow ignore whitespace
615 pub fn ignore_whitespace(&mut self, yes: bool) -> &mut Self {
616 self.set_config(|x| x.ignore_whitespace(yes))
617 }
618
619 /// Enable or disable the "dot matches any character" flag.
620 /// When this is enabled, `.` will match any character. When it's disabled, then `.` will match any character
621 /// except for a new line character.
622 pub fn dot_matches_new_line(&mut self, yes: bool) -> &mut Self {
623 self.set_config(|x| x.dot_matches_new_line(yes))
624 }
625
626 /// Enable verbose mode in the regular expression.
627 ///
628 /// The same as ignore_whitespace
629 ///
630 /// When enabled, verbose mode permits insigificant whitespace in many
631 /// places in the regular expression, as well as comments. Comments are
632 /// started using `#` and continue until the end of the line.
633 ///
634 /// By default, this is disabled. It may be selectively enabled in the
635 /// regular expression by using the `x` flag regardless of this setting.
636 pub fn verbose_mode(&mut self, yes: bool) -> &mut Self {
637 self.set_config(|x| x.ignore_whitespace(yes))
638 }
639
640 /// Enable or disable the Unicode flag (`u`) by default.
641 ///
642 /// By default this is **enabled**. It may alternatively be selectively
643 /// disabled in the regular expression itself via the `u` flag.
644 ///
645 /// Note that unless "allow invalid UTF-8" is enabled (it's disabled by
646 /// default), a regular expression will fail to parse if Unicode mode is
647 /// disabled and a sub-expression could possibly match invalid UTF-8.
648 ///
649 /// **WARNING**: Unicode mode can greatly increase the size of the compiled
650 /// DFA, which can noticeably impact both memory usage and compilation
651 /// time. This is especially noticeable if your regex contains character
652 /// classes like `\w` that are impacted by whether Unicode is enabled or
653 /// not. If Unicode is not necessary, you are encouraged to disable it.
654 pub fn unicode_mode(&mut self, yes: bool) -> &mut Self {
655 self.set_config(|x| x.unicode(yes))
656 }
657
658 /// Limit for how many times backtracking should be attempted for fancy regexes (where
659 /// backtracking is used). If this limit is exceeded, execution returns an error with
660 /// [`Error::BacktrackLimitExceeded`](enum.Error.html#variant.BacktrackLimitExceeded).
661 /// This is for preventing a regex with catastrophic backtracking to run for too long.
662 ///
663 /// Default is `1_000_000` (1 million).
664 pub fn backtrack_limit(&mut self, limit: usize) -> &mut Self {
665 self.0.backtrack_limit = limit;
666 self
667 }
668
669 /// Set the approximate size limit of the compiled regular expression.
670 ///
671 /// This option is forwarded from the wrapped `regex` crate. Note that depending on the used
672 /// regex features there may be multiple delegated sub-regexes fed to the `regex` crate. As
673 /// such the actual limit is closer to `<number of delegated regexes> * delegate_size_limit`.
674 pub fn delegate_size_limit(&mut self, limit: usize) -> &mut Self {
675 self.0.delegate_size_limit = Some(limit);
676 self
677 }
678
679 /// Set the approximate size of the cache used by the DFA.
680 ///
681 /// This option is forwarded from the wrapped `regex` crate. Note that depending on the used
682 /// regex features there may be multiple delegated sub-regexes fed to the `regex` crate. As
683 /// such the actual limit is closer to `<number of delegated regexes> *
684 /// delegate_dfa_size_limit`.
685 pub fn delegate_dfa_size_limit(&mut self, limit: usize) -> &mut Self {
686 self.0.delegate_dfa_size_limit = Some(limit);
687 self
688 }
689}
690
691impl fmt::Debug for Regex {
692 /// Shows the original regular expression.
693 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
694 write!(f, "{}", self.as_str())
695 }
696}
697
698impl fmt::Display for Regex {
699 /// Shows the original regular expression
700 fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
701 write!(f, "{}", self.as_str())
702 }
703}
704
705impl FromStr for Regex {
706 type Err = Error;
707
708 /// Attempts to parse a string into a regular expression
709 fn from_str(s: &str) -> Result<Regex> {
710 Regex::new(s)
711 }
712}
713
714impl Regex {
715 /// Parse and compile a regex with default options, see `RegexBuilder`.
716 ///
717 /// Returns an [`Error`](enum.Error.html) if the pattern could not be parsed.
718 pub fn new(re: &str) -> Result<Regex> {
719 let options = RegexOptions {
720 pattern: re.to_string(),
721 ..RegexOptions::default()
722 };
723 Self::new_options(options)
724 }
725
726 fn new_options(options: RegexOptions) -> Result<Regex> {
727 let raw_tree = Expr::parse_tree_with_flags(&options.pattern, options.compute_flags())?;
728
729 // wrapper to search for re at arbitrary start position,
730 // and to capture the match bounds
731 let tree = wrap_tree(raw_tree);
732
733 let info = analyze(&tree)?;
734
735 let inner_info = &info.children[1].children[0]; // references inner expr
736 if !inner_info.hard {
737 // easy case, wrap regex
738
739 // we do our own to_str because escapes are different
740 let mut re_cooked = String::new();
741 // same as raw_tree.expr above, but it was moved, so traverse to find it
742 let raw_e = match tree.expr {
743 Expr::Concat(ref v) => match v[1] {
744 Expr::Group(ref child) => child,
745 _ => unreachable!(),
746 },
747 _ => unreachable!(),
748 };
749 raw_e.to_str(&mut re_cooked, 0);
750 let inner = compile::compile_inner(&re_cooked, &options)?;
751 return Ok(Regex {
752 inner: RegexImpl::Wrap { inner, options },
753 named_groups: Arc::new(tree.named_groups),
754 });
755 }
756
757 let prog = compile(&info)?;
758 Ok(Regex {
759 inner: RegexImpl::Fancy {
760 prog,
761 n_groups: info.end_group,
762 options,
763 },
764 named_groups: Arc::new(tree.named_groups),
765 })
766 }
767
768 /// Returns the original string of this regex.
769 pub fn as_str(&self) -> &str {
770 match &self.inner {
771 RegexImpl::Wrap { options, .. } => &options.pattern,
772 RegexImpl::Fancy { options, .. } => &options.pattern,
773 }
774 }
775
776 /// Check if the regex matches the input text.
777 ///
778 /// # Example
779 ///
780 /// Test if some text contains the same word twice:
781 ///
782 /// ```rust
783 /// # use fancy_regex::Regex;
784 ///
785 /// let re = Regex::new(r"(\w+) \1").unwrap();
786 /// assert!(re.is_match("mirror mirror on the wall").unwrap());
787 /// ```
788 pub fn is_match(&self, text: &str) -> Result<bool> {
789 match &self.inner {
790 RegexImpl::Wrap { ref inner, .. } => Ok(inner.is_match(text)),
791 RegexImpl::Fancy {
792 ref prog, options, ..
793 } => {
794 let result = vm::run(prog, text, 0, 0, options)?;
795 Ok(result.is_some())
796 }
797 }
798 }
799
800 /// Returns an iterator for each successive non-overlapping match in `text`.
801 ///
802 /// If you have capturing groups in your regex that you want to extract, use the [Regex::captures_iter()]
803 /// method.
804 ///
805 /// # Example
806 ///
807 /// Find all words followed by an exclamation point:
808 ///
809 /// ```rust
810 /// # use fancy_regex::Regex;
811 ///
812 /// let re = Regex::new(r"\w+(?=!)").unwrap();
813 /// let mut matches = re.find_iter("so fancy! even with! iterators!");
814 /// assert_eq!(matches.next().unwrap().unwrap().as_str(), "fancy");
815 /// assert_eq!(matches.next().unwrap().unwrap().as_str(), "with");
816 /// assert_eq!(matches.next().unwrap().unwrap().as_str(), "iterators");
817 /// assert!(matches.next().is_none());
818 /// ```
819 pub fn find_iter<'r, 't>(&'r self, text: &'t str) -> Matches<'r, 't> {
820 Matches {
821 re: &self,
822 text,
823 last_end: 0,
824 last_match: None,
825 }
826 }
827
828 /// Find the first match in the input text.
829 ///
830 /// If you have capturing groups in your regex that you want to extract, use the [Regex::captures()]
831 /// method.
832 ///
833 /// # Example
834 ///
835 /// Find a word that is followed by an exclamation point:
836 ///
837 /// ```rust
838 /// # use fancy_regex::Regex;
839 ///
840 /// let re = Regex::new(r"\w+(?=!)").unwrap();
841 /// assert_eq!(re.find("so fancy!").unwrap().unwrap().as_str(), "fancy");
842 /// ```
843 pub fn find<'t>(&self, text: &'t str) -> Result<Option<Match<'t>>> {
844 self.find_from_pos(text, 0)
845 }
846
847 /// Returns the first match in `text`, starting from the specified byte position `pos`.
848 ///
849 /// # Examples
850 ///
851 /// Finding match starting at a position:
852 ///
853 /// ```
854 /// # use fancy_regex::Regex;
855 /// let re = Regex::new(r"(?m:^)(\d+)").unwrap();
856 /// let text = "1 test 123\n2 foo";
857 /// let mat = re.find_from_pos(text, 7).unwrap().unwrap();
858 ///
859 /// assert_eq!(mat.start(), 11);
860 /// assert_eq!(mat.end(), 12);
861 /// ```
862 ///
863 /// Note that in some cases this is not the same as using the `find`
864 /// method and passing a slice of the string, see [Regex::captures_from_pos()] for details.
865 pub fn find_from_pos<'t>(&self, text: &'t str, pos: usize) -> Result<Option<Match<'t>>> {
866 self.find_from_pos_with_option_flags(text, pos, 0)
867 }
868
869 fn find_from_pos_with_option_flags<'t>(
870 &self,
871 text: &'t str,
872 pos: usize,
873 option_flags: u32,
874 ) -> Result<Option<Match<'t>>> {
875 match &self.inner {
876 RegexImpl::Wrap { inner, .. } => Ok(inner
877 .search(&RaInput::new(text).span(pos..text.len()))
878 .map(|m| Match::new(text, m.start(), m.end()))),
879 RegexImpl::Fancy { prog, options, .. } => {
880 let result = vm::run(prog, text, pos, option_flags, options)?;
881 Ok(result.map(|saves| Match::new(text, saves[0], saves[1])))
882 }
883 }
884 }
885
886 /// Returns an iterator over all the non-overlapping capture groups matched in `text`.
887 ///
888 /// # Examples
889 ///
890 /// Finding all matches and capturing parts of each:
891 ///
892 /// ```rust
893 /// # use fancy_regex::Regex;
894 ///
895 /// let re = Regex::new(r"(\d{4})-(\d{2})").unwrap();
896 /// let text = "It was between 2018-04 and 2020-01";
897 /// let mut all_captures = re.captures_iter(text);
898 ///
899 /// let first = all_captures.next().unwrap().unwrap();
900 /// assert_eq!(first.get(1).unwrap().as_str(), "2018");
901 /// assert_eq!(first.get(2).unwrap().as_str(), "04");
902 /// assert_eq!(first.get(0).unwrap().as_str(), "2018-04");
903 ///
904 /// let second = all_captures.next().unwrap().unwrap();
905 /// assert_eq!(second.get(1).unwrap().as_str(), "2020");
906 /// assert_eq!(second.get(2).unwrap().as_str(), "01");
907 /// assert_eq!(second.get(0).unwrap().as_str(), "2020-01");
908 ///
909 /// assert!(all_captures.next().is_none());
910 /// ```
911 pub fn captures_iter<'r, 't>(&'r self, text: &'t str) -> CaptureMatches<'r, 't> {
912 CaptureMatches(self.find_iter(text))
913 }
914
915 /// Returns the capture groups for the first match in `text`.
916 ///
917 /// If no match is found, then `Ok(None)` is returned.
918 ///
919 /// # Examples
920 ///
921 /// Finding matches and capturing parts of the match:
922 ///
923 /// ```rust
924 /// # use fancy_regex::Regex;
925 ///
926 /// let re = Regex::new(r"(\d{4})-(\d{2})-(\d{2})").unwrap();
927 /// let text = "The date was 2018-04-07";
928 /// let captures = re.captures(text).unwrap().unwrap();
929 ///
930 /// assert_eq!(captures.get(1).unwrap().as_str(), "2018");
931 /// assert_eq!(captures.get(2).unwrap().as_str(), "04");
932 /// assert_eq!(captures.get(3).unwrap().as_str(), "07");
933 /// assert_eq!(captures.get(0).unwrap().as_str(), "2018-04-07");
934 /// ```
935 pub fn captures<'t>(&self, text: &'t str) -> Result<Option<Captures<'t>>> {
936 self.captures_from_pos(text, 0)
937 }
938
939 /// Returns the capture groups for the first match in `text`, starting from
940 /// the specified byte position `pos`.
941 ///
942 /// # Examples
943 ///
944 /// Finding captures starting at a position:
945 ///
946 /// ```
947 /// # use fancy_regex::Regex;
948 /// let re = Regex::new(r"(?m:^)(\d+)").unwrap();
949 /// let text = "1 test 123\n2 foo";
950 /// let captures = re.captures_from_pos(text, 7).unwrap().unwrap();
951 ///
952 /// let group = captures.get(1).unwrap();
953 /// assert_eq!(group.as_str(), "2");
954 /// assert_eq!(group.start(), 11);
955 /// assert_eq!(group.end(), 12);
956 /// ```
957 ///
958 /// Note that in some cases this is not the same as using the `captures`
959 /// method and passing a slice of the string, see the capture that we get
960 /// when we do this:
961 ///
962 /// ```
963 /// # use fancy_regex::Regex;
964 /// let re = Regex::new(r"(?m:^)(\d+)").unwrap();
965 /// let text = "1 test 123\n2 foo";
966 /// let captures = re.captures(&text[7..]).unwrap().unwrap();
967 /// assert_eq!(captures.get(1).unwrap().as_str(), "123");
968 /// ```
969 ///
970 /// This matched the number "123" because it's at the beginning of the text
971 /// of the string slice.
972 ///
973 pub fn captures_from_pos<'t>(&self, text: &'t str, pos: usize) -> Result<Option<Captures<'t>>> {
974 let named_groups = self.named_groups.clone();
975 match &self.inner {
976 RegexImpl::Wrap { inner, .. } => {
977 let mut locations = inner.create_captures();
978 inner.captures(RaInput::new(text).span(pos..text.len()), &mut locations);
979 Ok(locations.is_match().then(|| Captures {
980 inner: CapturesImpl::Wrap { text, locations },
981 named_groups,
982 }))
983 }
984 RegexImpl::Fancy {
985 prog,
986 n_groups,
987 options,
988 ..
989 } => {
990 let result = vm::run(prog, text, pos, 0, options)?;
991 Ok(result.map(|mut saves| {
992 saves.truncate(n_groups * 2);
993 Captures {
994 inner: CapturesImpl::Fancy { text, saves },
995 named_groups,
996 }
997 }))
998 }
999 }
1000 }
1001
1002 /// Returns the number of captures, including the implicit capture of the entire expression.
1003 pub fn captures_len(&self) -> usize {
1004 match &self.inner {
1005 RegexImpl::Wrap { inner, .. } => inner.captures_len(),
1006 RegexImpl::Fancy { n_groups, .. } => *n_groups,
1007 }
1008 }
1009
1010 /// Returns an iterator over the capture names.
1011 pub fn capture_names(&self) -> CaptureNames {
1012 let mut names = Vec::new();
1013 names.resize(self.captures_len(), None);
1014 for (name, &i) in self.named_groups.iter() {
1015 names[i] = Some(name.as_str());
1016 }
1017 CaptureNames(names.into_iter())
1018 }
1019
1020 // for debugging only
1021 #[doc(hidden)]
1022 pub fn debug_print(&self, writer: &mut Formatter<'_>) -> fmt::Result {
1023 match &self.inner {
1024 RegexImpl::Wrap { options, .. } => {
1025 write!(writer, "wrapped Regex {:?}", options.pattern)
1026 }
1027 RegexImpl::Fancy { prog, .. } => prog.debug_print(writer),
1028 }
1029 }
1030
1031 /// Replaces the leftmost-first match with the replacement provided.
1032 /// The replacement can be a regular string (where `$N` and `$name` are
1033 /// expanded to match capture groups) or a function that takes the matches'
1034 /// `Captures` and returns the replaced string.
1035 ///
1036 /// If no match is found, then a copy of the string is returned unchanged.
1037 ///
1038 /// # Replacement string syntax
1039 ///
1040 /// All instances of `$name` in the replacement text is replaced with the
1041 /// corresponding capture group `name`.
1042 ///
1043 /// `name` may be an integer corresponding to the index of the
1044 /// capture group (counted by order of opening parenthesis where `0` is the
1045 /// entire match) or it can be a name (consisting of letters, digits or
1046 /// underscores) corresponding to a named capture group.
1047 ///
1048 /// If `name` isn't a valid capture group (whether the name doesn't exist
1049 /// or isn't a valid index), then it is replaced with the empty string.
1050 ///
1051 /// The longest possible name is used. e.g., `$1a` looks up the capture
1052 /// group named `1a` and not the capture group at index `1`. To exert more
1053 /// precise control over the name, use braces, e.g., `${1}a`.
1054 ///
1055 /// To write a literal `$` use `$$`.
1056 ///
1057 /// # Examples
1058 ///
1059 /// Note that this function is polymorphic with respect to the replacement.
1060 /// In typical usage, this can just be a normal string:
1061 ///
1062 /// ```rust
1063 /// # use fancy_regex::Regex;
1064 /// let re = Regex::new("[^01]+").unwrap();
1065 /// assert_eq!(re.replace("1078910", ""), "1010");
1066 /// ```
1067 ///
1068 /// But anything satisfying the `Replacer` trait will work. For example,
1069 /// a closure of type `|&Captures| -> String` provides direct access to the
1070 /// captures corresponding to a match. This allows one to access
1071 /// capturing group matches easily:
1072 ///
1073 /// ```rust
1074 /// # use fancy_regex::{Regex, Captures};
1075 /// let re = Regex::new(r"([^,\s]+),\s+(\S+)").unwrap();
1076 /// let result = re.replace("Springsteen, Bruce", |caps: &Captures| {
1077 /// format!("{} {}", &caps[2], &caps[1])
1078 /// });
1079 /// assert_eq!(result, "Bruce Springsteen");
1080 /// ```
1081 ///
1082 /// But this is a bit cumbersome to use all the time. Instead, a simple
1083 /// syntax is supported that expands `$name` into the corresponding capture
1084 /// group. Here's the last example, but using this expansion technique
1085 /// with named capture groups:
1086 ///
1087 /// ```rust
1088 /// # use fancy_regex::Regex;
1089 /// let re = Regex::new(r"(?P<last>[^,\s]+),\s+(?P<first>\S+)").unwrap();
1090 /// let result = re.replace("Springsteen, Bruce", "$first $last");
1091 /// assert_eq!(result, "Bruce Springsteen");
1092 /// ```
1093 ///
1094 /// Note that using `$2` instead of `$first` or `$1` instead of `$last`
1095 /// would produce the same result. To write a literal `$` use `$$`.
1096 ///
1097 /// Sometimes the replacement string requires use of curly braces to
1098 /// delineate a capture group replacement and surrounding literal text.
1099 /// For example, if we wanted to join two words together with an
1100 /// underscore:
1101 ///
1102 /// ```rust
1103 /// # use fancy_regex::Regex;
1104 /// let re = Regex::new(r"(?P<first>\w+)\s+(?P<second>\w+)").unwrap();
1105 /// let result = re.replace("deep fried", "${first}_$second");
1106 /// assert_eq!(result, "deep_fried");
1107 /// ```
1108 ///
1109 /// Without the curly braces, the capture group name `first_` would be
1110 /// used, and since it doesn't exist, it would be replaced with the empty
1111 /// string.
1112 ///
1113 /// Finally, sometimes you just want to replace a literal string with no
1114 /// regard for capturing group expansion. This can be done by wrapping a
1115 /// byte string with `NoExpand`:
1116 ///
1117 /// ```rust
1118 /// # use fancy_regex::Regex;
1119 /// use fancy_regex::NoExpand;
1120 ///
1121 /// let re = Regex::new(r"(?P<last>[^,\s]+),\s+(\S+)").unwrap();
1122 /// let result = re.replace("Springsteen, Bruce", NoExpand("$2 $last"));
1123 /// assert_eq!(result, "$2 $last");
1124 /// ```
1125 pub fn replace<'t, R: Replacer>(&self, text: &'t str, rep: R) -> Cow<'t, str> {
1126 self.replacen(text, 1, rep)
1127 }
1128
1129 /// Replaces all non-overlapping matches in `text` with the replacement
1130 /// provided. This is the same as calling `replacen` with `limit` set to
1131 /// `0`.
1132 ///
1133 /// See the documentation for `replace` for details on how to access
1134 /// capturing group matches in the replacement string.
1135 pub fn replace_all<'t, R: Replacer>(&self, text: &'t str, rep: R) -> Cow<'t, str> {
1136 self.replacen(text, 0, rep)
1137 }
1138
1139 /// Replaces at most `limit` non-overlapping matches in `text` with the
1140 /// replacement provided. If `limit` is 0, then all non-overlapping matches
1141 /// are replaced.
1142 ///
1143 /// Will panic if any errors are encountered. Use `try_replacen`, which this
1144 /// function unwraps, if you want to handle errors.
1145 ///
1146 /// See the documentation for `replace` for details on how to access
1147 /// capturing group matches in the replacement string.
1148 ///
1149 pub fn replacen<'t, R: Replacer>(&self, text: &'t str, limit: usize, rep: R) -> Cow<'t, str> {
1150 self.try_replacen(text, limit, rep).unwrap()
1151 }
1152
1153 /// Replaces at most `limit` non-overlapping matches in `text` with the
1154 /// replacement provided. If `limit` is 0, then all non-overlapping matches
1155 /// are replaced.
1156 ///
1157 /// Propagates any errors encountered, such as `RuntimeError::BacktrackLimitExceeded`.
1158 ///
1159 /// See the documentation for `replace` for details on how to access
1160 /// capturing group matches in the replacement string.
1161 pub fn try_replacen<'t, R: Replacer>(
1162 &self,
1163 text: &'t str,
1164 limit: usize,
1165 mut rep: R,
1166 ) -> Result<Cow<'t, str>> {
1167 // If we know that the replacement doesn't have any capture expansions,
1168 // then we can fast path. The fast path can make a tremendous
1169 // difference:
1170 //
1171 // 1) We use `find_iter` instead of `captures_iter`. Not asking for
1172 // captures generally makes the regex engines faster.
1173 // 2) We don't need to look up all of the capture groups and do
1174 // replacements inside the replacement string. We just push it
1175 // at each match and be done with it.
1176 if let Some(rep) = rep.no_expansion() {
1177 let mut it = self.find_iter(text).enumerate().peekable();
1178 if it.peek().is_none() {
1179 return Ok(Cow::Borrowed(text));
1180 }
1181 let mut new = String::with_capacity(text.len());
1182 let mut last_match = 0;
1183 for (i, m) in it {
1184 let m = m?;
1185
1186 if limit > 0 && i >= limit {
1187 break;
1188 }
1189 new.push_str(&text[last_match..m.start()]);
1190 new.push_str(&rep);
1191 last_match = m.end();
1192 }
1193 new.push_str(&text[last_match..]);
1194 return Ok(Cow::Owned(new));
1195 }
1196
1197 // The slower path, which we use if the replacement needs access to
1198 // capture groups.
1199 let mut it = self.captures_iter(text).enumerate().peekable();
1200 if it.peek().is_none() {
1201 return Ok(Cow::Borrowed(text));
1202 }
1203 let mut new = String::with_capacity(text.len());
1204 let mut last_match = 0;
1205 for (i, cap) in it {
1206 let cap = cap?;
1207
1208 if limit > 0 && i >= limit {
1209 break;
1210 }
1211 // unwrap on 0 is OK because captures only reports matches
1212 let m = cap.get(0).unwrap();
1213 new.push_str(&text[last_match..m.start()]);
1214 rep.replace_append(&cap, &mut new);
1215 last_match = m.end();
1216 }
1217 new.push_str(&text[last_match..]);
1218 Ok(Cow::Owned(new))
1219 }
1220
1221 /// Splits the string by matches of the regex.
1222 ///
1223 /// Returns an iterator over the substrings of the target string
1224 /// that *aren't* matched by the regex.
1225 ///
1226 /// # Example
1227 ///
1228 /// To split a string delimited by arbitrary amounts of spaces or tabs:
1229 ///
1230 /// ```rust
1231 /// # use fancy_regex::Regex;
1232 /// let re = Regex::new(r"[ \t]+").unwrap();
1233 /// let target = "a b \t c\td e";
1234 /// let fields: Vec<&str> = re.split(target).map(|x| x.unwrap()).collect();
1235 /// assert_eq!(fields, vec!["a", "b", "c", "d", "e"]);
1236 /// ```
1237 pub fn split<'r, 'h>(&'r self, target: &'h str) -> Split<'r, 'h> {
1238 Split {
1239 matches: self.find_iter(target),
1240 next_start: 0,
1241 target,
1242 }
1243 }
1244
1245 /// Splits the string by matches of the regex at most `limit` times.
1246 ///
1247 /// Returns an iterator over the substrings of the target string
1248 /// that *aren't* matched by the regex.
1249 ///
1250 /// The `N`th substring is the remaining part of the target.
1251 ///
1252 /// # Example
1253 ///
1254 /// To split a string delimited by arbitrary amounts of spaces or tabs
1255 /// 3 times:
1256 ///
1257 /// ```rust
1258 /// # use fancy_regex::Regex;
1259 /// let re = Regex::new(r"[ \t]+").unwrap();
1260 /// let target = "a b \t c\td e";
1261 /// let fields: Vec<&str> = re.splitn(target, 3).map(|x| x.unwrap()).collect();
1262 /// assert_eq!(fields, vec!["a", "b", "c\td e"]);
1263 /// ```
1264 pub fn splitn<'r, 'h>(&'r self, target: &'h str, limit: usize) -> SplitN<'r, 'h> {
1265 SplitN {
1266 splits: self.split(target),
1267 limit: limit,
1268 }
1269 }
1270}
1271
1272impl TryFrom<&str> for Regex {
1273 type Error = Error;
1274
1275 /// Attempts to parse a string into a regular expression
1276 fn try_from(s: &str) -> Result<Self> {
1277 Self::new(s)
1278 }
1279}
1280
1281impl TryFrom<String> for Regex {
1282 type Error = Error;
1283
1284 /// Attempts to parse a string into a regular expression
1285 fn try_from(s: String) -> Result<Self> {
1286 Self::new(&s)
1287 }
1288}
1289
1290impl<'t> Match<'t> {
1291 /// Returns the starting byte offset of the match in the text.
1292 #[inline]
1293 pub fn start(&self) -> usize {
1294 self.start
1295 }
1296
1297 /// Returns the ending byte offset of the match in the text.
1298 #[inline]
1299 pub fn end(&self) -> usize {
1300 self.end
1301 }
1302
1303 /// Returns the range over the starting and ending byte offsets of the match in text.
1304 #[inline]
1305 pub fn range(&self) -> Range<usize> {
1306 self.start..self.end
1307 }
1308
1309 /// Returns the matched text.
1310 #[inline]
1311 pub fn as_str(&self) -> &'t str {
1312 &self.text[self.start..self.end]
1313 }
1314
1315 /// Creates a new match from the given text and byte offsets.
1316 fn new(text: &'t str, start: usize, end: usize) -> Match<'t> {
1317 Match { text, start, end }
1318 }
1319}
1320
1321impl<'t> From<Match<'t>> for &'t str {
1322 fn from(m: Match<'t>) -> &'t str {
1323 m.as_str()
1324 }
1325}
1326
1327impl<'t> From<Match<'t>> for Range<usize> {
1328 fn from(m: Match<'t>) -> Range<usize> {
1329 m.range()
1330 }
1331}
1332
1333#[allow(clippy::len_without_is_empty)] // follow regex's API
1334impl<'t> Captures<'t> {
1335 /// Get the capture group by its index in the regex.
1336 ///
1337 /// If there is no match for that group or the index does not correspond to a group, `None` is
1338 /// returned. The index 0 returns the whole match.
1339 pub fn get(&self, i: usize) -> Option<Match<'t>> {
1340 match &self.inner {
1341 CapturesImpl::Wrap { text, locations } => locations.get_group(i).map(|span| Match {
1342 text,
1343 start: span.start,
1344 end: span.end,
1345 }),
1346 CapturesImpl::Fancy { text, ref saves } => {
1347 let slot = i * 2;
1348 if slot >= saves.len() {
1349 return None;
1350 }
1351 let lo = saves[slot];
1352 if lo == usize::MAX {
1353 return None;
1354 }
1355 let hi = saves[slot + 1];
1356 Some(Match {
1357 text,
1358 start: lo,
1359 end: hi,
1360 })
1361 }
1362 }
1363 }
1364
1365 /// Returns the match for a named capture group. Returns `None` the capture
1366 /// group did not match or if there is no group with the given name.
1367 pub fn name(&self, name: &str) -> Option<Match<'t>> {
1368 self.named_groups.get(name).and_then(|i| self.get(*i))
1369 }
1370
1371 /// Expands all instances of `$group` in `replacement` to the corresponding
1372 /// capture group `name`, and writes them to the `dst` buffer given.
1373 ///
1374 /// `group` may be an integer corresponding to the index of the
1375 /// capture group (counted by order of opening parenthesis where `\0` is the
1376 /// entire match) or it can be a name (consisting of letters, digits or
1377 /// underscores) corresponding to a named capture group.
1378 ///
1379 /// If `group` isn't a valid capture group (whether the name doesn't exist
1380 /// or isn't a valid index), then it is replaced with the empty string.
1381 ///
1382 /// The longest possible name is used. e.g., `$1a` looks up the capture
1383 /// group named `1a` and not the capture group at index `1`. To exert more
1384 /// precise control over the name, use braces, e.g., `${1}a`.
1385 ///
1386 /// To write a literal `$`, use `$$`.
1387 ///
1388 /// For more control over expansion, see [`Expander`].
1389 ///
1390 /// [`Expander`]: expand/struct.Expander.html
1391 pub fn expand(&self, replacement: &str, dst: &mut String) {
1392 Expander::default().append_expansion(dst, replacement, self);
1393 }
1394
1395 /// Iterate over the captured groups in order in which they appeared in the regex. The first
1396 /// capture corresponds to the whole match.
1397 pub fn iter<'c>(&'c self) -> SubCaptureMatches<'c, 't> {
1398 SubCaptureMatches { caps: self, i: 0 }
1399 }
1400
1401 /// How many groups were captured. This is always at least 1 because group 0 returns the whole
1402 /// match.
1403 pub fn len(&self) -> usize {
1404 match &self.inner {
1405 CapturesImpl::Wrap { locations, .. } => locations.group_len(),
1406 CapturesImpl::Fancy { saves, .. } => saves.len() / 2,
1407 }
1408 }
1409}
1410
1411/// Get a group by index.
1412///
1413/// `'t` is the lifetime of the matched text.
1414///
1415/// The text can't outlive the `Captures` object if this method is
1416/// used, because of how `Index` is defined (normally `a[i]` is part
1417/// of `a` and can't outlive it); to do that, use `get()` instead.
1418///
1419/// # Panics
1420///
1421/// If there is no group at the given index.
1422impl<'t> Index<usize> for Captures<'t> {
1423 type Output = str;
1424
1425 fn index(&self, i: usize) -> &str {
1426 self.get(i)
1427 .map(|m| m.as_str())
1428 .unwrap_or_else(|| panic!("no group at index '{}'", i))
1429 }
1430}
1431
1432/// Get a group by name.
1433///
1434/// `'t` is the lifetime of the matched text and `'i` is the lifetime
1435/// of the group name (the index).
1436///
1437/// The text can't outlive the `Captures` object if this method is
1438/// used, because of how `Index` is defined (normally `a[i]` is part
1439/// of `a` and can't outlive it); to do that, use `name` instead.
1440///
1441/// # Panics
1442///
1443/// If there is no group named by the given value.
1444impl<'t, 'i> Index<&'i str> for Captures<'t> {
1445 type Output = str;
1446
1447 fn index<'a>(&'a self, name: &'i str) -> &'a str {
1448 self.name(name)
1449 .map(|m| m.as_str())
1450 .unwrap_or_else(|| panic!("no group named '{}'", name))
1451 }
1452}
1453
1454impl<'c, 't> Iterator for SubCaptureMatches<'c, 't> {
1455 type Item = Option<Match<'t>>;
1456
1457 fn next(&mut self) -> Option<Option<Match<'t>>> {
1458 if self.i < self.caps.len() {
1459 let result = self.caps.get(self.i);
1460 self.i += 1;
1461 Some(result)
1462 } else {
1463 None
1464 }
1465 }
1466}
1467
1468// TODO: might be nice to implement ExactSizeIterator etc for SubCaptures
1469
1470/// Regular expression AST. This is public for now but may change.
1471#[derive(Debug, PartialEq, Eq)]
1472pub enum Expr {
1473 /// An empty expression, e.g. the last branch in `(a|b|)`
1474 Empty,
1475 /// Any character, regex `.`
1476 Any {
1477 /// Whether it also matches newlines or not
1478 newline: bool,
1479 },
1480 /// An assertion
1481 Assertion(Assertion),
1482 /// The string as a literal, e.g. `a`
1483 Literal {
1484 /// The string to match
1485 val: String,
1486 /// Whether match is case-insensitive or not
1487 casei: bool,
1488 },
1489 /// Concatenation of multiple expressions, must match in order, e.g. `a.` is a concatenation of
1490 /// the literal `a` and `.` for any character
1491 Concat(Vec<Expr>),
1492 /// Alternative of multiple expressions, one of them must match, e.g. `a|b` is an alternative
1493 /// where either the literal `a` or `b` must match
1494 Alt(Vec<Expr>),
1495 /// Capturing group of expression, e.g. `(a.)` matches `a` and any character and "captures"
1496 /// (remembers) the match
1497 Group(Box<Expr>),
1498 /// Look-around (e.g. positive/negative look-ahead or look-behind) with an expression, e.g.
1499 /// `(?=a)` means the next character must be `a` (but the match is not consumed)
1500 LookAround(Box<Expr>, LookAround),
1501 /// Repeat of an expression, e.g. `a*` or `a+` or `a{1,3}`
1502 Repeat {
1503 /// The expression that is being repeated
1504 child: Box<Expr>,
1505 /// The minimum number of repetitions
1506 lo: usize,
1507 /// The maximum number of repetitions (or `usize::MAX`)
1508 hi: usize,
1509 /// Greedy means as much as possible is matched, e.g. `.*b` would match all of `abab`.
1510 /// Non-greedy means as little as possible, e.g. `.*?b` would match only `ab` in `abab`.
1511 greedy: bool,
1512 },
1513 /// Delegate a regex to the regex crate. This is used as a simplification so that we don't have
1514 /// to represent all the expressions in the AST, e.g. character classes.
1515 Delegate {
1516 /// The regex
1517 inner: String,
1518 /// How many characters the regex matches
1519 size: usize, // TODO: move into analysis result
1520 /// Whether the matching is case-insensitive or not
1521 casei: bool,
1522 },
1523 /// Back reference to a capture group, e.g. `\1` in `(abc|def)\1` references the captured group
1524 /// and the whole regex matches either `abcabc` or `defdef`.
1525 Backref {
1526 /// The capture group number being referenced
1527 group: usize,
1528 /// Whether the matching is case-insensitive or not
1529 casei: bool,
1530 },
1531 /// Back reference to a capture group at the given specified relative recursion level.
1532 BackrefWithRelativeRecursionLevel {
1533 /// The capture group number being referenced
1534 group: usize,
1535 /// Relative recursion level
1536 relative_level: isize,
1537 /// Whether the matching is case-insensitive or not
1538 casei: bool,
1539 },
1540 /// Atomic non-capturing group, e.g. `(?>ab|a)` in text that contains `ab` will match `ab` and
1541 /// never backtrack and try `a`, even if matching fails after the atomic group.
1542 AtomicGroup(Box<Expr>),
1543 /// Keep matched text so far out of overall match
1544 KeepOut,
1545 /// Anchor to match at the position where the previous match ended
1546 ContinueFromPreviousMatchEnd,
1547 /// Conditional expression based on whether the numbered capture group matched or not
1548 BackrefExistsCondition(usize),
1549 /// If/Then/Else Condition. If there is no Then/Else, these will just be empty expressions.
1550 Conditional {
1551 /// The conditional expression to evaluate
1552 condition: Box<Expr>,
1553 /// What to execute if the condition is true
1554 true_branch: Box<Expr>,
1555 /// What to execute if the condition is false
1556 false_branch: Box<Expr>,
1557 },
1558 /// Subroutine call to the specified group number
1559 SubroutineCall(usize),
1560 /// Unresolved subroutine call to the specified group name
1561 UnresolvedNamedSubroutineCall {
1562 /// The capture group name
1563 name: String,
1564 /// The position in the original regex pattern where the subroutine call is made
1565 ix: usize,
1566 },
1567}
1568
1569/// Type of look-around assertion as used for a look-around expression.
1570#[derive(Debug, PartialEq, Eq, Clone, Copy)]
1571pub enum LookAround {
1572 /// Look-ahead assertion, e.g. `(?=a)`
1573 LookAhead,
1574 /// Negative look-ahead assertion, e.g. `(?!a)`
1575 LookAheadNeg,
1576 /// Look-behind assertion, e.g. `(?<=a)`
1577 LookBehind,
1578 /// Negative look-behind assertion, e.g. `(?<!a)`
1579 LookBehindNeg,
1580}
1581
1582/// An iterator over capture names in a [Regex]. The iterator
1583/// returns the name of each group, or [None] if the group has
1584/// no name. Because capture group 0 cannot have a name, the
1585/// first item returned is always [None].
1586pub struct CaptureNames<'r>(vec::IntoIter<Option<&'r str>>);
1587
1588impl Debug for CaptureNames<'_> {
1589 fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
1590 f.write_str("<CaptureNames>")
1591 }
1592}
1593
1594impl<'r> Iterator for CaptureNames<'r> {
1595 type Item = Option<&'r str>;
1596
1597 fn next(&mut self) -> Option<Self::Item> {
1598 self.0.next()
1599 }
1600}
1601
1602// silly to write my own, but this is super-fast for the common 1-digit
1603// case.
1604fn push_usize(s: &mut String, x: usize) {
1605 if x >= 10 {
1606 push_usize(s, x / 10);
1607 s.push((b'0' + (x % 10) as u8) as char);
1608 } else {
1609 s.push((b'0' + (x as u8)) as char);
1610 }
1611}
1612
1613fn is_special(c: char) -> bool {
1614 match c {
1615 '\\' | '.' | '+' | '*' | '?' | '(' | ')' | '|' | '[' | ']' | '{' | '}' | '^' | '$'
1616 | '#' => true,
1617 _ => false,
1618 }
1619}
1620
1621fn push_quoted(buf: &mut String, s: &str) {
1622 for c in s.chars() {
1623 if is_special(c) {
1624 buf.push('\\');
1625 }
1626 buf.push(c);
1627 }
1628}
1629
1630/// Escapes special characters in `text` with '\\'. Returns a string which, when interpreted
1631/// as a regex, matches exactly `text`.
1632pub fn escape(text: &str) -> Cow<str> {
1633 // Using bytes() is OK because all special characters are single bytes.
1634 match text.bytes().filter(|&b| is_special(b as char)).count() {
1635 0 => Cow::Borrowed(text),
1636 n => {
1637 // The capacity calculation is exact because '\\' is a single byte.
1638 let mut buf = String::with_capacity(text.len() + n);
1639 push_quoted(&mut buf, text);
1640 Cow::Owned(buf)
1641 }
1642 }
1643}
1644
1645/// Type of assertions
1646#[derive(Debug, PartialEq, Eq, Clone, Copy)]
1647pub enum Assertion {
1648 /// Start of input text
1649 StartText,
1650 /// End of input text
1651 EndText,
1652 /// Start of a line
1653 StartLine {
1654 /// CRLF mode
1655 crlf: bool,
1656 },
1657 /// End of a line
1658 EndLine {
1659 /// CRLF mode
1660 crlf: bool,
1661 },
1662 /// Left word boundary
1663 LeftWordBoundary,
1664 /// Right word boundary
1665 RightWordBoundary,
1666 /// Both word boundaries
1667 WordBoundary,
1668 /// Not word boundary
1669 NotWordBoundary,
1670}
1671
1672impl Assertion {
1673 pub(crate) fn is_hard(&self) -> bool {
1674 use Assertion::*;
1675 matches!(
1676 self,
1677 // these will make regex-automata use PikeVM
1678 LeftWordBoundary | RightWordBoundary | WordBoundary | NotWordBoundary
1679 )
1680 }
1681}
1682
1683impl Expr {
1684 /// Parse the regex and return an expression (AST) and a bit set with the indexes of groups
1685 /// that are referenced by backrefs.
1686 pub fn parse_tree(re: &str) -> Result<ExprTree> {
1687 Parser::parse(re)
1688 }
1689
1690 /// Parse the regex and return an expression (AST)
1691 /// Flags should be bit based based on flags
1692 pub fn parse_tree_with_flags(re: &str, flags: u32) -> Result<ExprTree> {
1693 Parser::parse_with_flags(re, flags)
1694 }
1695
1696 /// Convert expression to a regex string in the regex crate's syntax.
1697 ///
1698 /// # Panics
1699 ///
1700 /// Panics for expressions that are hard, i.e. can not be handled by the regex crate.
1701 pub fn to_str(&self, buf: &mut String, precedence: u8) {
1702 match *self {
1703 Expr::Empty => (),
1704 Expr::Any { newline } => buf.push_str(if newline { "(?s:.)" } else { "." }),
1705 Expr::Literal { ref val, casei } => {
1706 if casei {
1707 buf.push_str("(?i:");
1708 }
1709 push_quoted(buf, val);
1710 if casei {
1711 buf.push_str(")");
1712 }
1713 }
1714 Expr::Assertion(Assertion::StartText) => buf.push('^'),
1715 Expr::Assertion(Assertion::EndText) => buf.push('$'),
1716 Expr::Assertion(Assertion::StartLine { crlf: false }) => buf.push_str("(?m:^)"),
1717 Expr::Assertion(Assertion::EndLine { crlf: false }) => buf.push_str("(?m:$)"),
1718 Expr::Assertion(Assertion::StartLine { crlf: true }) => buf.push_str("(?Rm:^)"),
1719 Expr::Assertion(Assertion::EndLine { crlf: true }) => buf.push_str("(?Rm:$)"),
1720 Expr::Concat(ref children) => {
1721 if precedence > 1 {
1722 buf.push_str("(?:");
1723 }
1724 for child in children {
1725 child.to_str(buf, 2);
1726 }
1727 if precedence > 1 {
1728 buf.push(')')
1729 }
1730 }
1731 Expr::Alt(ref children) => {
1732 if precedence > 0 {
1733 buf.push_str("(?:");
1734 }
1735 for (i, child) in children.iter().enumerate() {
1736 if i != 0 {
1737 buf.push('|');
1738 }
1739 child.to_str(buf, 1);
1740 }
1741 if precedence > 0 {
1742 buf.push(')');
1743 }
1744 }
1745 Expr::Group(ref child) => {
1746 buf.push('(');
1747 child.to_str(buf, 0);
1748 buf.push(')');
1749 }
1750 Expr::Repeat {
1751 ref child,
1752 lo,
1753 hi,
1754 greedy,
1755 } => {
1756 if precedence > 2 {
1757 buf.push_str("(?:");
1758 }
1759 child.to_str(buf, 3);
1760 match (lo, hi) {
1761 (0, 1) => buf.push('?'),
1762 (0, usize::MAX) => buf.push('*'),
1763 (1, usize::MAX) => buf.push('+'),
1764 (lo, hi) => {
1765 buf.push('{');
1766 push_usize(buf, lo);
1767 if lo != hi {
1768 buf.push(',');
1769 if hi != usize::MAX {
1770 push_usize(buf, hi);
1771 }
1772 }
1773 buf.push('}');
1774 }
1775 }
1776 if !greedy {
1777 buf.push('?');
1778 }
1779 if precedence > 2 {
1780 buf.push(')');
1781 }
1782 }
1783 Expr::Delegate {
1784 ref inner, casei, ..
1785 } => {
1786 // at the moment, delegate nodes are just atoms
1787 if casei {
1788 buf.push_str("(?i:");
1789 }
1790 buf.push_str(inner);
1791 if casei {
1792 buf.push_str(")");
1793 }
1794 }
1795 _ => panic!("attempting to format hard expr"),
1796 }
1797 }
1798}
1799
1800// precondition: ix > 0
1801fn prev_codepoint_ix(s: &str, mut ix: usize) -> usize {
1802 let bytes = s.as_bytes();
1803 loop {
1804 ix -= 1;
1805 // fancy bit magic for ranges 0..0x80 + 0xc0..
1806 if (bytes[ix] as i8) >= -0x40 {
1807 break;
1808 }
1809 }
1810 ix
1811}
1812
1813fn codepoint_len(b: u8) -> usize {
1814 match b {
1815 b if b < 0x80 => 1,
1816 b if b < 0xe0 => 2,
1817 b if b < 0xf0 => 3,
1818 _ => 4,
1819 }
1820}
1821
1822/// Returns the smallest possible index of the next valid UTF-8 sequence
1823/// starting after `i`.
1824/// Adapted from a function with the same name in the `regex` crate.
1825fn next_utf8(text: &str, i: usize) -> usize {
1826 let b = match text.as_bytes().get(i) {
1827 None => return i + 1,
1828 Some(&b) => b,
1829 };
1830 i + codepoint_len(b)
1831}
1832
1833// If this returns false, then there is no possible backref in the re
1834
1835// Both potential implementations are turned off, because we currently
1836// always need to do a deeper analysis because of 1-character
1837// look-behind. If we could call a find_from_pos method of regex::Regex,
1838// it would make sense to bring this back.
1839/*
1840pub fn detect_possible_backref(re: &str) -> bool {
1841 let mut last = b'\x00';
1842 for b in re.as_bytes() {
1843 if b'0' <= *b && *b <= b'9' && last == b'\\' { return true; }
1844 last = *b;
1845 }
1846 false
1847}
1848
1849pub fn detect_possible_backref(re: &str) -> bool {
1850 let mut bytes = re.as_bytes();
1851 loop {
1852 match memchr::memchr(b'\\', &bytes[..bytes.len() - 1]) {
1853 Some(i) => {
1854 bytes = &bytes[i + 1..];
1855 let c = bytes[0];
1856 if b'0' <= c && c <= b'9' { return true; }
1857 }
1858 None => return false
1859 }
1860 }
1861}
1862*/
1863
1864/// wrapper to search for re at arbitrary start position,
1865/// and to capture the match bounds
1866pub fn wrap_tree(raw_tree: ExprTree) -> ExprTree {
1867 return ExprTree {
1868 expr: Expr::Concat(vec![
1869 Expr::Repeat {
1870 child: Box::new(Expr::Any { newline: true }),
1871 lo: 0,
1872 hi: usize::MAX,
1873 greedy: false,
1874 },
1875 Expr::Group(Box::new(raw_tree.expr)),
1876 ]),
1877 ..raw_tree
1878 };
1879}
1880
1881/// The internal module only exists so that the toy example can access internals for debugging and
1882/// experimenting.
1883#[doc(hidden)]
1884pub mod internal {
1885 pub use crate::analyze::analyze;
1886 pub use crate::compile::compile;
1887 pub use crate::vm::{run_default, run_trace, Insn, Prog};
1888}
1889
1890#[cfg(test)]
1891mod tests {
1892 use alloc::borrow::Cow;
1893 use alloc::boxed::Box;
1894 use alloc::string::String;
1895 use alloc::{format, vec};
1896
1897 use crate::parse::make_literal;
1898 use crate::{Expr, Regex};
1899
1900 //use detect_possible_backref;
1901
1902 // tests for to_str
1903
1904 fn to_str(e: Expr) -> String {
1905 let mut s = String::new();
1906 e.to_str(&mut s, 0);
1907 s
1908 }
1909
1910 #[test]
1911 fn to_str_concat_alt() {
1912 let e = Expr::Concat(vec![
1913 Expr::Alt(vec![make_literal("a"), make_literal("b")]),
1914 make_literal("c"),
1915 ]);
1916 assert_eq!(to_str(e), "(?:a|b)c");
1917 }
1918
1919 #[test]
1920 fn to_str_rep_concat() {
1921 let e = Expr::Repeat {
1922 child: Box::new(Expr::Concat(vec![make_literal("a"), make_literal("b")])),
1923 lo: 2,
1924 hi: 3,
1925 greedy: true,
1926 };
1927 assert_eq!(to_str(e), "(?:ab){2,3}");
1928 }
1929
1930 #[test]
1931 fn to_str_group_alt() {
1932 let e = Expr::Group(Box::new(Expr::Alt(vec![
1933 make_literal("a"),
1934 make_literal("b"),
1935 ])));
1936 assert_eq!(to_str(e), "(a|b)");
1937 }
1938
1939 #[test]
1940 fn as_str_debug() {
1941 let s = r"(a+)b\1";
1942 let regex = Regex::new(s).unwrap();
1943 assert_eq!(s, regex.as_str());
1944 assert_eq!(s, format!("{:?}", regex));
1945 }
1946
1947 #[test]
1948 fn display() {
1949 let s = r"(a+)b\1";
1950 let regex = Regex::new(s).unwrap();
1951 assert_eq!(s, format!("{}", regex));
1952 }
1953
1954 #[test]
1955 fn from_str() {
1956 let s = r"(a+)b\1";
1957 let regex = s.parse::<Regex>().unwrap();
1958 assert_eq!(regex.as_str(), s);
1959 }
1960
1961 #[test]
1962 fn to_str_repeat() {
1963 fn repeat(lo: usize, hi: usize, greedy: bool) -> Expr {
1964 Expr::Repeat {
1965 child: Box::new(make_literal("a")),
1966 lo,
1967 hi,
1968 greedy,
1969 }
1970 }
1971
1972 assert_eq!(to_str(repeat(2, 2, true)), "a{2}");
1973 assert_eq!(to_str(repeat(2, 2, false)), "a{2}?");
1974 assert_eq!(to_str(repeat(2, 3, true)), "a{2,3}");
1975 assert_eq!(to_str(repeat(2, 3, false)), "a{2,3}?");
1976 assert_eq!(to_str(repeat(2, usize::MAX, true)), "a{2,}");
1977 assert_eq!(to_str(repeat(2, usize::MAX, false)), "a{2,}?");
1978 assert_eq!(to_str(repeat(0, 1, true)), "a?");
1979 assert_eq!(to_str(repeat(0, 1, false)), "a??");
1980 assert_eq!(to_str(repeat(0, usize::MAX, true)), "a*");
1981 assert_eq!(to_str(repeat(0, usize::MAX, false)), "a*?");
1982 assert_eq!(to_str(repeat(1, usize::MAX, true)), "a+");
1983 assert_eq!(to_str(repeat(1, usize::MAX, false)), "a+?");
1984 }
1985
1986 #[test]
1987 fn escape() {
1988 // Check that strings that need no quoting are borrowed, and that non-special punctuation
1989 // is not quoted.
1990 match crate::escape("@foo") {
1991 Cow::Borrowed(s) => assert_eq!(s, "@foo"),
1992 _ => panic!("Value should be borrowed."),
1993 }
1994
1995 // Check typical usage.
1996 assert_eq!(crate::escape("fo*o").into_owned(), "fo\\*o");
1997
1998 // Check that multibyte characters are handled correctly.
1999 assert_eq!(crate::escape("fø*ø").into_owned(), "fø\\*ø");
2000 }
2001
2002 /*
2003 #[test]
2004 fn detect_backref() {
2005 assert_eq!(detect_possible_backref("a0a1a2"), false);
2006 assert_eq!(detect_possible_backref("a0a1\\a2"), false);
2007 assert_eq!(detect_possible_backref("a0a\\1a2"), true);
2008 assert_eq!(detect_possible_backref("a0a1a2\\"), false);
2009 }
2010 */
2011}