fancy_regex/lib.rs
1// Copyright 2016 The Fancy Regex Authors.
2//
3// Permission is hereby granted, free of charge, to any person obtaining a copy
4// of this software and associated documentation files (the "Software"), to deal
5// in the Software without restriction, including without limitation the rights
6// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7// copies of the Software, and to permit persons to whom the Software is
8// furnished to do so, subject to the following conditions:
9//
10// The above copyright notice and this permission notice shall be included in
11// all copies or substantial portions of the Software.
12//
13// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19// THE SOFTWARE.
20
21/*!
22An implementation of regexes, supporting a relatively rich set of features, including backreferences
23and lookaround.
24
25It builds on top of the excellent [regex] crate. If you are not
26familiar with it, make sure you read its documentation and maybe you don't even need fancy-regex.
27
28If your regex or parts of it does not use any special features, the matching is delegated to the
29regex crate. That means it has linear runtime. But if you use "fancy" features such as
30backreferences or look-around, an engine with backtracking needs to be used. In that case, the regex
31can be slow and take exponential time to run because of what is called "catastrophic backtracking".
32This depends on the regex and the input.
33
34# Usage
35
36The API should feel very similar to the regex crate, and involves compiling a regex and then using
37it to find matches in text.
38
39## Example: Matching text
40
41An example with backreferences to check if a text consists of two identical words:
42
43```rust
44use fancy_regex::Regex;
45
46let re = Regex::new(r"^(\w+) (\1)$").unwrap();
47let result = re.is_match("foo foo");
48
49assert!(result.is_ok());
50let did_match = result.unwrap();
51assert!(did_match);
52```
53
54Note that like in the regex crate, the regex needs anchors like `^` and `$` to match against the
55entire input text.
56
57## Example: Finding the position of matches
58
59```rust
60use fancy_regex::Regex;
61
62let re = Regex::new(r"(\d)\1").unwrap();
63let result = re.find("foo 22");
64
65assert!(result.is_ok(), "execution was successful");
66let match_option = result.unwrap();
67
68assert!(match_option.is_some(), "found a match");
69let m = match_option.unwrap();
70
71assert_eq!(m.start(), 4);
72assert_eq!(m.end(), 6);
73assert_eq!(m.as_str(), "22");
74```
75
76## Example: Capturing groups
77
78```rust
79use fancy_regex::Regex;
80
81let re = Regex::new(r"(?<!AU)\$(\d+)").unwrap();
82let result = re.captures("AU$10, $20");
83
84let captures = result.expect("Error running regex").expect("No match found");
85let group = captures.get(1).expect("No group");
86assert_eq!(group.as_str(), "20");
87```
88
89## Example: Splitting text
90
91```rust
92use fancy_regex::Regex;
93
94let re = Regex::new(r"[ \t]+").unwrap();
95let target = "a b \t c\td e";
96let fields: Vec<&str> = re.split(target).map(|x| x.unwrap()).collect();
97assert_eq!(fields, vec!["a", "b", "c", "d", "e"]);
98
99let fields: Vec<&str> = re.splitn(target, 3).map(|x| x.unwrap()).collect();
100assert_eq!(fields, vec!["a", "b", "c\td e"]);
101```
102
103# Syntax
104
105The regex syntax is based on the [regex] crate's, with some additional supported syntax.
106
107Escapes:
108
109`\h`
110: hex digit (`[0-9A-Fa-f]`) \
111`\H`
112: not hex digit (`[^0-9A-Fa-f]`) \
113`\e`
114: escape control character (`\x1B`) \
115`\K`
116: keep text matched so far out of the overall match ([docs](https://www.regular-expressions.info/keep.html))\
117`\G`
118: anchor to where the previous match ended ([docs](https://www.regular-expressions.info/continue.html))\
119`\Z`
120: anchor to the end of the text before any trailing newlines\
121`\O`
122: any character including newline
123
124Backreferences:
125
126`\1`
127: match the exact string that the first capture group matched \
128`\2`
129: backref to the second capture group, etc
130
131Named capture groups:
132
133`(?<name>exp)`
134: match *exp*, creating capture group named *name* \
135`\k<name>`
136: match the exact string that the capture group named *name* matched \
137`(?P<name>exp)`
138: same as `(?<name>exp)` for compatibility with Python, etc. \
139`(?P=name)`
140: same as `\k<name>` for compatibility with Python, etc.
141
142Look-around assertions for matching without changing the current position:
143
144`(?=exp)`
145: look-ahead, succeeds if *exp* matches to the right of the current position \
146`(?!exp)`
147: negative look-ahead, succeeds if *exp* doesn't match to the right \
148`(?<=exp)`
149: look-behind, succeeds if *exp* matches to the left of the current position \
150`(?<!exp)`
151: negative look-behind, succeeds if *exp* doesn't match to the left
152
153Atomic groups using `(?>exp)` to prevent backtracking within `exp`, e.g.:
154
155```
156# use fancy_regex::Regex;
157let re = Regex::new(r"^a(?>bc|b)c$").unwrap();
158assert!(re.is_match("abcc").unwrap());
159// Doesn't match because `|b` is never tried because of the atomic group
160assert!(!re.is_match("abc").unwrap());
161```
162
163Conditionals - if/then/else:
164
165`(?(1))`
166: continue only if first capture group matched \
167`(?(<name>))`
168: continue only if capture group named *name* matched \
169`(?(1)true_branch|false_branch)`
170: if the first capture group matched then execute the true_branch regex expression, else execute false_branch ([docs](https://www.regular-expressions.info/conditional.html)) \
171`(?(condition)true_branch|false_branch)`
172: if the condition matches then execute the true_branch regex expression, else execute false_branch from the point just before the condition was evaluated
173
174[regex]: https://crates.io/crates/regex
175*/
176
177#![deny(missing_docs)]
178#![deny(missing_debug_implementations)]
179#![cfg_attr(not(feature = "std"), no_std)]
180
181extern crate alloc;
182
183use alloc::borrow::Cow;
184use alloc::boxed::Box;
185use alloc::string::{String, ToString};
186use alloc::sync::Arc;
187use alloc::vec;
188use alloc::vec::Vec;
189
190use core::convert::TryFrom;
191use core::fmt;
192use core::fmt::{Debug, Formatter};
193use core::ops::{Index, Range};
194use core::str::FromStr;
195use regex_automata::meta::Regex as RaRegex;
196use regex_automata::util::captures::Captures as RaCaptures;
197use regex_automata::util::syntax::Config as SyntaxConfig;
198use regex_automata::Input as RaInput;
199
200mod analyze;
201mod compile;
202mod error;
203mod expand;
204mod optimize;
205mod parse;
206mod parse_flags;
207mod replacer;
208mod vm;
209
210use crate::analyze::analyze;
211use crate::analyze::can_compile_as_anchored;
212use crate::compile::compile;
213use crate::optimize::optimize;
214use crate::parse::{ExprTree, NamedGroups, Parser};
215use crate::parse_flags::*;
216use crate::vm::{Prog, OPTION_SKIPPED_EMPTY_MATCH};
217
218pub use crate::error::{CompileError, Error, ParseError, Result, RuntimeError};
219pub use crate::expand::Expander;
220pub use crate::replacer::{NoExpand, Replacer, ReplacerRef};
221
222const MAX_RECURSION: usize = 64;
223
224// the public API
225
226/// A builder for a `Regex` to allow configuring options.
227#[derive(Debug)]
228pub struct RegexBuilder(RegexOptions);
229
230/// A compiled regular expression.
231#[derive(Clone)]
232pub struct Regex {
233 inner: RegexImpl,
234 named_groups: Arc<NamedGroups>,
235}
236
237// Separate enum because we don't want to expose any of this
238#[derive(Clone)]
239enum RegexImpl {
240 // Do we want to box this? It's pretty big...
241 Wrap {
242 inner: RaRegex,
243 options: RegexOptions,
244 /// Some optimizations avoid the VM, but need to use an extra capture group to represent the match boundaries
245 explicit_capture_group_0: bool,
246 debug_pattern: String,
247 },
248 Fancy {
249 prog: Prog,
250 n_groups: usize,
251 options: RegexOptions,
252 },
253}
254
255/// A single match of a regex or group in an input text
256#[derive(Copy, Clone, Debug, Eq, PartialEq)]
257pub struct Match<'t> {
258 text: &'t str,
259 start: usize,
260 end: usize,
261}
262
263/// An iterator over all non-overlapping matches for a particular string.
264///
265/// The iterator yields a `Result<Match>`. The iterator stops when no more
266/// matches can be found.
267///
268/// `'r` is the lifetime of the compiled regular expression and `'t` is the
269/// lifetime of the matched string.
270#[derive(Debug)]
271pub struct Matches<'r, 't> {
272 re: &'r Regex,
273 text: &'t str,
274 last_end: usize,
275 last_match: Option<usize>,
276}
277
278impl<'r, 't> Matches<'r, 't> {
279 /// Return the text being searched.
280 pub fn text(&self) -> &'t str {
281 self.text
282 }
283
284 /// Return the underlying regex.
285 pub fn regex(&self) -> &'r Regex {
286 self.re
287 }
288}
289
290impl<'r, 't> Iterator for Matches<'r, 't> {
291 type Item = Result<Match<'t>>;
292
293 /// Adapted from the `regex` crate. Calls `find_from_pos` repeatedly.
294 /// Ignores empty matches immediately after a match.
295 fn next(&mut self) -> Option<Self::Item> {
296 if self.last_end > self.text.len() {
297 return None;
298 }
299
300 let option_flags = if let Some(last_match) = self.last_match {
301 if self.last_end > last_match {
302 OPTION_SKIPPED_EMPTY_MATCH
303 } else {
304 0
305 }
306 } else {
307 0
308 };
309 let mat =
310 match self
311 .re
312 .find_from_pos_with_option_flags(self.text, self.last_end, option_flags)
313 {
314 Err(error) => {
315 // Stop on first error: If an error is encountered, return it, and set the "last match position"
316 // to the string length, so that the next next() call will return None, to prevent an infinite loop.
317 self.last_end = self.text.len() + 1;
318 return Some(Err(error));
319 }
320 Ok(None) => return None,
321 Ok(Some(mat)) => mat,
322 };
323
324 if mat.start == mat.end {
325 // This is an empty match. To ensure we make progress, start
326 // the next search at the smallest possible starting position
327 // of the next match following this one.
328 self.last_end = next_utf8(self.text, mat.end);
329 // Don't accept empty matches immediately following a match.
330 // Just move on to the next match.
331 if Some(mat.end) == self.last_match {
332 return self.next();
333 }
334 } else {
335 self.last_end = mat.end;
336 }
337
338 self.last_match = Some(mat.end);
339
340 Some(Ok(mat))
341 }
342}
343
344/// An iterator that yields all non-overlapping capture groups matching a
345/// particular regular expression.
346///
347/// The iterator stops when no more matches can be found.
348///
349/// `'r` is the lifetime of the compiled regular expression and `'t` is the
350/// lifetime of the matched string.
351#[derive(Debug)]
352pub struct CaptureMatches<'r, 't>(Matches<'r, 't>);
353
354impl<'r, 't> CaptureMatches<'r, 't> {
355 /// Return the text being searched.
356 pub fn text(&self) -> &'t str {
357 self.0.text
358 }
359
360 /// Return the underlying regex.
361 pub fn regex(&self) -> &'r Regex {
362 self.0.re
363 }
364}
365
366impl<'r, 't> Iterator for CaptureMatches<'r, 't> {
367 type Item = Result<Captures<'t>>;
368
369 /// Adapted from the `regex` crate. Calls `captures_from_pos` repeatedly.
370 /// Ignores empty matches immediately after a match.
371 fn next(&mut self) -> Option<Self::Item> {
372 if self.0.last_end > self.0.text.len() {
373 return None;
374 }
375
376 let captures = match self.0.re.captures_from_pos(self.0.text, self.0.last_end) {
377 Err(error) => {
378 // Stop on first error: If an error is encountered, return it, and set the "last match position"
379 // to the string length, so that the next next() call will return None, to prevent an infinite loop.
380 self.0.last_end = self.0.text.len() + 1;
381 return Some(Err(error));
382 }
383 Ok(None) => return None,
384 Ok(Some(captures)) => captures,
385 };
386
387 let mat = captures
388 .get(0)
389 .expect("`Captures` is expected to have entire match at 0th position");
390 if mat.start == mat.end {
391 self.0.last_end = next_utf8(self.0.text, mat.end);
392 if Some(mat.end) == self.0.last_match {
393 return self.next();
394 }
395 } else {
396 self.0.last_end = mat.end;
397 }
398
399 self.0.last_match = Some(mat.end);
400
401 Some(Ok(captures))
402 }
403}
404
405/// A set of capture groups found for a regex.
406#[derive(Debug)]
407pub struct Captures<'t> {
408 inner: CapturesImpl<'t>,
409 named_groups: Arc<NamedGroups>,
410}
411
412#[derive(Debug)]
413enum CapturesImpl<'t> {
414 Wrap {
415 text: &'t str,
416 locations: RaCaptures,
417 /// Some optimizations avoid the VM but need an extra capture group to represent the match boundaries.
418 /// Therefore what is actually capture group 1 should be treated as capture group 0, and all other
419 /// capture groups should have their index reduced by one as well to line up with what the pattern specifies.
420 explicit_capture_group_0: bool,
421 },
422 Fancy {
423 text: &'t str,
424 saves: Vec<usize>,
425 },
426}
427
428/// Iterator for captured groups in order in which they appear in the regex.
429#[derive(Debug)]
430pub struct SubCaptureMatches<'c, 't> {
431 caps: &'c Captures<'t>,
432 i: usize,
433}
434
435/// An iterator over all substrings delimited by a regex.
436///
437/// This iterator yields `Result<&'h str>`, where each item is a substring of the
438/// target string that is delimited by matches of the regular expression. It stops when there
439/// are no more substrings to yield.
440///
441/// `'r` is the lifetime of the compiled regular expression, and `'h` is the
442/// lifetime of the target string being split.
443///
444/// This iterator can be created by the [`Regex::split`] method.
445#[derive(Debug)]
446pub struct Split<'r, 'h> {
447 matches: Matches<'r, 'h>,
448 next_start: usize,
449 target: &'h str,
450}
451
452impl<'r, 'h> Iterator for Split<'r, 'h> {
453 type Item = Result<&'h str>;
454
455 /// Returns the next substring that results from splitting the target string by the regex.
456 ///
457 /// If no more matches are found, returns the remaining part of the string,
458 /// or `None` if all substrings have been yielded.
459 fn next(&mut self) -> Option<Result<&'h str>> {
460 match self.matches.next() {
461 None => {
462 let len = self.target.len();
463 if self.next_start > len {
464 // No more substrings to return
465 None
466 } else {
467 // Return the last part of the target string
468 // Next call will return None
469 let part = &self.target[self.next_start..len];
470 self.next_start = len + 1;
471 Some(Ok(part))
472 }
473 }
474 // Return the next substring
475 Some(Ok(m)) => {
476 let part = &self.target[self.next_start..m.start()];
477 self.next_start = m.end();
478 Some(Ok(part))
479 }
480 Some(Err(e)) => Some(Err(e)),
481 }
482 }
483}
484
485impl<'r, 'h> core::iter::FusedIterator for Split<'r, 'h> {}
486
487/// An iterator over at most `N` substrings delimited by a regex.
488///
489/// This iterator yields `Result<&'h str>`, where each item is a substring of the
490/// target that is delimited by matches of the regular expression. It stops either when
491/// there are no more substrings to yield, or after `N` substrings have been yielded.
492///
493/// The `N`th substring is the remaining part of the target.
494///
495/// `'r` is the lifetime of the compiled regular expression, and `'h` is the
496/// lifetime of the target string being split.
497///
498/// This iterator can be created by the [`Regex::splitn`] method.
499#[derive(Debug)]
500pub struct SplitN<'r, 'h> {
501 splits: Split<'r, 'h>,
502 limit: usize,
503}
504
505impl<'r, 'h> Iterator for SplitN<'r, 'h> {
506 type Item = Result<&'h str>;
507
508 /// Returns the next substring resulting from splitting the target by the regex,
509 /// limited to `N` splits.
510 ///
511 /// Returns `None` if no more matches are found or if the limit is reached after yielding
512 /// the remaining part of the target.
513 fn next(&mut self) -> Option<Result<&'h str>> {
514 if self.limit == 0 {
515 // Limit reached. No more substrings available.
516 return None;
517 }
518
519 // Decrement the limit for each split.
520 self.limit -= 1;
521 if self.limit > 0 {
522 return self.splits.next();
523 }
524
525 // Nth split
526 let len = self.splits.target.len();
527 if self.splits.next_start > len {
528 // No more substrings available.
529 None
530 } else {
531 // Return the remaining part of the target
532 let start = self.splits.next_start;
533 self.splits.next_start = len + 1;
534 Some(Ok(&self.splits.target[start..len]))
535 }
536 }
537
538 fn size_hint(&self) -> (usize, Option<usize>) {
539 (0, Some(self.limit))
540 }
541}
542
543impl<'r, 'h> core::iter::FusedIterator for SplitN<'r, 'h> {}
544
545#[derive(Clone, Debug)]
546struct RegexOptions {
547 pattern: String,
548 syntaxc: SyntaxConfig,
549 backtrack_limit: usize,
550 delegate_size_limit: Option<usize>,
551 delegate_dfa_size_limit: Option<usize>,
552 oniguruma_mode: bool,
553}
554
555impl RegexOptions {
556 fn get_flag_value(flag_value: bool, enum_value: u32) -> u32 {
557 if flag_value {
558 enum_value
559 } else {
560 0
561 }
562 }
563
564 fn compute_flags(&self) -> u32 {
565 let insensitive = Self::get_flag_value(self.syntaxc.get_case_insensitive(), FLAG_CASEI);
566 let multiline = Self::get_flag_value(self.syntaxc.get_multi_line(), FLAG_MULTI);
567 let whitespace =
568 Self::get_flag_value(self.syntaxc.get_ignore_whitespace(), FLAG_IGNORE_SPACE);
569 let dotnl = Self::get_flag_value(self.syntaxc.get_dot_matches_new_line(), FLAG_DOTNL);
570 let unicode = Self::get_flag_value(self.syntaxc.get_unicode(), FLAG_UNICODE);
571 let oniguruma_mode = Self::get_flag_value(self.oniguruma_mode, FLAG_ONIGURUMA_MODE);
572
573 insensitive | multiline | whitespace | dotnl | unicode | unicode | oniguruma_mode
574 }
575}
576
577impl Default for RegexOptions {
578 fn default() -> Self {
579 RegexOptions {
580 pattern: String::new(),
581 syntaxc: SyntaxConfig::default(),
582 backtrack_limit: 1_000_000,
583 delegate_size_limit: None,
584 delegate_dfa_size_limit: None,
585 oniguruma_mode: false,
586 }
587 }
588}
589
590impl RegexBuilder {
591 /// Create a new regex builder with a regex pattern.
592 ///
593 /// If the pattern is invalid, the call to `build` will fail later.
594 pub fn new(pattern: &str) -> Self {
595 let mut builder = RegexBuilder(RegexOptions::default());
596 builder.0.pattern = pattern.to_string();
597 builder
598 }
599
600 /// Build the `Regex`.
601 ///
602 /// Returns an [`Error`](enum.Error.html) if the pattern could not be parsed.
603 pub fn build(&self) -> Result<Regex> {
604 Regex::new_options(self.0.clone())
605 }
606
607 fn set_config(&mut self, func: impl Fn(SyntaxConfig) -> SyntaxConfig) -> &mut Self {
608 self.0.syntaxc = func(self.0.syntaxc);
609 self
610 }
611
612 /// Override default case insensitive
613 /// this is to enable/disable casing via builder instead of a flag within
614 /// the raw string provided to the regex builder
615 ///
616 /// Default is false
617 pub fn case_insensitive(&mut self, yes: bool) -> &mut Self {
618 self.set_config(|x| x.case_insensitive(yes))
619 }
620
621 /// Enable multi-line regex
622 pub fn multi_line(&mut self, yes: bool) -> &mut Self {
623 self.set_config(|x| x.multi_line(yes))
624 }
625
626 /// Allow ignore whitespace
627 pub fn ignore_whitespace(&mut self, yes: bool) -> &mut Self {
628 self.set_config(|x| x.ignore_whitespace(yes))
629 }
630
631 /// Enable or disable the "dot matches any character" flag.
632 /// When this is enabled, `.` will match any character. When it's disabled, then `.` will match any character
633 /// except for a new line character.
634 pub fn dot_matches_new_line(&mut self, yes: bool) -> &mut Self {
635 self.set_config(|x| x.dot_matches_new_line(yes))
636 }
637
638 /// Enable verbose mode in the regular expression.
639 ///
640 /// The same as ignore_whitespace
641 ///
642 /// When enabled, verbose mode permits insigificant whitespace in many
643 /// places in the regular expression, as well as comments. Comments are
644 /// started using `#` and continue until the end of the line.
645 ///
646 /// By default, this is disabled. It may be selectively enabled in the
647 /// regular expression by using the `x` flag regardless of this setting.
648 pub fn verbose_mode(&mut self, yes: bool) -> &mut Self {
649 self.set_config(|x| x.ignore_whitespace(yes))
650 }
651
652 /// Enable or disable the Unicode flag (`u`) by default.
653 ///
654 /// By default this is **enabled**. It may alternatively be selectively
655 /// disabled in the regular expression itself via the `u` flag.
656 ///
657 /// Note that unless "allow invalid UTF-8" is enabled (it's disabled by
658 /// default), a regular expression will fail to parse if Unicode mode is
659 /// disabled and a sub-expression could possibly match invalid UTF-8.
660 ///
661 /// **WARNING**: Unicode mode can greatly increase the size of the compiled
662 /// DFA, which can noticeably impact both memory usage and compilation
663 /// time. This is especially noticeable if your regex contains character
664 /// classes like `\w` that are impacted by whether Unicode is enabled or
665 /// not. If Unicode is not necessary, you are encouraged to disable it.
666 pub fn unicode_mode(&mut self, yes: bool) -> &mut Self {
667 self.set_config(|x| x.unicode(yes))
668 }
669
670 /// Limit for how many times backtracking should be attempted for fancy regexes (where
671 /// backtracking is used). If this limit is exceeded, execution returns an error with
672 /// [`Error::BacktrackLimitExceeded`](enum.Error.html#variant.BacktrackLimitExceeded).
673 /// This is for preventing a regex with catastrophic backtracking to run for too long.
674 ///
675 /// Default is `1_000_000` (1 million).
676 pub fn backtrack_limit(&mut self, limit: usize) -> &mut Self {
677 self.0.backtrack_limit = limit;
678 self
679 }
680
681 /// Set the approximate size limit of the compiled regular expression.
682 ///
683 /// This option is forwarded from the wrapped `regex` crate. Note that depending on the used
684 /// regex features there may be multiple delegated sub-regexes fed to the `regex` crate. As
685 /// such the actual limit is closer to `<number of delegated regexes> * delegate_size_limit`.
686 pub fn delegate_size_limit(&mut self, limit: usize) -> &mut Self {
687 self.0.delegate_size_limit = Some(limit);
688 self
689 }
690
691 /// Set the approximate size of the cache used by the DFA.
692 ///
693 /// This option is forwarded from the wrapped `regex` crate. Note that depending on the used
694 /// regex features there may be multiple delegated sub-regexes fed to the `regex` crate. As
695 /// such the actual limit is closer to `<number of delegated regexes> *
696 /// delegate_dfa_size_limit`.
697 pub fn delegate_dfa_size_limit(&mut self, limit: usize) -> &mut Self {
698 self.0.delegate_dfa_size_limit = Some(limit);
699 self
700 }
701
702 /// Attempts to better match [Oniguruma](https://github.com/kkos/oniguruma)'s default behavior
703 ///
704 /// Currently this amounts to changing behavior with:
705 ///
706 /// # Left and right word bounds
707 ///
708 /// `fancy-regex` follows the default of other regex engines such as the `regex` crate itself
709 /// where `\<` and `\>` correspond to a _left_ and _right_ word-bound respectively. This
710 /// differs from Oniguruma's defaults which treat them as matching the literals `<` and `>`.
711 /// When this option is set using `\<` and `\>` in the pattern will match the literals
712 /// `<` and `>` instead of word bounds.
713 ///
714 /// ## Example
715 ///
716 /// ```
717 /// use fancy_regex::{Regex, RegexBuilder};
718 ///
719 /// let haystack = "turbo::<Fish>";
720 /// let regex = r"\<\w*\>";
721 ///
722 /// // By default `\<` and `\>` will match the start and end of a word boundary
723 /// let word_bounds_regex = Regex::new(regex).unwrap();
724 /// let word_bounds = word_bounds_regex.find(haystack).unwrap().unwrap();
725 /// assert_eq!(word_bounds.as_str(), "turbo");
726 ///
727 /// // With the option set they instead match the literal `<` and `>` characters
728 /// let literals_regex = RegexBuilder::new(regex).oniguruma_mode(true).build().unwrap();
729 /// let literals = literals_regex.find(haystack).unwrap().unwrap();
730 /// assert_eq!(literals.as_str(), "<Fish>");
731 /// ```
732 pub fn oniguruma_mode(&mut self, yes: bool) -> &mut Self {
733 self.0.oniguruma_mode = yes;
734 self
735 }
736}
737
738impl fmt::Debug for Regex {
739 /// Shows the original regular expression.
740 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
741 write!(f, "{}", self.as_str())
742 }
743}
744
745impl fmt::Display for Regex {
746 /// Shows the original regular expression
747 fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
748 write!(f, "{}", self.as_str())
749 }
750}
751
752impl FromStr for Regex {
753 type Err = Error;
754
755 /// Attempts to parse a string into a regular expression
756 fn from_str(s: &str) -> Result<Regex> {
757 Regex::new(s)
758 }
759}
760
761impl Regex {
762 /// Parse and compile a regex with default options, see `RegexBuilder`.
763 ///
764 /// Returns an [`Error`](enum.Error.html) if the pattern could not be parsed.
765 pub fn new(re: &str) -> Result<Regex> {
766 let options = RegexOptions {
767 pattern: re.to_string(),
768 ..RegexOptions::default()
769 };
770 Self::new_options(options)
771 }
772
773 fn new_options(options: RegexOptions) -> Result<Regex> {
774 let mut tree = Expr::parse_tree_with_flags(&options.pattern, options.compute_flags())?;
775
776 // try to optimize the expression tree
777 let requires_capture_group_fixup = optimize(&mut tree);
778 let info = analyze(&tree, requires_capture_group_fixup)?;
779
780 if !info.hard {
781 // easy case, wrap regex
782
783 // we do our own to_str because escapes are different
784 // NOTE: there is a good opportunity here to use Hir to avoid regex-automata re-parsing it
785 let mut re_cooked = String::new();
786 tree.expr.to_str(&mut re_cooked, 0);
787 let inner = compile::compile_inner(&re_cooked, &options)?;
788 return Ok(Regex {
789 inner: RegexImpl::Wrap {
790 inner,
791 options: RegexOptions {
792 pattern: options.pattern,
793 ..options
794 },
795 explicit_capture_group_0: requires_capture_group_fixup,
796 debug_pattern: re_cooked,
797 },
798 named_groups: Arc::new(tree.named_groups),
799 });
800 }
801
802 let prog = compile(&info, can_compile_as_anchored(&tree.expr))?;
803 Ok(Regex {
804 inner: RegexImpl::Fancy {
805 prog,
806 n_groups: info.end_group,
807 options,
808 },
809 named_groups: Arc::new(tree.named_groups),
810 })
811 }
812
813 /// Returns the original string of this regex.
814 pub fn as_str(&self) -> &str {
815 match &self.inner {
816 RegexImpl::Wrap { options, .. } => &options.pattern,
817 RegexImpl::Fancy { options, .. } => &options.pattern,
818 }
819 }
820
821 /// Check if the regex matches the input text.
822 ///
823 /// # Example
824 ///
825 /// Test if some text contains the same word twice:
826 ///
827 /// ```rust
828 /// # use fancy_regex::Regex;
829 ///
830 /// let re = Regex::new(r"(\w+) \1").unwrap();
831 /// assert!(re.is_match("mirror mirror on the wall").unwrap());
832 /// ```
833 pub fn is_match(&self, text: &str) -> Result<bool> {
834 match &self.inner {
835 RegexImpl::Wrap { ref inner, .. } => Ok(inner.is_match(text)),
836 RegexImpl::Fancy {
837 ref prog, options, ..
838 } => {
839 let result = vm::run(prog, text, 0, 0, options)?;
840 Ok(result.is_some())
841 }
842 }
843 }
844
845 /// Returns an iterator for each successive non-overlapping match in `text`.
846 ///
847 /// If you have capturing groups in your regex that you want to extract, use the [Regex::captures_iter()]
848 /// method.
849 ///
850 /// # Example
851 ///
852 /// Find all words followed by an exclamation point:
853 ///
854 /// ```rust
855 /// # use fancy_regex::Regex;
856 ///
857 /// let re = Regex::new(r"\w+(?=!)").unwrap();
858 /// let mut matches = re.find_iter("so fancy! even with! iterators!");
859 /// assert_eq!(matches.next().unwrap().unwrap().as_str(), "fancy");
860 /// assert_eq!(matches.next().unwrap().unwrap().as_str(), "with");
861 /// assert_eq!(matches.next().unwrap().unwrap().as_str(), "iterators");
862 /// assert!(matches.next().is_none());
863 /// ```
864 pub fn find_iter<'r, 't>(&'r self, text: &'t str) -> Matches<'r, 't> {
865 Matches {
866 re: self,
867 text,
868 last_end: 0,
869 last_match: None,
870 }
871 }
872
873 /// Find the first match in the input text.
874 ///
875 /// If you have capturing groups in your regex that you want to extract, use the [Regex::captures()]
876 /// method.
877 ///
878 /// # Example
879 ///
880 /// Find a word that is followed by an exclamation point:
881 ///
882 /// ```rust
883 /// # use fancy_regex::Regex;
884 ///
885 /// let re = Regex::new(r"\w+(?=!)").unwrap();
886 /// assert_eq!(re.find("so fancy!").unwrap().unwrap().as_str(), "fancy");
887 /// ```
888 pub fn find<'t>(&self, text: &'t str) -> Result<Option<Match<'t>>> {
889 self.find_from_pos(text, 0)
890 }
891
892 /// Returns the first match in `text`, starting from the specified byte position `pos`.
893 ///
894 /// # Examples
895 ///
896 /// Finding match starting at a position:
897 ///
898 /// ```
899 /// # use fancy_regex::Regex;
900 /// let re = Regex::new(r"(?m:^)(\d+)").unwrap();
901 /// let text = "1 test 123\n2 foo";
902 /// let mat = re.find_from_pos(text, 7).unwrap().unwrap();
903 ///
904 /// assert_eq!(mat.start(), 11);
905 /// assert_eq!(mat.end(), 12);
906 /// ```
907 ///
908 /// Note that in some cases this is not the same as using the `find`
909 /// method and passing a slice of the string, see [Regex::captures_from_pos()] for details.
910 pub fn find_from_pos<'t>(&self, text: &'t str, pos: usize) -> Result<Option<Match<'t>>> {
911 self.find_from_pos_with_option_flags(text, pos, 0)
912 }
913
914 fn find_from_pos_with_option_flags<'t>(
915 &self,
916 text: &'t str,
917 pos: usize,
918 option_flags: u32,
919 ) -> Result<Option<Match<'t>>> {
920 match &self.inner {
921 RegexImpl::Wrap {
922 inner,
923 explicit_capture_group_0,
924 ..
925 } => {
926 if !*explicit_capture_group_0 {
927 Ok(inner
928 .search(&RaInput::new(text).span(pos..text.len()))
929 .map(|m| Match::new(text, m.start(), m.end())))
930 } else {
931 let mut locations = inner.create_captures();
932 inner.captures(RaInput::new(text).span(pos..text.len()), &mut locations);
933 Ok(locations.is_match().then(|| {
934 Match::new(
935 text,
936 locations.get_group(1).unwrap().start,
937 locations.get_group(1).unwrap().end,
938 )
939 }))
940 }
941 }
942 RegexImpl::Fancy { prog, options, .. } => {
943 let result = vm::run(prog, text, pos, option_flags, options)?;
944 Ok(result.map(|saves| Match::new(text, saves[0], saves[1])))
945 }
946 }
947 }
948
949 /// Returns an iterator over all the non-overlapping capture groups matched in `text`.
950 ///
951 /// # Examples
952 ///
953 /// Finding all matches and capturing parts of each:
954 ///
955 /// ```rust
956 /// # use fancy_regex::Regex;
957 ///
958 /// let re = Regex::new(r"(\d{4})-(\d{2})").unwrap();
959 /// let text = "It was between 2018-04 and 2020-01";
960 /// let mut all_captures = re.captures_iter(text);
961 ///
962 /// let first = all_captures.next().unwrap().unwrap();
963 /// assert_eq!(first.get(1).unwrap().as_str(), "2018");
964 /// assert_eq!(first.get(2).unwrap().as_str(), "04");
965 /// assert_eq!(first.get(0).unwrap().as_str(), "2018-04");
966 ///
967 /// let second = all_captures.next().unwrap().unwrap();
968 /// assert_eq!(second.get(1).unwrap().as_str(), "2020");
969 /// assert_eq!(second.get(2).unwrap().as_str(), "01");
970 /// assert_eq!(second.get(0).unwrap().as_str(), "2020-01");
971 ///
972 /// assert!(all_captures.next().is_none());
973 /// ```
974 pub fn captures_iter<'r, 't>(&'r self, text: &'t str) -> CaptureMatches<'r, 't> {
975 CaptureMatches(self.find_iter(text))
976 }
977
978 /// Returns the capture groups for the first match in `text`.
979 ///
980 /// If no match is found, then `Ok(None)` is returned.
981 ///
982 /// # Examples
983 ///
984 /// Finding matches and capturing parts of the match:
985 ///
986 /// ```rust
987 /// # use fancy_regex::Regex;
988 ///
989 /// let re = Regex::new(r"(\d{4})-(\d{2})-(\d{2})").unwrap();
990 /// let text = "The date was 2018-04-07";
991 /// let captures = re.captures(text).unwrap().unwrap();
992 ///
993 /// assert_eq!(captures.get(1).unwrap().as_str(), "2018");
994 /// assert_eq!(captures.get(2).unwrap().as_str(), "04");
995 /// assert_eq!(captures.get(3).unwrap().as_str(), "07");
996 /// assert_eq!(captures.get(0).unwrap().as_str(), "2018-04-07");
997 /// ```
998 pub fn captures<'t>(&self, text: &'t str) -> Result<Option<Captures<'t>>> {
999 self.captures_from_pos(text, 0)
1000 }
1001
1002 /// Returns the capture groups for the first match in `text`, starting from
1003 /// the specified byte position `pos`.
1004 ///
1005 /// # Examples
1006 ///
1007 /// Finding captures starting at a position:
1008 ///
1009 /// ```
1010 /// # use fancy_regex::Regex;
1011 /// let re = Regex::new(r"(?m:^)(\d+)").unwrap();
1012 /// let text = "1 test 123\n2 foo";
1013 /// let captures = re.captures_from_pos(text, 7).unwrap().unwrap();
1014 ///
1015 /// let group = captures.get(1).unwrap();
1016 /// assert_eq!(group.as_str(), "2");
1017 /// assert_eq!(group.start(), 11);
1018 /// assert_eq!(group.end(), 12);
1019 /// ```
1020 ///
1021 /// Note that in some cases this is not the same as using the `captures`
1022 /// method and passing a slice of the string, see the capture that we get
1023 /// when we do this:
1024 ///
1025 /// ```
1026 /// # use fancy_regex::Regex;
1027 /// let re = Regex::new(r"(?m:^)(\d+)").unwrap();
1028 /// let text = "1 test 123\n2 foo";
1029 /// let captures = re.captures(&text[7..]).unwrap().unwrap();
1030 /// assert_eq!(captures.get(1).unwrap().as_str(), "123");
1031 /// ```
1032 ///
1033 /// This matched the number "123" because it's at the beginning of the text
1034 /// of the string slice.
1035 ///
1036 pub fn captures_from_pos<'t>(&self, text: &'t str, pos: usize) -> Result<Option<Captures<'t>>> {
1037 let named_groups = self.named_groups.clone();
1038 match &self.inner {
1039 RegexImpl::Wrap {
1040 inner,
1041 explicit_capture_group_0,
1042 ..
1043 } => {
1044 let mut locations = inner.create_captures();
1045 inner.captures(RaInput::new(text).span(pos..text.len()), &mut locations);
1046 if locations.is_match() {
1047 Ok(Some(Captures {
1048 inner: CapturesImpl::Wrap {
1049 text,
1050 locations,
1051 explicit_capture_group_0: *explicit_capture_group_0,
1052 },
1053 named_groups,
1054 }))
1055 } else {
1056 Ok(None)
1057 }
1058 }
1059 RegexImpl::Fancy {
1060 prog,
1061 n_groups,
1062 options,
1063 ..
1064 } => {
1065 let result = vm::run(prog, text, pos, 0, options)?;
1066 Ok(result.map(|mut saves| {
1067 saves.truncate(n_groups * 2);
1068 Captures {
1069 inner: CapturesImpl::Fancy { text, saves },
1070 named_groups,
1071 }
1072 }))
1073 }
1074 }
1075 }
1076
1077 /// Returns the number of captures, including the implicit capture of the entire expression.
1078 pub fn captures_len(&self) -> usize {
1079 match &self.inner {
1080 RegexImpl::Wrap {
1081 inner,
1082 explicit_capture_group_0,
1083 ..
1084 } => inner.captures_len() - if *explicit_capture_group_0 { 1 } else { 0 },
1085 RegexImpl::Fancy { n_groups, .. } => *n_groups,
1086 }
1087 }
1088
1089 /// Returns an iterator over the capture names.
1090 pub fn capture_names(&self) -> CaptureNames<'_> {
1091 let mut names = Vec::new();
1092 names.resize(self.captures_len(), None);
1093 for (name, &i) in self.named_groups.iter() {
1094 names[i] = Some(name.as_str());
1095 }
1096 CaptureNames(names.into_iter())
1097 }
1098
1099 // for debugging only
1100 #[doc(hidden)]
1101 pub fn debug_print(&self, writer: &mut Formatter<'_>) -> fmt::Result {
1102 match &self.inner {
1103 RegexImpl::Wrap {
1104 debug_pattern,
1105 explicit_capture_group_0,
1106 ..
1107 } => {
1108 write!(
1109 writer,
1110 "wrapped Regex {:?}, explicit_capture_group_0: {:}",
1111 debug_pattern, *explicit_capture_group_0
1112 )
1113 }
1114 RegexImpl::Fancy { prog, .. } => prog.debug_print(writer),
1115 }
1116 }
1117
1118 /// Replaces the leftmost-first match with the replacement provided.
1119 /// The replacement can be a regular string (where `$N` and `$name` are
1120 /// expanded to match capture groups) or a function that takes the matches'
1121 /// `Captures` and returns the replaced string.
1122 ///
1123 /// If no match is found, then a copy of the string is returned unchanged.
1124 ///
1125 /// # Replacement string syntax
1126 ///
1127 /// All instances of `$name` in the replacement text is replaced with the
1128 /// corresponding capture group `name`.
1129 ///
1130 /// `name` may be an integer corresponding to the index of the
1131 /// capture group (counted by order of opening parenthesis where `0` is the
1132 /// entire match) or it can be a name (consisting of letters, digits or
1133 /// underscores) corresponding to a named capture group.
1134 ///
1135 /// If `name` isn't a valid capture group (whether the name doesn't exist
1136 /// or isn't a valid index), then it is replaced with the empty string.
1137 ///
1138 /// The longest possible name is used. e.g., `$1a` looks up the capture
1139 /// group named `1a` and not the capture group at index `1`. To exert more
1140 /// precise control over the name, use braces, e.g., `${1}a`.
1141 ///
1142 /// To write a literal `$` use `$$`.
1143 ///
1144 /// # Examples
1145 ///
1146 /// Note that this function is polymorphic with respect to the replacement.
1147 /// In typical usage, this can just be a normal string:
1148 ///
1149 /// ```rust
1150 /// # use fancy_regex::Regex;
1151 /// let re = Regex::new("[^01]+").unwrap();
1152 /// assert_eq!(re.replace("1078910", ""), "1010");
1153 /// ```
1154 ///
1155 /// But anything satisfying the `Replacer` trait will work. For example,
1156 /// a closure of type `|&Captures| -> String` provides direct access to the
1157 /// captures corresponding to a match. This allows one to access
1158 /// capturing group matches easily:
1159 ///
1160 /// ```rust
1161 /// # use fancy_regex::{Regex, Captures};
1162 /// let re = Regex::new(r"([^,\s]+),\s+(\S+)").unwrap();
1163 /// let result = re.replace("Springsteen, Bruce", |caps: &Captures| {
1164 /// format!("{} {}", &caps[2], &caps[1])
1165 /// });
1166 /// assert_eq!(result, "Bruce Springsteen");
1167 /// ```
1168 ///
1169 /// But this is a bit cumbersome to use all the time. Instead, a simple
1170 /// syntax is supported that expands `$name` into the corresponding capture
1171 /// group. Here's the last example, but using this expansion technique
1172 /// with named capture groups:
1173 ///
1174 /// ```rust
1175 /// # use fancy_regex::Regex;
1176 /// let re = Regex::new(r"(?P<last>[^,\s]+),\s+(?P<first>\S+)").unwrap();
1177 /// let result = re.replace("Springsteen, Bruce", "$first $last");
1178 /// assert_eq!(result, "Bruce Springsteen");
1179 /// ```
1180 ///
1181 /// Note that using `$2` instead of `$first` or `$1` instead of `$last`
1182 /// would produce the same result. To write a literal `$` use `$$`.
1183 ///
1184 /// Sometimes the replacement string requires use of curly braces to
1185 /// delineate a capture group replacement and surrounding literal text.
1186 /// For example, if we wanted to join two words together with an
1187 /// underscore:
1188 ///
1189 /// ```rust
1190 /// # use fancy_regex::Regex;
1191 /// let re = Regex::new(r"(?P<first>\w+)\s+(?P<second>\w+)").unwrap();
1192 /// let result = re.replace("deep fried", "${first}_$second");
1193 /// assert_eq!(result, "deep_fried");
1194 /// ```
1195 ///
1196 /// Without the curly braces, the capture group name `first_` would be
1197 /// used, and since it doesn't exist, it would be replaced with the empty
1198 /// string.
1199 ///
1200 /// Finally, sometimes you just want to replace a literal string with no
1201 /// regard for capturing group expansion. This can be done by wrapping a
1202 /// byte string with `NoExpand`:
1203 ///
1204 /// ```rust
1205 /// # use fancy_regex::Regex;
1206 /// use fancy_regex::NoExpand;
1207 ///
1208 /// let re = Regex::new(r"(?P<last>[^,\s]+),\s+(\S+)").unwrap();
1209 /// let result = re.replace("Springsteen, Bruce", NoExpand("$2 $last"));
1210 /// assert_eq!(result, "$2 $last");
1211 /// ```
1212 pub fn replace<'t, R: Replacer>(&self, text: &'t str, rep: R) -> Cow<'t, str> {
1213 self.replacen(text, 1, rep)
1214 }
1215
1216 /// Replaces all non-overlapping matches in `text` with the replacement
1217 /// provided. This is the same as calling `replacen` with `limit` set to
1218 /// `0`.
1219 ///
1220 /// See the documentation for `replace` for details on how to access
1221 /// capturing group matches in the replacement string.
1222 pub fn replace_all<'t, R: Replacer>(&self, text: &'t str, rep: R) -> Cow<'t, str> {
1223 self.replacen(text, 0, rep)
1224 }
1225
1226 /// Replaces at most `limit` non-overlapping matches in `text` with the
1227 /// replacement provided. If `limit` is 0, then all non-overlapping matches
1228 /// are replaced.
1229 ///
1230 /// Will panic if any errors are encountered. Use `try_replacen`, which this
1231 /// function unwraps, if you want to handle errors.
1232 ///
1233 /// See the documentation for `replace` for details on how to access
1234 /// capturing group matches in the replacement string.
1235 ///
1236 pub fn replacen<'t, R: Replacer>(&self, text: &'t str, limit: usize, rep: R) -> Cow<'t, str> {
1237 self.try_replacen(text, limit, rep).unwrap()
1238 }
1239
1240 /// Replaces at most `limit` non-overlapping matches in `text` with the
1241 /// replacement provided. If `limit` is 0, then all non-overlapping matches
1242 /// are replaced.
1243 ///
1244 /// Propagates any errors encountered, such as `RuntimeError::BacktrackLimitExceeded`.
1245 ///
1246 /// See the documentation for `replace` for details on how to access
1247 /// capturing group matches in the replacement string.
1248 pub fn try_replacen<'t, R: Replacer>(
1249 &self,
1250 text: &'t str,
1251 limit: usize,
1252 mut rep: R,
1253 ) -> Result<Cow<'t, str>> {
1254 // If we know that the replacement doesn't have any capture expansions,
1255 // then we can fast path. The fast path can make a tremendous
1256 // difference:
1257 //
1258 // 1) We use `find_iter` instead of `captures_iter`. Not asking for
1259 // captures generally makes the regex engines faster.
1260 // 2) We don't need to look up all of the capture groups and do
1261 // replacements inside the replacement string. We just push it
1262 // at each match and be done with it.
1263 if let Some(rep) = rep.no_expansion() {
1264 let mut it = self.find_iter(text).enumerate().peekable();
1265 if it.peek().is_none() {
1266 return Ok(Cow::Borrowed(text));
1267 }
1268 let mut new = String::with_capacity(text.len());
1269 let mut last_match = 0;
1270 for (i, m) in it {
1271 let m = m?;
1272
1273 if limit > 0 && i >= limit {
1274 break;
1275 }
1276 new.push_str(&text[last_match..m.start()]);
1277 new.push_str(&rep);
1278 last_match = m.end();
1279 }
1280 new.push_str(&text[last_match..]);
1281 return Ok(Cow::Owned(new));
1282 }
1283
1284 // The slower path, which we use if the replacement needs access to
1285 // capture groups.
1286 let mut it = self.captures_iter(text).enumerate().peekable();
1287 if it.peek().is_none() {
1288 return Ok(Cow::Borrowed(text));
1289 }
1290 let mut new = String::with_capacity(text.len());
1291 let mut last_match = 0;
1292 for (i, cap) in it {
1293 let cap = cap?;
1294
1295 if limit > 0 && i >= limit {
1296 break;
1297 }
1298 // unwrap on 0 is OK because captures only reports matches
1299 let m = cap.get(0).unwrap();
1300 new.push_str(&text[last_match..m.start()]);
1301 rep.replace_append(&cap, &mut new);
1302 last_match = m.end();
1303 }
1304 new.push_str(&text[last_match..]);
1305 Ok(Cow::Owned(new))
1306 }
1307
1308 /// Splits the string by matches of the regex.
1309 ///
1310 /// Returns an iterator over the substrings of the target string
1311 /// that *aren't* matched by the regex.
1312 ///
1313 /// # Example
1314 ///
1315 /// To split a string delimited by arbitrary amounts of spaces or tabs:
1316 ///
1317 /// ```rust
1318 /// # use fancy_regex::Regex;
1319 /// let re = Regex::new(r"[ \t]+").unwrap();
1320 /// let target = "a b \t c\td e";
1321 /// let fields: Vec<&str> = re.split(target).map(|x| x.unwrap()).collect();
1322 /// assert_eq!(fields, vec!["a", "b", "c", "d", "e"]);
1323 /// ```
1324 pub fn split<'r, 'h>(&'r self, target: &'h str) -> Split<'r, 'h> {
1325 Split {
1326 matches: self.find_iter(target),
1327 next_start: 0,
1328 target,
1329 }
1330 }
1331
1332 /// Splits the string by matches of the regex at most `limit` times.
1333 ///
1334 /// Returns an iterator over the substrings of the target string
1335 /// that *aren't* matched by the regex.
1336 ///
1337 /// The `N`th substring is the remaining part of the target.
1338 ///
1339 /// # Example
1340 ///
1341 /// To split a string delimited by arbitrary amounts of spaces or tabs
1342 /// 3 times:
1343 ///
1344 /// ```rust
1345 /// # use fancy_regex::Regex;
1346 /// let re = Regex::new(r"[ \t]+").unwrap();
1347 /// let target = "a b \t c\td e";
1348 /// let fields: Vec<&str> = re.splitn(target, 3).map(|x| x.unwrap()).collect();
1349 /// assert_eq!(fields, vec!["a", "b", "c\td e"]);
1350 /// ```
1351 pub fn splitn<'r, 'h>(&'r self, target: &'h str, limit: usize) -> SplitN<'r, 'h> {
1352 SplitN {
1353 splits: self.split(target),
1354 limit,
1355 }
1356 }
1357}
1358
1359impl TryFrom<&str> for Regex {
1360 type Error = Error;
1361
1362 /// Attempts to parse a string into a regular expression
1363 fn try_from(s: &str) -> Result<Self> {
1364 Self::new(s)
1365 }
1366}
1367
1368impl TryFrom<String> for Regex {
1369 type Error = Error;
1370
1371 /// Attempts to parse a string into a regular expression
1372 fn try_from(s: String) -> Result<Self> {
1373 Self::new(&s)
1374 }
1375}
1376
1377impl<'t> Match<'t> {
1378 /// Returns the starting byte offset of the match in the text.
1379 #[inline]
1380 pub fn start(&self) -> usize {
1381 self.start
1382 }
1383
1384 /// Returns the ending byte offset of the match in the text.
1385 #[inline]
1386 pub fn end(&self) -> usize {
1387 self.end
1388 }
1389
1390 /// Returns the range over the starting and ending byte offsets of the match in text.
1391 #[inline]
1392 pub fn range(&self) -> Range<usize> {
1393 self.start..self.end
1394 }
1395
1396 /// Returns the matched text.
1397 #[inline]
1398 pub fn as_str(&self) -> &'t str {
1399 &self.text[self.start..self.end]
1400 }
1401
1402 /// Creates a new match from the given text and byte offsets.
1403 fn new(text: &'t str, start: usize, end: usize) -> Match<'t> {
1404 Match { text, start, end }
1405 }
1406}
1407
1408impl<'t> From<Match<'t>> for &'t str {
1409 fn from(m: Match<'t>) -> &'t str {
1410 m.as_str()
1411 }
1412}
1413
1414impl<'t> From<Match<'t>> for Range<usize> {
1415 fn from(m: Match<'t>) -> Range<usize> {
1416 m.range()
1417 }
1418}
1419
1420#[allow(clippy::len_without_is_empty)] // follow regex's API
1421impl<'t> Captures<'t> {
1422 /// Get the capture group by its index in the regex.
1423 ///
1424 /// If there is no match for that group or the index does not correspond to a group, `None` is
1425 /// returned. The index 0 returns the whole match.
1426 pub fn get(&self, i: usize) -> Option<Match<'t>> {
1427 match &self.inner {
1428 CapturesImpl::Wrap {
1429 text,
1430 locations,
1431 explicit_capture_group_0,
1432 } => locations
1433 .get_group(i + if *explicit_capture_group_0 { 1 } else { 0 })
1434 .map(|span| Match {
1435 text,
1436 start: span.start,
1437 end: span.end,
1438 }),
1439 CapturesImpl::Fancy { text, ref saves } => {
1440 let slot = i * 2;
1441 if slot >= saves.len() {
1442 return None;
1443 }
1444 let lo = saves[slot];
1445 if lo == usize::MAX {
1446 return None;
1447 }
1448 let hi = saves[slot + 1];
1449 Some(Match {
1450 text,
1451 start: lo,
1452 end: hi,
1453 })
1454 }
1455 }
1456 }
1457
1458 /// Returns the match for a named capture group. Returns `None` the capture
1459 /// group did not match or if there is no group with the given name.
1460 pub fn name(&self, name: &str) -> Option<Match<'t>> {
1461 self.named_groups.get(name).and_then(|i| self.get(*i))
1462 }
1463
1464 /// Expands all instances of `$group` in `replacement` to the corresponding
1465 /// capture group `name`, and writes them to the `dst` buffer given.
1466 ///
1467 /// `group` may be an integer corresponding to the index of the
1468 /// capture group (counted by order of opening parenthesis where `\0` is the
1469 /// entire match) or it can be a name (consisting of letters, digits or
1470 /// underscores) corresponding to a named capture group.
1471 ///
1472 /// If `group` isn't a valid capture group (whether the name doesn't exist
1473 /// or isn't a valid index), then it is replaced with the empty string.
1474 ///
1475 /// The longest possible name is used. e.g., `$1a` looks up the capture
1476 /// group named `1a` and not the capture group at index `1`. To exert more
1477 /// precise control over the name, use braces, e.g., `${1}a`.
1478 ///
1479 /// To write a literal `$`, use `$$`.
1480 ///
1481 /// For more control over expansion, see [`Expander`].
1482 ///
1483 /// [`Expander`]: expand/struct.Expander.html
1484 pub fn expand(&self, replacement: &str, dst: &mut String) {
1485 Expander::default().append_expansion(dst, replacement, self);
1486 }
1487
1488 /// Iterate over the captured groups in order in which they appeared in the regex. The first
1489 /// capture corresponds to the whole match.
1490 pub fn iter<'c>(&'c self) -> SubCaptureMatches<'c, 't> {
1491 SubCaptureMatches { caps: self, i: 0 }
1492 }
1493
1494 /// How many groups were captured. This is always at least 1 because group 0 returns the whole
1495 /// match.
1496 pub fn len(&self) -> usize {
1497 match &self.inner {
1498 CapturesImpl::Wrap {
1499 locations,
1500 explicit_capture_group_0,
1501 ..
1502 } => locations.group_len() - if *explicit_capture_group_0 { 1 } else { 0 },
1503 CapturesImpl::Fancy { saves, .. } => saves.len() / 2,
1504 }
1505 }
1506}
1507
1508/// Get a group by index.
1509///
1510/// `'t` is the lifetime of the matched text.
1511///
1512/// The text can't outlive the `Captures` object if this method is
1513/// used, because of how `Index` is defined (normally `a[i]` is part
1514/// of `a` and can't outlive it); to do that, use `get()` instead.
1515///
1516/// # Panics
1517///
1518/// If there is no group at the given index.
1519impl<'t> Index<usize> for Captures<'t> {
1520 type Output = str;
1521
1522 fn index(&self, i: usize) -> &str {
1523 self.get(i)
1524 .map(|m| m.as_str())
1525 .unwrap_or_else(|| panic!("no group at index '{}'", i))
1526 }
1527}
1528
1529/// Get a group by name.
1530///
1531/// `'t` is the lifetime of the matched text and `'i` is the lifetime
1532/// of the group name (the index).
1533///
1534/// The text can't outlive the `Captures` object if this method is
1535/// used, because of how `Index` is defined (normally `a[i]` is part
1536/// of `a` and can't outlive it); to do that, use `name` instead.
1537///
1538/// # Panics
1539///
1540/// If there is no group named by the given value.
1541impl<'t, 'i> Index<&'i str> for Captures<'t> {
1542 type Output = str;
1543
1544 fn index<'a>(&'a self, name: &'i str) -> &'a str {
1545 self.name(name)
1546 .map(|m| m.as_str())
1547 .unwrap_or_else(|| panic!("no group named '{}'", name))
1548 }
1549}
1550
1551impl<'c, 't> Iterator for SubCaptureMatches<'c, 't> {
1552 type Item = Option<Match<'t>>;
1553
1554 fn next(&mut self) -> Option<Option<Match<'t>>> {
1555 if self.i < self.caps.len() {
1556 let result = self.caps.get(self.i);
1557 self.i += 1;
1558 Some(result)
1559 } else {
1560 None
1561 }
1562 }
1563}
1564
1565// TODO: might be nice to implement ExactSizeIterator etc for SubCaptures
1566
1567/// Regular expression AST. This is public for now but may change.
1568#[derive(Debug, PartialEq, Eq, Clone)]
1569pub enum Expr {
1570 /// An empty expression, e.g. the last branch in `(a|b|)`
1571 Empty,
1572 /// Any character, regex `.`
1573 Any {
1574 /// Whether it also matches newlines or not
1575 newline: bool,
1576 },
1577 /// An assertion
1578 Assertion(Assertion),
1579 /// The string as a literal, e.g. `a`
1580 Literal {
1581 /// The string to match
1582 val: String,
1583 /// Whether match is case-insensitive or not
1584 casei: bool,
1585 },
1586 /// Concatenation of multiple expressions, must match in order, e.g. `a.` is a concatenation of
1587 /// the literal `a` and `.` for any character
1588 Concat(Vec<Expr>),
1589 /// Alternative of multiple expressions, one of them must match, e.g. `a|b` is an alternative
1590 /// where either the literal `a` or `b` must match
1591 Alt(Vec<Expr>),
1592 /// Capturing group of expression, e.g. `(a.)` matches `a` and any character and "captures"
1593 /// (remembers) the match
1594 Group(Box<Expr>),
1595 /// Look-around (e.g. positive/negative look-ahead or look-behind) with an expression, e.g.
1596 /// `(?=a)` means the next character must be `a` (but the match is not consumed)
1597 LookAround(Box<Expr>, LookAround),
1598 /// Repeat of an expression, e.g. `a*` or `a+` or `a{1,3}`
1599 Repeat {
1600 /// The expression that is being repeated
1601 child: Box<Expr>,
1602 /// The minimum number of repetitions
1603 lo: usize,
1604 /// The maximum number of repetitions (or `usize::MAX`)
1605 hi: usize,
1606 /// Greedy means as much as possible is matched, e.g. `.*b` would match all of `abab`.
1607 /// Non-greedy means as little as possible, e.g. `.*?b` would match only `ab` in `abab`.
1608 greedy: bool,
1609 },
1610 /// Delegate a regex to the regex crate. This is used as a simplification so that we don't have
1611 /// to represent all the expressions in the AST, e.g. character classes.
1612 Delegate {
1613 /// The regex
1614 inner: String,
1615 /// How many characters the regex matches
1616 size: usize, // TODO: move into analysis result
1617 /// Whether the matching is case-insensitive or not
1618 casei: bool,
1619 },
1620 /// Back reference to a capture group, e.g. `\1` in `(abc|def)\1` references the captured group
1621 /// and the whole regex matches either `abcabc` or `defdef`.
1622 Backref {
1623 /// The capture group number being referenced
1624 group: usize,
1625 /// Whether the matching is case-insensitive or not
1626 casei: bool,
1627 },
1628 /// Back reference to a capture group at the given specified relative recursion level.
1629 BackrefWithRelativeRecursionLevel {
1630 /// The capture group number being referenced
1631 group: usize,
1632 /// Relative recursion level
1633 relative_level: isize,
1634 /// Whether the matching is case-insensitive or not
1635 casei: bool,
1636 },
1637 /// Atomic non-capturing group, e.g. `(?>ab|a)` in text that contains `ab` will match `ab` and
1638 /// never backtrack and try `a`, even if matching fails after the atomic group.
1639 AtomicGroup(Box<Expr>),
1640 /// Keep matched text so far out of overall match
1641 KeepOut,
1642 /// Anchor to match at the position where the previous match ended
1643 ContinueFromPreviousMatchEnd,
1644 /// Conditional expression based on whether the numbered capture group matched or not
1645 BackrefExistsCondition(usize),
1646 /// If/Then/Else Condition. If there is no Then/Else, these will just be empty expressions.
1647 Conditional {
1648 /// The conditional expression to evaluate
1649 condition: Box<Expr>,
1650 /// What to execute if the condition is true
1651 true_branch: Box<Expr>,
1652 /// What to execute if the condition is false
1653 false_branch: Box<Expr>,
1654 },
1655 /// Subroutine call to the specified group number
1656 SubroutineCall(usize),
1657 /// Unresolved subroutine call to the specified group name
1658 UnresolvedNamedSubroutineCall {
1659 /// The capture group name
1660 name: String,
1661 /// The position in the original regex pattern where the subroutine call is made
1662 ix: usize,
1663 },
1664}
1665
1666/// Type of look-around assertion as used for a look-around expression.
1667#[derive(Debug, PartialEq, Eq, Clone, Copy)]
1668pub enum LookAround {
1669 /// Look-ahead assertion, e.g. `(?=a)`
1670 LookAhead,
1671 /// Negative look-ahead assertion, e.g. `(?!a)`
1672 LookAheadNeg,
1673 /// Look-behind assertion, e.g. `(?<=a)`
1674 LookBehind,
1675 /// Negative look-behind assertion, e.g. `(?<!a)`
1676 LookBehindNeg,
1677}
1678
1679/// An iterator over capture names in a [Regex]. The iterator
1680/// returns the name of each group, or [None] if the group has
1681/// no name. Because capture group 0 cannot have a name, the
1682/// first item returned is always [None].
1683pub struct CaptureNames<'r>(vec::IntoIter<Option<&'r str>>);
1684
1685impl Debug for CaptureNames<'_> {
1686 fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
1687 f.write_str("<CaptureNames>")
1688 }
1689}
1690
1691impl<'r> Iterator for CaptureNames<'r> {
1692 type Item = Option<&'r str>;
1693
1694 fn next(&mut self) -> Option<Self::Item> {
1695 self.0.next()
1696 }
1697}
1698
1699// silly to write my own, but this is super-fast for the common 1-digit
1700// case.
1701fn push_usize(s: &mut String, x: usize) {
1702 if x >= 10 {
1703 push_usize(s, x / 10);
1704 s.push((b'0' + (x % 10) as u8) as char);
1705 } else {
1706 s.push((b'0' + (x as u8)) as char);
1707 }
1708}
1709
1710fn is_special(c: char) -> bool {
1711 match c {
1712 '\\' | '.' | '+' | '*' | '?' | '(' | ')' | '|' | '[' | ']' | '{' | '}' | '^' | '$'
1713 | '#' => true,
1714 _ => false,
1715 }
1716}
1717
1718fn push_quoted(buf: &mut String, s: &str) {
1719 for c in s.chars() {
1720 if is_special(c) {
1721 buf.push('\\');
1722 }
1723 buf.push(c);
1724 }
1725}
1726
1727/// Escapes special characters in `text` with '\\'. Returns a string which, when interpreted
1728/// as a regex, matches exactly `text`.
1729pub fn escape(text: &str) -> Cow<'_, str> {
1730 // Using bytes() is OK because all special characters are single bytes.
1731 match text.bytes().filter(|&b| is_special(b as char)).count() {
1732 0 => Cow::Borrowed(text),
1733 n => {
1734 // The capacity calculation is exact because '\\' is a single byte.
1735 let mut buf = String::with_capacity(text.len() + n);
1736 push_quoted(&mut buf, text);
1737 Cow::Owned(buf)
1738 }
1739 }
1740}
1741
1742/// Type of assertions
1743#[derive(Debug, PartialEq, Eq, Clone, Copy)]
1744pub enum Assertion {
1745 /// Start of input text
1746 StartText,
1747 /// End of input text
1748 EndText,
1749 /// Start of a line
1750 StartLine {
1751 /// CRLF mode
1752 crlf: bool,
1753 },
1754 /// End of a line
1755 EndLine {
1756 /// CRLF mode
1757 crlf: bool,
1758 },
1759 /// Left word boundary
1760 LeftWordBoundary,
1761 /// Right word boundary
1762 RightWordBoundary,
1763 /// Both word boundaries
1764 WordBoundary,
1765 /// Not word boundary
1766 NotWordBoundary,
1767}
1768
1769impl Assertion {
1770 pub(crate) fn is_hard(&self) -> bool {
1771 use Assertion::*;
1772 matches!(
1773 self,
1774 // these will make regex-automata use PikeVM
1775 LeftWordBoundary | RightWordBoundary | WordBoundary | NotWordBoundary
1776 )
1777 }
1778}
1779
1780impl Expr {
1781 /// Parse the regex and return an expression (AST) and a bit set with the indexes of groups
1782 /// that are referenced by backrefs.
1783 pub fn parse_tree(re: &str) -> Result<ExprTree> {
1784 Parser::parse(re)
1785 }
1786
1787 /// Parse the regex and return an expression (AST)
1788 /// Flags should be bit based based on flags
1789 pub fn parse_tree_with_flags(re: &str, flags: u32) -> Result<ExprTree> {
1790 Parser::parse_with_flags(re, flags)
1791 }
1792
1793 /// Convert expression to a regex string in the regex crate's syntax.
1794 ///
1795 /// # Panics
1796 ///
1797 /// Panics for expressions that are hard, i.e. can not be handled by the regex crate.
1798 pub fn to_str(&self, buf: &mut String, precedence: u8) {
1799 match *self {
1800 Expr::Empty => (),
1801 Expr::Any { newline } => buf.push_str(if newline { "(?s:.)" } else { "." }),
1802 Expr::Literal { ref val, casei } => {
1803 if casei {
1804 buf.push_str("(?i:");
1805 }
1806 push_quoted(buf, val);
1807 if casei {
1808 buf.push(')');
1809 }
1810 }
1811 Expr::Assertion(Assertion::StartText) => buf.push('^'),
1812 Expr::Assertion(Assertion::EndText) => buf.push('$'),
1813 Expr::Assertion(Assertion::StartLine { crlf: false }) => buf.push_str("(?m:^)"),
1814 Expr::Assertion(Assertion::EndLine { crlf: false }) => buf.push_str("(?m:$)"),
1815 Expr::Assertion(Assertion::StartLine { crlf: true }) => buf.push_str("(?Rm:^)"),
1816 Expr::Assertion(Assertion::EndLine { crlf: true }) => buf.push_str("(?Rm:$)"),
1817 Expr::Concat(ref children) => {
1818 if precedence > 1 {
1819 buf.push_str("(?:");
1820 }
1821 for child in children {
1822 child.to_str(buf, 2);
1823 }
1824 if precedence > 1 {
1825 buf.push(')')
1826 }
1827 }
1828 Expr::Alt(ref children) => {
1829 if precedence > 0 {
1830 buf.push_str("(?:");
1831 }
1832 for (i, child) in children.iter().enumerate() {
1833 if i != 0 {
1834 buf.push('|');
1835 }
1836 child.to_str(buf, 1);
1837 }
1838 if precedence > 0 {
1839 buf.push(')');
1840 }
1841 }
1842 Expr::Group(ref child) => {
1843 buf.push('(');
1844 child.to_str(buf, 0);
1845 buf.push(')');
1846 }
1847 Expr::Repeat {
1848 ref child,
1849 lo,
1850 hi,
1851 greedy,
1852 } => {
1853 if precedence > 2 {
1854 buf.push_str("(?:");
1855 }
1856 child.to_str(buf, 3);
1857 match (lo, hi) {
1858 (0, 1) => buf.push('?'),
1859 (0, usize::MAX) => buf.push('*'),
1860 (1, usize::MAX) => buf.push('+'),
1861 (lo, hi) => {
1862 buf.push('{');
1863 push_usize(buf, lo);
1864 if lo != hi {
1865 buf.push(',');
1866 if hi != usize::MAX {
1867 push_usize(buf, hi);
1868 }
1869 }
1870 buf.push('}');
1871 }
1872 }
1873 if !greedy {
1874 buf.push('?');
1875 }
1876 if precedence > 2 {
1877 buf.push(')');
1878 }
1879 }
1880 Expr::Delegate {
1881 ref inner, casei, ..
1882 } => {
1883 // at the moment, delegate nodes are just atoms
1884 if casei {
1885 buf.push_str("(?i:");
1886 }
1887 buf.push_str(inner);
1888 if casei {
1889 buf.push(')');
1890 }
1891 }
1892 _ => panic!("attempting to format hard expr {:?}", self),
1893 }
1894 }
1895}
1896
1897// precondition: ix > 0
1898fn prev_codepoint_ix(s: &str, mut ix: usize) -> usize {
1899 let bytes = s.as_bytes();
1900 loop {
1901 ix -= 1;
1902 // fancy bit magic for ranges 0..0x80 + 0xc0..
1903 if (bytes[ix] as i8) >= -0x40 {
1904 break;
1905 }
1906 }
1907 ix
1908}
1909
1910fn codepoint_len(b: u8) -> usize {
1911 match b {
1912 b if b < 0x80 => 1,
1913 b if b < 0xe0 => 2,
1914 b if b < 0xf0 => 3,
1915 _ => 4,
1916 }
1917}
1918
1919/// Returns the smallest possible index of the next valid UTF-8 sequence
1920/// starting after `i`.
1921/// Adapted from a function with the same name in the `regex` crate.
1922fn next_utf8(text: &str, i: usize) -> usize {
1923 let b = match text.as_bytes().get(i) {
1924 None => return i + 1,
1925 Some(&b) => b,
1926 };
1927 i + codepoint_len(b)
1928}
1929
1930// If this returns false, then there is no possible backref in the re
1931
1932// Both potential implementations are turned off, because we currently
1933// always need to do a deeper analysis because of 1-character
1934// look-behind. If we could call a find_from_pos method of regex::Regex,
1935// it would make sense to bring this back.
1936/*
1937pub fn detect_possible_backref(re: &str) -> bool {
1938 let mut last = b'\x00';
1939 for b in re.as_bytes() {
1940 if b'0' <= *b && *b <= b'9' && last == b'\\' { return true; }
1941 last = *b;
1942 }
1943 false
1944}
1945
1946pub fn detect_possible_backref(re: &str) -> bool {
1947 let mut bytes = re.as_bytes();
1948 loop {
1949 match memchr::memchr(b'\\', &bytes[..bytes.len() - 1]) {
1950 Some(i) => {
1951 bytes = &bytes[i + 1..];
1952 let c = bytes[0];
1953 if b'0' <= c && c <= b'9' { return true; }
1954 }
1955 None => return false
1956 }
1957 }
1958}
1959*/
1960
1961/// The internal module only exists so that the toy example can access internals for debugging and
1962/// experimenting.
1963#[doc(hidden)]
1964pub mod internal {
1965 pub use crate::analyze::{analyze, can_compile_as_anchored};
1966 pub use crate::compile::compile;
1967 pub use crate::optimize::optimize;
1968 pub use crate::parse_flags::{
1969 FLAG_CASEI, FLAG_DOTNL, FLAG_IGNORE_SPACE, FLAG_MULTI, FLAG_UNICODE,
1970 };
1971 pub use crate::vm::{run_default, run_trace, Insn, Prog};
1972}
1973
1974#[cfg(test)]
1975mod tests {
1976 use alloc::borrow::Cow;
1977 use alloc::boxed::Box;
1978 use alloc::string::String;
1979 use alloc::{format, vec};
1980
1981 use crate::parse::make_literal;
1982 use crate::{Expr, Regex, RegexImpl};
1983
1984 //use detect_possible_backref;
1985
1986 // tests for to_str
1987
1988 fn to_str(e: Expr) -> String {
1989 let mut s = String::new();
1990 e.to_str(&mut s, 0);
1991 s
1992 }
1993
1994 #[test]
1995 fn to_str_concat_alt() {
1996 let e = Expr::Concat(vec![
1997 Expr::Alt(vec![make_literal("a"), make_literal("b")]),
1998 make_literal("c"),
1999 ]);
2000 assert_eq!(to_str(e), "(?:a|b)c");
2001 }
2002
2003 #[test]
2004 fn to_str_rep_concat() {
2005 let e = Expr::Repeat {
2006 child: Box::new(Expr::Concat(vec![make_literal("a"), make_literal("b")])),
2007 lo: 2,
2008 hi: 3,
2009 greedy: true,
2010 };
2011 assert_eq!(to_str(e), "(?:ab){2,3}");
2012 }
2013
2014 #[test]
2015 fn to_str_group_alt() {
2016 let e = Expr::Group(Box::new(Expr::Alt(vec![
2017 make_literal("a"),
2018 make_literal("b"),
2019 ])));
2020 assert_eq!(to_str(e), "(a|b)");
2021 }
2022
2023 #[test]
2024 fn as_str_debug() {
2025 let s = r"(a+)b\1";
2026 let regex = Regex::new(s).unwrap();
2027 assert_eq!(s, regex.as_str());
2028 assert_eq!(s, format!("{:?}", regex));
2029 }
2030
2031 #[test]
2032 fn display() {
2033 let s = r"(a+)b\1";
2034 let regex = Regex::new(s).unwrap();
2035 assert_eq!(s, format!("{}", regex));
2036 }
2037
2038 #[test]
2039 fn from_str() {
2040 let s = r"(a+)b\1";
2041 let regex = s.parse::<Regex>().unwrap();
2042 assert_eq!(regex.as_str(), s);
2043 }
2044
2045 #[test]
2046 fn to_str_repeat() {
2047 fn repeat(lo: usize, hi: usize, greedy: bool) -> Expr {
2048 Expr::Repeat {
2049 child: Box::new(make_literal("a")),
2050 lo,
2051 hi,
2052 greedy,
2053 }
2054 }
2055
2056 assert_eq!(to_str(repeat(2, 2, true)), "a{2}");
2057 assert_eq!(to_str(repeat(2, 2, false)), "a{2}?");
2058 assert_eq!(to_str(repeat(2, 3, true)), "a{2,3}");
2059 assert_eq!(to_str(repeat(2, 3, false)), "a{2,3}?");
2060 assert_eq!(to_str(repeat(2, usize::MAX, true)), "a{2,}");
2061 assert_eq!(to_str(repeat(2, usize::MAX, false)), "a{2,}?");
2062 assert_eq!(to_str(repeat(0, 1, true)), "a?");
2063 assert_eq!(to_str(repeat(0, 1, false)), "a??");
2064 assert_eq!(to_str(repeat(0, usize::MAX, true)), "a*");
2065 assert_eq!(to_str(repeat(0, usize::MAX, false)), "a*?");
2066 assert_eq!(to_str(repeat(1, usize::MAX, true)), "a+");
2067 assert_eq!(to_str(repeat(1, usize::MAX, false)), "a+?");
2068 }
2069
2070 #[test]
2071 fn escape() {
2072 // Check that strings that need no quoting are borrowed, and that non-special punctuation
2073 // is not quoted.
2074 match crate::escape("@foo") {
2075 Cow::Borrowed(s) => assert_eq!(s, "@foo"),
2076 _ => panic!("Value should be borrowed."),
2077 }
2078
2079 // Check typical usage.
2080 assert_eq!(crate::escape("fo*o").into_owned(), "fo\\*o");
2081
2082 // Check that multibyte characters are handled correctly.
2083 assert_eq!(crate::escape("fø*ø").into_owned(), "fø\\*ø");
2084 }
2085
2086 #[test]
2087 fn trailing_positive_lookahead_wrap_capture_group_fixup() {
2088 let s = r"a+(?=c)";
2089 let regex = s.parse::<Regex>().unwrap();
2090 assert!(matches!(regex.inner,
2091 RegexImpl::Wrap { explicit_capture_group_0: true, .. }),
2092 "trailing positive lookahead for an otherwise easy pattern should avoid going through the VM");
2093 assert_eq!(s, regex.as_str());
2094 assert_eq!(s, format!("{:?}", regex));
2095 }
2096
2097 #[test]
2098 fn easy_regex() {
2099 let s = r"(a+)b";
2100 let regex = s.parse::<Regex>().unwrap();
2101 assert!(
2102 matches!(regex.inner, RegexImpl::Wrap { explicit_capture_group_0: false, .. }),
2103 "easy pattern should avoid going through the VM, and capture group 0 should be implicit"
2104 );
2105
2106 assert_eq!(s, regex.as_str());
2107 assert_eq!(s, format!("{:?}", regex));
2108 }
2109
2110 #[test]
2111 fn hard_regex() {
2112 let s = r"(a+)(?>c)";
2113 let regex = s.parse::<Regex>().unwrap();
2114 assert!(
2115 matches!(regex.inner, RegexImpl::Fancy { .. }),
2116 "hard regex should be compiled into a VM"
2117 );
2118 assert_eq!(s, regex.as_str());
2119 assert_eq!(s, format!("{:?}", regex));
2120 }
2121
2122 /*
2123 #[test]
2124 fn detect_backref() {
2125 assert_eq!(detect_possible_backref("a0a1a2"), false);
2126 assert_eq!(detect_possible_backref("a0a1\\a2"), false);
2127 assert_eq!(detect_possible_backref("a0a\\1a2"), true);
2128 assert_eq!(detect_possible_backref("a0a1a2\\"), false);
2129 }
2130 */
2131}