fancy_regex/lib.rs
1// Copyright 2016 The Fancy Regex Authors.
2//
3// Permission is hereby granted, free of charge, to any person obtaining a copy
4// of this software and associated documentation files (the "Software"), to deal
5// in the Software without restriction, including without limitation the rights
6// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7// copies of the Software, and to permit persons to whom the Software is
8// furnished to do so, subject to the following conditions:
9//
10// The above copyright notice and this permission notice shall be included in
11// all copies or substantial portions of the Software.
12//
13// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19// THE SOFTWARE.
20
21/*!
22An implementation of regexes, supporting a relatively rich set of features, including backreferences
23and lookaround.
24
25It builds on top of the excellent [regex] crate. If you are not
26familiar with it, make sure you read its documentation and maybe you don't even need fancy-regex.
27
28If your regex or parts of it does not use any special features, the matching is delegated to the
29regex crate. That means it has linear runtime. But if you use "fancy" features such as
30backreferences or look-around, an engine with backtracking needs to be used. In that case, the regex
31can be slow and take exponential time to run because of what is called "catastrophic backtracking".
32This depends on the regex and the input.
33
34# Usage
35
36The API should feel very similar to the regex crate, and involves compiling a regex and then using
37it to find matches in text.
38
39## Example: Matching text
40
41An example with backreferences to check if a text consists of two identical words:
42
43```rust
44use fancy_regex::Regex;
45
46let re = Regex::new(r"^(\w+) (\1)$").unwrap();
47let result = re.is_match("foo foo");
48
49assert!(result.is_ok());
50let did_match = result.unwrap();
51assert!(did_match);
52```
53
54Note that like in the regex crate, the regex needs anchors like `^` and `$` to match against the
55entire input text.
56
57## Example: Finding the position of matches
58
59```rust
60use fancy_regex::Regex;
61
62let re = Regex::new(r"(\d)\1").unwrap();
63let result = re.find("foo 22");
64
65assert!(result.is_ok(), "execution was successful");
66let match_option = result.unwrap();
67
68assert!(match_option.is_some(), "found a match");
69let m = match_option.unwrap();
70
71assert_eq!(m.start(), 4);
72assert_eq!(m.end(), 6);
73assert_eq!(m.as_str(), "22");
74```
75
76## Example: Capturing groups
77
78```rust
79use fancy_regex::Regex;
80
81let re = Regex::new(r"(?<!AU)\$(\d+)").unwrap();
82let result = re.captures("AU$10, $20");
83
84let captures = result.expect("Error running regex").expect("No match found");
85let group = captures.get(1).expect("No group");
86assert_eq!(group.as_str(), "20");
87```
88
89## Example: Splitting text
90
91```rust
92use fancy_regex::Regex;
93
94let re = Regex::new(r"[ \t]+").unwrap();
95let target = "a b \t c\td e";
96let fields: Vec<&str> = re.split(target).map(|x| x.unwrap()).collect();
97assert_eq!(fields, vec!["a", "b", "c", "d", "e"]);
98
99let fields: Vec<&str> = re.splitn(target, 3).map(|x| x.unwrap()).collect();
100assert_eq!(fields, vec!["a", "b", "c\td e"]);
101```
102
103# Features
104
105This crate supports several optional features that can be enabled or disabled:
106
107- **`std`** (enabled by default): Enables standard library support. Disable for `no_std` environments.
108- **`unicode`** (enabled by default): Enables Unicode support for character classes and word boundaries.
109- **`perf`** (enabled by default): Enables performance optimizations in the underlying regex engine.
110- **`variable-lookbehinds`** (enabled by default): Enables support for variable-length lookbehind
111 assertions (e.g., `(?<=a+)`). Without this feature, only constant-length lookbehinds are supported.
112 This feature uses reverse DFA matching from the `regex-automata` crate to efficiently handle
113 variable-length patterns that don't use backreferences or other fancy features.
114
115# Syntax
116
117The regex syntax is based on the [regex] crate's, with some additional supported syntax.
118
119Escapes:
120
121`\h`
122: hex digit (`[0-9A-Fa-f]`) \
123`\H`
124: not hex digit (`[^0-9A-Fa-f]`) \
125`\e`
126: escape control character (`\x1B`) \
127`\K`
128: keep text matched so far out of the overall match ([docs](https://www.regular-expressions.info/keep.html))\
129`\G`
130: anchor to where the previous match ended ([docs](https://www.regular-expressions.info/continue.html))\
131`\Z`
132: anchor to the end of the text before any trailing newlines\
133`\O`
134: any character including newline
135
136Backreferences:
137
138`\1`
139: match the exact string that the first capture group matched \
140`\2`
141: backref to the second capture group, etc
142
143Named capture groups:
144
145`(?<name>exp)`
146: match *exp*, creating capture group named *name* \
147`\k<name>`
148: match the exact string that the capture group named *name* matched \
149`(?P<name>exp)`
150: same as `(?<name>exp)` for compatibility with Python, etc. \
151`(?P=name)`
152: same as `\k<name>` for compatibility with Python, etc.
153
154Look-around assertions for matching without changing the current position:
155
156`(?=exp)`
157: look-ahead, succeeds if *exp* matches to the right of the current position \
158`(?!exp)`
159: negative look-ahead, succeeds if *exp* doesn't match to the right \
160`(?<=exp)`
161: look-behind, succeeds if *exp* matches to the left of the current position \
162`(?<!exp)`
163: negative look-behind, succeeds if *exp* doesn't match to the left
164
165**Note**: Look-behind assertions with variable length (e.g., `(?<=a+)`) are supported with the
166`variable-lookbehinds` feature (enabled by default). Without this feature, only constant-length
167look-behinds are supported. Variable-length look-behinds with backreferences or other "fancy"
168features are not currently supported.
169
170Atomic groups using `(?>exp)` to prevent backtracking within `exp`, e.g.:
171
172```
173# use fancy_regex::Regex;
174let re = Regex::new(r"^a(?>bc|b)c$").unwrap();
175assert!(re.is_match("abcc").unwrap());
176// Doesn't match because `|b` is never tried because of the atomic group
177assert!(!re.is_match("abc").unwrap());
178```
179
180Conditionals - if/then/else:
181
182`(?(1))`
183: continue only if first capture group matched \
184`(?(<name>))`
185: continue only if capture group named *name* matched \
186`(?(1)true_branch|false_branch)`
187: if the first capture group matched then execute the true_branch regex expression, else execute false_branch ([docs](https://www.regular-expressions.info/conditional.html)) \
188`(?(condition)true_branch|false_branch)`
189: if the condition matches then execute the true_branch regex expression, else execute false_branch from the point just before the condition was evaluated
190
191[regex]: https://crates.io/crates/regex
192*/
193
194#![deny(missing_docs)]
195#![deny(missing_debug_implementations)]
196#![cfg_attr(not(feature = "std"), no_std)]
197
198extern crate alloc;
199
200use alloc::borrow::Cow;
201use alloc::boxed::Box;
202use alloc::string::{String, ToString};
203use alloc::sync::Arc;
204use alloc::vec;
205use alloc::vec::Vec;
206
207use core::convert::TryFrom;
208use core::fmt;
209use core::fmt::{Debug, Formatter};
210use core::ops::{Index, Range};
211use core::str::FromStr;
212use regex_automata::meta::Regex as RaRegex;
213use regex_automata::util::captures::Captures as RaCaptures;
214use regex_automata::util::syntax::Config as SyntaxConfig;
215use regex_automata::Input as RaInput;
216
217mod analyze;
218mod compile;
219mod error;
220mod expand;
221mod optimize;
222mod parse;
223mod parse_flags;
224mod replacer;
225mod vm;
226
227use crate::analyze::analyze;
228use crate::analyze::can_compile_as_anchored;
229use crate::compile::compile;
230use crate::optimize::optimize;
231use crate::parse::{ExprTree, NamedGroups, Parser};
232use crate::parse_flags::*;
233use crate::vm::{Prog, OPTION_SKIPPED_EMPTY_MATCH};
234
235pub use crate::error::{CompileError, Error, ParseError, Result, RuntimeError};
236pub use crate::expand::Expander;
237pub use crate::replacer::{NoExpand, Replacer, ReplacerRef};
238
239const MAX_RECURSION: usize = 64;
240
241// the public API
242
243/// A builder for a `Regex` to allow configuring options.
244#[derive(Debug)]
245pub struct RegexBuilder(RegexOptions);
246
247/// A compiled regular expression.
248#[derive(Clone)]
249pub struct Regex {
250 inner: RegexImpl,
251 named_groups: Arc<NamedGroups>,
252}
253
254// Separate enum because we don't want to expose any of this
255#[derive(Clone)]
256enum RegexImpl {
257 // Do we want to box this? It's pretty big...
258 Wrap {
259 inner: RaRegex,
260 options: RegexOptions,
261 /// Some optimizations avoid the VM, but need to use an extra capture group to represent the match boundaries
262 explicit_capture_group_0: bool,
263 debug_pattern: String,
264 },
265 Fancy {
266 prog: Arc<Prog>,
267 n_groups: usize,
268 options: RegexOptions,
269 },
270}
271
272/// A single match of a regex or group in an input text
273#[derive(Copy, Clone, Debug, Eq, PartialEq)]
274pub struct Match<'t> {
275 text: &'t str,
276 start: usize,
277 end: usize,
278}
279
280/// An iterator over all non-overlapping matches for a particular string.
281///
282/// The iterator yields a `Result<Match>`. The iterator stops when no more
283/// matches can be found.
284///
285/// `'r` is the lifetime of the compiled regular expression and `'t` is the
286/// lifetime of the matched string.
287#[derive(Debug)]
288pub struct Matches<'r, 't> {
289 re: &'r Regex,
290 text: &'t str,
291 last_end: usize,
292 last_match: Option<usize>,
293}
294
295impl<'r, 't> Matches<'r, 't> {
296 /// Return the text being searched.
297 pub fn text(&self) -> &'t str {
298 self.text
299 }
300
301 /// Return the underlying regex.
302 pub fn regex(&self) -> &'r Regex {
303 self.re
304 }
305}
306
307impl<'r, 't> Iterator for Matches<'r, 't> {
308 type Item = Result<Match<'t>>;
309
310 /// Adapted from the `regex` crate. Calls `find_from_pos` repeatedly.
311 /// Ignores empty matches immediately after a match.
312 fn next(&mut self) -> Option<Self::Item> {
313 if self.last_end > self.text.len() {
314 return None;
315 }
316
317 let option_flags = if let Some(last_match) = self.last_match {
318 if self.last_end > last_match {
319 OPTION_SKIPPED_EMPTY_MATCH
320 } else {
321 0
322 }
323 } else {
324 0
325 };
326 let mat =
327 match self
328 .re
329 .find_from_pos_with_option_flags(self.text, self.last_end, option_flags)
330 {
331 Err(error) => {
332 // Stop on first error: If an error is encountered, return it, and set the "last match position"
333 // to the string length, so that the next next() call will return None, to prevent an infinite loop.
334 self.last_end = self.text.len() + 1;
335 return Some(Err(error));
336 }
337 Ok(None) => return None,
338 Ok(Some(mat)) => mat,
339 };
340
341 if mat.start == mat.end {
342 // This is an empty match. To ensure we make progress, start
343 // the next search at the smallest possible starting position
344 // of the next match following this one.
345 self.last_end = next_utf8(self.text, mat.end);
346 // Don't accept empty matches immediately following a match.
347 // Just move on to the next match.
348 if Some(mat.end) == self.last_match {
349 return self.next();
350 }
351 } else {
352 self.last_end = mat.end;
353 }
354
355 self.last_match = Some(mat.end);
356
357 Some(Ok(mat))
358 }
359}
360
361/// An iterator that yields all non-overlapping capture groups matching a
362/// particular regular expression.
363///
364/// The iterator stops when no more matches can be found.
365///
366/// `'r` is the lifetime of the compiled regular expression and `'t` is the
367/// lifetime of the matched string.
368#[derive(Debug)]
369pub struct CaptureMatches<'r, 't>(Matches<'r, 't>);
370
371impl<'r, 't> CaptureMatches<'r, 't> {
372 /// Return the text being searched.
373 pub fn text(&self) -> &'t str {
374 self.0.text
375 }
376
377 /// Return the underlying regex.
378 pub fn regex(&self) -> &'r Regex {
379 self.0.re
380 }
381}
382
383impl<'r, 't> Iterator for CaptureMatches<'r, 't> {
384 type Item = Result<Captures<'t>>;
385
386 /// Adapted from the `regex` crate. Calls `captures_from_pos` repeatedly.
387 /// Ignores empty matches immediately after a match.
388 fn next(&mut self) -> Option<Self::Item> {
389 if self.0.last_end > self.0.text.len() {
390 return None;
391 }
392
393 let captures = match self.0.re.captures_from_pos(self.0.text, self.0.last_end) {
394 Err(error) => {
395 // Stop on first error: If an error is encountered, return it, and set the "last match position"
396 // to the string length, so that the next next() call will return None, to prevent an infinite loop.
397 self.0.last_end = self.0.text.len() + 1;
398 return Some(Err(error));
399 }
400 Ok(None) => return None,
401 Ok(Some(captures)) => captures,
402 };
403
404 let mat = captures
405 .get(0)
406 .expect("`Captures` is expected to have entire match at 0th position");
407 if mat.start == mat.end {
408 self.0.last_end = next_utf8(self.0.text, mat.end);
409 if Some(mat.end) == self.0.last_match {
410 return self.next();
411 }
412 } else {
413 self.0.last_end = mat.end;
414 }
415
416 self.0.last_match = Some(mat.end);
417
418 Some(Ok(captures))
419 }
420}
421
422/// A set of capture groups found for a regex.
423#[derive(Debug)]
424pub struct Captures<'t> {
425 inner: CapturesImpl<'t>,
426 named_groups: Arc<NamedGroups>,
427}
428
429#[derive(Debug)]
430enum CapturesImpl<'t> {
431 Wrap {
432 text: &'t str,
433 locations: RaCaptures,
434 /// Some optimizations avoid the VM but need an extra capture group to represent the match boundaries.
435 /// Therefore what is actually capture group 1 should be treated as capture group 0, and all other
436 /// capture groups should have their index reduced by one as well to line up with what the pattern specifies.
437 explicit_capture_group_0: bool,
438 },
439 Fancy {
440 text: &'t str,
441 saves: Vec<usize>,
442 },
443}
444
445/// Iterator for captured groups in order in which they appear in the regex.
446#[derive(Debug)]
447pub struct SubCaptureMatches<'c, 't> {
448 caps: &'c Captures<'t>,
449 i: usize,
450}
451
452/// An iterator over all substrings delimited by a regex.
453///
454/// This iterator yields `Result<&'h str>`, where each item is a substring of the
455/// target string that is delimited by matches of the regular expression. It stops when there
456/// are no more substrings to yield.
457///
458/// `'r` is the lifetime of the compiled regular expression, and `'h` is the
459/// lifetime of the target string being split.
460///
461/// This iterator can be created by the [`Regex::split`] method.
462#[derive(Debug)]
463pub struct Split<'r, 'h> {
464 matches: Matches<'r, 'h>,
465 next_start: usize,
466 target: &'h str,
467}
468
469impl<'r, 'h> Iterator for Split<'r, 'h> {
470 type Item = Result<&'h str>;
471
472 /// Returns the next substring that results from splitting the target string by the regex.
473 ///
474 /// If no more matches are found, returns the remaining part of the string,
475 /// or `None` if all substrings have been yielded.
476 fn next(&mut self) -> Option<Result<&'h str>> {
477 match self.matches.next() {
478 None => {
479 let len = self.target.len();
480 if self.next_start > len {
481 // No more substrings to return
482 None
483 } else {
484 // Return the last part of the target string
485 // Next call will return None
486 let part = &self.target[self.next_start..len];
487 self.next_start = len + 1;
488 Some(Ok(part))
489 }
490 }
491 // Return the next substring
492 Some(Ok(m)) => {
493 let part = &self.target[self.next_start..m.start()];
494 self.next_start = m.end();
495 Some(Ok(part))
496 }
497 Some(Err(e)) => Some(Err(e)),
498 }
499 }
500}
501
502impl<'r, 'h> core::iter::FusedIterator for Split<'r, 'h> {}
503
504/// An iterator over at most `N` substrings delimited by a regex.
505///
506/// This iterator yields `Result<&'h str>`, where each item is a substring of the
507/// target that is delimited by matches of the regular expression. It stops either when
508/// there are no more substrings to yield, or after `N` substrings have been yielded.
509///
510/// The `N`th substring is the remaining part of the target.
511///
512/// `'r` is the lifetime of the compiled regular expression, and `'h` is the
513/// lifetime of the target string being split.
514///
515/// This iterator can be created by the [`Regex::splitn`] method.
516#[derive(Debug)]
517pub struct SplitN<'r, 'h> {
518 splits: Split<'r, 'h>,
519 limit: usize,
520}
521
522impl<'r, 'h> Iterator for SplitN<'r, 'h> {
523 type Item = Result<&'h str>;
524
525 /// Returns the next substring resulting from splitting the target by the regex,
526 /// limited to `N` splits.
527 ///
528 /// Returns `None` if no more matches are found or if the limit is reached after yielding
529 /// the remaining part of the target.
530 fn next(&mut self) -> Option<Result<&'h str>> {
531 if self.limit == 0 {
532 // Limit reached. No more substrings available.
533 return None;
534 }
535
536 // Decrement the limit for each split.
537 self.limit -= 1;
538 if self.limit > 0 {
539 return self.splits.next();
540 }
541
542 // Nth split
543 let len = self.splits.target.len();
544 if self.splits.next_start > len {
545 // No more substrings available.
546 None
547 } else {
548 // Return the remaining part of the target
549 let start = self.splits.next_start;
550 self.splits.next_start = len + 1;
551 Some(Ok(&self.splits.target[start..len]))
552 }
553 }
554
555 fn size_hint(&self) -> (usize, Option<usize>) {
556 (0, Some(self.limit))
557 }
558}
559
560impl<'r, 'h> core::iter::FusedIterator for SplitN<'r, 'h> {}
561
562#[derive(Clone, Debug)]
563struct RegexOptions {
564 pattern: String,
565 syntaxc: SyntaxConfig,
566 backtrack_limit: usize,
567 delegate_size_limit: Option<usize>,
568 delegate_dfa_size_limit: Option<usize>,
569 oniguruma_mode: bool,
570}
571
572impl RegexOptions {
573 fn get_flag_value(flag_value: bool, enum_value: u32) -> u32 {
574 if flag_value {
575 enum_value
576 } else {
577 0
578 }
579 }
580
581 fn compute_flags(&self) -> u32 {
582 let insensitive = Self::get_flag_value(self.syntaxc.get_case_insensitive(), FLAG_CASEI);
583 let multiline = Self::get_flag_value(self.syntaxc.get_multi_line(), FLAG_MULTI);
584 let whitespace =
585 Self::get_flag_value(self.syntaxc.get_ignore_whitespace(), FLAG_IGNORE_SPACE);
586 let dotnl = Self::get_flag_value(self.syntaxc.get_dot_matches_new_line(), FLAG_DOTNL);
587 let unicode = Self::get_flag_value(self.syntaxc.get_unicode(), FLAG_UNICODE);
588 let oniguruma_mode = Self::get_flag_value(self.oniguruma_mode, FLAG_ONIGURUMA_MODE);
589
590 insensitive | multiline | whitespace | dotnl | unicode | unicode | oniguruma_mode
591 }
592}
593
594impl Default for RegexOptions {
595 fn default() -> Self {
596 RegexOptions {
597 pattern: String::new(),
598 syntaxc: SyntaxConfig::default(),
599 backtrack_limit: 1_000_000,
600 delegate_size_limit: None,
601 delegate_dfa_size_limit: None,
602 oniguruma_mode: false,
603 }
604 }
605}
606
607impl RegexBuilder {
608 /// Create a new regex builder with a regex pattern.
609 ///
610 /// If the pattern is invalid, the call to `build` will fail later.
611 pub fn new(pattern: &str) -> Self {
612 let mut builder = RegexBuilder(RegexOptions::default());
613 builder.0.pattern = pattern.to_string();
614 builder
615 }
616
617 /// Build the `Regex`.
618 ///
619 /// Returns an [`Error`](enum.Error.html) if the pattern could not be parsed.
620 pub fn build(&self) -> Result<Regex> {
621 Regex::new_options(self.0.clone())
622 }
623
624 fn set_config(&mut self, func: impl Fn(SyntaxConfig) -> SyntaxConfig) -> &mut Self {
625 self.0.syntaxc = func(self.0.syntaxc);
626 self
627 }
628
629 /// Override default case insensitive
630 /// this is to enable/disable casing via builder instead of a flag within
631 /// the raw string provided to the regex builder
632 ///
633 /// Default is false
634 pub fn case_insensitive(&mut self, yes: bool) -> &mut Self {
635 self.set_config(|x| x.case_insensitive(yes))
636 }
637
638 /// Enable multi-line regex
639 pub fn multi_line(&mut self, yes: bool) -> &mut Self {
640 self.set_config(|x| x.multi_line(yes))
641 }
642
643 /// Allow ignore whitespace
644 pub fn ignore_whitespace(&mut self, yes: bool) -> &mut Self {
645 self.set_config(|x| x.ignore_whitespace(yes))
646 }
647
648 /// Enable or disable the "dot matches any character" flag.
649 /// When this is enabled, `.` will match any character. When it's disabled, then `.` will match any character
650 /// except for a new line character.
651 pub fn dot_matches_new_line(&mut self, yes: bool) -> &mut Self {
652 self.set_config(|x| x.dot_matches_new_line(yes))
653 }
654
655 /// Enable verbose mode in the regular expression.
656 ///
657 /// The same as ignore_whitespace
658 ///
659 /// When enabled, verbose mode permits insigificant whitespace in many
660 /// places in the regular expression, as well as comments. Comments are
661 /// started using `#` and continue until the end of the line.
662 ///
663 /// By default, this is disabled. It may be selectively enabled in the
664 /// regular expression by using the `x` flag regardless of this setting.
665 pub fn verbose_mode(&mut self, yes: bool) -> &mut Self {
666 self.set_config(|x| x.ignore_whitespace(yes))
667 }
668
669 /// Enable or disable the Unicode flag (`u`) by default.
670 ///
671 /// By default this is **enabled**. It may alternatively be selectively
672 /// disabled in the regular expression itself via the `u` flag.
673 ///
674 /// Note that unless "allow invalid UTF-8" is enabled (it's disabled by
675 /// default), a regular expression will fail to parse if Unicode mode is
676 /// disabled and a sub-expression could possibly match invalid UTF-8.
677 ///
678 /// **WARNING**: Unicode mode can greatly increase the size of the compiled
679 /// DFA, which can noticeably impact both memory usage and compilation
680 /// time. This is especially noticeable if your regex contains character
681 /// classes like `\w` that are impacted by whether Unicode is enabled or
682 /// not. If Unicode is not necessary, you are encouraged to disable it.
683 pub fn unicode_mode(&mut self, yes: bool) -> &mut Self {
684 self.set_config(|x| x.unicode(yes))
685 }
686
687 /// Limit for how many times backtracking should be attempted for fancy regexes (where
688 /// backtracking is used). If this limit is exceeded, execution returns an error with
689 /// [`Error::BacktrackLimitExceeded`](enum.Error.html#variant.BacktrackLimitExceeded).
690 /// This is for preventing a regex with catastrophic backtracking to run for too long.
691 ///
692 /// Default is `1_000_000` (1 million).
693 pub fn backtrack_limit(&mut self, limit: usize) -> &mut Self {
694 self.0.backtrack_limit = limit;
695 self
696 }
697
698 /// Set the approximate size limit of the compiled regular expression.
699 ///
700 /// This option is forwarded from the wrapped `regex` crate. Note that depending on the used
701 /// regex features there may be multiple delegated sub-regexes fed to the `regex` crate. As
702 /// such the actual limit is closer to `<number of delegated regexes> * delegate_size_limit`.
703 pub fn delegate_size_limit(&mut self, limit: usize) -> &mut Self {
704 self.0.delegate_size_limit = Some(limit);
705 self
706 }
707
708 /// Set the approximate size of the cache used by the DFA.
709 ///
710 /// This option is forwarded from the wrapped `regex` crate. Note that depending on the used
711 /// regex features there may be multiple delegated sub-regexes fed to the `regex` crate. As
712 /// such the actual limit is closer to `<number of delegated regexes> *
713 /// delegate_dfa_size_limit`.
714 pub fn delegate_dfa_size_limit(&mut self, limit: usize) -> &mut Self {
715 self.0.delegate_dfa_size_limit = Some(limit);
716 self
717 }
718
719 /// Attempts to better match [Oniguruma](https://github.com/kkos/oniguruma)'s default behavior
720 ///
721 /// Currently this amounts to changing behavior with:
722 ///
723 /// # Left and right word bounds
724 ///
725 /// `fancy-regex` follows the default of other regex engines such as the `regex` crate itself
726 /// where `\<` and `\>` correspond to a _left_ and _right_ word-bound respectively. This
727 /// differs from Oniguruma's defaults which treat them as matching the literals `<` and `>`.
728 /// When this option is set using `\<` and `\>` in the pattern will match the literals
729 /// `<` and `>` instead of word bounds.
730 ///
731 /// ## Example
732 ///
733 /// ```
734 /// use fancy_regex::{Regex, RegexBuilder};
735 ///
736 /// let haystack = "turbo::<Fish>";
737 /// let regex = r"\<\w*\>";
738 ///
739 /// // By default `\<` and `\>` will match the start and end of a word boundary
740 /// let word_bounds_regex = Regex::new(regex).unwrap();
741 /// let word_bounds = word_bounds_regex.find(haystack).unwrap().unwrap();
742 /// assert_eq!(word_bounds.as_str(), "turbo");
743 ///
744 /// // With the option set they instead match the literal `<` and `>` characters
745 /// let literals_regex = RegexBuilder::new(regex).oniguruma_mode(true).build().unwrap();
746 /// let literals = literals_regex.find(haystack).unwrap().unwrap();
747 /// assert_eq!(literals.as_str(), "<Fish>");
748 /// ```
749 pub fn oniguruma_mode(&mut self, yes: bool) -> &mut Self {
750 self.0.oniguruma_mode = yes;
751 self
752 }
753}
754
755impl fmt::Debug for Regex {
756 /// Shows the original regular expression.
757 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
758 write!(f, "{}", self.as_str())
759 }
760}
761
762impl fmt::Display for Regex {
763 /// Shows the original regular expression
764 fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
765 write!(f, "{}", self.as_str())
766 }
767}
768
769impl FromStr for Regex {
770 type Err = Error;
771
772 /// Attempts to parse a string into a regular expression
773 fn from_str(s: &str) -> Result<Regex> {
774 Regex::new(s)
775 }
776}
777
778impl Regex {
779 /// Parse and compile a regex with default options, see `RegexBuilder`.
780 ///
781 /// Returns an [`Error`](enum.Error.html) if the pattern could not be parsed.
782 pub fn new(re: &str) -> Result<Regex> {
783 let options = RegexOptions {
784 pattern: re.to_string(),
785 ..RegexOptions::default()
786 };
787 Self::new_options(options)
788 }
789
790 fn new_options(options: RegexOptions) -> Result<Regex> {
791 let mut tree = Expr::parse_tree_with_flags(&options.pattern, options.compute_flags())?;
792
793 // try to optimize the expression tree
794 let requires_capture_group_fixup = optimize(&mut tree);
795 let info = analyze(&tree, requires_capture_group_fixup)?;
796
797 if !info.hard {
798 // easy case, wrap regex
799
800 // we do our own to_str because escapes are different
801 // NOTE: there is a good opportunity here to use Hir to avoid regex-automata re-parsing it
802 let mut re_cooked = String::new();
803 tree.expr.to_str(&mut re_cooked, 0);
804 let inner = compile::compile_inner(&re_cooked, &options)?;
805 return Ok(Regex {
806 inner: RegexImpl::Wrap {
807 inner,
808 options: RegexOptions {
809 pattern: options.pattern,
810 ..options
811 },
812 explicit_capture_group_0: requires_capture_group_fixup,
813 debug_pattern: re_cooked,
814 },
815 named_groups: Arc::new(tree.named_groups),
816 });
817 }
818
819 let prog = compile(&info, can_compile_as_anchored(&tree.expr))?;
820 Ok(Regex {
821 inner: RegexImpl::Fancy {
822 prog: Arc::new(prog),
823 n_groups: info.end_group(),
824 options,
825 },
826 named_groups: Arc::new(tree.named_groups),
827 })
828 }
829
830 /// Returns the original string of this regex.
831 pub fn as_str(&self) -> &str {
832 match &self.inner {
833 RegexImpl::Wrap { options, .. } => &options.pattern,
834 RegexImpl::Fancy { options, .. } => &options.pattern,
835 }
836 }
837
838 /// Check if the regex matches the input text.
839 ///
840 /// # Example
841 ///
842 /// Test if some text contains the same word twice:
843 ///
844 /// ```rust
845 /// # use fancy_regex::Regex;
846 ///
847 /// let re = Regex::new(r"(\w+) \1").unwrap();
848 /// assert!(re.is_match("mirror mirror on the wall").unwrap());
849 /// ```
850 pub fn is_match(&self, text: &str) -> Result<bool> {
851 match &self.inner {
852 RegexImpl::Wrap { inner, .. } => Ok(inner.is_match(text)),
853 RegexImpl::Fancy { prog, options, .. } => {
854 let result = vm::run(prog, text, 0, 0, options)?;
855 Ok(result.is_some())
856 }
857 }
858 }
859
860 /// Returns an iterator for each successive non-overlapping match in `text`.
861 ///
862 /// If you have capturing groups in your regex that you want to extract, use the [Regex::captures_iter()]
863 /// method.
864 ///
865 /// # Example
866 ///
867 /// Find all words followed by an exclamation point:
868 ///
869 /// ```rust
870 /// # use fancy_regex::Regex;
871 ///
872 /// let re = Regex::new(r"\w+(?=!)").unwrap();
873 /// let mut matches = re.find_iter("so fancy! even with! iterators!");
874 /// assert_eq!(matches.next().unwrap().unwrap().as_str(), "fancy");
875 /// assert_eq!(matches.next().unwrap().unwrap().as_str(), "with");
876 /// assert_eq!(matches.next().unwrap().unwrap().as_str(), "iterators");
877 /// assert!(matches.next().is_none());
878 /// ```
879 pub fn find_iter<'r, 't>(&'r self, text: &'t str) -> Matches<'r, 't> {
880 Matches {
881 re: self,
882 text,
883 last_end: 0,
884 last_match: None,
885 }
886 }
887
888 /// Find the first match in the input text.
889 ///
890 /// If you have capturing groups in your regex that you want to extract, use the [Regex::captures()]
891 /// method.
892 ///
893 /// # Example
894 ///
895 /// Find a word that is followed by an exclamation point:
896 ///
897 /// ```rust
898 /// # use fancy_regex::Regex;
899 ///
900 /// let re = Regex::new(r"\w+(?=!)").unwrap();
901 /// assert_eq!(re.find("so fancy!").unwrap().unwrap().as_str(), "fancy");
902 /// ```
903 pub fn find<'t>(&self, text: &'t str) -> Result<Option<Match<'t>>> {
904 self.find_from_pos(text, 0)
905 }
906
907 /// Returns the first match in `text`, starting from the specified byte position `pos`.
908 ///
909 /// # Examples
910 ///
911 /// Finding match starting at a position:
912 ///
913 /// ```
914 /// # use fancy_regex::Regex;
915 /// let re = Regex::new(r"(?m:^)(\d+)").unwrap();
916 /// let text = "1 test 123\n2 foo";
917 /// let mat = re.find_from_pos(text, 7).unwrap().unwrap();
918 ///
919 /// assert_eq!(mat.start(), 11);
920 /// assert_eq!(mat.end(), 12);
921 /// ```
922 ///
923 /// Note that in some cases this is not the same as using the `find`
924 /// method and passing a slice of the string, see [Regex::captures_from_pos()] for details.
925 pub fn find_from_pos<'t>(&self, text: &'t str, pos: usize) -> Result<Option<Match<'t>>> {
926 self.find_from_pos_with_option_flags(text, pos, 0)
927 }
928
929 fn find_from_pos_with_option_flags<'t>(
930 &self,
931 text: &'t str,
932 pos: usize,
933 option_flags: u32,
934 ) -> Result<Option<Match<'t>>> {
935 match &self.inner {
936 RegexImpl::Wrap {
937 inner,
938 explicit_capture_group_0,
939 ..
940 } => {
941 if !*explicit_capture_group_0 {
942 Ok(inner
943 .search(&RaInput::new(text).span(pos..text.len()))
944 .map(|m| Match::new(text, m.start(), m.end())))
945 } else {
946 let mut locations = inner.create_captures();
947 inner.captures(RaInput::new(text).span(pos..text.len()), &mut locations);
948 Ok(locations.is_match().then(|| {
949 Match::new(
950 text,
951 locations.get_group(1).unwrap().start,
952 locations.get_group(1).unwrap().end,
953 )
954 }))
955 }
956 }
957 RegexImpl::Fancy { prog, options, .. } => {
958 let result = vm::run(prog, text, pos, option_flags, options)?;
959 Ok(result.map(|saves| Match::new(text, saves[0], saves[1])))
960 }
961 }
962 }
963
964 /// Returns an iterator over all the non-overlapping capture groups matched in `text`.
965 ///
966 /// # Examples
967 ///
968 /// Finding all matches and capturing parts of each:
969 ///
970 /// ```rust
971 /// # use fancy_regex::Regex;
972 ///
973 /// let re = Regex::new(r"(\d{4})-(\d{2})").unwrap();
974 /// let text = "It was between 2018-04 and 2020-01";
975 /// let mut all_captures = re.captures_iter(text);
976 ///
977 /// let first = all_captures.next().unwrap().unwrap();
978 /// assert_eq!(first.get(1).unwrap().as_str(), "2018");
979 /// assert_eq!(first.get(2).unwrap().as_str(), "04");
980 /// assert_eq!(first.get(0).unwrap().as_str(), "2018-04");
981 ///
982 /// let second = all_captures.next().unwrap().unwrap();
983 /// assert_eq!(second.get(1).unwrap().as_str(), "2020");
984 /// assert_eq!(second.get(2).unwrap().as_str(), "01");
985 /// assert_eq!(second.get(0).unwrap().as_str(), "2020-01");
986 ///
987 /// assert!(all_captures.next().is_none());
988 /// ```
989 pub fn captures_iter<'r, 't>(&'r self, text: &'t str) -> CaptureMatches<'r, 't> {
990 CaptureMatches(self.find_iter(text))
991 }
992
993 /// Returns the capture groups for the first match in `text`.
994 ///
995 /// If no match is found, then `Ok(None)` is returned.
996 ///
997 /// # Examples
998 ///
999 /// Finding matches and capturing parts of the match:
1000 ///
1001 /// ```rust
1002 /// # use fancy_regex::Regex;
1003 ///
1004 /// let re = Regex::new(r"(\d{4})-(\d{2})-(\d{2})").unwrap();
1005 /// let text = "The date was 2018-04-07";
1006 /// let captures = re.captures(text).unwrap().unwrap();
1007 ///
1008 /// assert_eq!(captures.get(1).unwrap().as_str(), "2018");
1009 /// assert_eq!(captures.get(2).unwrap().as_str(), "04");
1010 /// assert_eq!(captures.get(3).unwrap().as_str(), "07");
1011 /// assert_eq!(captures.get(0).unwrap().as_str(), "2018-04-07");
1012 /// ```
1013 pub fn captures<'t>(&self, text: &'t str) -> Result<Option<Captures<'t>>> {
1014 self.captures_from_pos(text, 0)
1015 }
1016
1017 /// Returns the capture groups for the first match in `text`, starting from
1018 /// the specified byte position `pos`.
1019 ///
1020 /// # Examples
1021 ///
1022 /// Finding captures starting at a position:
1023 ///
1024 /// ```
1025 /// # use fancy_regex::Regex;
1026 /// let re = Regex::new(r"(?m:^)(\d+)").unwrap();
1027 /// let text = "1 test 123\n2 foo";
1028 /// let captures = re.captures_from_pos(text, 7).unwrap().unwrap();
1029 ///
1030 /// let group = captures.get(1).unwrap();
1031 /// assert_eq!(group.as_str(), "2");
1032 /// assert_eq!(group.start(), 11);
1033 /// assert_eq!(group.end(), 12);
1034 /// ```
1035 ///
1036 /// Note that in some cases this is not the same as using the `captures`
1037 /// method and passing a slice of the string, see the capture that we get
1038 /// when we do this:
1039 ///
1040 /// ```
1041 /// # use fancy_regex::Regex;
1042 /// let re = Regex::new(r"(?m:^)(\d+)").unwrap();
1043 /// let text = "1 test 123\n2 foo";
1044 /// let captures = re.captures(&text[7..]).unwrap().unwrap();
1045 /// assert_eq!(captures.get(1).unwrap().as_str(), "123");
1046 /// ```
1047 ///
1048 /// This matched the number "123" because it's at the beginning of the text
1049 /// of the string slice.
1050 ///
1051 pub fn captures_from_pos<'t>(&self, text: &'t str, pos: usize) -> Result<Option<Captures<'t>>> {
1052 let named_groups = self.named_groups.clone();
1053 match &self.inner {
1054 RegexImpl::Wrap {
1055 inner,
1056 explicit_capture_group_0,
1057 ..
1058 } => {
1059 let mut locations = inner.create_captures();
1060 inner.captures(RaInput::new(text).span(pos..text.len()), &mut locations);
1061 if locations.is_match() {
1062 Ok(Some(Captures {
1063 inner: CapturesImpl::Wrap {
1064 text,
1065 locations,
1066 explicit_capture_group_0: *explicit_capture_group_0,
1067 },
1068 named_groups,
1069 }))
1070 } else {
1071 Ok(None)
1072 }
1073 }
1074 RegexImpl::Fancy {
1075 prog,
1076 n_groups,
1077 options,
1078 ..
1079 } => {
1080 let result = vm::run(prog, text, pos, 0, options)?;
1081 Ok(result.map(|mut saves| {
1082 saves.truncate(n_groups * 2);
1083 Captures {
1084 inner: CapturesImpl::Fancy { text, saves },
1085 named_groups,
1086 }
1087 }))
1088 }
1089 }
1090 }
1091
1092 /// Returns the number of captures, including the implicit capture of the entire expression.
1093 pub fn captures_len(&self) -> usize {
1094 match &self.inner {
1095 RegexImpl::Wrap {
1096 inner,
1097 explicit_capture_group_0,
1098 ..
1099 } => inner.captures_len() - if *explicit_capture_group_0 { 1 } else { 0 },
1100 RegexImpl::Fancy { n_groups, .. } => *n_groups,
1101 }
1102 }
1103
1104 /// Returns an iterator over the capture names.
1105 pub fn capture_names(&self) -> CaptureNames<'_> {
1106 let mut names = Vec::new();
1107 names.resize(self.captures_len(), None);
1108 for (name, &i) in self.named_groups.iter() {
1109 names[i] = Some(name.as_str());
1110 }
1111 CaptureNames(names.into_iter())
1112 }
1113
1114 // for debugging only
1115 #[doc(hidden)]
1116 pub fn debug_print(&self, writer: &mut Formatter<'_>) -> fmt::Result {
1117 match &self.inner {
1118 RegexImpl::Wrap {
1119 debug_pattern,
1120 explicit_capture_group_0,
1121 ..
1122 } => {
1123 write!(
1124 writer,
1125 "wrapped Regex {:?}, explicit_capture_group_0: {:}",
1126 debug_pattern, *explicit_capture_group_0
1127 )
1128 }
1129 RegexImpl::Fancy { prog, .. } => prog.debug_print(writer),
1130 }
1131 }
1132
1133 /// Replaces the leftmost-first match with the replacement provided.
1134 /// The replacement can be a regular string (where `$N` and `$name` are
1135 /// expanded to match capture groups) or a function that takes the matches'
1136 /// `Captures` and returns the replaced string.
1137 ///
1138 /// If no match is found, then a copy of the string is returned unchanged.
1139 ///
1140 /// # Replacement string syntax
1141 ///
1142 /// All instances of `$name` in the replacement text is replaced with the
1143 /// corresponding capture group `name`.
1144 ///
1145 /// `name` may be an integer corresponding to the index of the
1146 /// capture group (counted by order of opening parenthesis where `0` is the
1147 /// entire match) or it can be a name (consisting of letters, digits or
1148 /// underscores) corresponding to a named capture group.
1149 ///
1150 /// If `name` isn't a valid capture group (whether the name doesn't exist
1151 /// or isn't a valid index), then it is replaced with the empty string.
1152 ///
1153 /// The longest possible name is used. e.g., `$1a` looks up the capture
1154 /// group named `1a` and not the capture group at index `1`. To exert more
1155 /// precise control over the name, use braces, e.g., `${1}a`.
1156 ///
1157 /// To write a literal `$` use `$$`.
1158 ///
1159 /// # Examples
1160 ///
1161 /// Note that this function is polymorphic with respect to the replacement.
1162 /// In typical usage, this can just be a normal string:
1163 ///
1164 /// ```rust
1165 /// # use fancy_regex::Regex;
1166 /// let re = Regex::new("[^01]+").unwrap();
1167 /// assert_eq!(re.replace("1078910", ""), "1010");
1168 /// ```
1169 ///
1170 /// But anything satisfying the `Replacer` trait will work. For example,
1171 /// a closure of type `|&Captures| -> String` provides direct access to the
1172 /// captures corresponding to a match. This allows one to access
1173 /// capturing group matches easily:
1174 ///
1175 /// ```rust
1176 /// # use fancy_regex::{Regex, Captures};
1177 /// let re = Regex::new(r"([^,\s]+),\s+(\S+)").unwrap();
1178 /// let result = re.replace("Springsteen, Bruce", |caps: &Captures| {
1179 /// format!("{} {}", &caps[2], &caps[1])
1180 /// });
1181 /// assert_eq!(result, "Bruce Springsteen");
1182 /// ```
1183 ///
1184 /// But this is a bit cumbersome to use all the time. Instead, a simple
1185 /// syntax is supported that expands `$name` into the corresponding capture
1186 /// group. Here's the last example, but using this expansion technique
1187 /// with named capture groups:
1188 ///
1189 /// ```rust
1190 /// # use fancy_regex::Regex;
1191 /// let re = Regex::new(r"(?P<last>[^,\s]+),\s+(?P<first>\S+)").unwrap();
1192 /// let result = re.replace("Springsteen, Bruce", "$first $last");
1193 /// assert_eq!(result, "Bruce Springsteen");
1194 /// ```
1195 ///
1196 /// Note that using `$2` instead of `$first` or `$1` instead of `$last`
1197 /// would produce the same result. To write a literal `$` use `$$`.
1198 ///
1199 /// Sometimes the replacement string requires use of curly braces to
1200 /// delineate a capture group replacement and surrounding literal text.
1201 /// For example, if we wanted to join two words together with an
1202 /// underscore:
1203 ///
1204 /// ```rust
1205 /// # use fancy_regex::Regex;
1206 /// let re = Regex::new(r"(?P<first>\w+)\s+(?P<second>\w+)").unwrap();
1207 /// let result = re.replace("deep fried", "${first}_$second");
1208 /// assert_eq!(result, "deep_fried");
1209 /// ```
1210 ///
1211 /// Without the curly braces, the capture group name `first_` would be
1212 /// used, and since it doesn't exist, it would be replaced with the empty
1213 /// string.
1214 ///
1215 /// Finally, sometimes you just want to replace a literal string with no
1216 /// regard for capturing group expansion. This can be done by wrapping a
1217 /// byte string with `NoExpand`:
1218 ///
1219 /// ```rust
1220 /// # use fancy_regex::Regex;
1221 /// use fancy_regex::NoExpand;
1222 ///
1223 /// let re = Regex::new(r"(?P<last>[^,\s]+),\s+(\S+)").unwrap();
1224 /// let result = re.replace("Springsteen, Bruce", NoExpand("$2 $last"));
1225 /// assert_eq!(result, "$2 $last");
1226 /// ```
1227 pub fn replace<'t, R: Replacer>(&self, text: &'t str, rep: R) -> Cow<'t, str> {
1228 self.replacen(text, 1, rep)
1229 }
1230
1231 /// Replaces all non-overlapping matches in `text` with the replacement
1232 /// provided. This is the same as calling `replacen` with `limit` set to
1233 /// `0`.
1234 ///
1235 /// See the documentation for `replace` for details on how to access
1236 /// capturing group matches in the replacement string.
1237 pub fn replace_all<'t, R: Replacer>(&self, text: &'t str, rep: R) -> Cow<'t, str> {
1238 self.replacen(text, 0, rep)
1239 }
1240
1241 /// Replaces at most `limit` non-overlapping matches in `text` with the
1242 /// replacement provided. If `limit` is 0, then all non-overlapping matches
1243 /// are replaced.
1244 ///
1245 /// Will panic if any errors are encountered. Use `try_replacen`, which this
1246 /// function unwraps, if you want to handle errors.
1247 ///
1248 /// See the documentation for `replace` for details on how to access
1249 /// capturing group matches in the replacement string.
1250 ///
1251 pub fn replacen<'t, R: Replacer>(&self, text: &'t str, limit: usize, rep: R) -> Cow<'t, str> {
1252 self.try_replacen(text, limit, rep).unwrap()
1253 }
1254
1255 /// Replaces at most `limit` non-overlapping matches in `text` with the
1256 /// replacement provided. If `limit` is 0, then all non-overlapping matches
1257 /// are replaced.
1258 ///
1259 /// Propagates any errors encountered, such as `RuntimeError::BacktrackLimitExceeded`.
1260 ///
1261 /// See the documentation for `replace` for details on how to access
1262 /// capturing group matches in the replacement string.
1263 pub fn try_replacen<'t, R: Replacer>(
1264 &self,
1265 text: &'t str,
1266 limit: usize,
1267 mut rep: R,
1268 ) -> Result<Cow<'t, str>> {
1269 // If we know that the replacement doesn't have any capture expansions,
1270 // then we can fast path. The fast path can make a tremendous
1271 // difference:
1272 //
1273 // 1) We use `find_iter` instead of `captures_iter`. Not asking for
1274 // captures generally makes the regex engines faster.
1275 // 2) We don't need to look up all of the capture groups and do
1276 // replacements inside the replacement string. We just push it
1277 // at each match and be done with it.
1278 if let Some(rep) = rep.no_expansion() {
1279 let mut it = self.find_iter(text).enumerate().peekable();
1280 if it.peek().is_none() {
1281 return Ok(Cow::Borrowed(text));
1282 }
1283 let mut new = String::with_capacity(text.len());
1284 let mut last_match = 0;
1285 for (i, m) in it {
1286 let m = m?;
1287
1288 if limit > 0 && i >= limit {
1289 break;
1290 }
1291 new.push_str(&text[last_match..m.start()]);
1292 new.push_str(&rep);
1293 last_match = m.end();
1294 }
1295 new.push_str(&text[last_match..]);
1296 return Ok(Cow::Owned(new));
1297 }
1298
1299 // The slower path, which we use if the replacement needs access to
1300 // capture groups.
1301 let mut it = self.captures_iter(text).enumerate().peekable();
1302 if it.peek().is_none() {
1303 return Ok(Cow::Borrowed(text));
1304 }
1305 let mut new = String::with_capacity(text.len());
1306 let mut last_match = 0;
1307 for (i, cap) in it {
1308 let cap = cap?;
1309
1310 if limit > 0 && i >= limit {
1311 break;
1312 }
1313 // unwrap on 0 is OK because captures only reports matches
1314 let m = cap.get(0).unwrap();
1315 new.push_str(&text[last_match..m.start()]);
1316 rep.replace_append(&cap, &mut new);
1317 last_match = m.end();
1318 }
1319 new.push_str(&text[last_match..]);
1320 Ok(Cow::Owned(new))
1321 }
1322
1323 /// Splits the string by matches of the regex.
1324 ///
1325 /// Returns an iterator over the substrings of the target string
1326 /// that *aren't* matched by the regex.
1327 ///
1328 /// # Example
1329 ///
1330 /// To split a string delimited by arbitrary amounts of spaces or tabs:
1331 ///
1332 /// ```rust
1333 /// # use fancy_regex::Regex;
1334 /// let re = Regex::new(r"[ \t]+").unwrap();
1335 /// let target = "a b \t c\td e";
1336 /// let fields: Vec<&str> = re.split(target).map(|x| x.unwrap()).collect();
1337 /// assert_eq!(fields, vec!["a", "b", "c", "d", "e"]);
1338 /// ```
1339 pub fn split<'r, 'h>(&'r self, target: &'h str) -> Split<'r, 'h> {
1340 Split {
1341 matches: self.find_iter(target),
1342 next_start: 0,
1343 target,
1344 }
1345 }
1346
1347 /// Splits the string by matches of the regex at most `limit` times.
1348 ///
1349 /// Returns an iterator over the substrings of the target string
1350 /// that *aren't* matched by the regex.
1351 ///
1352 /// The `N`th substring is the remaining part of the target.
1353 ///
1354 /// # Example
1355 ///
1356 /// To split a string delimited by arbitrary amounts of spaces or tabs
1357 /// 3 times:
1358 ///
1359 /// ```rust
1360 /// # use fancy_regex::Regex;
1361 /// let re = Regex::new(r"[ \t]+").unwrap();
1362 /// let target = "a b \t c\td e";
1363 /// let fields: Vec<&str> = re.splitn(target, 3).map(|x| x.unwrap()).collect();
1364 /// assert_eq!(fields, vec!["a", "b", "c\td e"]);
1365 /// ```
1366 pub fn splitn<'r, 'h>(&'r self, target: &'h str, limit: usize) -> SplitN<'r, 'h> {
1367 SplitN {
1368 splits: self.split(target),
1369 limit,
1370 }
1371 }
1372}
1373
1374impl TryFrom<&str> for Regex {
1375 type Error = Error;
1376
1377 /// Attempts to parse a string into a regular expression
1378 fn try_from(s: &str) -> Result<Self> {
1379 Self::new(s)
1380 }
1381}
1382
1383impl TryFrom<String> for Regex {
1384 type Error = Error;
1385
1386 /// Attempts to parse a string into a regular expression
1387 fn try_from(s: String) -> Result<Self> {
1388 Self::new(&s)
1389 }
1390}
1391
1392impl<'t> Match<'t> {
1393 /// Returns the starting byte offset of the match in the text.
1394 #[inline]
1395 pub fn start(&self) -> usize {
1396 self.start
1397 }
1398
1399 /// Returns the ending byte offset of the match in the text.
1400 #[inline]
1401 pub fn end(&self) -> usize {
1402 self.end
1403 }
1404
1405 /// Returns the range over the starting and ending byte offsets of the match in text.
1406 #[inline]
1407 pub fn range(&self) -> Range<usize> {
1408 self.start..self.end
1409 }
1410
1411 /// Returns the matched text.
1412 #[inline]
1413 pub fn as_str(&self) -> &'t str {
1414 &self.text[self.start..self.end]
1415 }
1416
1417 /// Creates a new match from the given text and byte offsets.
1418 fn new(text: &'t str, start: usize, end: usize) -> Match<'t> {
1419 Match { text, start, end }
1420 }
1421}
1422
1423impl<'t> From<Match<'t>> for &'t str {
1424 fn from(m: Match<'t>) -> &'t str {
1425 m.as_str()
1426 }
1427}
1428
1429impl<'t> From<Match<'t>> for Range<usize> {
1430 fn from(m: Match<'t>) -> Range<usize> {
1431 m.range()
1432 }
1433}
1434
1435#[allow(clippy::len_without_is_empty)] // follow regex's API
1436impl<'t> Captures<'t> {
1437 /// Get the capture group by its index in the regex.
1438 ///
1439 /// If there is no match for that group or the index does not correspond to a group, `None` is
1440 /// returned. The index 0 returns the whole match.
1441 pub fn get(&self, i: usize) -> Option<Match<'t>> {
1442 match &self.inner {
1443 CapturesImpl::Wrap {
1444 text,
1445 locations,
1446 explicit_capture_group_0,
1447 } => locations
1448 .get_group(i + if *explicit_capture_group_0 { 1 } else { 0 })
1449 .map(|span| Match {
1450 text,
1451 start: span.start,
1452 end: span.end,
1453 }),
1454 CapturesImpl::Fancy { text, saves } => {
1455 let slot = i * 2;
1456 if slot >= saves.len() {
1457 return None;
1458 }
1459 let lo = saves[slot];
1460 if lo == usize::MAX {
1461 return None;
1462 }
1463 let hi = saves[slot + 1];
1464 Some(Match {
1465 text,
1466 start: lo,
1467 end: hi,
1468 })
1469 }
1470 }
1471 }
1472
1473 /// Returns the match for a named capture group. Returns `None` the capture
1474 /// group did not match or if there is no group with the given name.
1475 pub fn name(&self, name: &str) -> Option<Match<'t>> {
1476 self.named_groups.get(name).and_then(|i| self.get(*i))
1477 }
1478
1479 /// Expands all instances of `$group` in `replacement` to the corresponding
1480 /// capture group `name`, and writes them to the `dst` buffer given.
1481 ///
1482 /// `group` may be an integer corresponding to the index of the
1483 /// capture group (counted by order of opening parenthesis where `\0` is the
1484 /// entire match) or it can be a name (consisting of letters, digits or
1485 /// underscores) corresponding to a named capture group.
1486 ///
1487 /// If `group` isn't a valid capture group (whether the name doesn't exist
1488 /// or isn't a valid index), then it is replaced with the empty string.
1489 ///
1490 /// The longest possible name is used. e.g., `$1a` looks up the capture
1491 /// group named `1a` and not the capture group at index `1`. To exert more
1492 /// precise control over the name, use braces, e.g., `${1}a`.
1493 ///
1494 /// To write a literal `$`, use `$$`.
1495 ///
1496 /// For more control over expansion, see [`Expander`].
1497 ///
1498 /// [`Expander`]: expand/struct.Expander.html
1499 pub fn expand(&self, replacement: &str, dst: &mut String) {
1500 Expander::default().append_expansion(dst, replacement, self);
1501 }
1502
1503 /// Iterate over the captured groups in order in which they appeared in the regex. The first
1504 /// capture corresponds to the whole match.
1505 pub fn iter<'c>(&'c self) -> SubCaptureMatches<'c, 't> {
1506 SubCaptureMatches { caps: self, i: 0 }
1507 }
1508
1509 /// How many groups were captured. This is always at least 1 because group 0 returns the whole
1510 /// match.
1511 pub fn len(&self) -> usize {
1512 match &self.inner {
1513 CapturesImpl::Wrap {
1514 locations,
1515 explicit_capture_group_0,
1516 ..
1517 } => locations.group_len() - if *explicit_capture_group_0 { 1 } else { 0 },
1518 CapturesImpl::Fancy { saves, .. } => saves.len() / 2,
1519 }
1520 }
1521}
1522
1523/// Get a group by index.
1524///
1525/// `'t` is the lifetime of the matched text.
1526///
1527/// The text can't outlive the `Captures` object if this method is
1528/// used, because of how `Index` is defined (normally `a[i]` is part
1529/// of `a` and can't outlive it); to do that, use `get()` instead.
1530///
1531/// # Panics
1532///
1533/// If there is no group at the given index.
1534impl<'t> Index<usize> for Captures<'t> {
1535 type Output = str;
1536
1537 fn index(&self, i: usize) -> &str {
1538 self.get(i)
1539 .map(|m| m.as_str())
1540 .unwrap_or_else(|| panic!("no group at index '{}'", i))
1541 }
1542}
1543
1544/// Get a group by name.
1545///
1546/// `'t` is the lifetime of the matched text and `'i` is the lifetime
1547/// of the group name (the index).
1548///
1549/// The text can't outlive the `Captures` object if this method is
1550/// used, because of how `Index` is defined (normally `a[i]` is part
1551/// of `a` and can't outlive it); to do that, use `name` instead.
1552///
1553/// # Panics
1554///
1555/// If there is no group named by the given value.
1556impl<'t, 'i> Index<&'i str> for Captures<'t> {
1557 type Output = str;
1558
1559 fn index<'a>(&'a self, name: &'i str) -> &'a str {
1560 self.name(name)
1561 .map(|m| m.as_str())
1562 .unwrap_or_else(|| panic!("no group named '{}'", name))
1563 }
1564}
1565
1566impl<'c, 't> Iterator for SubCaptureMatches<'c, 't> {
1567 type Item = Option<Match<'t>>;
1568
1569 fn next(&mut self) -> Option<Option<Match<'t>>> {
1570 if self.i < self.caps.len() {
1571 let result = self.caps.get(self.i);
1572 self.i += 1;
1573 Some(result)
1574 } else {
1575 None
1576 }
1577 }
1578}
1579
1580// TODO: might be nice to implement ExactSizeIterator etc for SubCaptures
1581
1582/// Regular expression AST. This is public for now but may change.
1583#[derive(Debug, PartialEq, Eq, Clone)]
1584pub enum Expr {
1585 /// An empty expression, e.g. the last branch in `(a|b|)`
1586 Empty,
1587 /// Any character, regex `.`
1588 Any {
1589 /// Whether it also matches newlines or not
1590 newline: bool,
1591 },
1592 /// An assertion
1593 Assertion(Assertion),
1594 /// The string as a literal, e.g. `a`
1595 Literal {
1596 /// The string to match
1597 val: String,
1598 /// Whether match is case-insensitive or not
1599 casei: bool,
1600 },
1601 /// Concatenation of multiple expressions, must match in order, e.g. `a.` is a concatenation of
1602 /// the literal `a` and `.` for any character
1603 Concat(Vec<Expr>),
1604 /// Alternative of multiple expressions, one of them must match, e.g. `a|b` is an alternative
1605 /// where either the literal `a` or `b` must match
1606 Alt(Vec<Expr>),
1607 /// Capturing group of expression, e.g. `(a.)` matches `a` and any character and "captures"
1608 /// (remembers) the match
1609 Group(Box<Expr>),
1610 /// Look-around (e.g. positive/negative look-ahead or look-behind) with an expression, e.g.
1611 /// `(?=a)` means the next character must be `a` (but the match is not consumed)
1612 LookAround(Box<Expr>, LookAround),
1613 /// Repeat of an expression, e.g. `a*` or `a+` or `a{1,3}`
1614 Repeat {
1615 /// The expression that is being repeated
1616 child: Box<Expr>,
1617 /// The minimum number of repetitions
1618 lo: usize,
1619 /// The maximum number of repetitions (or `usize::MAX`)
1620 hi: usize,
1621 /// Greedy means as much as possible is matched, e.g. `.*b` would match all of `abab`.
1622 /// Non-greedy means as little as possible, e.g. `.*?b` would match only `ab` in `abab`.
1623 greedy: bool,
1624 },
1625 /// Delegate a regex to the regex crate. This is used as a simplification so that we don't have
1626 /// to represent all the expressions in the AST, e.g. character classes.
1627 Delegate {
1628 /// The regex
1629 inner: String,
1630 /// How many characters the regex matches
1631 size: usize, // TODO: move into analysis result
1632 /// Whether the matching is case-insensitive or not
1633 casei: bool,
1634 },
1635 /// Back reference to a capture group, e.g. `\1` in `(abc|def)\1` references the captured group
1636 /// and the whole regex matches either `abcabc` or `defdef`.
1637 Backref {
1638 /// The capture group number being referenced
1639 group: usize,
1640 /// Whether the matching is case-insensitive or not
1641 casei: bool,
1642 },
1643 /// Back reference to a capture group at the given specified relative recursion level.
1644 BackrefWithRelativeRecursionLevel {
1645 /// The capture group number being referenced
1646 group: usize,
1647 /// Relative recursion level
1648 relative_level: isize,
1649 /// Whether the matching is case-insensitive or not
1650 casei: bool,
1651 },
1652 /// Atomic non-capturing group, e.g. `(?>ab|a)` in text that contains `ab` will match `ab` and
1653 /// never backtrack and try `a`, even if matching fails after the atomic group.
1654 AtomicGroup(Box<Expr>),
1655 /// Keep matched text so far out of overall match
1656 KeepOut,
1657 /// Anchor to match at the position where the previous match ended
1658 ContinueFromPreviousMatchEnd,
1659 /// Conditional expression based on whether the numbered capture group matched or not
1660 BackrefExistsCondition(usize),
1661 /// If/Then/Else Condition. If there is no Then/Else, these will just be empty expressions.
1662 Conditional {
1663 /// The conditional expression to evaluate
1664 condition: Box<Expr>,
1665 /// What to execute if the condition is true
1666 true_branch: Box<Expr>,
1667 /// What to execute if the condition is false
1668 false_branch: Box<Expr>,
1669 },
1670 /// Subroutine call to the specified group number
1671 SubroutineCall(usize),
1672 /// Unresolved subroutine call to the specified group name
1673 UnresolvedNamedSubroutineCall {
1674 /// The capture group name
1675 name: String,
1676 /// The position in the original regex pattern where the subroutine call is made
1677 ix: usize,
1678 },
1679}
1680
1681/// Type of look-around assertion as used for a look-around expression.
1682#[derive(Debug, PartialEq, Eq, Clone, Copy)]
1683pub enum LookAround {
1684 /// Look-ahead assertion, e.g. `(?=a)`
1685 LookAhead,
1686 /// Negative look-ahead assertion, e.g. `(?!a)`
1687 LookAheadNeg,
1688 /// Look-behind assertion, e.g. `(?<=a)`
1689 LookBehind,
1690 /// Negative look-behind assertion, e.g. `(?<!a)`
1691 LookBehindNeg,
1692}
1693
1694/// An iterator over capture names in a [Regex]. The iterator
1695/// returns the name of each group, or [None] if the group has
1696/// no name. Because capture group 0 cannot have a name, the
1697/// first item returned is always [None].
1698pub struct CaptureNames<'r>(vec::IntoIter<Option<&'r str>>);
1699
1700impl Debug for CaptureNames<'_> {
1701 fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
1702 f.write_str("<CaptureNames>")
1703 }
1704}
1705
1706impl<'r> Iterator for CaptureNames<'r> {
1707 type Item = Option<&'r str>;
1708
1709 fn next(&mut self) -> Option<Self::Item> {
1710 self.0.next()
1711 }
1712}
1713
1714// silly to write my own, but this is super-fast for the common 1-digit
1715// case.
1716fn push_usize(s: &mut String, x: usize) {
1717 if x >= 10 {
1718 push_usize(s, x / 10);
1719 s.push((b'0' + (x % 10) as u8) as char);
1720 } else {
1721 s.push((b'0' + (x as u8)) as char);
1722 }
1723}
1724
1725fn is_special(c: char) -> bool {
1726 matches!(
1727 c,
1728 '\\' | '.' | '+' | '*' | '?' | '(' | ')' | '|' | '[' | ']' | '{' | '}' | '^' | '$' | '#'
1729 )
1730}
1731
1732fn push_quoted(buf: &mut String, s: &str) {
1733 for c in s.chars() {
1734 if is_special(c) {
1735 buf.push('\\');
1736 }
1737 buf.push(c);
1738 }
1739}
1740
1741/// Escapes special characters in `text` with '\\'. Returns a string which, when interpreted
1742/// as a regex, matches exactly `text`.
1743pub fn escape(text: &str) -> Cow<'_, str> {
1744 // Using bytes() is OK because all special characters are single bytes.
1745 match text.bytes().filter(|&b| is_special(b as char)).count() {
1746 0 => Cow::Borrowed(text),
1747 n => {
1748 // The capacity calculation is exact because '\\' is a single byte.
1749 let mut buf = String::with_capacity(text.len() + n);
1750 push_quoted(&mut buf, text);
1751 Cow::Owned(buf)
1752 }
1753 }
1754}
1755
1756/// Type of assertions
1757#[derive(Debug, PartialEq, Eq, Clone, Copy)]
1758pub enum Assertion {
1759 /// Start of input text
1760 StartText,
1761 /// End of input text
1762 EndText,
1763 /// Start of a line
1764 StartLine {
1765 /// CRLF mode
1766 crlf: bool,
1767 },
1768 /// End of a line
1769 EndLine {
1770 /// CRLF mode
1771 crlf: bool,
1772 },
1773 /// Left word boundary
1774 LeftWordBoundary,
1775 /// Left word half boundary
1776 LeftWordHalfBoundary,
1777 /// Right word boundary
1778 RightWordBoundary,
1779 /// Right word half boundary
1780 RightWordHalfBoundary,
1781 /// Both word boundaries
1782 WordBoundary,
1783 /// Not word boundary
1784 NotWordBoundary,
1785}
1786
1787impl Assertion {
1788 pub(crate) fn is_hard(&self) -> bool {
1789 use Assertion::*;
1790 matches!(
1791 self,
1792 // these will make regex-automata use PikeVM
1793 LeftWordBoundary
1794 | LeftWordHalfBoundary
1795 | RightWordBoundary
1796 | RightWordHalfBoundary
1797 | WordBoundary
1798 | NotWordBoundary
1799 )
1800 }
1801}
1802
1803impl Expr {
1804 /// Parse the regex and return an expression (AST) and a bit set with the indexes of groups
1805 /// that are referenced by backrefs.
1806 pub fn parse_tree(re: &str) -> Result<ExprTree> {
1807 Parser::parse(re)
1808 }
1809
1810 /// Parse the regex and return an expression (AST)
1811 /// Flags should be bit based based on flags
1812 pub fn parse_tree_with_flags(re: &str, flags: u32) -> Result<ExprTree> {
1813 Parser::parse_with_flags(re, flags)
1814 }
1815
1816 /// Convert expression to a regex string in the regex crate's syntax.
1817 ///
1818 /// # Panics
1819 ///
1820 /// Panics for expressions that are hard, i.e. can not be handled by the regex crate.
1821 pub fn to_str(&self, buf: &mut String, precedence: u8) {
1822 match *self {
1823 Expr::Empty => (),
1824 Expr::Any { newline } => buf.push_str(if newline { "(?s:.)" } else { "." }),
1825 Expr::Literal { ref val, casei } => {
1826 if casei {
1827 buf.push_str("(?i:");
1828 }
1829 push_quoted(buf, val);
1830 if casei {
1831 buf.push(')');
1832 }
1833 }
1834 Expr::Assertion(Assertion::StartText) => buf.push('^'),
1835 Expr::Assertion(Assertion::EndText) => buf.push('$'),
1836 Expr::Assertion(Assertion::StartLine { crlf: false }) => buf.push_str("(?m:^)"),
1837 Expr::Assertion(Assertion::EndLine { crlf: false }) => buf.push_str("(?m:$)"),
1838 Expr::Assertion(Assertion::StartLine { crlf: true }) => buf.push_str("(?Rm:^)"),
1839 Expr::Assertion(Assertion::EndLine { crlf: true }) => buf.push_str("(?Rm:$)"),
1840 Expr::Concat(ref children) => {
1841 if precedence > 1 {
1842 buf.push_str("(?:");
1843 }
1844 for child in children {
1845 child.to_str(buf, 2);
1846 }
1847 if precedence > 1 {
1848 buf.push(')')
1849 }
1850 }
1851 Expr::Alt(ref children) => {
1852 if precedence > 0 {
1853 buf.push_str("(?:");
1854 }
1855 for (i, child) in children.iter().enumerate() {
1856 if i != 0 {
1857 buf.push('|');
1858 }
1859 child.to_str(buf, 1);
1860 }
1861 if precedence > 0 {
1862 buf.push(')');
1863 }
1864 }
1865 Expr::Group(ref child) => {
1866 buf.push('(');
1867 child.to_str(buf, 0);
1868 buf.push(')');
1869 }
1870 Expr::Repeat {
1871 ref child,
1872 lo,
1873 hi,
1874 greedy,
1875 } => {
1876 if precedence > 2 {
1877 buf.push_str("(?:");
1878 }
1879 child.to_str(buf, 3);
1880 match (lo, hi) {
1881 (0, 1) => buf.push('?'),
1882 (0, usize::MAX) => buf.push('*'),
1883 (1, usize::MAX) => buf.push('+'),
1884 (lo, hi) => {
1885 buf.push('{');
1886 push_usize(buf, lo);
1887 if lo != hi {
1888 buf.push(',');
1889 if hi != usize::MAX {
1890 push_usize(buf, hi);
1891 }
1892 }
1893 buf.push('}');
1894 }
1895 }
1896 if !greedy {
1897 buf.push('?');
1898 }
1899 if precedence > 2 {
1900 buf.push(')');
1901 }
1902 }
1903 Expr::Delegate {
1904 ref inner, casei, ..
1905 } => {
1906 // at the moment, delegate nodes are just atoms
1907 if casei {
1908 buf.push_str("(?i:");
1909 }
1910 buf.push_str(inner);
1911 if casei {
1912 buf.push(')');
1913 }
1914 }
1915 _ => panic!("attempting to format hard expr {:?}", self),
1916 }
1917 }
1918}
1919
1920// precondition: ix > 0
1921fn prev_codepoint_ix(s: &str, mut ix: usize) -> usize {
1922 let bytes = s.as_bytes();
1923 loop {
1924 ix -= 1;
1925 // fancy bit magic for ranges 0..0x80 + 0xc0..
1926 if (bytes[ix] as i8) >= -0x40 {
1927 break;
1928 }
1929 }
1930 ix
1931}
1932
1933fn codepoint_len(b: u8) -> usize {
1934 match b {
1935 b if b < 0x80 => 1,
1936 b if b < 0xe0 => 2,
1937 b if b < 0xf0 => 3,
1938 _ => 4,
1939 }
1940}
1941
1942/// Returns the smallest possible index of the next valid UTF-8 sequence
1943/// starting after `i`.
1944/// Adapted from a function with the same name in the `regex` crate.
1945fn next_utf8(text: &str, i: usize) -> usize {
1946 let b = match text.as_bytes().get(i) {
1947 None => return i + 1,
1948 Some(&b) => b,
1949 };
1950 i + codepoint_len(b)
1951}
1952
1953// If this returns false, then there is no possible backref in the re
1954
1955// Both potential implementations are turned off, because we currently
1956// always need to do a deeper analysis because of 1-character
1957// look-behind. If we could call a find_from_pos method of regex::Regex,
1958// it would make sense to bring this back.
1959/*
1960pub fn detect_possible_backref(re: &str) -> bool {
1961 let mut last = b'\x00';
1962 for b in re.as_bytes() {
1963 if b'0' <= *b && *b <= b'9' && last == b'\\' { return true; }
1964 last = *b;
1965 }
1966 false
1967}
1968
1969pub fn detect_possible_backref(re: &str) -> bool {
1970 let mut bytes = re.as_bytes();
1971 loop {
1972 match memchr::memchr(b'\\', &bytes[..bytes.len() - 1]) {
1973 Some(i) => {
1974 bytes = &bytes[i + 1..];
1975 let c = bytes[0];
1976 if b'0' <= c && c <= b'9' { return true; }
1977 }
1978 None => return false
1979 }
1980 }
1981}
1982*/
1983
1984/// The internal module only exists so that the toy example can access internals for debugging and
1985/// experimenting.
1986#[doc(hidden)]
1987pub mod internal {
1988 pub use crate::analyze::{analyze, can_compile_as_anchored};
1989 pub use crate::compile::compile;
1990 pub use crate::optimize::optimize;
1991 pub use crate::parse_flags::{
1992 FLAG_CASEI, FLAG_DOTNL, FLAG_IGNORE_SPACE, FLAG_MULTI, FLAG_ONIGURUMA_MODE, FLAG_UNICODE,
1993 };
1994 pub use crate::vm::{run_default, run_trace, Insn, Prog};
1995}
1996
1997#[cfg(test)]
1998mod tests {
1999 use alloc::borrow::Cow;
2000 use alloc::boxed::Box;
2001 use alloc::string::String;
2002 use alloc::{format, vec};
2003
2004 use crate::parse::make_literal;
2005 use crate::{Expr, Regex, RegexImpl};
2006
2007 //use detect_possible_backref;
2008
2009 // tests for to_str
2010
2011 fn to_str(e: Expr) -> String {
2012 let mut s = String::new();
2013 e.to_str(&mut s, 0);
2014 s
2015 }
2016
2017 #[test]
2018 fn to_str_concat_alt() {
2019 let e = Expr::Concat(vec![
2020 Expr::Alt(vec![make_literal("a"), make_literal("b")]),
2021 make_literal("c"),
2022 ]);
2023 assert_eq!(to_str(e), "(?:a|b)c");
2024 }
2025
2026 #[test]
2027 fn to_str_rep_concat() {
2028 let e = Expr::Repeat {
2029 child: Box::new(Expr::Concat(vec![make_literal("a"), make_literal("b")])),
2030 lo: 2,
2031 hi: 3,
2032 greedy: true,
2033 };
2034 assert_eq!(to_str(e), "(?:ab){2,3}");
2035 }
2036
2037 #[test]
2038 fn to_str_group_alt() {
2039 let e = Expr::Group(Box::new(Expr::Alt(vec![
2040 make_literal("a"),
2041 make_literal("b"),
2042 ])));
2043 assert_eq!(to_str(e), "(a|b)");
2044 }
2045
2046 #[test]
2047 fn as_str_debug() {
2048 let s = r"(a+)b\1";
2049 let regex = Regex::new(s).unwrap();
2050 assert_eq!(s, regex.as_str());
2051 assert_eq!(s, format!("{:?}", regex));
2052 }
2053
2054 #[test]
2055 fn display() {
2056 let s = r"(a+)b\1";
2057 let regex = Regex::new(s).unwrap();
2058 assert_eq!(s, format!("{}", regex));
2059 }
2060
2061 #[test]
2062 fn from_str() {
2063 let s = r"(a+)b\1";
2064 let regex = s.parse::<Regex>().unwrap();
2065 assert_eq!(regex.as_str(), s);
2066 }
2067
2068 #[test]
2069 fn to_str_repeat() {
2070 fn repeat(lo: usize, hi: usize, greedy: bool) -> Expr {
2071 Expr::Repeat {
2072 child: Box::new(make_literal("a")),
2073 lo,
2074 hi,
2075 greedy,
2076 }
2077 }
2078
2079 assert_eq!(to_str(repeat(2, 2, true)), "a{2}");
2080 assert_eq!(to_str(repeat(2, 2, false)), "a{2}?");
2081 assert_eq!(to_str(repeat(2, 3, true)), "a{2,3}");
2082 assert_eq!(to_str(repeat(2, 3, false)), "a{2,3}?");
2083 assert_eq!(to_str(repeat(2, usize::MAX, true)), "a{2,}");
2084 assert_eq!(to_str(repeat(2, usize::MAX, false)), "a{2,}?");
2085 assert_eq!(to_str(repeat(0, 1, true)), "a?");
2086 assert_eq!(to_str(repeat(0, 1, false)), "a??");
2087 assert_eq!(to_str(repeat(0, usize::MAX, true)), "a*");
2088 assert_eq!(to_str(repeat(0, usize::MAX, false)), "a*?");
2089 assert_eq!(to_str(repeat(1, usize::MAX, true)), "a+");
2090 assert_eq!(to_str(repeat(1, usize::MAX, false)), "a+?");
2091 }
2092
2093 #[test]
2094 fn escape() {
2095 // Check that strings that need no quoting are borrowed, and that non-special punctuation
2096 // is not quoted.
2097 match crate::escape("@foo") {
2098 Cow::Borrowed(s) => assert_eq!(s, "@foo"),
2099 _ => panic!("Value should be borrowed."),
2100 }
2101
2102 // Check typical usage.
2103 assert_eq!(crate::escape("fo*o").into_owned(), "fo\\*o");
2104
2105 // Check that multibyte characters are handled correctly.
2106 assert_eq!(crate::escape("fø*ø").into_owned(), "fø\\*ø");
2107 }
2108
2109 #[test]
2110 fn trailing_positive_lookahead_wrap_capture_group_fixup() {
2111 let s = r"a+(?=c)";
2112 let regex = s.parse::<Regex>().unwrap();
2113 assert!(matches!(regex.inner,
2114 RegexImpl::Wrap { explicit_capture_group_0: true, .. }),
2115 "trailing positive lookahead for an otherwise easy pattern should avoid going through the VM");
2116 assert_eq!(s, regex.as_str());
2117 assert_eq!(s, format!("{:?}", regex));
2118 }
2119
2120 #[test]
2121 fn easy_regex() {
2122 let s = r"(a+)b";
2123 let regex = s.parse::<Regex>().unwrap();
2124 assert!(
2125 matches!(regex.inner, RegexImpl::Wrap { explicit_capture_group_0: false, .. }),
2126 "easy pattern should avoid going through the VM, and capture group 0 should be implicit"
2127 );
2128
2129 assert_eq!(s, regex.as_str());
2130 assert_eq!(s, format!("{:?}", regex));
2131 }
2132
2133 #[test]
2134 fn hard_regex() {
2135 let s = r"(a+)(?>c)";
2136 let regex = s.parse::<Regex>().unwrap();
2137 assert!(
2138 matches!(regex.inner, RegexImpl::Fancy { .. }),
2139 "hard regex should be compiled into a VM"
2140 );
2141 assert_eq!(s, regex.as_str());
2142 assert_eq!(s, format!("{:?}", regex));
2143 }
2144
2145 /*
2146 #[test]
2147 fn detect_backref() {
2148 assert_eq!(detect_possible_backref("a0a1a2"), false);
2149 assert_eq!(detect_possible_backref("a0a1\\a2"), false);
2150 assert_eq!(detect_possible_backref("a0a\\1a2"), true);
2151 assert_eq!(detect_possible_backref("a0a1a2\\"), false);
2152 }
2153 */
2154}