fancy_regex_fork_pb/lib.rs
1// Copyright 2016 The Fancy Regex Authors.
2//
3// Permission is hereby granted, free of charge, to any person obtaining a copy
4// of this software and associated documentation files (the "Software"), to deal
5// in the Software without restriction, including without limitation the rights
6// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7// copies of the Software, and to permit persons to whom the Software is
8// furnished to do so, subject to the following conditions:
9//
10// The above copyright notice and this permission notice shall be included in
11// all copies or substantial portions of the Software.
12//
13// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19// THE SOFTWARE.
20
21/*!
22An implementation of regexes, supporting a relatively rich set of features, including backreferences
23and lookaround.
24
25It builds on top of the excellent [regex] crate. If you are not
26familiar with it, make sure you read its documentation and maybe you don't even need fancy-regex.
27
28If your regex or parts of it does not use any special features, the matching is delegated to the
29regex crate. That means it has linear runtime. But if you use "fancy" features such as
30backreferences or look-around, an engine with backtracking needs to be used. In that case, the regex
31can be slow and take exponential time to run because of what is called "catastrophic backtracking".
32This depends on the regex and the input.
33
34# Usage
35
36The API should feel very similar to the regex crate, and involves compiling a regex and then using
37it to find matches in text.
38
39## Example: Matching text
40
41An example with backreferences to check if a text consists of two identical words:
42
43```rust
44use fancy_regex::Regex;
45
46let re = Regex::new(r"^(\w+) (\1)$").unwrap();
47let result = re.is_match("foo foo");
48
49assert!(result.is_ok());
50let did_match = result.unwrap();
51assert!(did_match);
52```
53
54Note that like in the regex crate, the regex needs anchors like `^` and `$` to match against the
55entire input text.
56
57## Example: Finding the position of matches
58
59```rust
60use fancy_regex::Regex;
61
62let re = Regex::new(r"(\d)\1").unwrap();
63let result = re.find("foo 22");
64
65assert!(result.is_ok(), "execution was successful");
66let match_option = result.unwrap();
67
68assert!(match_option.is_some(), "found a match");
69let m = match_option.unwrap();
70
71assert_eq!(m.start(), 4);
72assert_eq!(m.end(), 6);
73assert_eq!(m.as_str(), "22");
74```
75
76## Example: Capturing groups
77
78```rust
79use fancy_regex::Regex;
80
81let re = Regex::new(r"(?<!AU)\$(\d+)").unwrap();
82let result = re.captures("AU$10, $20");
83
84let captures = result.expect("Error running regex").expect("No match found");
85let group = captures.get(1).expect("No group");
86assert_eq!(group.as_str(), "20");
87```
88
89# Syntax
90
91The regex syntax is based on the [regex] crate's, with some additional supported syntax. Escapes:
92
93```norun
94\h hex digit ([0-9A-Fa-f])
95\H not hex digit ([^0-9A-Fa-f])
96\e escape control character (\x1B)
97```
98
99Backreferences:
100
101```norun
102\1 match the exact string that the first capture group matched
103\2 backref to the second capture group, etc
104```
105
106Look-around assertions for matching without changing the current position:
107
108```norun
109(?=exp) look-ahead, succeeds if exp matches to the right of the current position
110(?!exp) negative look-ahead, succeeds if exp doesn't match to the right
111(?<=exp) look-behind, succeeds if exp matches to the left of the current position
112(?<!exp) negative look-behind, succeeds if exp doesn't match to the left
113```
114
115Atomic groups using `(?>exp)` to prevent backtracking within `exp`, e.g.:
116
117```
118# use fancy_regex::Regex;
119let re = Regex::new(r"^a(?>bc|b)c$").unwrap();
120assert!(re.is_match("abcc").unwrap());
121// Doesn't match because `|b` is never tried because of the atomic group
122assert!(!re.is_match("abc").unwrap());
123```
124
125[regex]: https://crates.io/crates/regex
126*/
127
128#![doc(html_root_url = "https://docs.rs/fancy-regex/0.3.1")]
129#![deny(missing_docs)]
130#![deny(missing_debug_implementations)]
131
132use bit_set::BitSet;
133use std::fmt;
134use std::usize;
135
136mod analyze;
137mod compile;
138mod error;
139mod parse;
140mod vm;
141
142use crate::analyze::analyze;
143use crate::compile::compile;
144use crate::parse::Parser;
145use crate::vm::Prog;
146
147pub use crate::error::{Error, Result};
148
149const MAX_RECURSION: usize = 64;
150
151// the public API
152
153/// A builder for a `Regex` to allow configuring options.
154#[derive(Debug)]
155pub struct RegexBuilder(RegexOptions);
156
157/// A compiled regular expression.
158pub struct Regex(RegexImpl);
159
160// Separate enum because we don't want to expose any of this
161enum RegexImpl {
162 // Do we want to box this? It's pretty big...
163 Wrap {
164 inner: regex::Regex,
165 inner1: Option<Box<regex::Regex>>,
166 options: RegexOptions,
167 },
168 Fancy {
169 prog: Prog,
170 n_groups: usize,
171 options: RegexOptions,
172 },
173}
174
175/// A single match of a regex or group in an input text
176#[derive(Copy, Clone, Debug, Eq, PartialEq)]
177pub struct Match<'t> {
178 text: &'t str,
179 start: usize,
180 end: usize,
181}
182
183/// A set of capture groups found for a regex.
184#[derive(Debug)]
185pub struct Captures<'t>(CapturesImpl<'t>);
186
187#[derive(Debug)]
188enum CapturesImpl<'t> {
189 Wrap {
190 text: &'t str,
191 inner: regex::Captures<'t>,
192
193 // starting position, in _from_pos variants
194 offset: usize,
195
196 enclosing_groups: usize,
197 },
198 Fancy {
199 text: &'t str,
200 saves: Vec<usize>,
201 },
202}
203
204/// Iterator for captured groups in order in which they appear in the regex.
205#[derive(Debug)]
206pub struct SubCaptureMatches<'c, 't> {
207 caps: &'c Captures<'t>,
208 i: usize,
209}
210
211#[derive(Clone, Debug)]
212struct RegexOptions {
213 pattern: String,
214 backtrack_limit: usize,
215 delegate_size_limit: Option<usize>,
216 delegate_dfa_size_limit: Option<usize>,
217 case_insensitive: bool,
218 multi_line: bool,
219 dot_matches_new_line: bool,
220 unicode: bool,
221}
222
223impl Default for RegexOptions {
224 fn default() -> Self {
225 RegexOptions {
226 pattern: String::new(),
227 backtrack_limit: 1_000_000,
228 delegate_size_limit: None,
229 delegate_dfa_size_limit: None,
230 case_insensitive: false,
231 multi_line: false,
232 dot_matches_new_line: false,
233 unicode: false,
234 }
235 }
236}
237
238impl RegexBuilder {
239 /// Create a new regex builder with a regex pattern.
240 ///
241 /// If the pattern is invalid, the call to `build` will fail later.
242 pub fn new(pattern: &str) -> Self {
243 let mut builder = RegexBuilder(RegexOptions::default());
244 if builder.0.case_insensitive || builder.0.multi_line || builder.0.dot_matches_new_line || builder.0.unicode {
245 let flags = format!(
246 "{}{}{}{}",
247 if builder.0.case_insensitive { "i" } else { "" },
248 if builder.0.multi_line { "m" } else { "" },
249 if builder.0.dot_matches_new_line { "s" } else { "" },
250 if builder.0.unicode { "u" } else { "" },
251 );
252 builder.0.pattern = format!("(?{}){}", &flags, pattern);
253 } else {
254 builder.0.pattern = pattern.to_string();
255 }
256 builder
257 }
258
259 /// Build the `Regex`.
260 ///
261 /// Returns an [`Error`](enum.Error.html) if the pattern could not be parsed.
262 pub fn build(&self) -> Result<Regex> {
263 Regex::new_options(self.0.clone())
264 }
265
266 /// Limit for how many times backtracking should be attempted for fancy regexes (where
267 /// backtracking is used). If this limit is exceeded, execution returns an error with
268 /// [`Error::BacktrackLimitExceeded`](enum.Error.html#variant.BacktrackLimitExceeded).
269 /// This is for preventing a regex with catastrophic backtracking to run for too long.
270 ///
271 /// Default is `1_000_000` (1 million).
272 pub fn backtrack_limit(&mut self, limit: usize) -> &mut Self {
273 self.0.backtrack_limit = limit;
274 self
275 }
276
277 /// Set the approximate size limit of the compiled regular expression.
278 ///
279 /// This option is forwarded from the wrapped `regex` crate. Note that depending on the used
280 /// regex features there may be multiple delegated sub-regexes fed to the `regex` crate. As
281 /// such the actual limit is closer to `<number of delegated regexes> * delegate_size_limit`.
282 pub fn delegate_size_limit(&mut self, limit: usize) -> &mut Self {
283 self.0.delegate_size_limit = Some(limit);
284 self
285 }
286
287 /// Set the approximate size of the cache used by the DFA.
288 ///
289 /// This option is forwarded from the wrapped `regex` crate. Note that depending on the used
290 /// regex features there may be multiple delegated sub-regexes fed to the `regex` crate. As
291 /// such the actual limit is closer to `<number of delegated regexes> *
292 /// delegate_dfa_size_limit`.
293 pub fn delegate_dfa_size_limit(&mut self, limit: usize) -> &mut Self {
294 self.0.delegate_dfa_size_limit = Some(limit);
295 self
296 }
297
298 /// Set the case insensitivity flag.
299 pub fn case_insensitive(&mut self, value: bool) -> &mut Self {
300 self.0.case_insensitive = value;
301 self
302 }
303
304 /// Set the multi-line flag.
305 pub fn multi_line(&mut self, value: bool) -> &mut Self {
306 self.0.multi_line = value;
307 self
308 }
309
310 /// Set the dot-matches-new-line flag.
311 pub fn dot_matches_new_line(&mut self, value: bool) -> &mut Self {
312 self.0.dot_matches_new_line = value;
313 self
314 }
315
316 /// Set the unicode flag.
317 pub fn unicode(&mut self, value: bool) -> &mut Self {
318 self.0.unicode = value;
319 self
320 }
321}
322
323impl fmt::Debug for Regex {
324 /// Shows the original regular expression.
325 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
326 write!(f, "{}", self.as_str())
327 }
328}
329
330impl Regex {
331 /// Parse and compile a regex with default options, see `RegexBuilder`.
332 ///
333 /// Returns an [`Error`](enum.Error.html) if the pattern could not be parsed.
334 pub fn new(re: &str) -> Result<Regex> {
335 let options = RegexOptions {
336 pattern: re.to_string(),
337 ..RegexOptions::default()
338 };
339 Self::new_options(options)
340 }
341
342 fn new_options(options: RegexOptions) -> Result<Regex> {
343 let (raw_e, backrefs) = Expr::parse(&options.pattern)?;
344
345 // wrapper to search for re at arbitrary start position,
346 // and to capture the match bounds
347 let e = Expr::Concat(vec![
348 Expr::Repeat {
349 child: Box::new(Expr::Any { newline: true }),
350 lo: 0,
351 hi: usize::MAX,
352 greedy: false,
353 },
354 Expr::Group(Box::new(raw_e)),
355 ]);
356
357 let info = analyze(&e, &backrefs)?;
358
359 let inner_info = &info.children[1].children[0]; // references inner expr
360 if !inner_info.hard {
361 // easy case, wrap regex
362
363 // we do our own to_str because escapes are different
364 let mut re_cooked = String::new();
365 // same as raw_e above, but it was moved, so traverse to find it
366 let raw_e = match e {
367 Expr::Concat(ref v) => match v[1] {
368 Expr::Group(ref child) => child,
369 _ => unreachable!(),
370 },
371 _ => unreachable!(),
372 };
373 raw_e.to_str(&mut re_cooked, 0);
374 let inner = compile::compile_inner(&re_cooked, &options)?;
375 let inner1 = if inner_info.looks_left {
376 // create regex to handle 1-char look-behind
377 let re1 = ["^(?s:.)+?(", re_cooked.as_str(), ")"].concat();
378 let compiled = compile::compile_inner(&re1, &options)?;
379 Some(Box::new(compiled))
380 } else {
381 None
382 };
383 return Ok(Regex(RegexImpl::Wrap {
384 inner,
385 inner1,
386 options,
387 }));
388 }
389
390 let prog = compile(&info)?;
391 Ok(Regex(RegexImpl::Fancy {
392 prog,
393 n_groups: info.end_group,
394 options,
395 }))
396 }
397
398 /// Returns the original string of this regex.
399 pub fn as_str(&self) -> &str {
400 match &self.0 {
401 RegexImpl::Wrap { options, .. } => &options.pattern,
402 RegexImpl::Fancy { options, .. } => &options.pattern,
403 }
404 }
405
406 /// Check if the regex matches the input text.
407 ///
408 /// # Example
409 ///
410 /// Test if some text contains the same word twice:
411 ///
412 /// ```rust
413 /// # use fancy_regex::Regex;
414 ///
415 /// let re = Regex::new(r"(\w+) \1").unwrap();
416 /// assert!(re.is_match("mirror mirror on the wall").unwrap());
417 /// ```
418 pub fn is_match(&self, text: &str) -> Result<bool> {
419 match &self.0 {
420 RegexImpl::Wrap { ref inner, .. } => Ok(inner.is_match(text)),
421 RegexImpl::Fancy {
422 ref prog, options, ..
423 } => {
424 let result = vm::run(prog, text, 0, 0, options)?;
425 Ok(result.is_some())
426 }
427 }
428 }
429
430 /// Find the first match in the input text.
431 ///
432 /// If you have capturing groups in your regex that you want to extract, use the [captures()]
433 /// method.
434 ///
435 /// # Example
436 ///
437 /// Find a word that is followed by an exclamation point:
438 ///
439 /// ```rust
440 /// # use fancy_regex::Regex;
441 ///
442 /// let re = Regex::new(r"\w+(?=!)").unwrap();
443 /// assert_eq!(re.find("so fancy!").unwrap().unwrap().as_str(), "fancy");
444 /// ```
445 pub fn find<'t>(&self, text: &'t str) -> Result<Option<Match<'t>>> {
446 match &self.0 {
447 RegexImpl::Wrap { inner, .. } => Ok(inner
448 .find(text)
449 .map(|m| Match::new(text, m.start(), m.end()))),
450 RegexImpl::Fancy { prog, options, .. } => {
451 let result = vm::run(prog, text, 0, 0, options)?;
452 Ok(result.map(|saves| Match::new(text, saves[0], saves[1])))
453 }
454 }
455 }
456
457 /// Returns the capture groups for the first match in `text`.
458 ///
459 /// If no match is found, then `Ok(None)` is returned.
460 ///
461 /// # Examples
462 ///
463 /// Finding matches and capturing parts of the match:
464 ///
465 /// ```rust
466 /// # use fancy_regex::Regex;
467 ///
468 /// let re = Regex::new(r"(\d{4})-(\d{2})-(\d{2})").unwrap();
469 /// let text = "The date was 2018-04-07";
470 /// let captures = re.captures(text).unwrap().unwrap();
471 ///
472 /// assert_eq!(captures.get(1).unwrap().as_str(), "2018");
473 /// assert_eq!(captures.get(2).unwrap().as_str(), "04");
474 /// assert_eq!(captures.get(3).unwrap().as_str(), "07");
475 /// assert_eq!(captures.get(0).unwrap().as_str(), "2018-04-07");
476 /// ```
477 pub fn captures<'t>(&self, text: &'t str) -> Result<Option<Captures<'t>>> {
478 match &self.0 {
479 RegexImpl::Wrap { inner, .. } => Ok(inner.captures(text).map(|caps| {
480 Captures(CapturesImpl::Wrap {
481 text,
482 inner: caps,
483 offset: 0,
484 enclosing_groups: 0,
485 })
486 })),
487 RegexImpl::Fancy {
488 prog,
489 n_groups,
490 options,
491 ..
492 } => {
493 let result = vm::run(prog, text, 0, 0, options)?;
494 Ok(result.map(|mut saves| {
495 saves.truncate(n_groups * 2);
496 Captures(CapturesImpl::Fancy { text, saves })
497 }))
498 }
499 }
500 }
501
502 /// Returns the capture groups for the first match in `text`, starting from
503 /// the specified byte position `pos`.
504 ///
505 /// # Examples
506 ///
507 /// Finding captures starting at a position:
508 ///
509 /// ```
510 /// # use fancy_regex::Regex;
511 /// let re = Regex::new(r"(?m:^)(\d+)").unwrap();
512 /// let text = "1 test 123\n2 foo";
513 /// let captures = re.captures_from_pos(text, 7).unwrap().unwrap();
514 ///
515 /// let group = captures.get(1).unwrap();
516 /// assert_eq!(group.as_str(), "2");
517 /// assert_eq!(group.start(), 11);
518 /// assert_eq!(group.end(), 12);
519 /// ```
520 ///
521 /// Note that in some cases this is not the same as using the `captures`
522 /// methods and passing a slice of the string, see the capture that we get
523 /// when we do this:
524 ///
525 /// ```
526 /// # use fancy_regex::Regex;
527 /// let re = Regex::new(r"(?m:^)(\d+)").unwrap();
528 /// let text = "1 test 123\n2 foo";
529 /// let captures = re.captures(&text[7..]).unwrap().unwrap();
530 /// assert_eq!(captures.get(1).unwrap().as_str(), "123");
531 /// ```
532 ///
533 /// This matched the number "123" because it's at the beginning of the text
534 /// of the string slice.
535 ///
536 pub fn captures_from_pos<'t>(&self, text: &'t str, pos: usize) -> Result<Option<Captures<'t>>> {
537 match &self.0 {
538 RegexImpl::Wrap { inner, inner1, .. } => {
539 if inner1.is_none() || pos == 0 {
540 let result = inner.captures(&text[pos..]);
541 Ok(result.map(|caps| {
542 Captures(CapturesImpl::Wrap {
543 text,
544 inner: caps,
545 offset: pos,
546 enclosing_groups: 0,
547 })
548 }))
549 } else {
550 let ix = prev_codepoint_ix(text, pos);
551 let inner1 = inner1.as_ref().unwrap();
552 let result = inner1.captures(&text[ix..]);
553 Ok(result.map(|caps| {
554 Captures(CapturesImpl::Wrap {
555 text,
556 inner: caps,
557 offset: ix,
558 enclosing_groups: 1,
559 })
560 }))
561 }
562 }
563 RegexImpl::Fancy {
564 prog,
565 n_groups,
566 options,
567 ..
568 } => {
569 let result = vm::run(prog, text, pos, 0, options)?;
570 Ok(result.map(|mut saves| {
571 saves.truncate(n_groups * 2);
572 Captures(CapturesImpl::Fancy { text, saves })
573 }))
574 }
575 }
576 }
577
578 // for debugging only
579 #[doc(hidden)]
580 pub fn debug_print(&self) {
581 match &self.0 {
582 RegexImpl::Wrap { inner, .. } => println!("wrapped {:?}", inner),
583 RegexImpl::Fancy { prog, .. } => prog.debug_print(),
584 }
585 }
586}
587
588impl<'t> Match<'t> {
589 /// Returns the starting byte offset of the match in the text.
590 #[inline]
591 pub fn start(&self) -> usize {
592 self.start
593 }
594
595 /// Returns the ending byte offset of the match in the text.
596 #[inline]
597 pub fn end(&self) -> usize {
598 self.end
599 }
600
601 /// Returns the matched text.
602 #[inline]
603 pub fn as_str(&self) -> &'t str {
604 &self.text[self.start..self.end]
605 }
606
607 fn new(text: &'t str, start: usize, end: usize) -> Match<'t> {
608 Match { text, start, end }
609 }
610}
611
612impl<'t> Captures<'t> {
613 /// Get the capture group by its index in the regex.
614 ///
615 /// If there is no match for that group or the index does not correspond to a group, `None` is
616 /// returned. The index 0 returns the whole match.
617 pub fn get(&self, i: usize) -> Option<Match<'t>> {
618 match &self.0 {
619 CapturesImpl::Wrap {
620 text,
621 inner,
622 offset,
623 enclosing_groups,
624 } => inner.get(i + *enclosing_groups).map(|m| Match {
625 text,
626 start: m.start() + *offset,
627 end: m.end() + *offset,
628 }),
629 CapturesImpl::Fancy { text, ref saves } => {
630 let slot = i * 2;
631 if slot >= saves.len() {
632 return None;
633 }
634 let lo = saves[slot];
635 if lo == std::usize::MAX {
636 return None;
637 }
638 let hi = saves[slot + 1];
639 Some(Match {
640 text,
641 start: lo,
642 end: hi,
643 })
644 }
645 }
646 }
647
648 /// Iterate over the captured groups in order in which they appeared in the regex. The first
649 /// capture corresponds to the whole match.
650 pub fn iter<'c>(&'c self) -> SubCaptureMatches<'c, 't> {
651 SubCaptureMatches { caps: self, i: 0 }
652 }
653
654 /// How many groups were captured.
655 pub fn len(&self) -> usize {
656 match self.0 {
657 CapturesImpl::Wrap {
658 ref inner,
659 enclosing_groups,
660 ..
661 } => inner.len() - enclosing_groups,
662 CapturesImpl::Fancy { ref saves, .. } => saves.len() / 2,
663 }
664 }
665}
666
667impl<'c, 't> Iterator for SubCaptureMatches<'c, 't> {
668 type Item = Option<Match<'t>>;
669
670 fn next(&mut self) -> Option<Option<Match<'t>>> {
671 if self.i < self.caps.len() {
672 let result = self.caps.get(self.i);
673 self.i += 1;
674 Some(result)
675 } else {
676 None
677 }
678 }
679}
680
681// TODO: might be nice to implement ExactSizeIterator etc for SubCaptures
682
683/// Regular expression AST. This is public for now but may change.
684#[derive(Debug, PartialEq, Eq)]
685pub enum Expr {
686 /// An empty expression, e.g. the last branch in `(a|b|)`
687 Empty,
688 /// Any character, regex `.`
689 Any {
690 /// Whether it also matches newlines or not
691 newline: bool,
692 },
693 /// Start of input text
694 StartText,
695 /// End of input text
696 EndText,
697 /// Start of a line
698 StartLine,
699 /// End of a line
700 EndLine,
701 /// The string as a literal, e.g. `a`
702 Literal {
703 /// The string to match
704 val: String,
705 /// Whether match is case-insensitive or not
706 casei: bool,
707 },
708 /// Concatenation of multiple expressions, must match in order, e.g. `a.` is a concatenation of
709 /// the literal `a` and `.` for any character
710 Concat(Vec<Expr>),
711 /// Alternative of multiple expressions, one of them must match, e.g. `a|b` is an alternative
712 /// where either the literal `a` or `b` must match
713 Alt(Vec<Expr>),
714 /// Capturing group of expression, e.g. `(a.)` matches `a` and any character and "captures"
715 /// (remembers) the match
716 Group(Box<Expr>),
717 /// Look-around (e.g. positive/negative look-ahead or look-behind) with an expression, e.g.
718 /// `(?=a)` means the next character must be `a` (but the match is not consumed)
719 LookAround(Box<Expr>, LookAround),
720 /// Repeat of an expression, e.g. `a*` or `a+` or `a{1,3}`
721 Repeat {
722 /// The expression that is being repeated
723 child: Box<Expr>,
724 /// The minimum number of repetitions
725 lo: usize,
726 /// The maximum number of repetitions (or `usize::MAX`)
727 hi: usize,
728 /// Greedy means as much as possible is matched, e.g. `.*b` would match all of `abab`.
729 /// Non-greedy means as little as possible, e.g. `.*?b` would match only `ab` in `abab`.
730 greedy: bool,
731 },
732 /// Delegate a regex to the regex crate. This is used as a simplification so that we don't have
733 /// to represent all the expressions in the AST, e.g. character classes.
734 Delegate {
735 /// The regex
736 inner: String,
737 /// How many characters the regex matches
738 size: usize, // TODO: move into analysis result
739 /// Whether the matching is case-insensitive or not
740 casei: bool,
741 },
742 /// Back reference to a capture group, e.g. `\1` in `(abc|def)\1` references the captured group
743 /// and the whole regex matches either `abcabc` or `defdef`.
744 Backref(usize),
745 /// Atomic non-capturing group, e.g. `(?>ab|a)` in text that contains `ab` will match `ab` and
746 /// never backtrack and try `a`, even if matching fails after the atomic group.
747 AtomicGroup(Box<Expr>),
748}
749
750/// Type of look-around assertion as used for a look-around expression.
751#[derive(Debug, PartialEq, Eq, Clone, Copy)]
752pub enum LookAround {
753 /// Look-ahead assertion, e.g. `(?=a)`
754 LookAhead,
755 /// Negative look-ahead assertion, e.g. `(?!a)`
756 LookAheadNeg,
757 /// Look-behind assertion, e.g. `(?<=a)`
758 LookBehind,
759 /// Negative look-behind assertion, e.g. `(?<!a)`
760 LookBehindNeg,
761}
762
763// silly to write my own, but this is super-fast for the common 1-digit
764// case.
765fn push_usize(s: &mut String, x: usize) {
766 if x >= 10 {
767 push_usize(s, x / 10);
768 s.push((b'0' + (x % 10) as u8) as char);
769 } else {
770 s.push((b'0' + (x as u8)) as char);
771 }
772}
773
774fn push_quoted(buf: &mut String, s: &str) {
775 for c in s.chars() {
776 match c {
777 '\\' | '.' | '+' | '*' | '?' | '(' | ')' | '|' | '[' | ']' | '{' | '}' | '^' | '$'
778 | '#' => buf.push('\\'),
779 _ => (),
780 }
781 buf.push(c);
782 }
783}
784
785impl Expr {
786 /// Parse the regex and return an expression (AST) and a bit set with the indexes of groups
787 /// that are referenced by backrefs.
788 pub fn parse(re: &str) -> Result<(Expr, BitSet)> {
789 Parser::parse(re)
790 }
791
792 /// Convert expression to a regex string in the regex crate's syntax.
793 ///
794 /// # Panics
795 ///
796 /// Panics for expressions that are hard, i.e. can not be handled by the regex crate.
797 pub fn to_str(&self, buf: &mut String, precedence: u8) {
798 match *self {
799 Expr::Empty => (),
800 Expr::Any { newline } => buf.push_str(if newline { "(?s:.)" } else { "." }),
801 Expr::Literal { ref val, casei } => {
802 if casei {
803 buf.push_str("(?i:");
804 }
805 push_quoted(buf, val);
806 if casei {
807 buf.push_str(")");
808 }
809 }
810 Expr::StartText => buf.push('^'),
811 Expr::EndText => buf.push('$'),
812 Expr::StartLine => buf.push_str("(?m:^)"),
813 Expr::EndLine => buf.push_str("(?m:$)"),
814 Expr::Concat(ref children) => {
815 if precedence > 1 {
816 buf.push_str("(?:");
817 }
818 for child in children {
819 child.to_str(buf, 2);
820 }
821 if precedence > 1 {
822 buf.push(')')
823 }
824 }
825 Expr::Alt(ref children) => {
826 if precedence > 0 {
827 buf.push_str("(?:");
828 }
829
830 let is_empty = |e: &Expr| match e {
831 Expr::Empty => true,
832 _ => false,
833 };
834 let contains_empty = children.iter().any(&is_empty);
835 if contains_empty {
836 buf.push_str("(?:");
837 }
838 for (i, child) in children.iter().filter(|&c| !is_empty(c)).enumerate() {
839 if i != 0 {
840 buf.push('|');
841 }
842 child.to_str(buf, 1);
843 }
844 if contains_empty {
845 // regex fails with `(a|b|)`, so transform to `((?:a|b)?)`
846 buf.push_str(")?");
847 }
848
849 if precedence > 0 {
850 buf.push(')');
851 }
852 }
853 Expr::Group(ref child) => {
854 buf.push('(');
855 child.to_str(buf, 0);
856 buf.push(')');
857 }
858 Expr::Repeat {
859 ref child,
860 lo,
861 hi,
862 greedy,
863 } => {
864 if precedence > 2 {
865 buf.push_str("(?:");
866 }
867 child.to_str(buf, 3);
868 match (lo, hi) {
869 (0, 1) => buf.push('?'),
870 (0, usize::MAX) => buf.push('*'),
871 (1, usize::MAX) => buf.push('+'),
872 (lo, hi) => {
873 buf.push('{');
874 push_usize(buf, lo);
875 if lo != hi {
876 buf.push(',');
877 if hi != usize::MAX {
878 push_usize(buf, hi);
879 }
880 }
881 buf.push('}');
882 }
883 }
884 if !greedy {
885 buf.push('?');
886 }
887 if precedence > 2 {
888 buf.push(')');
889 }
890 }
891 Expr::Delegate {
892 ref inner, casei, ..
893 } => {
894 // at the moment, delegate nodes are just atoms
895 if casei {
896 buf.push_str("(?i:");
897 }
898 buf.push_str(inner);
899 if casei {
900 buf.push_str(")");
901 }
902 }
903 _ => panic!("attempting to format hard expr"),
904 }
905 }
906}
907
908// precondition: ix > 0
909fn prev_codepoint_ix(s: &str, mut ix: usize) -> usize {
910 let bytes = s.as_bytes();
911 loop {
912 ix -= 1;
913 // fancy bit magic for ranges 0..0x80 + 0xc0..
914 if (bytes[ix] as i8) >= -0x40 {
915 break;
916 }
917 }
918 ix
919}
920
921fn codepoint_len(b: u8) -> usize {
922 match b {
923 b if b < 0x80 => 1,
924 b if b < 0xe0 => 2,
925 b if b < 0xf0 => 3,
926 _ => 4,
927 }
928}
929
930// If this returns false, then there is no possible backref in the re
931
932// Both potential implementations are turned off, because we currently
933// always need to do a deeper analysis because of 1-character
934// look-behind. If we could call a find_from_pos method of regex::Regex,
935// it would make sense to bring this back.
936/*
937pub fn detect_possible_backref(re: &str) -> bool {
938 let mut last = b'\x00';
939 for b in re.as_bytes() {
940 if b'0' <= *b && *b <= b'9' && last == b'\\' { return true; }
941 last = *b;
942 }
943 false
944}
945
946pub fn detect_possible_backref(re: &str) -> bool {
947 let mut bytes = re.as_bytes();
948 loop {
949 match memchr::memchr(b'\\', &bytes[..bytes.len() - 1]) {
950 Some(i) => {
951 bytes = &bytes[i + 1..];
952 let c = bytes[0];
953 if b'0' <= c && c <= b'9' { return true; }
954 }
955 None => return false
956 }
957 }
958}
959*/
960
961/// The internal module only exists so that the toy example can access internals for debugging and
962/// experimenting.
963#[doc(hidden)]
964pub mod internal {
965 pub use crate::analyze::analyze;
966 pub use crate::compile::compile;
967 pub use crate::vm::{run_default, run_trace, Insn, Prog};
968}
969
970#[cfg(test)]
971mod tests {
972 use crate::parse::make_literal;
973 use crate::Expr;
974 use crate::Regex;
975 use std::usize;
976 //use detect_possible_backref;
977
978 // tests for to_str
979
980 fn to_str(e: Expr) -> String {
981 let mut s = String::new();
982 e.to_str(&mut s, 0);
983 s
984 }
985
986 #[test]
987 fn to_str_concat_alt() {
988 let e = Expr::Concat(vec![
989 Expr::Alt(vec![make_literal("a"), make_literal("b")]),
990 make_literal("c"),
991 ]);
992 assert_eq!(to_str(e), "(?:a|b)c");
993 }
994
995 #[test]
996 fn to_str_rep_concat() {
997 let e = Expr::Repeat {
998 child: Box::new(Expr::Concat(vec![make_literal("a"), make_literal("b")])),
999 lo: 2,
1000 hi: 3,
1001 greedy: true,
1002 };
1003 assert_eq!(to_str(e), "(?:ab){2,3}");
1004 }
1005
1006 #[test]
1007 fn to_str_group_alt() {
1008 let e = Expr::Group(Box::new(Expr::Alt(vec![
1009 make_literal("a"),
1010 make_literal("b"),
1011 ])));
1012 assert_eq!(to_str(e), "(a|b)");
1013 }
1014
1015 #[test]
1016 fn as_str_debug() {
1017 let s = r"(a+)b\1";
1018 let regex = Regex::new(s).unwrap();
1019 assert_eq!(s, regex.as_str());
1020 assert_eq!(s, format!("{:?}", regex));
1021 }
1022
1023 #[test]
1024 fn to_str_repeat() {
1025 fn repeat(lo: usize, hi: usize, greedy: bool) -> Expr {
1026 Expr::Repeat {
1027 child: Box::new(make_literal("a")),
1028 lo,
1029 hi,
1030 greedy,
1031 }
1032 }
1033
1034 assert_eq!(to_str(repeat(2, 2, true)), "a{2}");
1035 assert_eq!(to_str(repeat(2, 2, false)), "a{2}?");
1036 assert_eq!(to_str(repeat(2, 3, true)), "a{2,3}");
1037 assert_eq!(to_str(repeat(2, 3, false)), "a{2,3}?");
1038 assert_eq!(to_str(repeat(2, usize::MAX, true)), "a{2,}");
1039 assert_eq!(to_str(repeat(2, usize::MAX, false)), "a{2,}?");
1040 assert_eq!(to_str(repeat(0, 1, true)), "a?");
1041 assert_eq!(to_str(repeat(0, 1, false)), "a??");
1042 assert_eq!(to_str(repeat(0, usize::MAX, true)), "a*");
1043 assert_eq!(to_str(repeat(0, usize::MAX, false)), "a*?");
1044 assert_eq!(to_str(repeat(1, usize::MAX, true)), "a+");
1045 assert_eq!(to_str(repeat(1, usize::MAX, false)), "a+?");
1046 }
1047
1048 /*
1049 #[test]
1050 fn detect_backref() {
1051 assert_eq!(detect_possible_backref("a0a1a2"), false);
1052 assert_eq!(detect_possible_backref("a0a1\\a2"), false);
1053 assert_eq!(detect_possible_backref("a0a\\1a2"), true);
1054 assert_eq!(detect_possible_backref("a0a1a2\\"), false);
1055 }
1056 */
1057}