#![doc = include_str!("../docs/main.md")]
#![doc = include_str!("../docs/features.md")]
#![doc = include_str!("../docs/syntax.md")]
#![doc = include_str!("../docs/subroutines/1_intro.md")]
#![doc = include_str!("../docs/subroutines/2_flags.md")]
#![doc = include_str!("../docs/subroutines/3_left_recursion.md")]
#![doc = include_str!("../docs/subroutines/4_recursion.md")]
#![doc = include_str!("../docs/absent.md")]
#![deny(missing_docs)]
#![deny(missing_debug_implementations)]
#![cfg_attr(not(feature = "std"), no_std)]
extern crate alloc;
use alloc::borrow::Cow;
use alloc::boxed::Box;
use alloc::string::{String, ToString};
use alloc::sync::Arc;
use alloc::vec;
use alloc::vec::Vec;
use core::convert::TryFrom;
use core::fmt;
use core::fmt::{Debug, Formatter};
use core::ops::{Index, Range};
use core::str::FromStr;
use regex_automata::meta::Regex as RaRegex;
use regex_automata::util::captures::Captures as RaCaptures;
use regex_automata::util::syntax::Config as SyntaxConfig;
use regex_automata::Input as RaInput;
mod analyze;
mod compile;
mod error;
mod expand;
mod optimize;
mod parse;
mod parse_flags;
mod replacer;
mod vm;
use crate::analyze::can_compile_as_anchored;
use crate::analyze::{analyze, AnalyzeContext};
use crate::compile::{compile, CompileOptions};
use crate::optimize::optimize;
use crate::parse::{ExprTree, NamedGroups, Parser};
use crate::parse_flags::*;
use crate::vm::{Prog, OPTION_FIND_NOT_EMPTY, OPTION_SKIPPED_EMPTY_MATCH};
pub use crate::error::{CompileError, Error, ParseError, Result, RuntimeError};
pub use crate::expand::Expander;
pub use crate::replacer::{NoExpand, Replacer, ReplacerRef};
const MAX_RECURSION: usize = 64;
#[derive(Debug)]
pub struct RegexBuilder {
pattern: String,
options: RegexOptionsBuilder,
}
#[derive(Debug)]
pub struct RegexOptionsBuilder {
options: RegexOptions,
}
#[derive(Clone)]
pub struct Regex {
inner: RegexImpl,
named_groups: Arc<NamedGroups>,
}
#[derive(Clone)]
enum RegexImpl {
Wrap {
inner: RaRegex,
pattern: String,
explicit_capture_group_0: bool,
delegated_pattern: String,
},
Fancy {
prog: Arc<Prog>,
n_groups: usize,
pattern: String,
options: HardRegexRuntimeOptions,
},
}
#[derive(Copy, Clone, Debug, Eq, PartialEq)]
pub struct Match<'t> {
text: &'t str,
start: usize,
end: usize,
}
#[derive(Debug)]
pub struct Matches<'r, 't> {
re: &'r Regex,
text: &'t str,
last_end: usize,
last_match: Option<usize>,
last_skipped_empty: bool,
}
impl<'r, 't> Matches<'r, 't> {
pub fn text(&self) -> &'t str {
self.text
}
pub fn regex(&self) -> &'r Regex {
self.re
}
fn next_with<F, R>(&mut self, mut search: F) -> Option<Result<R>>
where
F: FnMut(&Regex, usize, u32) -> Result<Option<(R, Match<'t>)>>,
{
if self.last_end > self.text.len() {
return None;
}
let option_flags = if self.last_skipped_empty {
OPTION_SKIPPED_EMPTY_MATCH
} else {
0
};
let pos = self.last_end;
let (result, mat) = match search(self.re, pos, option_flags) {
Err(error) => {
self.last_end = self.text.len() + 1;
return Some(Err(error));
}
Ok(None) => return None,
Ok(Some(pair)) => pair,
};
if mat.start == mat.end {
self.last_end = next_utf8(self.text, mat.end);
self.last_skipped_empty = mat.end == pos;
if Some(mat.end) == self.last_match {
return self.next_with(search);
}
} else {
self.last_end = mat.end;
self.last_skipped_empty = false;
}
self.last_match = Some(mat.end);
Some(Ok(result))
}
}
impl<'r, 't> Iterator for Matches<'r, 't> {
type Item = Result<Match<'t>>;
fn next(&mut self) -> Option<Self::Item> {
let text = self.text;
self.next_with(move |re, pos, flags| {
re.find_from_pos_with_option_flags(text, pos, flags)
.map(|opt| opt.map(|m| (m, m)))
})
}
}
#[derive(Debug)]
pub struct CaptureMatches<'r, 't>(Matches<'r, 't>);
impl<'r, 't> CaptureMatches<'r, 't> {
pub fn text(&self) -> &'t str {
self.0.text
}
pub fn regex(&self) -> &'r Regex {
self.0.re
}
}
impl<'r, 't> Iterator for CaptureMatches<'r, 't> {
type Item = Result<Captures<'t>>;
fn next(&mut self) -> Option<Self::Item> {
let text = self.0.text;
self.0.next_with(move |re, pos, flags| {
let captures = re.captures_from_pos_with_option_flags(text, pos, flags)?;
Ok(captures.map(|c| {
let mat = c
.get(0)
.expect("`Captures` is expected to have entire match at 0th position");
(c, mat)
}))
})
}
}
#[derive(Debug)]
pub struct Captures<'t> {
inner: CapturesImpl<'t>,
named_groups: Arc<NamedGroups>,
}
#[derive(Debug)]
enum CapturesImpl<'t> {
Wrap {
text: &'t str,
locations: RaCaptures,
explicit_capture_group_0: bool,
},
Fancy {
text: &'t str,
saves: Vec<usize>,
},
}
#[derive(Debug)]
pub struct SubCaptureMatches<'c, 't> {
caps: &'c Captures<'t>,
i: usize,
}
#[derive(Debug)]
pub struct Split<'r, 'h> {
matches: Matches<'r, 'h>,
next_start: usize,
target: &'h str,
}
impl<'r, 'h> Iterator for Split<'r, 'h> {
type Item = Result<&'h str>;
fn next(&mut self) -> Option<Result<&'h str>> {
match self.matches.next() {
None => {
let len = self.target.len();
if self.next_start > len {
None
} else {
let part = &self.target[self.next_start..len];
self.next_start = len + 1;
Some(Ok(part))
}
}
Some(Ok(m)) => {
let part = &self.target[self.next_start..m.start()];
self.next_start = m.end();
Some(Ok(part))
}
Some(Err(e)) => Some(Err(e)),
}
}
}
impl<'r, 'h> core::iter::FusedIterator for Split<'r, 'h> {}
#[derive(Debug)]
pub struct SplitN<'r, 'h> {
splits: Split<'r, 'h>,
limit: usize,
}
impl<'r, 'h> Iterator for SplitN<'r, 'h> {
type Item = Result<&'h str>;
fn next(&mut self) -> Option<Result<&'h str>> {
if self.limit == 0 {
return None;
}
self.limit -= 1;
if self.limit > 0 {
return self.splits.next();
}
let len = self.splits.target.len();
if self.splits.next_start > len {
None
} else {
let start = self.splits.next_start;
self.splits.next_start = len + 1;
Some(Ok(&self.splits.target[start..len]))
}
}
fn size_hint(&self) -> (usize, Option<usize>) {
(0, Some(self.limit))
}
}
impl<'r, 'h> core::iter::FusedIterator for SplitN<'r, 'h> {}
#[derive(Clone, Debug, Default)]
struct RegexOptions {
syntaxc: SyntaxConfig,
delegate_size_limit: Option<usize>,
delegate_dfa_size_limit: Option<usize>,
oniguruma_mode: bool,
ignore_numbered_groups_when_named_groups_exist: bool,
hard_regex_runtime_options: HardRegexRuntimeOptions,
}
#[derive(Copy, Clone, Debug)]
struct HardRegexRuntimeOptions {
backtrack_limit: usize,
find_not_empty: bool,
}
impl RegexOptions {
fn get_flag_value(flag_value: bool, enum_value: u32) -> u32 {
if flag_value {
enum_value
} else {
0
}
}
fn compute_flags(&self) -> u32 {
let insensitive = Self::get_flag_value(self.syntaxc.get_case_insensitive(), FLAG_CASEI);
let multiline = Self::get_flag_value(self.syntaxc.get_multi_line(), FLAG_MULTI);
let whitespace =
Self::get_flag_value(self.syntaxc.get_ignore_whitespace(), FLAG_IGNORE_SPACE);
let dotnl = Self::get_flag_value(self.syntaxc.get_dot_matches_new_line(), FLAG_DOTNL);
let unicode = Self::get_flag_value(self.syntaxc.get_unicode(), FLAG_UNICODE);
let oniguruma_mode = Self::get_flag_value(self.oniguruma_mode, FLAG_ONIGURUMA_MODE);
let crlf = Self::get_flag_value(self.syntaxc.get_crlf(), FLAG_CRLF);
let named_groups_only = Self::get_flag_value(
self.ignore_numbered_groups_when_named_groups_exist,
FLAG_IGNORE_NUMBERED_GROUPS_WHEN_NAMED_GROUPS_EXIST,
);
insensitive
| multiline
| whitespace
| dotnl
| unicode
| oniguruma_mode
| crlf
| named_groups_only
}
}
impl Default for HardRegexRuntimeOptions {
fn default() -> Self {
HardRegexRuntimeOptions {
backtrack_limit: 1_000_000,
find_not_empty: false,
}
}
}
impl Default for RegexOptionsBuilder {
fn default() -> Self {
Self::new()
}
}
impl RegexOptionsBuilder {
pub fn new() -> Self {
RegexOptionsBuilder {
options: RegexOptions::default(),
}
}
pub fn build(&self, pattern: String) -> Result<Regex> {
Regex::new_options(pattern, &self.options)
}
fn set_config(&mut self, func: impl Fn(SyntaxConfig) -> SyntaxConfig) -> &mut Self {
self.options.syntaxc = func(self.options.syntaxc);
self
}
pub fn case_insensitive(&mut self, yes: bool) -> &mut Self {
self.set_config(|x| x.case_insensitive(yes))
}
pub fn multi_line(&mut self, yes: bool) -> &mut Self {
self.set_config(|x| x.multi_line(yes))
}
pub fn ignore_whitespace(&mut self, yes: bool) -> &mut Self {
self.set_config(|x| x.ignore_whitespace(yes))
}
pub fn dot_matches_new_line(&mut self, yes: bool) -> &mut Self {
self.set_config(|x| x.dot_matches_new_line(yes))
}
pub fn crlf(&mut self, yes: bool) -> &mut Self {
self.set_config(|x| x.crlf(yes))
}
pub fn verbose_mode(&mut self, yes: bool) -> &mut Self {
self.set_config(|x| x.ignore_whitespace(yes))
}
pub fn unicode_mode(&mut self, yes: bool) -> &mut Self {
self.set_config(|x| x.unicode(yes))
}
pub fn backtrack_limit(&mut self, limit: usize) -> &mut Self {
self.options.hard_regex_runtime_options.backtrack_limit = limit;
self
}
pub fn delegate_size_limit(&mut self, limit: usize) -> &mut Self {
self.options.delegate_size_limit = Some(limit);
self
}
pub fn delegate_dfa_size_limit(&mut self, limit: usize) -> &mut Self {
self.options.delegate_dfa_size_limit = Some(limit);
self
}
pub fn find_not_empty(&mut self, yes: bool) -> &mut Self {
self.options.hard_regex_runtime_options.find_not_empty = yes;
self
}
pub fn ignore_numbered_groups_when_named_groups_exist(&mut self, yes: bool) -> &mut Self {
self.options.ignore_numbered_groups_when_named_groups_exist = yes;
self
}
pub fn oniguruma_mode(&mut self, yes: bool) -> &mut Self {
self.options.oniguruma_mode = yes;
self
}
}
impl RegexBuilder {
pub fn new(pattern: &str) -> Self {
RegexBuilder {
pattern: pattern.to_string(),
options: RegexOptionsBuilder::new(),
}
}
pub fn build(&self) -> Result<Regex> {
self.options.build(self.pattern.clone())
}
pub fn pattern(&mut self, pattern: String) -> &mut Self {
self.pattern = pattern;
self
}
pub fn case_insensitive(&mut self, yes: bool) -> &mut Self {
self.options.case_insensitive(yes);
self
}
pub fn multi_line(&mut self, yes: bool) -> &mut Self {
self.options.multi_line(yes);
self
}
pub fn ignore_whitespace(&mut self, yes: bool) -> &mut Self {
self.options.ignore_whitespace(yes);
self
}
pub fn dot_matches_new_line(&mut self, yes: bool) -> &mut Self {
self.options.dot_matches_new_line(yes);
self
}
pub fn verbose_mode(&mut self, yes: bool) -> &mut Self {
self.options.ignore_whitespace(yes);
self
}
pub fn unicode_mode(&mut self, yes: bool) -> &mut Self {
self.options.unicode_mode(yes);
self
}
pub fn backtrack_limit(&mut self, limit: usize) -> &mut Self {
self.options.backtrack_limit(limit);
self
}
pub fn delegate_size_limit(&mut self, limit: usize) -> &mut Self {
self.options.delegate_size_limit(limit);
self
}
pub fn delegate_dfa_size_limit(&mut self, limit: usize) -> &mut Self {
self.options.delegate_dfa_size_limit(limit);
self
}
pub fn oniguruma_mode(&mut self, yes: bool) -> &mut Self {
self.options.oniguruma_mode(yes);
self
}
pub fn crlf(&mut self, yes: bool) -> &mut Self {
self.options.crlf(yes);
self
}
pub fn find_not_empty(&mut self, yes: bool) -> &mut Self {
self.options.find_not_empty(yes);
self
}
pub fn ignore_numbered_groups_when_named_groups_exist(&mut self, yes: bool) -> &mut Self {
self.options
.ignore_numbered_groups_when_named_groups_exist(yes);
self
}
}
impl fmt::Debug for Regex {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "{}", self.as_str())
}
}
impl fmt::Display for Regex {
fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
write!(f, "{}", self.as_str())
}
}
impl FromStr for Regex {
type Err = Error;
fn from_str(s: &str) -> Result<Regex> {
Regex::new(s)
}
}
impl Regex {
pub fn new(re: &str) -> Result<Regex> {
Self::new_options(re.to_string(), &RegexOptions::default())
}
fn new_options(pattern: String, options: &RegexOptions) -> Result<Regex> {
let mut tree = Expr::parse_tree_with_flags(&pattern, options.compute_flags())?;
let find_not_empty = options.hard_regex_runtime_options.find_not_empty;
let requires_capture_group_fixup = if find_not_empty {
false
} else {
optimize(&mut tree)
};
let info = analyze(
&tree,
AnalyzeContext {
explicit_capture_group_0: requires_capture_group_fixup,
find_not_empty,
},
)?;
if find_not_empty && info.const_size && info.min_size == 0 {
return Err(CompileError::PatternCanNeverMatch.into());
}
if !info.hard {
let mut re_cooked = String::new();
tree.expr.to_str(&mut re_cooked, 0);
let inner = compile::compile_inner(&re_cooked, options)?;
return Ok(Regex {
inner: RegexImpl::Wrap {
inner,
pattern,
explicit_capture_group_0: requires_capture_group_fixup,
delegated_pattern: re_cooked,
},
named_groups: Arc::new(tree.named_groups),
});
}
let prog = compile(
&info,
CompileOptions {
anchored: can_compile_as_anchored(&tree.expr),
contains_subroutines: tree.contains_subroutines,
},
)?;
Ok(Regex {
inner: RegexImpl::Fancy {
prog: Arc::new(prog),
n_groups: info.end_group(),
options: options.hard_regex_runtime_options,
pattern,
},
named_groups: Arc::new(tree.named_groups),
})
}
pub fn as_str(&self) -> &str {
match &self.inner {
RegexImpl::Wrap { pattern, .. } => pattern,
RegexImpl::Fancy { pattern, .. } => pattern,
}
}
pub fn is_match(&self, text: &str) -> Result<bool> {
match &self.inner {
RegexImpl::Wrap { inner, .. } => Ok(inner.is_match(text)),
RegexImpl::Fancy { .. } => self.find(text).map(|m| m.is_some()),
}
}
pub fn find_iter<'r, 't>(&'r self, text: &'t str) -> Matches<'r, 't> {
Matches {
re: self,
text,
last_end: 0,
last_match: None,
last_skipped_empty: false,
}
}
pub fn find<'t>(&self, text: &'t str) -> Result<Option<Match<'t>>> {
self.find_from_pos(text, 0)
}
pub fn find_from_pos<'t>(&self, text: &'t str, pos: usize) -> Result<Option<Match<'t>>> {
self.find_from_pos_with_option_flags(text, pos, 0)
}
fn find_from_pos_with_option_flags<'t>(
&self,
text: &'t str,
pos: usize,
option_flags: u32,
) -> Result<Option<Match<'t>>> {
if pos > text.len() {
return Ok(None);
}
match &self.inner {
RegexImpl::Wrap {
inner,
explicit_capture_group_0,
..
} => {
let result = if !*explicit_capture_group_0 {
inner
.search(&RaInput::new(text).span(pos..text.len()))
.map(|m| Match::new(text, m.start(), m.end()))
} else {
let mut locations = inner.create_captures();
inner.captures(RaInput::new(text).span(pos..text.len()), &mut locations);
locations
.get_group(1)
.map(|group1| Match::new(text, group1.start, group1.end))
};
Ok(result)
}
RegexImpl::Fancy { prog, options, .. } => {
let option_flags = option_flags
| if options.find_not_empty {
OPTION_FIND_NOT_EMPTY
} else {
0
};
let result = vm::run(prog, text, pos, option_flags, options)?;
Ok(result.map(|saves| Match::new(text, saves[0], saves[1])))
}
}
}
pub fn captures_iter<'r, 't>(&'r self, text: &'t str) -> CaptureMatches<'r, 't> {
CaptureMatches(self.find_iter(text))
}
pub fn captures<'t>(&self, text: &'t str) -> Result<Option<Captures<'t>>> {
self.captures_from_pos(text, 0)
}
pub fn captures_from_pos<'t>(&self, text: &'t str, pos: usize) -> Result<Option<Captures<'t>>> {
self.captures_from_pos_with_option_flags(text, pos, 0)
}
fn captures_from_pos_with_option_flags<'t>(
&self,
text: &'t str,
pos: usize,
option_flags: u32,
) -> Result<Option<Captures<'t>>> {
if pos > text.len() {
return Ok(None);
}
let named_groups = self.named_groups.clone();
match &self.inner {
RegexImpl::Wrap {
inner,
explicit_capture_group_0,
..
} => {
let explicit = *explicit_capture_group_0;
let mut locations = inner.create_captures();
inner.captures(RaInput::new(text).span(pos..text.len()), &mut locations);
Ok(locations.is_match().then_some(Captures {
inner: CapturesImpl::Wrap {
text,
locations,
explicit_capture_group_0: explicit,
},
named_groups,
}))
}
RegexImpl::Fancy {
prog,
n_groups,
options,
..
} => {
let option_flags = option_flags
| if options.find_not_empty {
OPTION_FIND_NOT_EMPTY
} else {
0
};
let result = vm::run(prog, text, pos, option_flags, options)?;
Ok(result.map(|mut saves| {
saves.truncate(n_groups * 2);
Captures {
inner: CapturesImpl::Fancy { text, saves },
named_groups,
}
}))
}
}
}
pub fn captures_len(&self) -> usize {
match &self.inner {
RegexImpl::Wrap {
inner,
explicit_capture_group_0,
..
} => inner.captures_len() - if *explicit_capture_group_0 { 1 } else { 0 },
RegexImpl::Fancy { n_groups, .. } => *n_groups,
}
}
pub fn capture_names(&self) -> CaptureNames<'_> {
let mut names = Vec::new();
names.resize(self.captures_len(), None);
for (name, &i) in self.named_groups.iter() {
names[i] = Some(name.as_str());
}
CaptureNames(names.into_iter())
}
#[doc(hidden)]
pub fn debug_print(&self, writer: &mut Formatter<'_>) -> fmt::Result {
match &self.inner {
RegexImpl::Wrap {
delegated_pattern,
explicit_capture_group_0,
..
} => {
write!(
writer,
"wrapped Regex {:?}, explicit_capture_group_0: {:}",
delegated_pattern, *explicit_capture_group_0
)
}
RegexImpl::Fancy { prog, .. } => prog.debug_print(writer),
}
}
pub fn replace<'t, R: Replacer>(&self, text: &'t str, rep: R) -> Cow<'t, str> {
self.replacen(text, 1, rep)
}
pub fn replace_all<'t, R: Replacer>(&self, text: &'t str, rep: R) -> Cow<'t, str> {
self.replacen(text, 0, rep)
}
pub fn replacen<'t, R: Replacer>(&self, text: &'t str, limit: usize, rep: R) -> Cow<'t, str> {
self.try_replacen(text, limit, rep).unwrap()
}
pub fn try_replacen<'t, R: Replacer>(
&self,
text: &'t str,
limit: usize,
mut rep: R,
) -> Result<Cow<'t, str>> {
if let Some(rep) = rep.no_expansion() {
let mut it = self.find_iter(text).enumerate().peekable();
if it.peek().is_none() {
return Ok(Cow::Borrowed(text));
}
let mut new = String::with_capacity(text.len());
let mut last_match = 0;
for (i, m) in it {
let m = m?;
if limit > 0 && i >= limit {
break;
}
new.push_str(&text[last_match..m.start()]);
new.push_str(&rep);
last_match = m.end();
}
new.push_str(&text[last_match..]);
return Ok(Cow::Owned(new));
}
let mut it = self.captures_iter(text).enumerate().peekable();
if it.peek().is_none() {
return Ok(Cow::Borrowed(text));
}
let mut new = String::with_capacity(text.len());
let mut last_match = 0;
for (i, cap) in it {
let cap = cap?;
if limit > 0 && i >= limit {
break;
}
let m = cap.get(0).unwrap();
new.push_str(&text[last_match..m.start()]);
rep.replace_append(&cap, &mut new);
last_match = m.end();
}
new.push_str(&text[last_match..]);
Ok(Cow::Owned(new))
}
pub fn split<'r, 'h>(&'r self, target: &'h str) -> Split<'r, 'h> {
Split {
matches: self.find_iter(target),
next_start: 0,
target,
}
}
pub fn splitn<'r, 'h>(&'r self, target: &'h str, limit: usize) -> SplitN<'r, 'h> {
SplitN {
splits: self.split(target),
limit,
}
}
}
impl TryFrom<&str> for Regex {
type Error = Error;
fn try_from(s: &str) -> Result<Self> {
Self::new(s)
}
}
impl TryFrom<String> for Regex {
type Error = Error;
fn try_from(s: String) -> Result<Self> {
Self::new(&s)
}
}
impl<'t> Match<'t> {
#[inline]
pub fn start(&self) -> usize {
self.start
}
#[inline]
pub fn end(&self) -> usize {
self.end
}
#[inline]
pub fn range(&self) -> Range<usize> {
self.start..self.end
}
#[inline]
pub fn as_str(&self) -> &'t str {
&self.text[self.start..self.end]
}
fn new(text: &'t str, start: usize, end: usize) -> Match<'t> {
Match { text, start, end }
}
}
impl<'t> From<Match<'t>> for &'t str {
fn from(m: Match<'t>) -> &'t str {
m.as_str()
}
}
impl<'t> From<Match<'t>> for Range<usize> {
fn from(m: Match<'t>) -> Range<usize> {
m.range()
}
}
#[allow(clippy::len_without_is_empty)] impl<'t> Captures<'t> {
pub fn get(&self, i: usize) -> Option<Match<'t>> {
match &self.inner {
CapturesImpl::Wrap {
text,
locations,
explicit_capture_group_0,
} => locations
.get_group(i + if *explicit_capture_group_0 { 1 } else { 0 })
.map(|span| Match {
text,
start: span.start,
end: span.end,
}),
CapturesImpl::Fancy { text, saves } => {
let slot = i * 2;
if slot >= saves.len() {
return None;
}
let lo = saves[slot];
if lo == usize::MAX {
return None;
}
let hi = saves[slot + 1];
Some(Match {
text,
start: lo,
end: hi,
})
}
}
}
pub fn name(&self, name: &str) -> Option<Match<'t>> {
self.named_groups.get(name).and_then(|i| self.get(*i))
}
pub fn expand(&self, replacement: &str, dst: &mut String) {
Expander::default().append_expansion(dst, replacement, self);
}
pub fn iter<'c>(&'c self) -> SubCaptureMatches<'c, 't> {
SubCaptureMatches { caps: self, i: 0 }
}
pub fn len(&self) -> usize {
match &self.inner {
CapturesImpl::Wrap {
locations,
explicit_capture_group_0,
..
} => locations.group_len() - if *explicit_capture_group_0 { 1 } else { 0 },
CapturesImpl::Fancy { saves, .. } => saves.len() / 2,
}
}
}
impl<'t> Index<usize> for Captures<'t> {
type Output = str;
fn index(&self, i: usize) -> &str {
self.get(i)
.map(|m| m.as_str())
.unwrap_or_else(|| panic!("no group at index '{}'", i))
}
}
impl<'t, 'i> Index<&'i str> for Captures<'t> {
type Output = str;
fn index<'a>(&'a self, name: &'i str) -> &'a str {
self.name(name)
.map(|m| m.as_str())
.unwrap_or_else(|| panic!("no group named '{}'", name))
}
}
impl<'c, 't> Iterator for SubCaptureMatches<'c, 't> {
type Item = Option<Match<'t>>;
fn next(&mut self) -> Option<Option<Match<'t>>> {
if self.i < self.caps.len() {
let result = self.caps.get(self.i);
self.i += 1;
Some(result)
} else {
None
}
}
}
#[derive(Debug, PartialEq, Eq, Clone)]
pub enum Expr {
Empty,
Any {
newline: bool,
crlf: bool,
},
Assertion(Assertion),
GeneralNewline {
unicode: bool,
},
Literal {
val: String,
casei: bool,
},
Concat(Vec<Expr>),
Alt(Vec<Expr>),
Group(Arc<Expr>),
LookAround(Box<Expr>, LookAround),
Repeat {
child: Box<Expr>,
lo: usize,
hi: usize,
greedy: bool,
},
Delegate {
inner: String,
casei: bool,
},
Backref {
group: usize,
casei: bool,
},
BackrefWithRelativeRecursionLevel {
group: usize,
relative_level: isize,
casei: bool,
},
AtomicGroup(Box<Expr>),
KeepOut,
ContinueFromPreviousMatchEnd,
BackrefExistsCondition {
group: usize,
relative_recursion_level: Option<isize>,
},
Conditional {
condition: Box<Expr>,
true_branch: Box<Expr>,
false_branch: Box<Expr>,
},
SubroutineCall(usize),
BacktrackingControlVerb(BacktrackingControlVerb),
Absent(Absent),
DefineGroup {
definitions: Box<Expr>,
},
AstNode(AstNode, usize),
}
#[derive(Debug, PartialEq, Eq, Clone)]
pub enum CaptureGroupTarget {
ByNumber(usize),
ByName(String),
Relative(isize),
}
#[derive(Debug, PartialEq, Eq, Clone)]
pub enum AstNode {
AstGroup {
name: Option<String>,
inner: Box<Expr>,
},
Backref {
target: CaptureGroupTarget,
casei: bool,
relative_recursion_level: Option<isize>,
},
SubroutineCall(CaptureGroupTarget),
BackrefExistsCondition {
target: CaptureGroupTarget,
relative_recursion_level: Option<isize>,
},
}
#[derive(Debug, PartialEq, Eq, Clone, Copy)]
pub enum LookAround {
LookAhead,
LookAheadNeg,
LookBehind,
LookBehindNeg,
}
#[derive(Debug, PartialEq, Eq, Clone)]
pub enum Absent {
Repeater(Box<Expr>),
Expression {
absent: Box<Expr>,
exp: Box<Expr>,
},
Stopper(Box<Expr>),
Clear,
}
#[derive(Debug, PartialEq, Eq, Clone, Copy)]
pub enum BacktrackingControlVerb {
Fail,
Accept,
Commit,
Skip,
Prune,
}
pub struct CaptureNames<'r>(vec::IntoIter<Option<&'r str>>);
impl Debug for CaptureNames<'_> {
fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
f.write_str("<CaptureNames>")
}
}
impl<'r> Iterator for CaptureNames<'r> {
type Item = Option<&'r str>;
fn next(&mut self) -> Option<Self::Item> {
self.0.next()
}
}
fn push_usize(s: &mut String, x: usize) {
if x >= 10 {
push_usize(s, x / 10);
s.push((b'0' + (x % 10) as u8) as char);
} else {
s.push((b'0' + (x as u8)) as char);
}
}
fn is_special(c: char) -> bool {
matches!(
c,
'\\' | '.' | '+' | '*' | '?' | '(' | ')' | '|' | '[' | ']' | '{' | '}' | '^' | '$' | '#'
)
}
fn push_quoted(buf: &mut String, s: &str) {
for c in s.chars() {
if is_special(c) {
buf.push('\\');
}
buf.push(c);
}
}
pub fn escape(text: &str) -> Cow<'_, str> {
match text.bytes().filter(|&b| is_special(b as char)).count() {
0 => Cow::Borrowed(text),
n => {
let mut buf = String::with_capacity(text.len() + n);
push_quoted(&mut buf, text);
Cow::Owned(buf)
}
}
}
#[derive(Debug, PartialEq, Eq, Clone, Copy)]
pub enum Assertion {
StartText,
EndText,
EndTextIgnoreTrailingNewlines {
crlf: bool,
},
StartLine {
crlf: bool,
},
EndLine {
crlf: bool,
},
LeftWordBoundary,
LeftWordHalfBoundary,
RightWordBoundary,
RightWordHalfBoundary,
WordBoundary,
NotWordBoundary,
}
impl Assertion {
pub(crate) fn is_hard(&self) -> bool {
use Assertion::*;
matches!(
self,
LeftWordBoundary
| LeftWordHalfBoundary
| RightWordBoundary
| RightWordHalfBoundary
| WordBoundary
| NotWordBoundary
| EndTextIgnoreTrailingNewlines { .. }
)
}
}
#[derive(Debug)]
pub enum ExprChildrenIter<'a> {
Empty,
Single(Option<&'a Expr>),
Vec(alloc::slice::Iter<'a, Expr>),
Triple {
first: Option<&'a Expr>,
second: Option<&'a Expr>,
third: Option<&'a Expr>,
},
}
#[derive(Debug)]
pub enum ExprChildrenIterMut<'a> {
Empty,
Single(Option<&'a mut Expr>),
Vec(alloc::slice::IterMut<'a, Expr>),
Triple {
first: Option<&'a mut Expr>,
second: Option<&'a mut Expr>,
third: Option<&'a mut Expr>,
},
}
impl<'a> Iterator for ExprChildrenIter<'a> {
type Item = &'a Expr;
fn next(&mut self) -> Option<Self::Item> {
match self {
ExprChildrenIter::Empty => None,
ExprChildrenIter::Single(ref mut child) => child.take(),
ExprChildrenIter::Vec(ref mut iter) => iter.next(),
ExprChildrenIter::Triple {
ref mut first,
ref mut second,
ref mut third,
} => first
.take()
.or_else(|| second.take())
.or_else(|| third.take()),
}
}
}
impl<'a> Iterator for ExprChildrenIterMut<'a> {
type Item = &'a mut Expr;
fn next(&mut self) -> Option<Self::Item> {
match self {
ExprChildrenIterMut::Empty => None,
ExprChildrenIterMut::Single(ref mut child) => child.take(),
ExprChildrenIterMut::Vec(ref mut iter) => iter.next(),
ExprChildrenIterMut::Triple {
ref mut first,
ref mut second,
ref mut third,
} => first
.take()
.or_else(|| second.take())
.or_else(|| third.take()),
}
}
}
macro_rules! children_iter_match {
($self:expr, $iter:ident, $vec_method:ident, $single_method:ident, $group_method:ident) => {
match $self {
Expr::Concat(children) | Expr::Alt(children) => $iter::Vec(children.$vec_method()),
Expr::Group(child) => $iter::Single(Some(Arc::$group_method(child))),
Expr::Absent(Absent::Repeater(child))
| Expr::Absent(Absent::Stopper(child))
| Expr::LookAround(child, _)
| Expr::AtomicGroup(child)
| Expr::Repeat { child, .. } => $iter::Single(Some(child.$single_method())),
Expr::Conditional {
condition,
true_branch,
false_branch,
} => $iter::Triple {
first: Some(condition.$single_method()),
second: Some(true_branch.$single_method()),
third: Some(false_branch.$single_method()),
},
Expr::Absent(Absent::Expression { absent, exp }) => $iter::Triple {
first: Some(absent.$single_method()),
second: Some(exp.$single_method()),
third: None,
},
Expr::DefineGroup { definitions } => $iter::Single(Some(definitions.$single_method())),
_ if $self.is_leaf_node() => $iter::Empty,
_ => unimplemented!(),
}
};
}
impl Expr {
pub fn parse_tree(re: &str) -> Result<ExprTree> {
Parser::parse(re)
}
pub fn parse_tree_with_flags(re: &str, flags: u32) -> Result<ExprTree> {
Parser::parse_with_flags(re, flags)
}
pub fn is_leaf_node(&self) -> bool {
matches!(
self,
Expr::Empty
| Expr::Any { .. }
| Expr::Assertion(_)
| Expr::GeneralNewline { .. }
| Expr::Literal { .. }
| Expr::Delegate { .. }
| Expr::Backref { .. }
| Expr::BackrefWithRelativeRecursionLevel { .. }
| Expr::KeepOut
| Expr::ContinueFromPreviousMatchEnd
| Expr::BackrefExistsCondition { .. }
| Expr::BacktrackingControlVerb(_)
| Expr::SubroutineCall(_)
| Expr::Absent(Absent::Clear)
| Expr::AstNode(..),
)
}
pub fn has_descendant(&self, predicate: impl Fn(&Expr) -> bool) -> bool {
let mut stack: Vec<&Expr> = self.children_iter().collect();
while let Some(expr) = stack.pop() {
if predicate(expr) {
return true;
}
stack.extend(expr.children_iter());
}
false
}
pub fn children_iter(&self) -> ExprChildrenIter<'_> {
children_iter_match!(self, ExprChildrenIter, iter, as_ref, as_ref)
}
pub fn children_iter_mut(&mut self) -> ExprChildrenIterMut<'_> {
children_iter_match!(self, ExprChildrenIterMut, iter_mut, as_mut, make_mut)
}
pub fn to_str(&self, buf: &mut String, precedence: u8) {
match *self {
Expr::Empty => (),
Expr::Any { newline, crlf } => buf.push_str(match (newline, crlf) {
(true, _) => "(?s:.)",
(false, true) => "(?R-s:.)",
(false, false) => ".",
}),
Expr::Literal { ref val, casei } => {
if casei {
buf.push_str("(?i:");
}
push_quoted(buf, val);
if casei {
buf.push(')');
}
}
Expr::Assertion(Assertion::StartText) => buf.push('^'),
Expr::Assertion(Assertion::EndText) => buf.push('$'),
Expr::Assertion(Assertion::StartLine { crlf: false }) => buf.push_str("(?m:^)"),
Expr::Assertion(Assertion::EndLine { crlf: false }) => buf.push_str("(?m:$)"),
Expr::Assertion(Assertion::StartLine { crlf: true }) => buf.push_str("(?Rm:^)"),
Expr::Assertion(Assertion::EndLine { crlf: true }) => buf.push_str("(?Rm:$)"),
Expr::Concat(ref children) => {
if precedence > 1 {
buf.push_str("(?:");
}
for child in children {
child.to_str(buf, 2);
}
if precedence > 1 {
buf.push(')')
}
}
Expr::Alt(_) => {
if precedence > 0 {
buf.push_str("(?:");
}
let mut children = self.children_iter();
if let Some(first) = children.next() {
first.to_str(buf, 1);
for child in children {
buf.push('|');
child.to_str(buf, 1);
}
}
if precedence > 0 {
buf.push(')');
}
}
Expr::Group(ref child) => {
buf.push('(');
child.to_str(buf, 0);
buf.push(')');
}
Expr::Repeat {
ref child,
lo,
hi,
greedy,
} => {
if precedence > 2 {
buf.push_str("(?:");
}
child.to_str(buf, 3);
match (lo, hi) {
(0, 1) => buf.push('?'),
(0, usize::MAX) => buf.push('*'),
(1, usize::MAX) => buf.push('+'),
(lo, hi) => {
buf.push('{');
push_usize(buf, lo);
if lo != hi {
buf.push(',');
if hi != usize::MAX {
push_usize(buf, hi);
}
}
buf.push('}');
}
}
if !greedy {
buf.push('?');
}
if precedence > 2 {
buf.push(')');
}
}
Expr::Delegate {
ref inner, casei, ..
} => {
if casei {
buf.push_str("(?i:");
}
buf.push_str(inner);
if casei {
buf.push(')');
}
}
Expr::DefineGroup { .. } => {
}
_ => panic!("attempting to format hard expr {:?}", self),
}
}
}
fn prev_codepoint_ix(s: &str, mut ix: usize) -> usize {
let bytes = s.as_bytes();
loop {
ix -= 1;
if (bytes[ix] as i8) >= -0x40 {
break;
}
}
ix
}
fn codepoint_len(b: u8) -> usize {
match b {
b if b < 0x80 => 1,
b if b < 0xe0 => 2,
b if b < 0xf0 => 3,
_ => 4,
}
}
pub(crate) fn next_utf8(text: &str, i: usize) -> usize {
let b = match text.as_bytes().get(i) {
None => return i + 1,
Some(&b) => b,
};
i + codepoint_len(b)
}
#[doc(hidden)]
pub mod internal {
pub use crate::analyze::{analyze, can_compile_as_anchored, AnalyzeContext, Info};
pub use crate::compile::{compile, CompileOptions};
pub use crate::optimize::optimize;
pub use crate::parse_flags::{
FLAG_CASEI, FLAG_CRLF, FLAG_DOTNL, FLAG_IGNORE_NUMBERED_GROUPS_WHEN_NAMED_GROUPS_EXIST,
FLAG_IGNORE_SPACE, FLAG_MULTI, FLAG_ONIGURUMA_MODE, FLAG_UNICODE,
};
pub use crate::vm::{run_default, run_trace, Insn, Prog};
}
#[cfg(test)]
mod tests {
use alloc::borrow::Cow;
use alloc::boxed::Box;
use alloc::string::{String, ToString};
use alloc::sync::Arc;
use alloc::vec::Vec;
use alloc::{format, vec};
use crate::parse::{make_group, make_literal};
use crate::{Absent, Expr, Regex, RegexImpl};
fn to_str(e: Expr) -> String {
let mut s = String::new();
e.to_str(&mut s, 0);
s
}
#[test]
fn to_str_concat_alt() {
let e = Expr::Concat(vec![
Expr::Alt(vec![make_literal("a"), make_literal("b")]),
make_literal("c"),
]);
assert_eq!(to_str(e), "(?:a|b)c");
}
#[test]
fn to_str_rep_concat() {
let e = Expr::Repeat {
child: Box::new(Expr::Concat(vec![make_literal("a"), make_literal("b")])),
lo: 2,
hi: 3,
greedy: true,
};
assert_eq!(to_str(e), "(?:ab){2,3}");
}
#[test]
fn to_str_group_alt() {
let e = Expr::Group(Arc::new(Expr::Alt(vec![
make_literal("a"),
make_literal("b"),
])));
assert_eq!(to_str(e), "(a|b)");
}
#[test]
fn as_str_debug() {
let s = r"(a+)b\1";
let regex = Regex::new(s).unwrap();
assert_eq!(s, regex.as_str());
assert_eq!(s, format!("{:?}", regex));
}
#[test]
fn display() {
let s = r"(a+)b\1";
let regex = Regex::new(s).unwrap();
assert_eq!(s, format!("{}", regex));
}
#[test]
fn from_str() {
let s = r"(a+)b\1";
let regex = s.parse::<Regex>().unwrap();
assert_eq!(regex.as_str(), s);
}
#[test]
fn to_str_repeat() {
fn repeat(lo: usize, hi: usize, greedy: bool) -> Expr {
Expr::Repeat {
child: Box::new(make_literal("a")),
lo,
hi,
greedy,
}
}
assert_eq!(to_str(repeat(2, 2, true)), "a{2}");
assert_eq!(to_str(repeat(2, 2, false)), "a{2}?");
assert_eq!(to_str(repeat(2, 3, true)), "a{2,3}");
assert_eq!(to_str(repeat(2, 3, false)), "a{2,3}?");
assert_eq!(to_str(repeat(2, usize::MAX, true)), "a{2,}");
assert_eq!(to_str(repeat(2, usize::MAX, false)), "a{2,}?");
assert_eq!(to_str(repeat(0, 1, true)), "a?");
assert_eq!(to_str(repeat(0, 1, false)), "a??");
assert_eq!(to_str(repeat(0, usize::MAX, true)), "a*");
assert_eq!(to_str(repeat(0, usize::MAX, false)), "a*?");
assert_eq!(to_str(repeat(1, usize::MAX, true)), "a+");
assert_eq!(to_str(repeat(1, usize::MAX, false)), "a+?");
}
#[test]
fn escape() {
match crate::escape("@foo") {
Cow::Borrowed(s) => assert_eq!(s, "@foo"),
_ => panic!("Value should be borrowed."),
}
assert_eq!(crate::escape("fo*o").into_owned(), "fo\\*o");
assert_eq!(crate::escape("fø*ø").into_owned(), "fø\\*ø");
}
#[test]
fn trailing_positive_lookahead_wrap_capture_group_fixup() {
let s = r"a+(?=c)";
let regex = s.parse::<Regex>().unwrap();
assert!(matches!(regex.inner,
RegexImpl::Wrap { explicit_capture_group_0: true, .. }),
"trailing positive lookahead for an otherwise easy pattern should avoid going through the VM");
assert_eq!(s, regex.as_str());
assert_eq!(s, format!("{:?}", regex));
}
#[test]
fn easy_regex() {
let s = r"(a+)b";
let regex = s.parse::<Regex>().unwrap();
assert!(
matches!(regex.inner, RegexImpl::Wrap { explicit_capture_group_0: false, .. }),
"easy pattern should avoid going through the VM, and capture group 0 should be implicit"
);
assert_eq!(s, regex.as_str());
assert_eq!(s, format!("{:?}", regex));
}
#[test]
fn hard_regex() {
let s = r"(a+)(?>c)";
let regex = s.parse::<Regex>().unwrap();
assert!(
matches!(regex.inner, RegexImpl::Fancy { .. }),
"hard regex should be compiled into a VM"
);
assert_eq!(s, regex.as_str());
assert_eq!(s, format!("{:?}", regex));
}
#[test]
fn test_is_leaf_node_leaf_nodes() {
assert!(Expr::Empty.is_leaf_node());
assert!(Expr::Any {
newline: false,
crlf: false
}
.is_leaf_node());
assert!(Expr::Any {
newline: true,
crlf: false
}
.is_leaf_node());
assert!(Expr::Assertion(crate::Assertion::StartText).is_leaf_node());
assert!(Expr::Literal {
val: "test".to_string(),
casei: false
}
.is_leaf_node());
assert!(Expr::Delegate {
inner: "[0-9]".to_string(),
casei: false
}
.is_leaf_node());
assert!(Expr::Backref {
group: 1,
casei: false
}
.is_leaf_node());
assert!(Expr::BackrefWithRelativeRecursionLevel {
group: 1,
relative_level: -1,
casei: false
}
.is_leaf_node());
assert!(Expr::KeepOut.is_leaf_node());
assert!(Expr::ContinueFromPreviousMatchEnd.is_leaf_node());
assert!(Expr::BackrefExistsCondition {
group: 1,
relative_recursion_level: None
}
.is_leaf_node());
assert!(Expr::BacktrackingControlVerb(crate::BacktrackingControlVerb::Fail).is_leaf_node());
assert!(Expr::SubroutineCall(1).is_leaf_node());
assert!(Expr::Absent(Absent::Clear).is_leaf_node());
}
#[test]
fn test_is_leaf_node_non_leaf_nodes() {
assert!(!Expr::Concat(vec![make_literal("a")]).is_leaf_node());
assert!(!Expr::Alt(vec![make_literal("a"), make_literal("b")]).is_leaf_node());
assert!(!make_group(make_literal("a")).is_leaf_node());
assert!(
!Expr::LookAround(Box::new(make_literal("a")), crate::LookAround::LookAhead)
.is_leaf_node()
);
assert!(!Expr::Repeat {
child: Box::new(make_literal("a")),
lo: 0,
hi: 1,
greedy: true
}
.is_leaf_node());
assert!(!Expr::AtomicGroup(Box::new(make_literal("a"))).is_leaf_node());
assert!(!Expr::Conditional {
condition: Box::new(Expr::BackrefExistsCondition {
group: 1,
relative_recursion_level: None
}),
true_branch: Box::new(make_literal("a")),
false_branch: Box::new(Expr::Empty)
}
.is_leaf_node());
assert!(!Expr::Absent(Absent::Repeater(Box::new(make_literal("a")))).is_leaf_node());
assert!(!Expr::Absent(Absent::Expression {
absent: Box::new(make_literal("/*")),
exp: Box::new(Expr::Repeat {
child: Box::new(Expr::Any {
newline: true,
crlf: false
}),
lo: 0,
hi: usize::MAX,
greedy: true
})
})
.is_leaf_node());
assert!(!Expr::Absent(Absent::Stopper(Box::new(make_literal("/*")))).is_leaf_node());
}
#[test]
fn test_children_iter_empty() {
let expr = Expr::Empty;
let mut iter = expr.children_iter();
assert!(iter.next().is_none());
let expr = make_literal("test");
let mut iter = expr.children_iter();
assert!(iter.next().is_none());
}
#[test]
fn test_children_iter_single() {
let child = make_literal("a");
let expr = make_group(child.clone());
let children: Vec<_> = expr.children_iter().collect();
assert_eq!(children.len(), 1);
let expr = Expr::Repeat {
child: Box::new(child.clone()),
lo: 0,
hi: 1,
greedy: true,
};
let children: Vec<_> = expr.children_iter().collect();
assert_eq!(children.len(), 1);
}
#[test]
fn test_children_iter_vec() {
let children_vec = vec![make_literal("a"), make_literal("b"), make_literal("c")];
let expr = Expr::Concat(children_vec.clone());
let children: Vec<_> = expr.children_iter().collect();
assert_eq!(children.len(), 3);
let expr = Expr::Alt(children_vec);
let children: Vec<_> = expr.children_iter().collect();
assert_eq!(children.len(), 3);
}
#[test]
fn test_children_iter_triple() {
let expr = Expr::Conditional {
condition: Box::new(Expr::BackrefExistsCondition {
group: 1,
relative_recursion_level: None,
}),
true_branch: Box::new(make_literal("a")),
false_branch: Box::new(make_literal("b")),
};
let children: Vec<_> = expr.children_iter().collect();
assert_eq!(children.len(), 3);
let expr = Expr::Absent(Absent::Expression {
absent: Box::new(make_literal("/*")),
exp: Box::new(Expr::Repeat {
child: Box::new(Expr::Any {
newline: true,
crlf: false,
}),
lo: 0,
hi: usize::MAX,
greedy: true,
}),
});
let children: Vec<_> = expr.children_iter().collect();
assert_eq!(children.len(), 2);
}
}