use std::borrow::Cow;
use std::collections::{HashMap, HashSet};
use std::fmt;
use std::iter::Peekable;
use std::str::Chars;
#[derive(Debug, Clone)]
pub(crate) struct Regex {
ast: Expr,
capture_count: usize,
named_captures: HashMap<String, Vec<usize>>,
named_capture_order: Vec<String>,
ignore_case: bool,
multi_line: bool,
dot_matches_new_line: bool,
unicode_mode: bool,
}
impl Regex {
pub(crate) fn new(pattern: &str) -> Result<Self, RegexError> {
RegexBuilder::new(pattern).build()
}
pub(crate) fn is_match(&self, input: &str) -> Result<bool, RegexError> {
Ok(self.find(input)?.is_some())
}
pub(crate) fn captures(&self, input: &str) -> Result<Option<Captures>, RegexError> {
self.captures_from_pos(input, 0)
}
pub(crate) fn captures_from_pos(
&self,
input: &str,
start: usize,
) -> Result<Option<Captures>, RegexError> {
let prepared = PreparedInput::new(input);
if start > prepared.text_len_bytes() {
return Ok(None);
}
let Some(start_char) = prepared.byte_to_char_index(start) else {
return Err(RegexError::new(
"search start is not a valid UTF-8 character boundary",
));
};
Ok(self
.find_spans_from_char(&prepared, start_char)
.map(|spans| self.spans_to_captures(&prepared, spans)))
}
pub(crate) fn find(&self, input: &str) -> Result<Option<Match>, RegexError> {
let captures = self.captures(input)?;
Ok(captures.and_then(|c| c.get(0).cloned()))
}
fn find_spans_from_char(
&self,
prepared: &PreparedInput<'_>,
start_char: usize,
) -> Option<Vec<Option<(usize, usize)>>> {
let opts = MatchOptions {
ignore_case: self.ignore_case,
multi_line: self.multi_line,
dot_matches_new_line: self.dot_matches_new_line,
unicode_mode: self.unicode_mode,
};
let max_start = prepared.text_len_chars();
for candidate_start in start_char..=max_start {
let mut caps = vec![None; self.capture_count + 1];
let initial = MatchState {
pos: candidate_start,
captures: caps.split_off(0),
};
let states = self.ast.match_states(prepared, &initial, &opts);
if states.is_empty() {
continue;
}
let mut first = states[0].clone();
first.captures[0] = Some((candidate_start, first.pos));
return Some(first.captures);
}
None
}
fn spans_to_captures(
&self,
prepared: &PreparedInput<'_>,
spans: Vec<Option<(usize, usize)>>,
) -> Captures {
let mut groups = Vec::with_capacity(spans.len());
for span in spans {
if let Some((start_char, end_char)) = span {
let start = prepared.char_to_byte_index(start_char);
let end = prepared.char_to_byte_index(end_char);
let text = prepared.slice_bytes_to_string(start, end);
groups.push(Some(Match { start, end, text }));
} else {
groups.push(None);
}
}
Captures {
groups,
named_captures: self.named_captures.clone(),
named_capture_order: self.named_capture_order.clone(),
}
}
}
#[derive(Debug, Clone)]
pub(crate) struct RegexBuilder {
pattern: String,
case_insensitive: bool,
multi_line: bool,
dot_matches_new_line: bool,
unicode_mode: bool,
unicode_sets_mode: bool,
}
impl RegexBuilder {
pub(crate) fn new(pattern: &str) -> Self {
Self {
pattern: pattern.to_string(),
case_insensitive: false,
multi_line: false,
dot_matches_new_line: false,
unicode_mode: false,
unicode_sets_mode: false,
}
}
pub(crate) fn case_insensitive(&mut self, enabled: bool) -> &mut Self {
self.case_insensitive = enabled;
self
}
pub(crate) fn multi_line(&mut self, enabled: bool) -> &mut Self {
self.multi_line = enabled;
self
}
pub(crate) fn dot_matches_new_line(&mut self, enabled: bool) -> &mut Self {
self.dot_matches_new_line = enabled;
self
}
pub(crate) fn unicode_mode(&mut self, enabled: bool) -> &mut Self {
self.unicode_mode = enabled;
self
}
pub(crate) fn unicode_sets_mode(&mut self, enabled: bool) -> &mut Self {
self.unicode_sets_mode = enabled;
self
}
pub(crate) fn build(&self) -> Result<Regex, RegexError> {
let mut parser = Parser::new(&self.pattern, self.unicode_mode, self.unicode_sets_mode);
let ast = parser.parse()?;
Ok(Regex {
ast,
capture_count: parser.capture_count,
named_captures: parser.named_captures,
named_capture_order: parser.named_capture_order,
ignore_case: self.case_insensitive,
multi_line: self.multi_line,
dot_matches_new_line: self.dot_matches_new_line,
unicode_mode: self.unicode_mode,
})
}
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub(crate) struct Captures {
groups: Vec<Option<Match>>,
named_captures: HashMap<String, Vec<usize>>,
named_capture_order: Vec<String>,
}
impl Captures {
pub(crate) fn len(&self) -> usize {
self.groups.len()
}
pub(crate) fn get(&self, index: usize) -> Option<&Match> {
self.groups.get(index).and_then(Option::as_ref)
}
pub(crate) fn get_named(&self, name: &str) -> Option<&Match> {
let indices = self.named_captures.get(name)?;
for index in indices {
if let Some(found) = self.get(*index) {
return Some(found);
}
}
None
}
pub(crate) fn has_named_groups(&self) -> bool {
!self.named_captures.is_empty()
}
pub(crate) fn named_group_names(&self) -> Vec<String> {
self.named_capture_order.clone()
}
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub(crate) struct Match {
start: usize,
end: usize,
text: String,
}
impl Match {
pub(crate) fn as_str(&self) -> &str {
self.text.as_str()
}
pub(crate) fn start(&self) -> usize {
self.start
}
pub(crate) fn end(&self) -> usize {
self.end
}
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub(crate) struct RegexError {
message: String,
}
impl RegexError {
fn new(message: impl Into<String>) -> Self {
Self {
message: message.into(),
}
}
}
impl fmt::Display for RegexError {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "{}", self.message)
}
}
impl std::error::Error for RegexError {}
pub(crate) fn escape(value: &str) -> Cow<'_, str> {
let mut out = String::with_capacity(value.len());
let mut changed = false;
for (idx, ch) in value.chars().enumerate() {
if idx == 0 && ch.is_ascii_alphanumeric() {
append_hex_escape(&mut out, ch as u32, 2);
changed = true;
continue;
}
match ch {
'\n' => {
out.push_str("\\n");
changed = true;
}
'\r' => {
out.push_str("\\r");
changed = true;
}
'\t' => {
out.push_str("\\t");
changed = true;
}
'\u{000B}' => {
out.push_str("\\v");
changed = true;
}
'\u{000C}' => {
out.push_str("\\f");
changed = true;
}
'\u{0020}' => {
out.push_str("\\x20");
changed = true;
}
'\u{00A0}' => {
out.push_str("\\xa0");
changed = true;
}
_ if is_regex_meta(ch) => {
out.push('\\');
out.push(ch);
changed = true;
}
_ if is_regexp_escape_hex_punctuator(ch) => {
append_hex_escape(&mut out, ch as u32, 2);
changed = true;
}
_ if is_regexp_escape_unicode_whitespace(ch) => {
append_hex_escape(&mut out, ch as u32, 4);
changed = true;
}
_ => out.push(ch),
}
}
if changed {
Cow::Owned(out)
} else {
Cow::Borrowed(value)
}
}
#[derive(Debug, Clone)]
struct PreparedInput<'a> {
src: &'a str,
chars: Vec<char>,
char_to_byte: Vec<usize>,
}
impl<'a> PreparedInput<'a> {
fn new(src: &'a str) -> Self {
let mut chars = Vec::new();
let mut char_to_byte = Vec::new();
for (idx, ch) in src.char_indices() {
chars.push(ch);
char_to_byte.push(idx);
}
char_to_byte.push(src.len());
Self {
src,
chars,
char_to_byte,
}
}
fn text_len_chars(&self) -> usize {
self.chars.len()
}
fn text_len_bytes(&self) -> usize {
self.src.len()
}
fn char_at(&self, idx: usize) -> Option<char> {
self.chars.get(idx).copied()
}
fn char_to_byte_index(&self, idx: usize) -> usize {
self.char_to_byte
.get(idx)
.copied()
.unwrap_or(self.src.len())
}
fn byte_to_char_index(&self, byte_idx: usize) -> Option<usize> {
self.char_to_byte.binary_search(&byte_idx).ok()
}
fn slice_bytes_to_string(&self, start: usize, end: usize) -> String {
self.src.get(start..end).unwrap_or_default().to_string()
}
fn is_word_boundary(&self, pos: usize, ignore_case: bool, unicode_mode: bool) -> bool {
let prev = pos
.checked_sub(1)
.and_then(|idx| self.char_at(idx))
.is_some_and(|ch| is_word_char_regex(ch, ignore_case, unicode_mode));
let next = self
.char_at(pos)
.is_some_and(|ch| is_word_char_regex(ch, ignore_case, unicode_mode));
prev != next
}
fn unicode_char_at(&self, pos: usize) -> Option<(char, usize)> {
let first = self.char_at(pos)?;
let Some(high) = deinternalize_surrogate_marker(first) else {
return Some((first, 1));
};
if !(0xD800..=0xDBFF).contains(&high) {
return Some((first, 1));
}
let Some(second) = self.char_at(pos + 1) else {
return Some((first, 1));
};
let Some(low) = deinternalize_surrogate_marker(second) else {
return Some((first, 1));
};
if !(0xDC00..=0xDFFF).contains(&low) {
return Some((first, 1));
}
let cp = 0x1_0000 + (((high as u32 - 0xD800) << 10) | (low as u32 - 0xDC00));
let scalar = char::from_u32(cp).expect("valid surrogate pair must decode to scalar");
Some((scalar, 2))
}
fn unicode_char_before(&self, end: usize) -> Option<(char, usize)> {
let last_pos = end.checked_sub(1)?;
let last = self.char_at(last_pos)?;
let Some(low) = deinternalize_surrogate_marker(last) else {
return Some((last, 1));
};
if !(0xDC00..=0xDFFF).contains(&low) {
return Some((last, 1));
}
let Some(high_pos) = end.checked_sub(2) else {
return Some((last, 1));
};
let Some(high_marker) = self.char_at(high_pos) else {
return Some((last, 1));
};
let Some(high) = deinternalize_surrogate_marker(high_marker) else {
return Some((last, 1));
};
if !(0xD800..=0xDBFF).contains(&high) {
return Some((last, 1));
}
let cp = 0x1_0000 + (((high as u32 - 0xD800) << 10) | (low as u32 - 0xDC00));
let scalar = char::from_u32(cp).expect("valid surrogate pair must decode to scalar");
Some((scalar, 2))
}
}
#[derive(Debug, Clone)]
struct MatchState {
pos: usize,
captures: Vec<Option<(usize, usize)>>,
}
#[derive(Debug, Clone, Copy)]
struct MatchOptions {
ignore_case: bool,
multi_line: bool,
dot_matches_new_line: bool,
unicode_mode: bool,
}
#[derive(Debug, Clone)]
enum Expr {
Empty,
Sequence(Vec<Expr>),
Alternation(Vec<Expr>),
Literal(char),
Dot,
StartAnchor,
EndAnchor,
WordBoundary(bool),
CharClass(CharClass),
UnicodeStringProperty(UnicodeStringProperty),
ClassSetOperation {
operands: Vec<CharClass>,
operators: Vec<ClassSetOperator>,
negated: bool,
},
Group {
index: Option<usize>,
expr: Box<Expr>,
},
FlagModifierGroup {
expr: Box<Expr>,
ignore_case: Option<bool>,
multi_line: Option<bool>,
dot_matches_new_line: Option<bool>,
},
LookAhead {
positive: bool,
expr: Box<Expr>,
},
LookBehind {
positive: bool,
expr: Box<Expr>,
},
NamedBackReference(String),
DecimalEscape(String),
BackReference(usize),
BackReferenceAny(Vec<usize>),
Quantifier {
expr: Box<Expr>,
min: usize,
max: Option<usize>,
greedy: bool,
},
}
impl Expr {
fn match_states(
&self,
prepared: &PreparedInput<'_>,
state: &MatchState,
opts: &MatchOptions,
) -> Vec<MatchState> {
match self {
Self::Empty => vec![state.clone()],
Self::Sequence(parts) => {
let mut states = vec![state.clone()];
for part in parts {
let mut next = Vec::new();
for candidate in &states {
next.extend(part.match_states(prepared, candidate, opts));
}
if next.is_empty() {
return Vec::new();
}
states = next;
}
states
}
Self::Alternation(branches) => {
let mut out = Vec::new();
for branch in branches {
out.extend(branch.match_states(prepared, state, opts));
}
out
}
Self::Literal(expected) => {
if opts.unicode_mode {
let Some((actual, width)) = prepared.unicode_char_at(state.pos) else {
return Vec::new();
};
if chars_equal(actual, *expected, opts.ignore_case, opts.unicode_mode) {
let mut next = state.clone();
next.pos += width;
vec![next]
} else {
Vec::new()
}
} else {
let Some(actual) = prepared.char_at(state.pos) else {
return Vec::new();
};
if chars_equal(actual, *expected, opts.ignore_case, opts.unicode_mode) {
let mut next = state.clone();
next.pos += 1;
vec![next]
} else {
Vec::new()
}
}
}
Self::Dot => {
if opts.unicode_mode {
let Some((actual, width)) = prepared.unicode_char_at(state.pos) else {
return Vec::new();
};
if opts.dot_matches_new_line || !is_line_terminator(actual) {
let mut next = state.clone();
next.pos += width;
vec![next]
} else {
Vec::new()
}
} else {
let Some(actual) = prepared.char_at(state.pos) else {
return Vec::new();
};
if opts.dot_matches_new_line || !is_line_terminator(actual) {
let mut next = state.clone();
next.pos += 1;
vec![next]
} else {
Vec::new()
}
}
}
Self::StartAnchor => {
let at_start = state.pos == 0;
let at_line_start = opts.multi_line
&& state
.pos
.checked_sub(1)
.and_then(|idx| prepared.char_at(idx))
.is_some_and(is_line_terminator);
if at_start || at_line_start {
vec![state.clone()]
} else {
Vec::new()
}
}
Self::EndAnchor => {
let at_end = state.pos == prepared.text_len_chars();
let at_line_end =
opts.multi_line && prepared.char_at(state.pos).is_some_and(is_line_terminator);
if at_end || at_line_end {
vec![state.clone()]
} else {
Vec::new()
}
}
Self::WordBoundary(expected) => {
if prepared.is_word_boundary(state.pos, opts.ignore_case, opts.unicode_mode)
== *expected
{
vec![state.clone()]
} else {
Vec::new()
}
}
Self::CharClass(class) => {
let ends =
class.match_ends(prepared, state.pos, opts.ignore_case, opts.unicode_mode);
if ends.is_empty() {
Vec::new()
} else {
let mut out = Vec::with_capacity(ends.len());
for end in ends {
let mut next = state.clone();
next.pos = end;
out.push(next);
}
out
}
}
Self::UnicodeStringProperty(property) => {
let Some(next_pos) = property.match_at(prepared, state.pos) else {
return Vec::new();
};
let mut next = state.clone();
next.pos = next_pos;
vec![next]
}
Self::ClassSetOperation {
operands,
operators,
negated,
} => {
let ends = class_set_operation_match_ends(
prepared,
state.pos,
operands,
operators,
*negated,
opts.ignore_case,
opts.unicode_mode,
);
if ends.is_empty() {
return Vec::new();
}
let mut out = Vec::with_capacity(ends.len());
for end in ends {
let mut next = state.clone();
next.pos = end;
out.push(next);
}
out
}
Self::Group { index, expr } => {
let start = state.pos;
let states = expr.match_states(prepared, state, opts);
let mut out = Vec::with_capacity(states.len());
for mut candidate in states {
if let Some(idx) = index {
if let Some(slot) = candidate.captures.get_mut(*idx) {
*slot = Some((start, candidate.pos));
}
}
out.push(candidate);
}
out
}
Self::FlagModifierGroup {
expr,
ignore_case,
multi_line,
dot_matches_new_line,
} => {
let mut local_opts = *opts;
if let Some(value) = ignore_case {
local_opts.ignore_case = *value;
}
if let Some(value) = multi_line {
local_opts.multi_line = *value;
}
if let Some(value) = dot_matches_new_line {
local_opts.dot_matches_new_line = *value;
}
expr.match_states(prepared, state, &local_opts)
}
Self::LookAhead { positive, expr } => {
let inner = expr.match_states(prepared, state, opts);
if *positive {
let Some(mut matched) = inner.into_iter().next() else {
return Vec::new();
};
matched.pos = state.pos;
vec![matched]
} else if inner.is_empty() {
vec![state.clone()]
} else {
Vec::new()
}
}
Self::LookBehind { positive, expr } => {
let inner = expr.match_states_backward(prepared, state, opts);
if *positive {
let Some(mut matched) = inner.into_iter().next() else {
return Vec::new();
};
matched.pos = state.pos;
vec![matched]
} else if inner.is_empty() {
vec![state.clone()]
} else {
Vec::new()
}
}
Self::BackReference(index) => {
let Some(group) = state.captures.get(*index).copied().flatten() else {
return vec![state.clone()];
};
let len = group.1.saturating_sub(group.0);
if state.pos.saturating_add(len) > prepared.text_len_chars() {
return Vec::new();
}
for offset in 0..len {
let Some(left) = prepared.char_at(group.0 + offset) else {
return Vec::new();
};
let Some(right) = prepared.char_at(state.pos + offset) else {
return Vec::new();
};
if !chars_equal(left, right, opts.ignore_case, opts.unicode_mode) {
return Vec::new();
}
}
let mut next = state.clone();
next.pos += len;
vec![next]
}
Self::BackReferenceAny(indices) => {
let Some(group) = indices
.iter()
.filter_map(|idx| state.captures.get(*idx).copied().flatten())
.next()
else {
return vec![state.clone()];
};
let len = group.1.saturating_sub(group.0);
if state.pos.saturating_add(len) > prepared.text_len_chars() {
return Vec::new();
}
for offset in 0..len {
let Some(left) = prepared.char_at(group.0 + offset) else {
return Vec::new();
};
let Some(right) = prepared.char_at(state.pos + offset) else {
return Vec::new();
};
if !chars_equal(left, right, opts.ignore_case, opts.unicode_mode) {
return Vec::new();
}
}
let mut next = state.clone();
next.pos += len;
vec![next]
}
Self::NamedBackReference(_) => Vec::new(),
Self::DecimalEscape(_) => Vec::new(),
Self::Quantifier {
expr,
min,
max,
greedy,
} => {
let capture_slots = capture_slots_in_expr(expr);
let mut base = state.clone();
clear_capture_slots(&mut base.captures, &capture_slots);
let mut levels = vec![vec![base]];
let limit = max.unwrap_or(usize::MAX);
while levels.len() - 1 < limit {
let prev = levels.last().expect("levels always has one element");
let mut next = Vec::new();
let mut saw_progress = false;
for candidate in prev {
let mut seeded = candidate.clone();
clear_capture_slots(&mut seeded.captures, &capture_slots);
for matched in expr.match_states(prepared, &seeded, opts) {
if matched.pos != candidate.pos {
saw_progress = true;
}
next.push(matched);
}
}
if next.is_empty() {
break;
}
if !saw_progress {
let repeated = next.clone();
levels.push(next);
let target = match max {
Some(max) if *max == *min => *min,
Some(max) => min.saturating_add(1).min(*max),
None => min.saturating_add(1),
};
while levels.len() - 1 < target {
levels.push(repeated.clone());
}
break;
}
levels.push(next);
}
let available_max = levels.len().saturating_sub(1);
if *min > available_max {
return Vec::new();
}
let mut out = Vec::new();
if *greedy {
for count in (*min..=available_max).rev() {
out.extend(levels[count].iter().cloned());
}
} else {
for count in *min..=available_max {
out.extend(levels[count].iter().cloned());
}
}
out
}
}
}
fn match_states_backward(
&self,
prepared: &PreparedInput<'_>,
state: &MatchState,
opts: &MatchOptions,
) -> Vec<MatchState> {
match self {
Self::Empty => vec![state.clone()],
Self::Sequence(parts) => {
let mut states = vec![state.clone()];
for part in parts.iter().rev() {
let mut next = Vec::new();
for candidate in &states {
next.extend(part.match_states_backward(prepared, candidate, opts));
}
if next.is_empty() {
return Vec::new();
}
states = next;
}
states
}
Self::Alternation(branches) => {
let mut out = Vec::new();
for branch in branches {
out.extend(branch.match_states_backward(prepared, state, opts));
}
out
}
Self::Literal(expected) => {
if opts.unicode_mode {
let Some((actual, width)) = prepared.unicode_char_before(state.pos) else {
return Vec::new();
};
if chars_equal(actual, *expected, opts.ignore_case, opts.unicode_mode) {
let Some(prev_pos) = state.pos.checked_sub(width) else {
return Vec::new();
};
let mut next = state.clone();
next.pos = prev_pos;
vec![next]
} else {
Vec::new()
}
} else {
let Some(prev_pos) = state.pos.checked_sub(1) else {
return Vec::new();
};
let Some(actual) = prepared.char_at(prev_pos) else {
return Vec::new();
};
if chars_equal(actual, *expected, opts.ignore_case, opts.unicode_mode) {
let mut next = state.clone();
next.pos = prev_pos;
vec![next]
} else {
Vec::new()
}
}
}
Self::Dot => {
if opts.unicode_mode {
let Some((actual, width)) = prepared.unicode_char_before(state.pos) else {
return Vec::new();
};
if opts.dot_matches_new_line || !is_line_terminator(actual) {
let Some(prev_pos) = state.pos.checked_sub(width) else {
return Vec::new();
};
let mut next = state.clone();
next.pos = prev_pos;
vec![next]
} else {
Vec::new()
}
} else {
let Some(prev_pos) = state.pos.checked_sub(1) else {
return Vec::new();
};
let Some(actual) = prepared.char_at(prev_pos) else {
return Vec::new();
};
if opts.dot_matches_new_line || !is_line_terminator(actual) {
let mut next = state.clone();
next.pos = prev_pos;
vec![next]
} else {
Vec::new()
}
}
}
Self::StartAnchor => {
let at_start = state.pos == 0;
let at_line_start = opts.multi_line
&& state
.pos
.checked_sub(1)
.and_then(|idx| prepared.char_at(idx))
.is_some_and(is_line_terminator);
if at_start || at_line_start {
vec![state.clone()]
} else {
Vec::new()
}
}
Self::EndAnchor => {
let at_end = state.pos == prepared.text_len_chars();
let at_line_end =
opts.multi_line && prepared.char_at(state.pos).is_some_and(is_line_terminator);
if at_end || at_line_end {
vec![state.clone()]
} else {
Vec::new()
}
}
Self::WordBoundary(expected) => {
if prepared.is_word_boundary(state.pos, opts.ignore_case, opts.unicode_mode)
== *expected
{
vec![state.clone()]
} else {
Vec::new()
}
}
Self::CharClass(class) => {
let starts =
class.match_starts(prepared, state.pos, opts.ignore_case, opts.unicode_mode);
if starts.is_empty() {
Vec::new()
} else {
let mut out = Vec::with_capacity(starts.len());
for start in starts {
let mut next = state.clone();
next.pos = start;
out.push(next);
}
out
}
}
Self::UnicodeStringProperty(property) => {
let starts = unicode_string_property_match_starts(property, prepared, state.pos);
if starts.is_empty() {
Vec::new()
} else {
let mut out = Vec::with_capacity(starts.len());
for start in starts {
let mut next = state.clone();
next.pos = start;
out.push(next);
}
out
}
}
Self::ClassSetOperation {
operands,
operators,
negated,
} => {
let starts = class_set_operation_match_starts(
prepared,
state.pos,
operands,
operators,
*negated,
opts.ignore_case,
opts.unicode_mode,
);
if starts.is_empty() {
Vec::new()
} else {
let mut out = Vec::with_capacity(starts.len());
for start in starts {
let mut next = state.clone();
next.pos = start;
out.push(next);
}
out
}
}
Self::Group { index, expr } => {
let end = state.pos;
let states = expr.match_states_backward(prepared, state, opts);
let mut out = Vec::with_capacity(states.len());
for mut candidate in states {
if let Some(idx) = index {
if let Some(slot) = candidate.captures.get_mut(*idx) {
*slot = Some((candidate.pos, end));
}
}
out.push(candidate);
}
out
}
Self::FlagModifierGroup {
expr,
ignore_case,
multi_line,
dot_matches_new_line,
} => {
let mut local_opts = *opts;
if let Some(value) = ignore_case {
local_opts.ignore_case = *value;
}
if let Some(value) = multi_line {
local_opts.multi_line = *value;
}
if let Some(value) = dot_matches_new_line {
local_opts.dot_matches_new_line = *value;
}
expr.match_states_backward(prepared, state, &local_opts)
}
Self::LookAhead { positive, expr } => {
let inner = expr.match_states(prepared, state, opts);
if *positive {
let Some(mut matched) = inner.into_iter().next() else {
return Vec::new();
};
matched.pos = state.pos;
vec![matched]
} else if inner.is_empty() {
vec![state.clone()]
} else {
Vec::new()
}
}
Self::LookBehind { positive, expr } => {
let inner = expr.match_states_backward(prepared, state, opts);
if *positive {
let Some(mut matched) = inner.into_iter().next() else {
return Vec::new();
};
matched.pos = state.pos;
vec![matched]
} else if inner.is_empty() {
vec![state.clone()]
} else {
Vec::new()
}
}
Self::BackReference(index) => {
let Some(group) = state.captures.get(*index).copied().flatten() else {
return vec![state.clone()];
};
let len = group.1.saturating_sub(group.0);
if state.pos < len {
return Vec::new();
}
let start = state.pos - len;
for offset in 0..len {
let Some(left) = prepared.char_at(group.0 + offset) else {
return Vec::new();
};
let Some(right) = prepared.char_at(start + offset) else {
return Vec::new();
};
if !chars_equal(left, right, opts.ignore_case, opts.unicode_mode) {
return Vec::new();
}
}
let mut next = state.clone();
next.pos = start;
vec![next]
}
Self::BackReferenceAny(indices) => {
let Some(group) = indices
.iter()
.filter_map(|idx| state.captures.get(*idx).copied().flatten())
.next()
else {
return vec![state.clone()];
};
let len = group.1.saturating_sub(group.0);
if state.pos < len {
return Vec::new();
}
let start = state.pos - len;
for offset in 0..len {
let Some(left) = prepared.char_at(group.0 + offset) else {
return Vec::new();
};
let Some(right) = prepared.char_at(start + offset) else {
return Vec::new();
};
if !chars_equal(left, right, opts.ignore_case, opts.unicode_mode) {
return Vec::new();
}
}
let mut next = state.clone();
next.pos = start;
vec![next]
}
Self::NamedBackReference(_) => Vec::new(),
Self::DecimalEscape(_) => Vec::new(),
Self::Quantifier {
expr,
min,
max,
greedy,
} => {
let capture_slots = capture_slots_in_expr(expr);
let mut base = state.clone();
clear_capture_slots(&mut base.captures, &capture_slots);
let mut levels = vec![vec![base]];
let limit = max.unwrap_or(usize::MAX);
while levels.len() - 1 < limit {
let prev = levels.last().expect("levels always has one element");
let mut next = Vec::new();
let mut saw_progress = false;
for candidate in prev {
let mut seeded = candidate.clone();
clear_capture_slots(&mut seeded.captures, &capture_slots);
for matched in expr.match_states_backward(prepared, &seeded, opts) {
if matched.pos != candidate.pos {
saw_progress = true;
}
next.push(matched);
}
}
if next.is_empty() {
break;
}
if !saw_progress {
let repeated = next.clone();
levels.push(next);
let target = match max {
Some(max) if *max == *min => *min,
Some(max) => min.saturating_add(1).min(*max),
None => min.saturating_add(1),
};
while levels.len() - 1 < target {
levels.push(repeated.clone());
}
break;
}
levels.push(next);
}
let available_max = levels.len().saturating_sub(1);
if *min > available_max {
return Vec::new();
}
let mut out = Vec::new();
if *greedy {
for count in (*min..=available_max).rev() {
out.extend(levels[count].iter().cloned());
}
} else {
for count in *min..=available_max {
out.extend(levels[count].iter().cloned());
}
}
out
}
}
}
}
#[derive(Debug, Clone)]
struct CharClass {
negated: bool,
items: Vec<ClassItem>,
}
impl CharClass {
fn matches(&self, ch: char, ignore_case: bool, unicode_mode: bool) -> bool {
let matched = self
.items
.iter()
.any(|item| item.matches(ch, ignore_case, unicode_mode));
if self.negated { !matched } else { matched }
}
fn match_ends(
&self,
prepared: &PreparedInput<'_>,
pos: usize,
ignore_case: bool,
unicode_mode: bool,
) -> Vec<usize> {
if self.negated {
let (actual, width) = if unicode_mode {
let Some((actual, width)) = prepared.unicode_char_at(pos) else {
return Vec::new();
};
(actual, width)
} else {
let Some(actual) = prepared.char_at(pos) else {
return Vec::new();
};
(actual, 1)
};
let matched = self
.items
.iter()
.any(|item| item.matches(actual, ignore_case, unicode_mode));
if matched {
Vec::new()
} else {
vec![pos + width]
}
} else {
let mut out = Vec::new();
for item in &self.items {
out.extend(item.match_ends(prepared, pos, ignore_case, unicode_mode));
}
dedupe_sorted_desc(out)
}
}
fn match_starts(
&self,
prepared: &PreparedInput<'_>,
end: usize,
ignore_case: bool,
unicode_mode: bool,
) -> Vec<usize> {
if self.negated {
let (start, actual) = if unicode_mode {
let Some((actual, width)) = prepared.unicode_char_before(end) else {
return Vec::new();
};
let Some(start) = end.checked_sub(width) else {
return Vec::new();
};
(start, actual)
} else {
let Some(start) = end.checked_sub(1) else {
return Vec::new();
};
let Some(actual) = prepared.char_at(start) else {
return Vec::new();
};
(start, actual)
};
let matched = self
.items
.iter()
.any(|item| item.matches(actual, ignore_case, unicode_mode));
if matched { Vec::new() } else { vec![start] }
} else {
let mut out = Vec::new();
for item in &self.items {
out.extend(item.match_starts(prepared, end, ignore_case, unicode_mode));
}
dedupe_sorted_asc(out)
}
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
enum ClassSetOperator {
Intersection,
Difference,
}
#[derive(Debug, Clone)]
enum ClassItem {
Char(char),
Range(char, char),
NestedExpression(Box<Expr>),
StringDisjunction(Vec<String>),
Digit,
NotDigit,
Word,
NotWord,
Space,
NotSpace,
UnicodeProperty {
property: UnicodeProperty,
negated: bool,
},
UnicodeStringProperty(UnicodeStringProperty),
}
impl ClassItem {
fn matches(&self, ch: char, ignore_case: bool, unicode_mode: bool) -> bool {
match self {
Self::Char(expected) => chars_equal(ch, *expected, ignore_case, unicode_mode),
Self::Range(start, end) => char_in_range(ch, *start, *end, ignore_case, unicode_mode),
Self::NestedExpression(expr) => {
expr_matches_single_char(expr, ch, ignore_case, unicode_mode)
}
Self::StringDisjunction(alternatives) => alternatives.iter().any(|value| {
single_char_string(value)
.is_some_and(|expected| chars_equal(ch, expected, ignore_case, unicode_mode))
}),
Self::Digit => ch.is_ascii_digit(),
Self::NotDigit => !ch.is_ascii_digit(),
Self::Word => is_word_char_regex(ch, ignore_case, unicode_mode),
Self::NotWord => !is_word_char_regex(ch, ignore_case, unicode_mode),
Self::Space => is_space_char(ch),
Self::NotSpace => !is_space_char(ch),
Self::UnicodeProperty { property, negated } => {
let matched = property.matches(ch);
if *negated { !matched } else { matched }
}
Self::UnicodeStringProperty(_) => false,
}
}
fn match_ends(
&self,
prepared: &PreparedInput<'_>,
pos: usize,
ignore_case: bool,
unicode_mode: bool,
) -> Vec<usize> {
if let Self::NestedExpression(expr) = self {
return expr_match_ends(expr, prepared, pos, ignore_case, unicode_mode);
}
if let Self::StringDisjunction(alternatives) = self {
return alternatives
.iter()
.filter_map(|candidate| {
match_literal_string_at(prepared, pos, candidate, ignore_case, unicode_mode)
})
.collect::<Vec<_>>();
}
if let Self::UnicodeStringProperty(property) = self {
return property
.match_at(prepared, pos)
.into_iter()
.collect::<Vec<_>>();
}
let Some((actual, width)) = (if unicode_mode {
prepared.unicode_char_at(pos)
} else {
prepared.char_at(pos).map(|ch| (ch, 1))
}) else {
return Vec::new();
};
let matched = match self {
Self::Char(expected) => chars_equal(actual, *expected, ignore_case, unicode_mode),
Self::Range(start, end) => {
char_in_range(actual, *start, *end, ignore_case, unicode_mode)
}
Self::Digit => actual.is_ascii_digit(),
Self::NotDigit => !actual.is_ascii_digit(),
Self::Word => is_word_char_regex(actual, ignore_case, unicode_mode),
Self::NotWord => !is_word_char_regex(actual, ignore_case, unicode_mode),
Self::Space => is_space_char(actual),
Self::NotSpace => !is_space_char(actual),
Self::UnicodeProperty { property, negated } => {
let matched = property.matches(actual);
if *negated { !matched } else { matched }
}
Self::NestedExpression(_)
| Self::StringDisjunction(_)
| Self::UnicodeStringProperty(_) => unreachable!("handled above"),
};
if matched {
vec![pos + width]
} else {
Vec::new()
}
}
fn match_starts(
&self,
prepared: &PreparedInput<'_>,
end: usize,
ignore_case: bool,
unicode_mode: bool,
) -> Vec<usize> {
let mut out = Vec::new();
for start in 0..=end {
if self
.match_ends(prepared, start, ignore_case, unicode_mode)
.iter()
.any(|candidate_end| *candidate_end == end)
{
out.push(start);
}
}
dedupe_sorted_asc(out)
}
}
#[derive(Debug, Clone, Copy)]
enum UnicodeProperty {
Letter,
Number,
DecimalNumber,
UppercaseLetter,
LowercaseLetter,
Ascii,
Any,
ScriptLatin,
ScriptGreek,
}
impl UnicodeProperty {
fn matches(&self, ch: char) -> bool {
match self {
Self::Letter => ch.is_alphabetic(),
Self::Number => ch.is_numeric(),
Self::DecimalNumber => char_in_ranges(ch, UNICODE_DECIMAL_NUMBER_RANGES),
Self::UppercaseLetter => ch.is_uppercase(),
Self::LowercaseLetter => ch.is_lowercase(),
Self::Ascii => ch.is_ascii(),
Self::Any => true,
Self::ScriptLatin => char_in_ranges(ch, UNICODE_SCRIPT_LATIN_RANGES),
Self::ScriptGreek => char_in_ranges(ch, UNICODE_SCRIPT_GREEK_RANGES),
}
}
}
#[derive(Debug, Clone, Copy)]
enum UnicodeStringProperty {
RgiEmoji,
}
impl UnicodeStringProperty {
fn match_at(&self, prepared: &PreparedInput<'_>, pos: usize) -> Option<usize> {
match self {
Self::RgiEmoji => match_rgi_emoji_at(prepared, pos),
}
}
}
#[derive(Debug, Clone, Copy)]
enum UnicodePropertySpec {
CodePoint(UnicodeProperty),
String(UnicodeStringProperty),
}
struct Parser {
chars: Vec<char>,
i: usize,
unicode_mode: bool,
unicode_sets_mode: bool,
parse_named_backreference_syntax: bool,
capture_count: usize,
named_captures: HashMap<String, Vec<usize>>,
named_capture_order: Vec<String>,
capture_names: HashMap<usize, String>,
}
impl Parser {
fn new(pattern: &str, unicode_mode: bool, unicode_sets_mode: bool) -> Self {
Self {
chars: pattern.chars().collect(),
i: 0,
unicode_mode,
unicode_sets_mode,
parse_named_backreference_syntax: true,
capture_count: 0,
named_captures: HashMap::new(),
named_capture_order: Vec::new(),
capture_names: HashMap::new(),
}
}
fn parse(&mut self) -> Result<Expr, RegexError> {
let mut expr = self.parse_alternation()?;
if self.i < self.chars.len() {
return Err(RegexError::new("invalid regular expression syntax"));
}
self.validate_named_capture_duplicates(&expr)?;
self.resolve_post_parse_escapes(&mut expr)?;
self.resolve_decimal_escapes(&mut expr)?;
Ok(expr)
}
fn parse_alternation(&mut self) -> Result<Expr, RegexError> {
let mut branches = vec![self.parse_sequence()?];
while self.peek() == Some('|') {
self.i += 1;
branches.push(self.parse_sequence()?);
}
if branches.len() == 1 {
Ok(branches.pop().unwrap_or(Expr::Empty))
} else {
Ok(Expr::Alternation(branches))
}
}
fn parse_sequence(&mut self) -> Result<Expr, RegexError> {
let mut items = Vec::new();
while let Some(ch) = self.peek() {
if ch == ')' || ch == '|' {
break;
}
let atom = self.parse_atom()?;
let quantified = self.parse_quantifier(atom)?;
items.push(quantified);
}
if items.is_empty() {
Ok(Expr::Empty)
} else if items.len() == 1 {
Ok(items.pop().unwrap_or(Expr::Empty))
} else {
Ok(Expr::Sequence(items))
}
}
fn parse_atom(&mut self) -> Result<Expr, RegexError> {
let Some(ch) = self.next() else {
return Err(RegexError::new("unexpected end of regular expression"));
};
match ch {
'^' => Ok(Expr::StartAnchor),
'$' => Ok(Expr::EndAnchor),
'.' => Ok(Expr::Dot),
'(' => self.parse_group(),
'[' => self.parse_char_class(),
'*' | '+' | '?' => Err(RegexError::new("nothing to repeat")),
'{' => {
if self.unicode_mode {
Err(RegexError::new("invalid quantifier syntax"))
} else if self.looks_like_braced_quantifier(self.i - 1) {
Err(RegexError::new("nothing to repeat"))
} else {
Ok(Expr::Literal('{'))
}
}
'}' => {
if self.unicode_mode {
Err(RegexError::new("invalid quantifier syntax"))
} else {
Ok(Expr::Literal('}'))
}
}
'\\' => self.parse_escape(false),
_ => Ok(Expr::Literal(ch)),
}
}
fn parse_group(&mut self) -> Result<Expr, RegexError> {
if self.peek() == Some('?') {
self.i += 1;
let Some(ch) = self.next() else {
return Err(RegexError::new("unsupported group syntax"));
};
match ch {
':' => {
let expr = self.parse_alternation()?;
self.expect(')')?;
Ok(Expr::Group {
index: None,
expr: Box::new(expr),
})
}
'=' => {
let expr = self.parse_alternation()?;
self.expect(')')?;
Ok(Expr::LookAhead {
positive: true,
expr: Box::new(expr),
})
}
'!' => {
let expr = self.parse_alternation()?;
self.expect(')')?;
Ok(Expr::LookAhead {
positive: false,
expr: Box::new(expr),
})
}
'<' => {
if self.peek() == Some('=') {
self.i += 1;
let expr = self.parse_alternation()?;
self.expect(')')?;
Ok(Expr::LookBehind {
positive: true,
expr: Box::new(expr),
})
} else if self.peek() == Some('!') {
self.i += 1;
let expr = self.parse_alternation()?;
self.expect(')')?;
Ok(Expr::LookBehind {
positive: false,
expr: Box::new(expr),
})
} else {
let name = self.parse_group_name()?;
self.capture_count += 1;
let index = self.capture_count;
if !self.named_captures.contains_key(&name) {
self.named_capture_order.push(name.clone());
}
self.named_captures
.entry(name.clone())
.or_default()
.push(index);
self.capture_names.insert(index, name);
let expr = self.parse_alternation()?;
self.expect(')')?;
Ok(Expr::Group {
index: Some(index),
expr: Box::new(expr),
})
}
}
'i' | 'm' | 's' | '-' => self.parse_flag_modifier_group(ch),
_ => Err(RegexError::new("unsupported group syntax")),
}
} else {
self.capture_count += 1;
let index = self.capture_count;
let expr = self.parse_alternation()?;
self.expect(')')?;
Ok(Expr::Group {
index: Some(index),
expr: Box::new(expr),
})
}
}
fn parse_flag_modifier_group(&mut self, first: char) -> Result<Expr, RegexError> {
let mut enabling = HashSet::new();
let mut disabling = HashSet::new();
let mut in_disabling = false;
let mut current = first;
loop {
match current {
'i' | 'm' | 's' => {
if in_disabling {
if !disabling.insert(current) || enabling.contains(¤t) {
return Err(RegexError::new("duplicate flag in modifier group"));
}
} else if !enabling.insert(current) || disabling.contains(¤t) {
return Err(RegexError::new("duplicate flag in modifier group"));
}
}
'-' => {
if in_disabling {
return Err(RegexError::new("invalid modifier group flags"));
}
in_disabling = true;
}
':' => break,
_ => return Err(RegexError::new("unsupported group syntax")),
}
let Some(next) = self.next() else {
return Err(RegexError::new("unsupported group syntax"));
};
current = next;
}
if enabling.is_empty() && disabling.is_empty() {
return Err(RegexError::new("invalid modifier group flags"));
}
let expr = self.parse_alternation()?;
self.expect(')')?;
Ok(Expr::FlagModifierGroup {
expr: Box::new(expr),
ignore_case: flag_modifier_value('i', &enabling, &disabling),
multi_line: flag_modifier_value('m', &enabling, &disabling),
dot_matches_new_line: flag_modifier_value('s', &enabling, &disabling),
})
}
fn parse_quantifier(&mut self, atom: Expr) -> Result<Expr, RegexError> {
let Some(ch) = self.peek() else {
return Ok(atom);
};
let quant = match ch {
'*' => {
self.i += 1;
Some((0usize, None))
}
'+' => {
self.i += 1;
Some((1usize, None))
}
'?' => {
self.i += 1;
Some((0usize, Some(1usize)))
}
'{' => {
let checkpoint = self.i;
match self.parse_braced_quantifier() {
Ok(quant) => Some(quant),
Err(err) if !self.unicode_mode && err.message != "invalid quantifier range" => {
self.i = checkpoint;
None
}
Err(err) => return Err(err),
}
}
_ => None,
};
let Some((min, max)) = quant else {
return Ok(atom);
};
if !is_quantifier_target_supported(&atom) {
return Err(RegexError::new("nothing to repeat"));
}
if self.unicode_mode && matches!(atom, Expr::LookAhead { .. }) {
return Err(RegexError::new("invalid quantifier"));
}
let greedy = if self.peek() == Some('?') {
self.i += 1;
false
} else {
true
};
Ok(Expr::Quantifier {
expr: Box::new(atom),
min,
max,
greedy,
})
}
fn parse_braced_quantifier(&mut self) -> Result<(usize, Option<usize>), RegexError> {
self.expect('{')?;
let min = self.parse_usize()?;
match self.peek() {
Some('}') => {
self.i += 1;
Ok((min, Some(min)))
}
Some(',') => {
self.i += 1;
if self.peek() == Some('}') {
self.i += 1;
Ok((min, None))
} else {
let max = self.parse_usize()?;
if max < min {
return Err(RegexError::new("invalid quantifier range"));
}
self.expect('}')?;
Ok((min, Some(max)))
}
}
_ => Err(RegexError::new("invalid quantifier syntax")),
}
}
fn parse_char_class(&mut self) -> Result<Expr, RegexError> {
let mut negated = false;
if self.peek() == Some('^') {
negated = true;
self.i += 1;
}
let mut operands: Vec<Vec<ClassItem>> = vec![Vec::new()];
let mut operators: Vec<ClassSetOperator> = Vec::new();
while let Some(ch) = self.peek() {
if ch == ']' {
self.i += 1;
if operators.is_empty() {
let items = operands
.pop()
.expect("character class always starts with one operand");
return self.build_class_expr_from_items(negated, items);
}
if operands.len() != operators.len() + 1 || operands.iter().any(Vec::is_empty) {
return Err(RegexError::new("invalid set operation"));
}
let has_unicode_string = operands
.iter()
.flatten()
.any(class_item_contains_unicode_string_property);
let has_non_scalar_string = operands
.iter()
.flatten()
.any(class_item_contains_non_scalar_string);
if negated && (has_unicode_string || has_non_scalar_string) {
return Err(RegexError::new("unsupported unicode set syntax"));
}
if operators.windows(2).any(|pair| pair[0] != pair[1]) {
return Err(RegexError::new("invalid set operation"));
}
if operands.iter().any(|operand| {
operand.len() != 1
|| matches!(operand.first(), Some(ClassItem::Range(_, _)) | None)
}) {
return Err(RegexError::new("invalid set operation"));
}
let operand_classes = operands
.into_iter()
.map(|items| CharClass {
negated: false,
items,
})
.collect::<Vec<_>>();
return Ok(Expr::ClassSetOperation {
operands: operand_classes,
operators,
negated,
});
}
if self.unicode_sets_mode {
if self.peek() == Some('&') && self.peek_next() == Some('&') {
if operands.last().map_or(true, Vec::is_empty) {
return Err(RegexError::new("invalid set operation"));
}
self.i += 2;
operators.push(ClassSetOperator::Intersection);
operands.push(Vec::new());
continue;
}
if self.peek() == Some('-') && self.peek_next() == Some('-') {
if operands.last().map_or(true, Vec::is_empty) {
return Err(RegexError::new("invalid set operation"));
}
self.i += 2;
operators.push(ClassSetOperator::Difference);
operands.push(Vec::new());
continue;
}
}
let item = self.parse_class_item()?;
if self.peek() == Some('-')
&& !(self.unicode_sets_mode && self.peek_next() == Some('-'))
{
let backup = self.i;
self.i += 1;
if self.peek() == Some(']') {
self.i = backup;
operands
.last_mut()
.expect("character class always has current operand")
.push(item);
continue;
}
let next_item = self.parse_class_item()?;
match (item, next_item) {
(ClassItem::Char(start), ClassItem::Char(end)) => {
if (start as u32) > (end as u32) {
return Err(RegexError::new("invalid character class range"));
}
operands
.last_mut()
.expect("character class always has current operand")
.push(ClassItem::Range(start, end));
}
_ => {
return Err(RegexError::new(
"character class range bounds must be literal characters",
));
}
}
} else {
operands
.last_mut()
.expect("character class always has current operand")
.push(item);
}
}
Err(RegexError::new("unterminated character class"))
}
fn parse_class_item(&mut self) -> Result<ClassItem, RegexError> {
let Some(ch) = self.next() else {
return Err(RegexError::new("unterminated character class"));
};
if ch != '\\' {
if self.unicode_sets_mode && ch == '[' {
let nested = self.parse_char_class()?;
return Ok(ClassItem::NestedExpression(Box::new(nested)));
}
return Ok(ClassItem::Char(ch));
}
let Some(escaped) = self.next() else {
return Err(RegexError::new("unterminated escape sequence"));
};
match escaped {
'd' => Ok(ClassItem::Digit),
'D' => Ok(ClassItem::NotDigit),
'w' => Ok(ClassItem::Word),
'W' => Ok(ClassItem::NotWord),
's' => Ok(ClassItem::Space),
'S' => Ok(ClassItem::NotSpace),
'c' => {
if self.peek().is_some_and(|next| next.is_ascii_alphabetic()) {
let next = self.next().expect("peek guaranteed a character");
Ok(ClassItem::Char(control_escape_char(next)))
} else if self
.peek()
.is_some_and(is_legacy_class_control_escape_suffix)
{
if self.unicode_mode {
return Err(RegexError::new("invalid escape sequence"));
}
let next = self.next().expect("peek guaranteed a character");
Ok(ClassItem::Char(control_escape_char_extended(next)))
} else if self.unicode_mode {
Err(RegexError::new("invalid escape sequence"))
} else {
self.i = self.i.saturating_sub(1);
Ok(ClassItem::Char('\\'))
}
}
'n' => Ok(ClassItem::Char('\n')),
'r' => Ok(ClassItem::Char('\r')),
't' => Ok(ClassItem::Char('\t')),
'v' => Ok(ClassItem::Char('\u{000B}')),
'f' => Ok(ClassItem::Char('\u{000C}')),
'b' => Ok(ClassItem::Char('\u{0008}')),
'0' => {
if self.peek().is_some_and(|next| next.is_ascii_digit()) {
if self.unicode_mode {
return Err(RegexError::new("invalid numeric backreference"));
}
if self.peek().is_some_and(|next| matches!(next, '0'..='7')) {
Ok(ClassItem::Char(self.parse_legacy_octal_escape_char('0')))
} else {
Ok(ClassItem::Char('\0'))
}
} else {
Ok(ClassItem::Char('\0'))
}
}
'1'..='9' => {
if self.unicode_mode {
return Err(RegexError::new("invalid numeric backreference"));
}
if escaped == '8' || escaped == '9' {
Ok(ClassItem::Char(escaped))
} else {
Ok(ClassItem::Char(
self.parse_legacy_octal_escape_char(escaped),
))
}
}
'x' => {
if self.peek_is_n_hex_digits(2) {
Ok(ClassItem::Char(self.parse_hex_char(2)?))
} else if self.unicode_mode {
Err(RegexError::new("invalid hexadecimal escape sequence"))
} else {
Ok(ClassItem::Char('x'))
}
}
'u' => {
if self.unicode_mode {
Ok(ClassItem::Char(self.parse_unicode_escape()?))
} else if self.peek_is_n_hex_digits(4) {
Ok(ClassItem::Char(self.parse_unicode_escape_fixed4()?))
} else {
Ok(ClassItem::Char('u'))
}
}
'p' | 'P' => {
if !self.unicode_mode {
Ok(ClassItem::Char(escaped))
} else {
match self.parse_unicode_property_escape()? {
UnicodePropertySpec::CodePoint(property) => {
Ok(ClassItem::UnicodeProperty {
property,
negated: escaped == 'P',
})
}
UnicodePropertySpec::String(property) => {
if escaped == 'P' {
return Err(RegexError::new("invalid unicode property escape"));
}
Ok(ClassItem::UnicodeStringProperty(property))
}
}
}
}
'q' => {
if !self.unicode_sets_mode {
return Err(RegexError::new("invalid escape sequence"));
}
Ok(ClassItem::StringDisjunction(
self.parse_class_string_disjunction()?,
))
}
other => {
if self.unicode_mode {
if is_unicode_identity_escape_in_class(other) {
Ok(ClassItem::Char(other))
} else {
Err(RegexError::new("invalid escape sequence"))
}
} else {
Ok(ClassItem::Char(other))
}
}
}
}
fn parse_class_string_disjunction(&mut self) -> Result<Vec<String>, RegexError> {
if self.next() != Some('{') {
return Err(RegexError::new("invalid escape sequence"));
}
let mut alternatives = vec![String::new()];
loop {
let Some(ch) = self.peek() else {
return Err(RegexError::new("unterminated class string disjunction"));
};
if ch == '}' {
self.i += 1;
return Ok(alternatives);
}
if ch == '|' {
self.i += 1;
alternatives.push(String::new());
continue;
}
let value = if ch == '\\' {
self.i += 1;
self.parse_class_string_escape()?
} else {
self.i += 1;
ch
};
alternatives
.last_mut()
.expect("class string disjunction always has one alternative")
.push(value);
}
}
fn parse_class_string_escape(&mut self) -> Result<char, RegexError> {
let Some(escaped) = self.next() else {
return Err(RegexError::new("unterminated escape sequence"));
};
match escaped {
'c' => {
if self.peek().is_some_and(|next| next.is_ascii_alphabetic()) {
let next = self.next().expect("peek guaranteed a character");
Ok(control_escape_char(next))
} else {
Err(RegexError::new("invalid escape sequence"))
}
}
'n' => Ok('\n'),
'r' => Ok('\r'),
't' => Ok('\t'),
'v' => Ok('\u{000B}'),
'f' => Ok('\u{000C}'),
'b' => Ok('\u{0008}'),
'0' => {
if self.peek().is_some_and(|next| next.is_ascii_digit()) {
Err(RegexError::new("invalid decimal escape"))
} else {
Ok('\0')
}
}
'1'..='9' => Err(RegexError::new("invalid decimal escape")),
'x' => {
if self.peek_is_n_hex_digits(2) {
self.parse_hex_char(2)
} else {
Err(RegexError::new("invalid hexadecimal escape sequence"))
}
}
'u' => self.parse_unicode_escape(),
other => {
if is_class_string_identity_escape(other) {
Ok(other)
} else {
Err(RegexError::new("invalid escape sequence"))
}
}
}
}
fn parse_escape(&mut self, _in_class: bool) -> Result<Expr, RegexError> {
let Some(ch) = self.next() else {
return Err(RegexError::new("unterminated escape sequence"));
};
match ch {
'd' => Ok(Expr::CharClass(CharClass {
negated: false,
items: vec![ClassItem::Digit],
})),
'D' => Ok(Expr::CharClass(CharClass {
negated: false,
items: vec![ClassItem::NotDigit],
})),
'w' => Ok(Expr::CharClass(CharClass {
negated: false,
items: vec![ClassItem::Word],
})),
'W' => Ok(Expr::CharClass(CharClass {
negated: false,
items: vec![ClassItem::NotWord],
})),
's' => Ok(Expr::CharClass(CharClass {
negated: false,
items: vec![ClassItem::Space],
})),
'S' => Ok(Expr::CharClass(CharClass {
negated: false,
items: vec![ClassItem::NotSpace],
})),
'b' => Ok(Expr::WordBoundary(true)),
'B' => Ok(Expr::WordBoundary(false)),
'n' => Ok(Expr::Literal('\n')),
'r' => Ok(Expr::Literal('\r')),
't' => Ok(Expr::Literal('\t')),
'v' => Ok(Expr::Literal('\u{000B}')),
'f' => Ok(Expr::Literal('\u{000C}')),
'c' => {
if self.peek().is_some_and(|next| next.is_ascii_alphabetic()) {
let next = self.next().expect("peek guaranteed a character");
Ok(Expr::Literal(control_escape_char(next)))
} else if self.unicode_mode {
Err(RegexError::new("invalid escape sequence"))
} else {
self.i = self.i.saturating_sub(1);
Ok(Expr::Literal('\\'))
}
}
'0' => {
if self.peek().is_some_and(|next| next.is_ascii_digit()) {
if self.unicode_mode {
return Err(RegexError::new("invalid numeric backreference"));
}
if self.peek().is_some_and(|next| matches!(next, '0'..='7')) {
Ok(Expr::Literal(self.parse_legacy_octal_escape_char('0')))
} else {
Ok(Expr::Literal('\0'))
}
} else {
Ok(Expr::Literal('\0'))
}
}
'x' => {
if self.peek_is_n_hex_digits(2) {
Ok(Expr::Literal(self.parse_hex_char(2)?))
} else if self.unicode_mode {
Err(RegexError::new("invalid hexadecimal escape sequence"))
} else {
Ok(Expr::Literal('x'))
}
}
'u' => {
if self.unicode_mode {
Ok(Expr::Literal(self.parse_unicode_escape()?))
} else if self.peek_is_n_hex_digits(4) {
Ok(Expr::Literal(self.parse_unicode_escape_fixed4()?))
} else {
Ok(Expr::Literal('u'))
}
}
'p' | 'P' => {
if !self.unicode_mode {
Ok(Expr::Literal(ch))
} else {
match self.parse_unicode_property_escape()? {
UnicodePropertySpec::CodePoint(property) => {
Ok(Expr::CharClass(CharClass {
negated: false,
items: vec![ClassItem::UnicodeProperty {
property,
negated: ch == 'P',
}],
}))
}
UnicodePropertySpec::String(property) => {
if ch == 'P' {
return Err(RegexError::new("invalid unicode property escape"));
}
Ok(Expr::UnicodeStringProperty(property))
}
}
}
}
'k' => {
if !self.parse_named_backreference_syntax {
return Ok(Expr::Literal('k'));
}
if self.peek() != Some('<') {
if self.unicode_mode {
return Err(RegexError::new("invalid named backreference syntax"));
}
return Ok(Expr::Literal('k'));
}
self.i += 1; let name = match self.parse_raw_group_name_until_gt() {
Some(name) => name,
None => {
if self.unicode_mode {
return Err(RegexError::new("invalid named backreference syntax"));
}
return Ok(Expr::Sequence(vec![Expr::Literal('k'), Expr::Literal('<')]));
}
};
Ok(Expr::NamedBackReference(name))
}
'1'..='9' => self.parse_numeric_backreference(ch),
other => {
if self.unicode_mode {
if is_unicode_identity_escape_outside_class(other) {
Ok(Expr::Literal(other))
} else {
Err(RegexError::new("invalid escape sequence"))
}
} else {
Ok(Expr::Literal(other))
}
}
}
}
fn parse_group_name(&mut self) -> Result<String, RegexError> {
let Some(raw) = self.parse_raw_group_name_until_gt() else {
return Err(RegexError::new("invalid capture group name"));
};
normalize_group_name(&raw).ok_or_else(|| RegexError::new("invalid capture group name"))
}
fn parse_raw_group_name_until_gt(&mut self) -> Option<String> {
let start = self.i;
let mut paren_depth = 0usize;
let mut in_class = false;
let mut escaped = false;
while let Some(ch) = self.peek() {
self.i += 1;
if escaped {
escaped = false;
continue;
}
if in_class {
match ch {
'\\' => escaped = true,
']' => in_class = false,
_ => {}
}
continue;
}
match ch {
'\\' => escaped = true,
'[' => in_class = true,
'(' => paren_depth = paren_depth.saturating_add(1),
')' => paren_depth = paren_depth.saturating_sub(1),
'>' if paren_depth == 0 => {
let name = self.chars[start..self.i - 1].iter().collect::<String>();
return Some(name);
}
_ => {}
}
}
None
}
fn parse_numeric_backreference(&mut self, first_digit: char) -> Result<Expr, RegexError> {
let mut raw = String::new();
raw.push(first_digit);
while self.peek().is_some_and(|ch| ch.is_ascii_digit()) {
raw.push(self.next().expect("peek guaranteed a digit"));
}
Ok(Expr::DecimalEscape(raw))
}
fn parse_unicode_escape(&mut self) -> Result<char, RegexError> {
if self.peek() == Some('{') {
self.i += 1;
let start = self.i;
while self.peek().is_some_and(|ch| ch.is_ascii_hexdigit()) {
self.i += 1;
}
if self.peek() != Some('}') || self.i == start {
return Err(RegexError::new("invalid Unicode escape sequence"));
}
let raw: String = self.chars[start..self.i].iter().collect();
self.i += 1;
let value = u32::from_str_radix(&raw, 16)
.map_err(|_| RegexError::new("invalid Unicode escape sequence"))?;
if value > 0x10FFFF {
return Err(RegexError::new("invalid Unicode escape sequence"));
}
unicode_escape_char_from_u32(value)
} else {
self.parse_unicode_escape_fixed4()
}
}
fn parse_unicode_escape_fixed4(&mut self) -> Result<char, RegexError> {
let start = self.i;
let end = start.saturating_add(4);
if end > self.chars.len() {
return Err(RegexError::new("invalid Unicode escape sequence"));
}
if !self.chars[start..end]
.iter()
.all(|ch| ch.is_ascii_hexdigit())
{
return Err(RegexError::new("invalid Unicode escape sequence"));
}
let raw: String = self.chars[start..end].iter().collect();
self.i = end;
let value = u32::from_str_radix(&raw, 16)
.map_err(|_| RegexError::new("invalid Unicode escape sequence"))?;
unicode_escape_char_from_u32(value)
}
fn parse_unicode_property_escape(&mut self) -> Result<UnicodePropertySpec, RegexError> {
if self.next() != Some('{') {
return Err(RegexError::new("invalid unicode property escape"));
}
let start = self.i;
while let Some(ch) = self.peek() {
if ch == '}' {
break;
}
self.i += 1;
}
if self.peek() != Some('}') || self.i == start {
return Err(RegexError::new("invalid unicode property escape"));
}
let raw: String = self.chars[start..self.i].iter().collect();
self.i += 1; parse_unicode_property_name(&raw, self.unicode_sets_mode)
.ok_or_else(|| RegexError::new("invalid unicode property escape"))
}
fn parse_hex_char(&mut self, digits: usize) -> Result<char, RegexError> {
let start = self.i;
let end = start.saturating_add(digits);
if end > self.chars.len() {
return Err(RegexError::new("invalid hexadecimal escape sequence"));
}
if !self.chars[start..end]
.iter()
.all(|ch| ch.is_ascii_hexdigit())
{
return Err(RegexError::new("invalid hexadecimal escape sequence"));
}
let raw: String = self.chars[start..end].iter().collect();
self.i = end;
let value = u32::from_str_radix(&raw, 16)
.map_err(|_| RegexError::new("invalid hexadecimal escape sequence"))?;
char::from_u32(value).ok_or_else(|| RegexError::new("invalid escape value"))
}
fn parse_usize(&mut self) -> Result<usize, RegexError> {
let start = self.i;
while self.peek().is_some_and(|ch| ch.is_ascii_digit()) {
self.i += 1;
}
if self.i == start {
return Err(RegexError::new("expected number in quantifier"));
}
let raw: String = self.chars[start..self.i].iter().collect();
raw.parse::<usize>()
.map_err(|_| RegexError::new("invalid numeric value in quantifier"))
}
fn peek_is_n_hex_digits(&self, digits: usize) -> bool {
let start = self.i;
let end = start.saturating_add(digits);
end <= self.chars.len()
&& self.chars[start..end]
.iter()
.all(|ch| ch.is_ascii_hexdigit())
}
fn build_class_expr_from_items(
&self,
negated: bool,
items: Vec<ClassItem>,
) -> Result<Expr, RegexError> {
if negated
&& (items
.iter()
.any(class_item_contains_unicode_string_property)
|| items.iter().any(class_item_contains_non_scalar_string))
{
return Err(RegexError::new("unsupported unicode set syntax"));
}
Ok(Expr::CharClass(CharClass { negated, items }))
}
fn expect(&mut self, ch: char) -> Result<(), RegexError> {
if self.next() == Some(ch) {
Ok(())
} else {
Err(RegexError::new("unterminated group or invalid syntax"))
}
}
fn validate_named_capture_duplicates(&self, expr: &Expr) -> Result<(), RegexError> {
self.named_capture_paths(expr).map(|_| ())
}
fn named_capture_paths(&self, expr: &Expr) -> Result<Vec<HashSet<String>>, RegexError> {
match expr {
Expr::Empty
| Expr::Literal(_)
| Expr::Dot
| Expr::StartAnchor
| Expr::EndAnchor
| Expr::WordBoundary(_)
| Expr::CharClass(_)
| Expr::UnicodeStringProperty(_)
| Expr::ClassSetOperation { .. }
| Expr::NamedBackReference(_)
| Expr::DecimalEscape(_)
| Expr::BackReference(_)
| Expr::BackReferenceAny(_) => Ok(vec![HashSet::new()]),
Expr::Sequence(parts) => {
let mut paths = vec![HashSet::new()];
for part in parts {
let part_paths = self.named_capture_paths(part)?;
let mut next = Vec::new();
for base in &paths {
for extension in &part_paths {
if let Some(duplicate) =
base.iter().find(|name| extension.contains(*name)).cloned()
{
return Err(RegexError::new(format!(
"duplicate capture group name: {duplicate}"
)));
}
let mut merged = base.clone();
merged.extend(extension.iter().cloned());
next.push(merged);
}
}
paths = dedupe_name_sets(next);
}
Ok(paths)
}
Expr::Alternation(branches) => {
let mut out = Vec::new();
for branch in branches {
out.extend(self.named_capture_paths(branch)?);
}
Ok(dedupe_name_sets(out))
}
Expr::Group { index, expr } => {
let mut paths = self.named_capture_paths(expr)?;
if let Some(name) = index.and_then(|idx| self.capture_names.get(&idx)) {
for path in &mut paths {
if path.contains(name) {
return Err(RegexError::new(format!(
"duplicate capture group name: {name}"
)));
}
path.insert(name.clone());
}
}
Ok(paths)
}
Expr::FlagModifierGroup { expr, .. }
| Expr::LookAhead { expr, .. }
| Expr::LookBehind { expr, .. }
| Expr::Quantifier { expr, .. } => self.named_capture_paths(expr),
}
}
fn resolve_post_parse_escapes(&mut self, expr: &mut Expr) -> Result<(), RegexError> {
match expr {
Expr::NamedBackReference(name) => {
let has_named_captures = !self.named_captures.is_empty();
if !self.unicode_mode && !has_named_captures {
*expr = self.parse_legacy_named_backreference_expr(name)?;
return Ok(());
}
let Some(normalized_name) = normalize_group_name(name) else {
return Err(RegexError::new("invalid capture group name"));
};
let Some(indices) = self.named_captures.get(&normalized_name).cloned() else {
return Err(RegexError::new(format!(
"unknown named backreference: {normalized_name}"
)));
};
if indices.len() == 1 {
*expr = Expr::BackReference(indices[0]);
} else {
*expr = Expr::BackReferenceAny(indices);
}
Ok(())
}
Expr::DecimalEscape(_) => Ok(()),
Expr::Sequence(parts) | Expr::Alternation(parts) => {
for part in parts {
self.resolve_post_parse_escapes(part)?;
}
Ok(())
}
Expr::Group { expr, .. }
| Expr::FlagModifierGroup { expr, .. }
| Expr::LookAhead { expr, .. }
| Expr::LookBehind { expr, .. }
| Expr::Quantifier { expr, .. } => self.resolve_post_parse_escapes(expr),
_ => Ok(()),
}
}
fn resolve_decimal_escapes(&self, expr: &mut Expr) -> Result<(), RegexError> {
match expr {
Expr::DecimalEscape(raw) => {
*expr = self.resolve_decimal_escape(raw)?;
Ok(())
}
Expr::Sequence(parts) | Expr::Alternation(parts) => {
for part in parts {
self.resolve_decimal_escapes(part)?;
}
Ok(())
}
Expr::Group { expr, .. }
| Expr::FlagModifierGroup { expr, .. }
| Expr::LookAhead { expr, .. }
| Expr::LookBehind { expr, .. }
| Expr::Quantifier { expr, .. } => self.resolve_decimal_escapes(expr),
_ => Ok(()),
}
}
fn resolve_decimal_escape(&self, raw: &str) -> Result<Expr, RegexError> {
let index = raw.parse::<usize>().ok();
if self.unicode_mode {
let Some(index) = index else {
return Err(RegexError::new("invalid numeric backreference"));
};
if index == 0 || index > self.capture_count {
return Err(RegexError::new("invalid numeric backreference"));
}
return Ok(Expr::BackReference(index));
}
if let Some(index) = index {
if index > 0 && index <= self.capture_count {
return Ok(Expr::BackReference(index));
}
}
Ok(Self::legacy_decimal_escape_expr(raw))
}
fn legacy_decimal_escape_expr(raw: &str) -> Expr {
let chars = raw.chars().collect::<Vec<_>>();
let Some(first) = chars.first().copied() else {
return Expr::Empty;
};
if first == '8' || first == '9' {
return Self::literal_sequence_from_chars(&chars);
}
let mut octal = String::new();
octal.push(first);
let mut used = 1usize;
while used < chars.len() && used < 3 && matches!(chars[used], '0'..='7') {
octal.push(chars[used]);
used += 1;
}
let value = u32::from_str_radix(&octal, 8).expect("legacy octal digits are valid");
let first_expr = char::from_u32(value)
.map(Expr::Literal)
.unwrap_or(Expr::Literal('\0'));
if used >= chars.len() {
first_expr
} else {
let mut parts = vec![first_expr];
parts.extend(chars[used..].iter().copied().map(Expr::Literal));
Expr::Sequence(parts)
}
}
fn literal_sequence_from_chars(chars: &[char]) -> Expr {
if chars.len() == 1 {
Expr::Literal(chars[0])
} else {
Expr::Sequence(chars.iter().copied().map(Expr::Literal).collect::<Vec<_>>())
}
}
fn parse_legacy_octal_escape_char(&mut self, first: char) -> char {
let mut raw = String::new();
raw.push(first);
for _ in 0..2 {
if self.peek().is_some_and(|ch| matches!(ch, '0'..='7')) {
raw.push(self.next().expect("peek guaranteed a digit"));
} else {
break;
}
}
let value = u32::from_str_radix(&raw, 8).expect("legacy octal digits are valid");
char::from_u32(value).unwrap_or('\0')
}
fn literal_named_backreference_expr(name: &str) -> Expr {
let expanded = legacy_named_backreference_name_chars(name);
let mut chars = Vec::with_capacity(expanded.len() + 3);
chars.push('k');
chars.push('<');
chars.extend(expanded);
chars.push('>');
Self::literal_sequence_from_chars(&chars)
}
fn parse_legacy_named_backreference_expr(&mut self, name: &str) -> Result<Expr, RegexError> {
let mut pattern = String::with_capacity(name.len() + 3);
pattern.push('k');
pattern.push('<');
pattern.push_str(name);
pattern.push('>');
let mut parser = Parser::new(&pattern, false, false);
parser.parse_named_backreference_syntax = false;
match parser.parse() {
Ok(mut expr) => {
if !parser.named_captures.is_empty() {
return Err(RegexError::new("invalid capture group name"));
}
if parser.capture_count > 0 {
let offset = self.capture_count;
shift_expr_capture_indices(&mut expr, offset);
self.capture_count += parser.capture_count;
}
Ok(expr)
}
Err(_) => Ok(Self::literal_named_backreference_expr(name)),
}
}
fn looks_like_braced_quantifier(&self, brace_index: usize) -> bool {
if self.chars.get(brace_index) != Some(&'{') {
return false;
}
let mut idx = brace_index + 1;
let mut saw_digit = false;
while self.chars.get(idx).is_some_and(|ch| ch.is_ascii_digit()) {
saw_digit = true;
idx += 1;
}
if !saw_digit {
return false;
}
match self.chars.get(idx).copied() {
Some('}') => true,
Some(',') => {
idx += 1;
while self.chars.get(idx).is_some_and(|ch| ch.is_ascii_digit()) {
idx += 1;
}
self.chars.get(idx) == Some(&'}')
}
_ => false,
}
}
fn peek(&self) -> Option<char> {
self.chars.get(self.i).copied()
}
fn peek_next(&self) -> Option<char> {
self.chars.get(self.i + 1).copied()
}
fn next(&mut self) -> Option<char> {
let ch = self.peek()?;
self.i += 1;
Some(ch)
}
}
fn chars_equal(left: char, right: char, ignore_case: bool, unicode_mode: bool) -> bool {
if !ignore_case {
return left == right;
}
let left = canonicalize_case_insensitive(left, unicode_mode);
let right = canonicalize_case_insensitive(right, unicode_mode);
left == right
}
fn canonicalize_case_insensitive(ch: char, unicode_mode: bool) -> char {
if unicode_mode {
canonicalize_unicode_char(ch)
} else {
canonicalize_non_unicode_char(ch)
}
}
fn canonicalize_non_unicode_char(ch: char) -> char {
let mut upper = ch.to_uppercase();
let Some(first) = upper.next() else {
return ch;
};
if upper.next().is_some() {
return ch;
}
if (ch as u32) >= 0x80 && (first as u32) < 0x80 {
return ch;
}
first
}
fn canonicalize_unicode_char(ch: char) -> char {
match ch {
'\u{017F}' => 's', '\u{212A}' => 'k', '\u{03C2}' => '\u{03C3}', _ => {
let mut lower = ch.to_lowercase();
let Some(first) = lower.next() else {
return ch;
};
if lower.next().is_some() { ch } else { first }
}
}
}
fn class_set_operation_matches_char(
operands: &[CharClass],
operators: &[ClassSetOperator],
negated: bool,
ch: char,
ignore_case: bool,
unicode_mode: bool,
) -> bool {
let Some(first) = operands.first() else {
return false;
};
let mut matched = first.matches(ch, ignore_case, unicode_mode);
for (op, operand) in operators.iter().zip(operands.iter().skip(1)) {
let right = operand.matches(ch, ignore_case, unicode_mode);
match op {
ClassSetOperator::Intersection => matched = matched && right,
ClassSetOperator::Difference => matched = matched && !right,
}
}
if negated { !matched } else { matched }
}
fn class_set_operation_match_ends(
prepared: &PreparedInput<'_>,
pos: usize,
operands: &[CharClass],
operators: &[ClassSetOperator],
negated: bool,
ignore_case: bool,
unicode_mode: bool,
) -> Vec<usize> {
let Some(first) = operands.first() else {
return Vec::new();
};
let mut current = first
.match_ends(prepared, pos, ignore_case, unicode_mode)
.into_iter()
.collect::<HashSet<_>>();
for (op, operand) in operators.iter().zip(operands.iter().skip(1)) {
let rhs = operand
.match_ends(prepared, pos, ignore_case, unicode_mode)
.into_iter()
.collect::<HashSet<_>>();
match op {
ClassSetOperator::Intersection => current.retain(|end| rhs.contains(end)),
ClassSetOperator::Difference => current.retain(|end| !rhs.contains(end)),
}
}
if negated {
let scalar_end = if unicode_mode {
let Some((_, width)) = prepared.unicode_char_at(pos) else {
return Vec::new();
};
pos + width
} else {
let Some(_) = prepared.char_at(pos) else {
return Vec::new();
};
pos + 1
};
if current.contains(&scalar_end) {
Vec::new()
} else {
vec![scalar_end]
}
} else {
dedupe_sorted_desc(current.into_iter().collect::<Vec<_>>())
}
}
fn class_set_operation_match_starts(
prepared: &PreparedInput<'_>,
end: usize,
operands: &[CharClass],
operators: &[ClassSetOperator],
negated: bool,
ignore_case: bool,
unicode_mode: bool,
) -> Vec<usize> {
let mut out = Vec::new();
for start in 0..=end {
if class_set_operation_match_ends(
prepared,
start,
operands,
operators,
negated,
ignore_case,
unicode_mode,
)
.iter()
.any(|candidate_end| *candidate_end == end)
{
out.push(start);
}
}
dedupe_sorted_asc(out)
}
fn expr_match_ends(
expr: &Expr,
prepared: &PreparedInput<'_>,
pos: usize,
ignore_case: bool,
unicode_mode: bool,
) -> Vec<usize> {
match expr {
Expr::Empty => vec![pos],
Expr::Literal(expected) => {
if unicode_mode {
prepared
.unicode_char_at(pos)
.filter(|(actual, _)| {
chars_equal(*actual, *expected, ignore_case, unicode_mode)
})
.map(|(_, width)| pos + width)
.into_iter()
.collect::<Vec<_>>()
} else {
prepared
.char_at(pos)
.filter(|actual| chars_equal(*actual, *expected, ignore_case, unicode_mode))
.map(|_| pos + 1)
.into_iter()
.collect::<Vec<_>>()
}
}
Expr::Sequence(parts) => {
let mut cursors = vec![pos];
for part in parts {
let mut next = Vec::new();
for cursor in &cursors {
next.extend(expr_match_ends(
part,
prepared,
*cursor,
ignore_case,
unicode_mode,
));
}
if next.is_empty() {
return Vec::new();
}
cursors = dedupe_sorted_desc(next);
}
cursors
}
Expr::Alternation(branches) => {
let mut out = Vec::new();
for branch in branches {
out.extend(expr_match_ends(
branch,
prepared,
pos,
ignore_case,
unicode_mode,
));
}
dedupe_sorted_desc(out)
}
Expr::CharClass(class) => class.match_ends(prepared, pos, ignore_case, unicode_mode),
Expr::ClassSetOperation {
operands,
operators,
negated,
} => class_set_operation_match_ends(
prepared,
pos,
operands,
operators,
*negated,
ignore_case,
unicode_mode,
),
Expr::UnicodeStringProperty(property) => property
.match_at(prepared, pos)
.into_iter()
.collect::<Vec<_>>(),
_ => Vec::new(),
}
}
fn expr_matches_single_char(expr: &Expr, ch: char, ignore_case: bool, unicode_mode: bool) -> bool {
match expr {
Expr::CharClass(class) => class.matches(ch, ignore_case, unicode_mode),
Expr::ClassSetOperation {
operands,
operators,
negated,
} => class_set_operation_matches_char(
operands,
operators,
*negated,
ch,
ignore_case,
unicode_mode,
),
Expr::Alternation(branches) => branches
.iter()
.any(|branch| expr_matches_single_char(branch, ch, ignore_case, unicode_mode)),
Expr::Literal(expected) => chars_equal(ch, *expected, ignore_case, unicode_mode),
Expr::UnicodeStringProperty(_) => false,
_ => false,
}
}
fn single_char_string(value: &str) -> Option<char> {
let mut chars = value.chars();
let first = chars.next()?;
if chars.next().is_none() {
Some(first)
} else {
None
}
}
fn unicode_string_property_match_starts(
property: &UnicodeStringProperty,
prepared: &PreparedInput<'_>,
end: usize,
) -> Vec<usize> {
let mut out = Vec::new();
for start in 0..=end {
if property.match_at(prepared, start) == Some(end) {
out.push(start);
}
}
dedupe_sorted_asc(out)
}
fn match_literal_string_at(
prepared: &PreparedInput<'_>,
pos: usize,
candidate: &str,
ignore_case: bool,
unicode_mode: bool,
) -> Option<usize> {
let mut index = pos;
for expected in candidate.chars() {
let (actual, width) = if unicode_mode {
prepared.unicode_char_at(index)?
} else {
(prepared.char_at(index)?, 1)
};
if !chars_equal(actual, expected, ignore_case, unicode_mode) {
return None;
}
index += width;
}
Some(index)
}
fn dedupe_sorted_desc(mut values: Vec<usize>) -> Vec<usize> {
values.sort_unstable();
values.dedup();
values.reverse();
values
}
fn dedupe_sorted_asc(mut values: Vec<usize>) -> Vec<usize> {
values.sort_unstable();
values.dedup();
values
}
fn dedupe_name_sets(values: Vec<HashSet<String>>) -> Vec<HashSet<String>> {
let mut out: Vec<HashSet<String>> = Vec::new();
for value in values {
if !out.iter().any(|existing| existing == &value) {
out.push(value);
}
}
out
}
fn capture_slots_in_expr(expr: &Expr) -> Vec<usize> {
let mut out = Vec::new();
collect_capture_slots(expr, &mut out);
out.sort_unstable();
out.dedup();
out
}
fn collect_capture_slots(expr: &Expr, out: &mut Vec<usize>) {
match expr {
Expr::Group { index, expr } => {
if let Some(index) = index {
out.push(*index);
}
collect_capture_slots(expr, out);
}
Expr::Sequence(parts) | Expr::Alternation(parts) => {
for part in parts {
collect_capture_slots(part, out);
}
}
Expr::FlagModifierGroup { expr, .. }
| Expr::LookAhead { expr, .. }
| Expr::LookBehind { expr, .. }
| Expr::Quantifier { expr, .. } => collect_capture_slots(expr, out),
_ => {}
}
}
fn clear_capture_slots(captures: &mut [Option<(usize, usize)>], slots: &[usize]) {
for slot in slots {
if let Some(capture) = captures.get_mut(*slot) {
*capture = None;
}
}
}
fn legacy_named_backreference_name_chars(name: &str) -> Vec<char> {
let chars = name.chars().collect::<Vec<_>>();
let mut out = Vec::new();
let mut i = 0usize;
while i < chars.len() {
let ch = chars[i];
i += 1;
if ch != '\\' {
out.push(ch);
continue;
}
if i >= chars.len() {
break;
}
let escaped = chars[i];
i += 1;
match escaped {
'c' => {
if i < chars.len() && chars[i].is_ascii_alphabetic() {
out.push(control_escape_char(chars[i]));
i += 1;
} else {
out.push('c');
}
}
'n' => out.push('\n'),
'r' => out.push('\r'),
't' => out.push('\t'),
'v' => out.push('\u{000B}'),
'f' => out.push('\u{000C}'),
'0' => {
if i < chars.len() && chars[i].is_ascii_digit() {
if matches!(chars[i], '0'..='7') {
let mut raw = String::from("0");
let mut consumed = 0usize;
while i < chars.len() && consumed < 2 && matches!(chars[i], '0'..='7') {
raw.push(chars[i]);
i += 1;
consumed += 1;
}
let value =
u32::from_str_radix(&raw, 8).expect("legacy octal digits are valid");
out.push(char::from_u32(value).unwrap_or('\0'));
} else {
out.push('\0');
}
} else {
out.push('\0');
}
}
'1'..='9' => {
let mut raw = String::new();
raw.push(escaped);
while i < chars.len() && chars[i].is_ascii_digit() {
raw.push(chars[i]);
i += 1;
}
out.extend(legacy_decimal_escape_to_chars(&raw));
}
'x' => {
if i + 1 < chars.len()
&& chars[i].is_ascii_hexdigit()
&& chars[i + 1].is_ascii_hexdigit()
{
let raw = [chars[i], chars[i + 1]].iter().collect::<String>();
i += 2;
let value =
u32::from_str_radix(&raw, 16).expect("hexadecimal escape digits are valid");
out.push(char::from_u32(value).unwrap_or('\0'));
} else {
out.push('x');
}
}
'u' => {
if i + 3 < chars.len() && chars[i..i + 4].iter().all(|ch| ch.is_ascii_hexdigit()) {
let raw = chars[i..i + 4].iter().collect::<String>();
i += 4;
let value =
u32::from_str_radix(&raw, 16).expect("unicode escape digits are valid");
out.push(char::from_u32(value).unwrap_or('\0'));
} else {
out.push('u');
}
}
other => out.push(other),
}
}
out
}
fn legacy_decimal_escape_to_chars(raw: &str) -> Vec<char> {
let chars = raw.chars().collect::<Vec<_>>();
let Some(first) = chars.first().copied() else {
return Vec::new();
};
if first == '8' || first == '9' {
return chars;
}
let mut octal = String::new();
octal.push(first);
let mut used = 1usize;
while used < chars.len() && used < 3 && matches!(chars[used], '0'..='7') {
octal.push(chars[used]);
used += 1;
}
let value = u32::from_str_radix(&octal, 8).expect("legacy octal digits are valid");
let mut out = vec![char::from_u32(value).unwrap_or('\0')];
out.extend(chars[used..].iter().copied());
out
}
fn shift_expr_capture_indices(expr: &mut Expr, offset: usize) {
if offset == 0 {
return;
}
match expr {
Expr::Group { index, expr } => {
if let Some(index) = index {
*index += offset;
}
shift_expr_capture_indices(expr, offset);
}
Expr::BackReference(index) => {
*index += offset;
}
Expr::BackReferenceAny(indices) => {
for index in indices {
*index += offset;
}
}
Expr::Sequence(parts) | Expr::Alternation(parts) => {
for part in parts {
shift_expr_capture_indices(part, offset);
}
}
Expr::FlagModifierGroup { expr, .. }
| Expr::LookAhead { expr, .. }
| Expr::LookBehind { expr, .. }
| Expr::Quantifier { expr, .. } => {
shift_expr_capture_indices(expr, offset);
}
_ => {}
}
}
fn class_item_contains_non_scalar_string(item: &ClassItem) -> bool {
match item {
ClassItem::UnicodeStringProperty(_) => false,
ClassItem::StringDisjunction(alternatives) => alternatives
.iter()
.any(|alternative| single_char_string(alternative).is_none()),
ClassItem::NestedExpression(expr) => expr_contains_non_scalar_string(expr),
_ => false,
}
}
fn expr_contains_non_scalar_string(expr: &Expr) -> bool {
match expr {
Expr::Empty => true,
Expr::Literal(_) => false,
Expr::UnicodeStringProperty(_) => true,
Expr::CharClass(class) => class
.items
.iter()
.any(class_item_contains_non_scalar_string),
Expr::ClassSetOperation { operands, .. } => operands.iter().any(|operand| {
operand
.items
.iter()
.any(class_item_contains_non_scalar_string)
}),
Expr::Sequence(parts) => {
if parts.len() != 1 {
true
} else {
parts.iter().any(expr_contains_non_scalar_string)
}
}
Expr::Alternation(branches) => branches.iter().any(expr_contains_non_scalar_string),
Expr::Group { expr, .. }
| Expr::FlagModifierGroup { expr, .. }
| Expr::LookAhead { expr, .. }
| Expr::LookBehind { expr, .. }
| Expr::Quantifier { expr, .. } => expr_contains_non_scalar_string(expr),
_ => false,
}
}
fn class_item_contains_unicode_string_property(item: &ClassItem) -> bool {
match item {
ClassItem::UnicodeStringProperty(_) => true,
ClassItem::NestedExpression(expr) => expr_contains_unicode_string_property(expr),
_ => false,
}
}
fn expr_contains_unicode_string_property(expr: &Expr) -> bool {
match expr {
Expr::UnicodeStringProperty(_) => true,
Expr::CharClass(class) => class
.items
.iter()
.any(class_item_contains_unicode_string_property),
Expr::ClassSetOperation { operands, .. } => operands.iter().any(|operand| {
operand
.items
.iter()
.any(class_item_contains_unicode_string_property)
}),
Expr::Alternation(branches) | Expr::Sequence(branches) => {
branches.iter().any(expr_contains_unicode_string_property)
}
Expr::Group { expr, .. }
| Expr::FlagModifierGroup { expr, .. }
| Expr::LookAhead { expr, .. }
| Expr::LookBehind { expr, .. }
| Expr::Quantifier { expr, .. } => expr_contains_unicode_string_property(expr),
_ => false,
}
}
fn normalize_group_name(raw: &str) -> Option<String> {
let mut chars = raw.chars().peekable();
let mut out = String::new();
while chars.peek().is_some() {
let ch = next_group_name_char(&mut chars)?;
if out.is_empty() {
if !is_group_name_start(ch) {
return None;
}
} else if !is_group_name_continue(ch) {
return None;
}
out.push(ch);
}
if out.is_empty() { None } else { Some(out) }
}
fn next_group_name_char(chars: &mut Peekable<Chars<'_>>) -> Option<char> {
let ch = chars.next()?;
if ch != '\\' {
return Some(ch);
}
parse_group_name_unicode_escape(chars)
}
fn parse_group_name_unicode_escape(chars: &mut Peekable<Chars<'_>>) -> Option<char> {
if chars.next()? != 'u' {
return None;
}
if chars.peek() == Some(&'{') {
chars.next();
let mut raw = String::new();
while let Some(&ch) = chars.peek() {
if ch == '}' {
break;
}
if !ch.is_ascii_hexdigit() {
return None;
}
raw.push(ch);
chars.next();
}
if raw.is_empty() || chars.next() != Some('}') {
return None;
}
let value = u32::from_str_radix(&raw, 16).ok()?;
char::from_u32(value)
} else {
let mut raw = String::new();
for _ in 0..4 {
let ch = chars.next()?;
if !ch.is_ascii_hexdigit() {
return None;
}
raw.push(ch);
}
let value = u32::from_str_radix(&raw, 16).ok()?;
char::from_u32(value)
}
}
fn is_group_name_start(ch: char) -> bool {
matches!(ch, '$' | '_') || ch.is_ascii_alphabetic() || (!ch.is_ascii() && ch.is_alphabetic())
}
fn is_group_name_continue(ch: char) -> bool {
is_group_name_start(ch)
|| ch.is_ascii_digit()
|| matches!(ch, '\u{200C}' | '\u{200D}')
|| is_combining_mark(ch)
}
fn is_combining_mark(ch: char) -> bool {
char_in_ranges(
ch,
&[
(0x0300, 0x036F),
(0x1AB0, 0x1AFF),
(0x1DC0, 0x1DFF),
(0x20D0, 0x20FF),
(0xFE20, 0xFE2F),
],
)
}
fn is_class_string_identity_escape(ch: char) -> bool {
ch.is_ascii_punctuation() && ch != '_'
}
fn flag_modifier_value(
flag: char,
enabling: &HashSet<char>,
disabling: &HashSet<char>,
) -> Option<bool> {
if enabling.contains(&flag) {
Some(true)
} else if disabling.contains(&flag) {
Some(false)
} else {
None
}
}
fn parse_unicode_property_name(raw: &str, unicode_sets_mode: bool) -> Option<UnicodePropertySpec> {
if let Some((name, value)) = raw.split_once('=') {
return parse_unicode_property_pair(name, value);
}
match raw {
"L" | "Letter" => Some(UnicodePropertySpec::CodePoint(UnicodeProperty::Letter)),
"N" | "Number" => Some(UnicodePropertySpec::CodePoint(UnicodeProperty::Number)),
"Nd" | "Decimal_Number" => Some(UnicodePropertySpec::CodePoint(
UnicodeProperty::DecimalNumber,
)),
"Lu" | "Uppercase_Letter" => Some(UnicodePropertySpec::CodePoint(
UnicodeProperty::UppercaseLetter,
)),
"Ll" | "Lowercase_Letter" => Some(UnicodePropertySpec::CodePoint(
UnicodeProperty::LowercaseLetter,
)),
"ASCII" => Some(UnicodePropertySpec::CodePoint(UnicodeProperty::Ascii)),
"Any" => Some(UnicodePropertySpec::CodePoint(UnicodeProperty::Any)),
"RGI_Emoji" if unicode_sets_mode => {
Some(UnicodePropertySpec::String(UnicodeStringProperty::RgiEmoji))
}
_ => None,
}
}
fn parse_unicode_property_pair(name: &str, value: &str) -> Option<UnicodePropertySpec> {
match name {
"gc" | "General_Category" => match value {
"L" | "Letter" => Some(UnicodePropertySpec::CodePoint(UnicodeProperty::Letter)),
"N" | "Number" => Some(UnicodePropertySpec::CodePoint(UnicodeProperty::Number)),
"Nd" | "Decimal_Number" => Some(UnicodePropertySpec::CodePoint(
UnicodeProperty::DecimalNumber,
)),
"Lu" | "Uppercase_Letter" => Some(UnicodePropertySpec::CodePoint(
UnicodeProperty::UppercaseLetter,
)),
"Ll" | "Lowercase_Letter" => Some(UnicodePropertySpec::CodePoint(
UnicodeProperty::LowercaseLetter,
)),
_ => None,
},
"sc" | "Script" | "scx" | "Script_Extensions" => match value {
"Latin" | "Latn" => Some(UnicodePropertySpec::CodePoint(UnicodeProperty::ScriptLatin)),
"Greek" | "Grek" => Some(UnicodePropertySpec::CodePoint(UnicodeProperty::ScriptGreek)),
_ => None,
},
_ => None,
}
}
fn match_rgi_emoji_at(prepared: &PreparedInput<'_>, pos: usize) -> Option<usize> {
let (ch, ch_width) = prepared.unicode_char_at(pos)?;
if is_keycap_base(ch) {
let mut idx = pos + ch_width;
if let Some((variation_selector, width)) = prepared.unicode_char_at(idx) {
if variation_selector == '\u{FE0F}' {
idx += width;
}
}
if let Some((enclosing_keycap, width)) = prepared.unicode_char_at(idx) {
if enclosing_keycap == '\u{20E3}' {
return Some(idx + width);
}
}
}
if is_regional_indicator(ch) {
let next_pos = pos + ch_width;
if let Some((next, next_width)) = prepared.unicode_char_at(next_pos) {
if is_regional_indicator(next) {
return Some(next_pos + next_width);
}
}
}
let mut idx = match_emoji_atom(prepared, pos)?;
while let Some((joiner, joiner_width)) = prepared.unicode_char_at(idx) {
if joiner != '\u{200D}' {
break;
}
let next = match_emoji_atom(prepared, idx + joiner_width)?;
idx = next;
}
Some(idx)
}
fn match_emoji_atom(prepared: &PreparedInput<'_>, pos: usize) -> Option<usize> {
let (ch, ch_width) = prepared.unicode_char_at(pos)?;
if !is_emoji_base(ch) {
return None;
}
let mut idx = pos + ch_width;
if let Some((variation_selector, width)) = prepared.unicode_char_at(idx) {
if variation_selector == '\u{FE0F}' {
idx += width;
}
}
if let Some((modifier, width)) = prepared.unicode_char_at(idx) {
if (0x1F3FB..=0x1F3FF).contains(&(modifier as u32)) {
idx += width;
}
}
Some(idx)
}
fn is_keycap_base(ch: char) -> bool {
ch.is_ascii_digit() || matches!(ch, '#' | '*')
}
fn is_regional_indicator(ch: char) -> bool {
(0x1F1E6..=0x1F1FF).contains(&(ch as u32))
}
fn is_emoji_base(ch: char) -> bool {
let code = ch as u32;
matches!(
code,
0x1F300..=0x1FAFF
| 0x2600..=0x27BF
| 0x2300..=0x23FF
| 0x2194..=0x21AA
| 0x00A9
| 0x00AE
| 0x203C
| 0x2049
| 0x2122
| 0x2139
| 0x2934
| 0x2935
| 0x3030
| 0x303D
| 0x3297
| 0x3299
)
}
fn control_escape_char(letter: char) -> char {
let upper = letter.to_ascii_uppercase() as u8;
char::from(upper - b'@')
}
fn is_legacy_class_control_escape_suffix(ch: char) -> bool {
ch.is_ascii_digit() || ch == '_'
}
fn control_escape_char_extended(ch: char) -> char {
if ch.is_ascii_alphabetic() {
return control_escape_char(ch);
}
let value = (ch as u8) & 0x1F;
char::from(value)
}
fn is_unicode_identity_escape_outside_class(ch: char) -> bool {
matches!(
ch,
'$' | '(' | ')' | '*' | '+' | '.' | '/' | '?' | '[' | '\\' | ']' | '^' | '{' | '|' | '}'
)
}
fn is_unicode_identity_escape_in_class(ch: char) -> bool {
matches!(
ch,
'$' | '('
| ')'
| '*'
| '+'
| '-'
| '.'
| '/'
| '?'
| '['
| '\\'
| ']'
| '^'
| '{'
| '|'
| '}'
)
}
fn char_in_ranges(ch: char, ranges: &[(u32, u32)]) -> bool {
let code = ch as u32;
ranges
.iter()
.any(|&(start, end)| start <= code && code <= end)
}
fn is_quantifier_target_supported(expr: &Expr) -> bool {
!matches!(
expr,
Expr::StartAnchor | Expr::EndAnchor | Expr::WordBoundary(_) | Expr::LookBehind { .. }
)
}
fn char_in_range(ch: char, start: char, end: char, ignore_case: bool, unicode_mode: bool) -> bool {
let (start_code, end_code, code) = if ignore_case {
if unicode_mode {
(
canonicalize_unicode_char(start) as u32,
canonicalize_unicode_char(end) as u32,
canonicalize_unicode_char(ch) as u32,
)
} else if start.is_ascii() && end.is_ascii() && ch.is_ascii() {
(
(start as u8).to_ascii_lowercase() as u32,
(end as u8).to_ascii_lowercase() as u32,
(ch as u8).to_ascii_lowercase() as u32,
)
} else {
(start as u32, end as u32, ch as u32)
}
} else {
(start as u32, end as u32, ch as u32)
};
if start_code <= end_code {
(start_code..=end_code).contains(&code)
} else {
(end_code..=start_code).contains(&code)
}
}
fn is_word_char(ch: char) -> bool {
ch.is_ascii_alphanumeric() || ch == '_'
}
fn is_word_char_regex(ch: char, ignore_case: bool, unicode_mode: bool) -> bool {
if is_word_char(ch) {
return true;
}
if !(ignore_case && unicode_mode) {
return false;
}
is_word_char(canonicalize_unicode_char(ch))
}
fn is_space_char(ch: char) -> bool {
matches!(
ch,
'\u{0009}'
| '\u{000A}'
| '\u{000B}'
| '\u{000C}'
| '\u{000D}'
| '\u{0020}'
| '\u{00A0}'
| '\u{1680}'
| '\u{2000}'
..='\u{200A}'
| '\u{2028}'
| '\u{2029}'
| '\u{202F}'
| '\u{205F}'
| '\u{3000}'
| '\u{FEFF}'
)
}
fn is_line_terminator(ch: char) -> bool {
matches!(ch, '\n' | '\r' | '\u{2028}' | '\u{2029}')
}
fn is_regex_meta(ch: char) -> bool {
matches!(
ch,
'\\' | '.' | '*' | '+' | '?' | '(' | ')' | '[' | ']' | '{' | '}' | '|' | '^' | '$' | '/'
)
}
const INTERNAL_SURROGATE_BASE: u32 = 0xE000;
const INTERNAL_SURROGATE_LAST: u32 = INTERNAL_SURROGATE_BASE + 0x07FF;
pub(crate) fn internalize_utf16_code_unit(unit: u16) -> char {
if (0xD800..=0xDFFF).contains(&unit) {
let mapped = INTERNAL_SURROGATE_BASE + (unit as u32 - 0xD800);
return char::from_u32(mapped).expect("internal surrogate marker must be valid scalar");
}
char::from_u32(unit as u32).expect("BMP non-surrogate code unit must be valid scalar")
}
pub(crate) fn deinternalize_surrogate_marker(ch: char) -> Option<u16> {
let code = ch as u32;
if (INTERNAL_SURROGATE_BASE..=INTERNAL_SURROGATE_LAST).contains(&code) {
Some((0xD800 + code - INTERNAL_SURROGATE_BASE) as u16)
} else {
None
}
}
fn unicode_escape_char_from_u32(value: u32) -> Result<char, RegexError> {
if (0xD800..=0xDFFF).contains(&value) {
return Ok(internalize_utf16_code_unit(value as u16));
}
char::from_u32(value).ok_or_else(|| RegexError::new("invalid Unicode escape sequence"))
}
fn append_hex_escape(out: &mut String, value: u32, width: usize) {
match width {
2 => out.push_str(&format!("\\x{value:02x}")),
4 => out.push_str(&format!("\\u{value:04x}")),
_ => out.push_str(&format!("\\u{{{value:x}}}")),
}
}
fn is_regexp_escape_hex_punctuator(ch: char) -> bool {
matches!(
ch,
'!' | '"'
| '#'
| '%'
| '&'
| '\''
| ','
| '-'
| ':'
| ';'
| '<'
| '='
| '>'
| '@'
| '`'
| '~'
)
}
fn is_regexp_escape_unicode_whitespace(ch: char) -> bool {
matches!(
ch,
'\u{1680}' | '\u{2000}'
..='\u{200A}'
| '\u{2028}'
| '\u{2029}'
| '\u{202F}'
| '\u{205F}'
| '\u{3000}'
| '\u{FEFF}'
)
}
const UNICODE_DECIMAL_NUMBER_RANGES: &[(u32, u32)] = &[
(0x30, 0x39),
(0x660, 0x669),
(0x6F0, 0x6F9),
(0x7C0, 0x7C9),
(0x966, 0x96F),
(0x9E6, 0x9EF),
(0xA66, 0xA6F),
(0xAE6, 0xAEF),
(0xB66, 0xB6F),
(0xBE6, 0xBEF),
(0xC66, 0xC6F),
(0xCE6, 0xCEF),
(0xD66, 0xD6F),
(0xDE6, 0xDEF),
(0xE50, 0xE59),
(0xED0, 0xED9),
(0xF20, 0xF29),
(0x1040, 0x1049),
(0x1090, 0x1099),
(0x17E0, 0x17E9),
(0x1810, 0x1819),
(0x1946, 0x194F),
(0x19D0, 0x19D9),
(0x1A80, 0x1A89),
(0x1A90, 0x1A99),
(0x1B50, 0x1B59),
(0x1BB0, 0x1BB9),
(0x1C40, 0x1C49),
(0x1C50, 0x1C59),
(0xA620, 0xA629),
(0xA8D0, 0xA8D9),
(0xA900, 0xA909),
(0xA9D0, 0xA9D9),
(0xA9F0, 0xA9F9),
(0xAA50, 0xAA59),
(0xABF0, 0xABF9),
(0xFF10, 0xFF19),
(0x104A0, 0x104A9),
(0x10D30, 0x10D39),
(0x10D40, 0x10D49),
(0x11066, 0x1106F),
(0x110F0, 0x110F9),
(0x11136, 0x1113F),
(0x111D0, 0x111D9),
(0x112F0, 0x112F9),
(0x11450, 0x11459),
(0x114D0, 0x114D9),
(0x11650, 0x11659),
(0x116C0, 0x116C9),
(0x116D0, 0x116E3),
(0x11730, 0x11739),
(0x118E0, 0x118E9),
(0x11950, 0x11959),
(0x11BF0, 0x11BF9),
(0x11C50, 0x11C59),
(0x11D50, 0x11D59),
(0x11DA0, 0x11DA9),
(0x11DE0, 0x11DE9),
(0x11F50, 0x11F59),
(0x16130, 0x16139),
(0x16A60, 0x16A69),
(0x16AC0, 0x16AC9),
(0x16B50, 0x16B59),
(0x16D70, 0x16D79),
(0x1CCF0, 0x1CCF9),
(0x1D7CE, 0x1D7FF),
(0x1E140, 0x1E149),
(0x1E2F0, 0x1E2F9),
(0x1E4F0, 0x1E4F9),
(0x1E5F1, 0x1E5FA),
(0x1E950, 0x1E959),
(0x1FBF0, 0x1FBF9),
];
const UNICODE_SCRIPT_LATIN_RANGES: &[(u32, u32)] = &[
(0x41, 0x5A),
(0x61, 0x7A),
(0xAA, 0xAA),
(0xBA, 0xBA),
(0xC0, 0xD6),
(0xD8, 0xF6),
(0xF8, 0x2B8),
(0x2E0, 0x2E4),
(0x1D00, 0x1D25),
(0x1D2C, 0x1D5C),
(0x1D62, 0x1D65),
(0x1D6B, 0x1D77),
(0x1D79, 0x1DBE),
(0x1E00, 0x1EFF),
(0x2071, 0x2071),
(0x207F, 0x207F),
(0x2090, 0x209C),
(0x212A, 0x212B),
(0x2132, 0x2132),
(0x214E, 0x214E),
(0x2160, 0x2188),
(0x2C60, 0x2C7F),
(0xA722, 0xA787),
(0xA78B, 0xA7DC),
(0xA7F1, 0xA7FF),
(0xAB30, 0xAB5A),
(0xAB5C, 0xAB64),
(0xAB66, 0xAB69),
(0xFB00, 0xFB06),
(0xFF21, 0xFF3A),
(0xFF41, 0xFF5A),
(0x10780, 0x10785),
(0x10787, 0x107B0),
(0x107B2, 0x107BA),
(0x1DF00, 0x1DF1E),
(0x1DF25, 0x1DF2A),
];
const UNICODE_SCRIPT_GREEK_RANGES: &[(u32, u32)] = &[
(0x370, 0x373),
(0x375, 0x377),
(0x37A, 0x37D),
(0x37F, 0x37F),
(0x384, 0x384),
(0x386, 0x386),
(0x388, 0x38A),
(0x38C, 0x38C),
(0x38E, 0x3A1),
(0x3A3, 0x3E1),
(0x3F0, 0x3FF),
(0x1D26, 0x1D2A),
(0x1D5D, 0x1D61),
(0x1D66, 0x1D6A),
(0x1DBF, 0x1DBF),
(0x1F00, 0x1F15),
(0x1F18, 0x1F1D),
(0x1F20, 0x1F45),
(0x1F48, 0x1F4D),
(0x1F50, 0x1F57),
(0x1F59, 0x1F59),
(0x1F5B, 0x1F5B),
(0x1F5D, 0x1F5D),
(0x1F5F, 0x1F7D),
(0x1F80, 0x1FB4),
(0x1FB6, 0x1FC4),
(0x1FC6, 0x1FD3),
(0x1FD6, 0x1FDB),
(0x1FDD, 0x1FEF),
(0x1FF2, 0x1FF4),
(0x1FF6, 0x1FFE),
(0x2126, 0x2126),
(0xAB65, 0xAB65),
(0x10140, 0x1018E),
(0x101A0, 0x101A0),
(0x1D200, 0x1D245),
];