use crate::unicode_categories::{get_block_ranges, get_category_ranges};
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub struct RunePair {
pub lo: char,
pub hi: char,
}
pub type RuneRange = Vec<RunePair>;
pub const REGEXP_QUANTIFIER_MAX: i32 = 100;
const ESCAPE: char = '~';
pub const RUNE_MAX: char = '\u{10FFFF}';
const SURROGATE_START_CP: u32 = 0xD800;
const SURROGATE_END_CP: u32 = 0xDFFF;
#[derive(Debug, Clone)]
pub struct QuantifiedAtom {
pub is_dot: bool,
pub runes: RuneRange,
pub quant_min: i32,
pub quant_max: i32,
pub subtree: Option<RegexpRoot>,
pub cache_key: Option<String>,
pub lookaround: Option<LookaroundType>,
pub ascii_negated_bytes: Option<Vec<u8>>,
pub is_word_boundary: Option<bool>,
}
impl Default for QuantifiedAtom {
fn default() -> Self {
Self {
is_dot: false,
runes: Vec::new(),
quant_min: 1,
quant_max: 1,
subtree: None,
cache_key: None,
lookaround: None,
ascii_negated_bytes: None,
is_word_boundary: None,
}
}
}
impl QuantifiedAtom {
#[inline]
pub fn is_singleton(&self) -> bool {
self.quant_min == 1 && self.quant_max == 1
}
#[inline]
pub fn is_qm(&self) -> bool {
self.quant_min == 0 && self.quant_max == 1
}
#[inline]
pub fn is_plus(&self) -> bool {
self.quant_min == 1 && self.quant_max == REGEXP_QUANTIFIER_MAX
}
#[inline]
pub fn is_star(&self) -> bool {
self.quant_min == 0 && self.quant_max == REGEXP_QUANTIFIER_MAX
}
}
pub type RegexpBranch = Vec<QuantifiedAtom>;
pub type RegexpRoot = Vec<RegexpBranch>;
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum LookaroundType {
PositiveLookahead,
NegativeLookahead,
PositiveLookbehind,
NegativeLookbehind,
}
impl LookaroundType {
pub fn is_negative(&self) -> bool {
matches!(self, Self::NegativeLookahead | Self::NegativeLookbehind)
}
pub fn is_lookbehind(&self) -> bool {
matches!(self, Self::PositiveLookbehind | Self::NegativeLookbehind)
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum RegexpFeature {
Dot,
Star,
Plus,
QuestionMark,
Range,
ParenGroup,
NonCapturingGroup,
LazyQuantifier,
Property,
Class,
NegatedClass,
OrBar,
Lookaround,
WordBoundary,
}
const IMPLEMENTED_FEATURES: &[RegexpFeature] = &[
RegexpFeature::Dot,
RegexpFeature::Class,
RegexpFeature::NegatedClass,
RegexpFeature::OrBar,
RegexpFeature::ParenGroup,
RegexpFeature::NonCapturingGroup,
RegexpFeature::LazyQuantifier,
RegexpFeature::QuestionMark,
RegexpFeature::Plus,
RegexpFeature::Star,
RegexpFeature::Range,
RegexpFeature::Property,
RegexpFeature::Lookaround,
RegexpFeature::WordBoundary,
];
struct RegexpParse {
bytes: Vec<u8>,
index: usize,
last_index: usize,
nesting: Vec<RegexpRoot>,
found_features: Vec<RegexpFeature>,
tree: RegexpRoot,
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct RegexpError {
pub message: String,
pub offset: usize,
}
impl std::fmt::Display for RegexpError {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "{} at offset {}", self.message, self.offset)
}
}
impl std::error::Error for RegexpError {}
impl RegexpParse {
fn new(re: &str) -> Self {
Self {
bytes: re.as_bytes().to_vec(),
index: 0,
last_index: 0,
nesting: Vec::new(),
found_features: Vec::new(),
tree: Vec::new(),
}
}
fn nest(&mut self) {
self.nesting.push(std::mem::take(&mut self.tree));
}
fn unnest(&mut self) -> RegexpRoot {
let subtree = std::mem::take(&mut self.tree);
self.tree = self.nesting.pop().unwrap_or_default();
subtree
}
fn is_nested(&self) -> bool {
!self.nesting.is_empty()
}
fn next_rune(&mut self) -> Result<char, RegexpError> {
if self.index >= self.bytes.len() {
return Err(RegexpError {
message: "end of string".into(),
offset: self.index,
});
}
self.last_index = self.index;
let remaining = &self.bytes[self.index..];
let s = std::str::from_utf8(remaining).map_err(|_| RegexpError {
message: "UTF-8 encoding error".into(),
offset: self.index,
})?;
let c = s.chars().next().ok_or_else(|| RegexpError {
message: "empty string".into(),
offset: self.index,
})?;
self.index += c.len_utf8();
Ok(c)
}
fn require(&mut self, wanted: char) -> Result<(), RegexpError> {
let got = self.next_rune()?;
if got != wanted {
return Err(RegexpError {
message: format!("expected '{}', got '{}'", wanted, got),
offset: self.last_index,
});
}
Ok(())
}
fn bypass_optional(&mut self, c: char) -> Result<bool, RegexpError> {
let next = self.next_rune()?;
if next != c {
self.backup1(next);
}
Ok(next == c)
}
fn backup1(&mut self, one_rune: char) {
self.index -= one_rune.len_utf8();
}
fn is_empty(&self) -> bool {
self.index >= self.bytes.len()
}
fn record_feature(&mut self, feature: RegexpFeature) {
if !self.found_features.contains(&feature) {
self.found_features.push(feature);
}
}
fn found_unimplemented(&self) -> Vec<RegexpFeature> {
self.found_features
.iter()
.filter(|f| !IMPLEMENTED_FEATURES.contains(f))
.copied()
.collect()
}
}
fn has_nested_lookaround(tree: &RegexpRoot) -> bool {
for branch in tree {
for atom in branch {
if atom.lookaround.is_some() {
if let Some(subtree) = &atom.subtree {
if tree_has_lookaround(subtree) {
return true;
}
}
}
}
}
false
}
fn tree_has_lookaround(tree: &RegexpRoot) -> bool {
for branch in tree {
for atom in branch {
if atom.lookaround.is_some() {
return true;
}
if let Some(subtree) = &atom.subtree {
if tree_has_lookaround(subtree) {
return true;
}
}
}
}
false
}
fn validate_lookarounds(tree: &RegexpRoot) -> Result<(), String> {
if has_nested_lookaround(tree) {
return Err("nested lookaround not supported: `(?=...(?=...)...)`".into());
}
for branch in tree {
for atom in branch {
if let Some(la_type) = &atom.lookaround {
if la_type.is_lookbehind() {
if let Some(subtree) = &atom.subtree {
if has_variable_length_pattern(subtree) {
return Err(
"variable-length lookbehind not yet supported: `(?<=a+)`; \
use fixed-length like `(?<=aaa)`"
.into(),
);
}
}
}
}
}
}
Ok(())
}
fn has_variable_length_pattern(tree: &RegexpRoot) -> bool {
if tree.len() > 1 {
let first_len = branch_fixed_length(&tree[0]);
for branch in tree.iter().skip(1) {
let branch_len = branch_fixed_length(branch);
if first_len != branch_len {
return true; }
}
return first_len.is_none();
}
if tree.is_empty() {
return false;
}
branch_fixed_length(&tree[0]).is_none()
}
fn branch_fixed_length(branch: &RegexpBranch) -> Option<usize> {
let mut total = 0usize;
for atom in branch {
if atom.quant_min != atom.quant_max {
return None;
}
if atom.is_star() || atom.is_plus() {
return None;
}
let atom_len = if atom.is_dot {
1
} else if !atom.runes.is_empty() {
1
} else if let Some(subtree) = &atom.subtree {
if subtree.len() == 1 {
branch_fixed_length(&subtree[0])?
} else {
let first_len = branch_fixed_length(&subtree[0])?;
for b in subtree.iter().skip(1) {
if branch_fixed_length(b) != Some(first_len) {
return None;
}
}
first_len
}
} else {
0 };
total += atom_len * (atom.quant_min as usize);
}
Some(total)
}
pub fn collect_lookarounds(tree: &RegexpRoot) -> Vec<(usize, usize, LookaroundType, RegexpRoot)> {
let mut result = Vec::new();
for (branch_idx, branch) in tree.iter().enumerate() {
for (atom_idx, atom) in branch.iter().enumerate() {
if let Some(la_type) = atom.lookaround {
if let Some(subtree) = &atom.subtree {
result.push((branch_idx, atom_idx, la_type, subtree.clone()));
}
}
}
}
result
}
pub fn has_top_level_lookaround(tree: &RegexpRoot) -> bool {
for branch in tree {
for atom in branch {
if atom.lookaround.is_some() {
return true;
}
}
}
false
}
pub fn has_word_boundary(tree: &RegexpRoot) -> bool {
for branch in tree {
for atom in branch {
if atom.is_word_boundary.is_some() {
return true;
}
}
}
false
}
fn word_char_runes() -> RuneRange {
vec![
RunePair { lo: 'a', hi: 'z' },
RunePair { lo: 'A', hi: 'Z' },
RunePair { lo: '0', hi: '9' },
RunePair { lo: '_', hi: '_' },
]
}
fn non_word_char_runes() -> RuneRange {
invert_rune_range(word_char_runes())
}
fn intersect_rune_ranges(a: &RuneRange, b: &RuneRange) -> RuneRange {
let mut result = Vec::new();
for ap in a {
for bp in b {
let lo = ap.lo.max(bp.lo);
let hi = ap.hi.min(bp.hi);
if lo <= hi {
result.push(RunePair { lo, hi });
}
}
}
simplify_rune_range(result)
}
enum ConstrainedAtom {
Single(QuantifiedAtom),
Split(QuantifiedAtom, QuantifiedAtom),
SplitOrAbsent(QuantifiedAtom, QuantifiedAtom),
}
fn constrain_atom_at_boundary(
atom: &QuantifiedAtom,
class: &RuneRange,
is_last_char: bool,
) -> Option<ConstrainedAtom> {
if atom.subtree.is_some() {
return None;
}
let base_runes = if atom.is_dot {
vec![RunePair {
lo: '\0',
hi: RUNE_MAX,
}]
} else {
atom.runes.clone()
};
let intersected = intersect_rune_ranges(&base_runes, class);
if intersected.is_empty() {
return None;
}
let cache_key = if intersected == simplify_rune_range(word_char_runes()) {
Some("wb_w".to_string())
} else if intersected == simplify_rune_range(non_word_char_runes()) {
Some("wb_W".to_string())
} else {
None
};
if atom.is_singleton() {
return Some(ConstrainedAtom::Single(QuantifiedAtom {
runes: intersected,
is_dot: false,
quant_min: 1,
quant_max: 1,
cache_key,
..Default::default()
}));
}
let constrained_single = QuantifiedAtom {
runes: intersected,
is_dot: false,
quant_min: 1,
quant_max: 1,
cache_key,
..Default::default()
};
let _ = is_last_char; let (new_min, new_max) = (
(atom.quant_min - 1).max(0),
if atom.quant_max == REGEXP_QUANTIFIER_MAX {
REGEXP_QUANTIFIER_MAX
} else {
atom.quant_max - 1
},
);
if new_max == 0 {
return Some(ConstrainedAtom::Single(constrained_single));
}
let base = QuantifiedAtom {
is_dot: atom.is_dot,
runes: atom.runes.clone(),
quant_min: new_min,
quant_max: new_max,
cache_key: atom.cache_key.clone(),
ascii_negated_bytes: atom.ascii_negated_bytes.clone(),
..Default::default()
};
if atom.quant_min == 0 {
Some(ConstrainedAtom::SplitOrAbsent(base, constrained_single))
} else {
Some(ConstrainedAtom::Split(base, constrained_single))
}
}
fn expand_constrained(ca: &ConstrainedAtom, base_first: bool) -> Vec<Vec<QuantifiedAtom>> {
match ca {
ConstrainedAtom::Single(a) => vec![vec![a.clone()]],
ConstrainedAtom::Split(base, single) | ConstrainedAtom::SplitOrAbsent(base, single) => {
if base_first {
vec![vec![base.clone(), single.clone()]]
} else {
vec![vec![single.clone(), base.clone()]]
}
}
}
}
fn expand_wb_at_start(suffix: &[QuantifiedAtom], is_boundary: bool, out: &mut Vec<RegexpBranch>) {
let wc = word_char_runes();
let nwc = non_word_char_runes();
let required_class = if is_boundary { &wc } else { &nwc };
let Some(constrained) = constrain_atom_at_boundary(&suffix[0], required_class, false) else {
return;
};
for atoms in expand_constrained(&constrained, false) {
let mut branch = atoms;
branch.extend_from_slice(&suffix[1..]);
out.push(branch);
}
if matches!(constrained, ConstrainedAtom::SplitOrAbsent(..)) && suffix.len() > 1 {
if let Some(c2) = constrain_atom_at_boundary(&suffix[1], required_class, false) {
for atoms in expand_constrained(&c2, false) {
let mut branch = atoms;
branch.extend_from_slice(&suffix[2..]);
out.push(branch);
}
}
}
}
fn expand_wb_at_end(prefix: &[QuantifiedAtom], is_boundary: bool, out: &mut Vec<RegexpBranch>) {
let wc = word_char_runes();
let nwc = non_word_char_runes();
let required_class = if is_boundary { &wc } else { &nwc };
let last_idx = prefix.len() - 1;
let Some(constrained) = constrain_atom_at_boundary(&prefix[last_idx], required_class, true)
else {
return;
};
for atoms in expand_constrained(&constrained, true) {
let mut branch = prefix[..last_idx].to_vec();
branch.extend(atoms);
out.push(branch);
}
if matches!(constrained, ConstrainedAtom::SplitOrAbsent(..)) && last_idx > 0 {
let prev = last_idx - 1;
if let Some(c2) = constrain_atom_at_boundary(&prefix[prev], required_class, true) {
for atoms in expand_constrained(&c2, true) {
let mut branch = prefix[..prev].to_vec();
branch.extend(atoms);
out.push(branch);
}
}
}
}
fn expand_wb_in_middle(
prefix: &[QuantifiedAtom],
suffix: &[QuantifiedAtom],
is_boundary: bool,
out: &mut Vec<RegexpBranch>,
) {
let wc = word_char_runes();
let nwc = non_word_char_runes();
let last_idx = prefix.len() - 1;
let class_pairs: [(&RuneRange, &RuneRange); 2] = if is_boundary {
[(&wc, &nwc), (&nwc, &wc)]
} else {
[(&wc, &wc), (&nwc, &nwc)]
};
for (last_class, first_class) in &class_pairs {
let cl = constrain_atom_at_boundary(&prefix[last_idx], last_class, true);
let cf = constrain_atom_at_boundary(&suffix[0], first_class, false);
let (Some(ref cl), Some(ref cf)) = (&cl, &cf) else {
continue;
};
for pe in &expand_constrained(cl, true) {
for se in &expand_constrained(cf, false) {
let mut branch = prefix[..last_idx].to_vec();
branch.extend(pe.clone());
branch.extend(se.clone());
branch.extend_from_slice(&suffix[1..]);
out.push(branch);
}
}
if matches!(cl, ConstrainedAtom::SplitOrAbsent(..)) {
let edge_class = if is_boundary { &wc } else { &nwc };
if let Some(c2) = constrain_atom_at_boundary(&suffix[0], edge_class, false) {
for se in expand_constrained(&c2, false) {
let mut branch = prefix[..last_idx].to_vec();
branch.extend(se);
branch.extend_from_slice(&suffix[1..]);
out.push(branch);
}
}
}
if matches!(cf, ConstrainedAtom::SplitOrAbsent(..)) {
let edge_class = if is_boundary { &wc } else { &nwc };
if let Some(c2) = constrain_atom_at_boundary(&prefix[last_idx], edge_class, true) {
for pe in expand_constrained(&c2, true) {
let mut branch = prefix[..last_idx].to_vec();
branch.extend(pe);
branch.extend_from_slice(&suffix[1..]);
out.push(branch);
}
}
}
}
}
pub fn expand_word_boundaries(tree: &RegexpRoot) -> Result<RegexpRoot, String> {
let mut result_branches: Vec<RegexpBranch> = Vec::new();
for branch in tree {
let wb_count = branch
.iter()
.filter(|a| a.is_word_boundary.is_some())
.count();
if wb_count == 0 {
result_branches.push(branch.clone());
continue;
}
if wb_count > 4 {
return Err("too many word boundaries in pattern (max 4)".into());
}
let mut alternatives = vec![branch.clone()];
loop {
let mut new_alternatives = Vec::new();
let mut found_wb = false;
for alt in &alternatives {
let wb_pos = alt.iter().position(|a| a.is_word_boundary.is_some());
let Some(pos) = wb_pos else {
new_alternatives.push(alt.clone());
continue;
};
found_wb = true;
let is_boundary = alt[pos].is_word_boundary.unwrap();
let prefix = &alt[..pos];
let suffix = &alt[pos + 1..];
match (prefix.is_empty(), suffix.is_empty()) {
(true, true) => {
if !is_boundary {
new_alternatives.push(Vec::new());
}
}
(true, false) => expand_wb_at_start(suffix, is_boundary, &mut new_alternatives),
(false, true) => expand_wb_at_end(prefix, is_boundary, &mut new_alternatives),
(false, false) => {
expand_wb_in_middle(prefix, suffix, is_boundary, &mut new_alternatives)
}
}
}
alternatives = new_alternatives;
if !found_wb {
break;
}
}
result_branches.extend(alternatives);
}
Ok(result_branches)
}
pub fn parse_regexp(re: &str) -> Result<RegexpRoot, RegexpError> {
let mut parse = RegexpParse::new(re);
read_branches(&mut parse)?;
if parse.is_nested() {
return Err(RegexpError {
message: "unclosed '('".into(),
offset: parse.index,
});
}
let tree = std::mem::take(&mut parse.tree);
if parse.found_features.contains(&RegexpFeature::Lookaround) {
if let Err(msg) = validate_lookarounds(&tree) {
return Err(RegexpError {
message: msg,
offset: 0,
});
}
}
let unimplemented = parse.found_unimplemented();
if !unimplemented.is_empty() {
return Err(RegexpError {
message: format!("unimplemented features: {:?}", unimplemented),
offset: 0,
});
}
Ok(tree)
}
fn read_branches(parse: &mut RegexpParse) -> Result<(), RegexpError> {
while !parse.is_empty() {
let branch = read_branch(parse)?;
parse.tree.push(branch);
if parse.is_empty() {
return Ok(());
}
let b = parse.next_rune()?;
if b == '|' {
parse.record_feature(RegexpFeature::OrBar);
continue;
} else if b == ')' {
parse.backup1(b);
return Ok(());
}
}
Ok(())
}
fn read_branch(parse: &mut RegexpParse) -> Result<RegexpBranch, RegexpError> {
let mut branch = Vec::new();
loop {
match read_piece(parse) {
Ok(piece) => branch.push(piece),
Err(e) if e.message == "stuck" => break,
Err(e) if e.message == "end of string" => break,
Err(e) => return Err(e),
}
}
Ok(branch)
}
fn read_piece(parse: &mut RegexpParse) -> Result<QuantifiedAtom, RegexpError> {
let mut qa = read_atom(parse)?;
read_quantifier(parse, &mut qa)?;
Ok(qa)
}
fn is_normal_char(c: char) -> bool {
let code = c as u32;
if code <= 0x27 || c == ',' || c == '-' || (0x2F..=0x3E).contains(&code) {
return true;
}
if (0x40..=0x5A).contains(&code) {
return true;
}
if code == 0x5c {
return true;
}
if (0x5E..=0x7A).contains(&code) {
return true;
}
if (0x7F..=0xD7FF).contains(&code) {
return true;
}
if (0xE000..=0x10FFFF).contains(&code) {
return true;
}
false
}
fn check_single_char_escape(c: char) -> Option<char> {
let code = c as u32;
if (0x28..=0x2B).contains(&code) {
return Some(c);
}
if c == '-' || c == '.' || c == '?' || (0x5B..=0x5E).contains(&code) {
return Some(c);
}
if c == 'n' {
return Some('\n');
}
if c == 'r' {
return Some('\r');
}
if c == 't' {
return Some('\t');
}
if (0x7B..=0x7D).contains(&code) {
return Some(c);
}
if c == ESCAPE {
return Some(ESCAPE);
}
None
}
fn xml_name_start_char() -> RuneRange {
vec![
RunePair { lo: ':', hi: ':' }, RunePair { lo: 'A', hi: 'Z' }, RunePair { lo: '_', hi: '_' }, RunePair { lo: 'a', hi: 'z' }, RunePair {
lo: '\u{C0}',
hi: '\u{D6}',
},
RunePair {
lo: '\u{D8}',
hi: '\u{F6}',
},
RunePair {
lo: '\u{F8}',
hi: '\u{2FF}',
},
RunePair {
lo: '\u{370}',
hi: '\u{37D}',
},
RunePair {
lo: '\u{37F}',
hi: '\u{1FFF}',
},
RunePair {
lo: '\u{200C}',
hi: '\u{200D}',
},
RunePair {
lo: '\u{2070}',
hi: '\u{218F}',
},
RunePair {
lo: '\u{2C00}',
hi: '\u{2FEF}',
},
RunePair {
lo: '\u{3001}',
hi: '\u{D7FF}',
},
RunePair {
lo: '\u{F900}',
hi: '\u{FDCF}',
},
RunePair {
lo: '\u{FDF0}',
hi: '\u{FFFD}',
},
RunePair {
lo: '\u{10000}',
hi: '\u{EFFFF}',
},
]
}
fn xml_name_char() -> RuneRange {
let mut rr = xml_name_start_char();
rr.extend([
RunePair { lo: '-', hi: '-' }, RunePair { lo: '.', hi: '.' }, RunePair { lo: '0', hi: '9' }, RunePair {
lo: '\u{B7}',
hi: '\u{B7}',
},
RunePair {
lo: '\u{300}',
hi: '\u{36F}',
},
RunePair {
lo: '\u{203F}',
hi: '\u{2040}',
},
]);
rr
}
fn check_multi_char_escape(c: char) -> Option<(RuneRange, Option<String>)> {
match c {
'd' => Some((vec![RunePair { lo: '0', hi: '9' }], None)),
'D' => Some((invert_rune_range(vec![RunePair { lo: '0', hi: '9' }]), None)),
'w' => Some((word_char_runes(), None)),
'W' => Some((invert_rune_range(word_char_runes()), None)),
's' => Some((
vec![
RunePair { lo: ' ', hi: ' ' },
RunePair { lo: '\t', hi: '\t' },
RunePair { lo: '\n', hi: '\n' },
RunePair { lo: '\r', hi: '\r' },
],
None,
)),
'S' => Some((
invert_rune_range(vec![
RunePair { lo: ' ', hi: ' ' },
RunePair { lo: '\t', hi: '\t' },
RunePair { lo: '\n', hi: '\n' },
RunePair { lo: '\r', hi: '\r' },
]),
None,
)),
'i' => Some((xml_name_start_char(), Some("i".to_string()))),
'I' => Some((
invert_rune_range(xml_name_start_char()),
Some("-i".to_string()),
)),
'c' => Some((xml_name_char(), Some("c".to_string()))),
'C' => Some((invert_rune_range(xml_name_char()), Some("-c".to_string()))),
_ => None,
}
}
fn read_atom(parse: &mut RegexpParse) -> Result<QuantifiedAtom, RegexpError> {
let b = parse.next_rune()?;
match b {
c if is_normal_char(c) => Ok(QuantifiedAtom {
runes: vec![RunePair { lo: c, hi: c }],
quant_min: 1,
quant_max: 1,
..Default::default()
}),
'.' => {
parse.record_feature(RegexpFeature::Dot);
Ok(QuantifiedAtom {
is_dot: true,
quant_min: 1,
quant_max: 1,
..Default::default()
})
}
'(' => {
parse.nest();
let mut lookaround_type: Option<LookaroundType> = None;
match parse.next_rune() {
Ok('?') => {
match parse.next_rune() {
Ok(':') => {
parse.record_feature(RegexpFeature::NonCapturingGroup);
}
Ok('=') => {
parse.record_feature(RegexpFeature::Lookaround);
lookaround_type = Some(LookaroundType::PositiveLookahead);
}
Ok('!') => {
parse.record_feature(RegexpFeature::Lookaround);
lookaround_type = Some(LookaroundType::NegativeLookahead);
}
Ok('<') => {
match parse.next_rune() {
Ok('=') => {
parse.record_feature(RegexpFeature::Lookaround);
lookaround_type = Some(LookaroundType::PositiveLookbehind);
}
Ok('!') => {
parse.record_feature(RegexpFeature::Lookaround);
lookaround_type = Some(LookaroundType::NegativeLookbehind);
}
Ok(c) => {
return Err(RegexpError {
message: format!(
"named capturing groups (?<{}...) not supported",
c
),
offset: parse.last_index,
});
}
Err(_) => {
return Err(RegexpError {
message: "unexpected end after (?<".into(),
offset: parse.last_index,
});
}
}
}
Ok(c) => {
return Err(RegexpError {
message: format!("unsupported group extension (?{}...)", c),
offset: parse.last_index,
});
}
Err(_) => {
return Err(RegexpError {
message: "unexpected end after (?".into(),
offset: parse.last_index,
});
}
}
}
Ok(c) => {
parse.record_feature(RegexpFeature::ParenGroup);
parse.backup1(c);
}
Err(_) => {
parse.record_feature(RegexpFeature::ParenGroup);
}
}
read_branches(parse)?;
parse.require(')')?;
let subtree = parse.unnest();
Ok(QuantifiedAtom {
subtree: Some(subtree),
quant_min: 1,
quant_max: 1,
lookaround: lookaround_type,
..Default::default()
})
}
')' => {
if parse.is_nested() {
parse.backup1(b);
Err(RegexpError {
message: "stuck".into(),
offset: parse.last_index,
})
} else {
Err(RegexpError {
message: "unbalanced ')'".into(),
offset: parse.last_index,
})
}
}
'[' => {
parse.record_feature(RegexpFeature::Class);
let (rr, ascii_negated_bytes) = read_char_class_expr(parse)?;
Ok(QuantifiedAtom {
runes: rr,
quant_min: 1,
quant_max: 1,
ascii_negated_bytes,
..Default::default()
})
}
']' => Err(RegexpError {
message: "invalid ']'".into(),
offset: parse.last_index,
}),
c if c == ESCAPE => {
let next = parse.next_rune().map_err(|_| RegexpError {
message: format!("'{}' at end of regular expression", ESCAPE),
offset: parse.last_index,
})?;
if let Some(escaped) = check_single_char_escape(next) {
return Ok(QuantifiedAtom {
runes: vec![RunePair {
lo: escaped,
hi: escaped,
}],
quant_min: 1,
quant_max: 1,
..Default::default()
});
}
if let Some((runes, cache_key)) = check_multi_char_escape(next) {
return Ok(QuantifiedAtom {
runes,
quant_min: 1,
quant_max: 1,
cache_key,
..Default::default()
});
}
if next == 'p' || next == 'P' {
parse.record_feature(RegexpFeature::Property);
let (mut runes, cache_key) = read_category(parse)?;
let cache_key = if next == 'P' {
runes = invert_rune_range(runes);
cache_key.map(|k| format!("-{}", k))
} else {
cache_key
};
return Ok(QuantifiedAtom {
runes,
quant_min: 1,
quant_max: 1,
cache_key,
..Default::default()
});
}
if next == 'b' || next == 'B' {
parse.record_feature(RegexpFeature::WordBoundary);
let is_boundary = next == 'b';
return Ok(QuantifiedAtom {
is_word_boundary: Some(is_boundary),
quant_min: 1,
quant_max: 1,
..Default::default()
});
}
if let Some(digit) = next.to_digit(10) {
if (1..=9).contains(&digit) {
return Err(RegexpError {
message: format!("backreferences (~{}) are not supported", digit),
offset: parse.last_index,
});
}
}
Err(RegexpError {
message: format!("invalid character '{}' after '{}'", next, ESCAPE),
offset: parse.last_index,
})
}
'?' | '+' | '*' | '{' => Err(RegexpError {
message: format!("invalid character '{}' (quantifier without atom)", b),
offset: parse.last_index,
}),
'|' => {
parse.backup1(b);
Err(RegexpError {
message: "stuck".into(),
offset: parse.last_index,
})
}
_ => {
parse.backup1(b);
Err(RegexpError {
message: "stuck".into(),
offset: parse.last_index,
})
}
}
}
fn read_char_class_expr(
parse: &mut RegexpParse,
) -> Result<(RuneRange, Option<Vec<u8>>), RegexpError> {
read_char_class_expr_depth(parse, 0)
}
const MAX_CLASS_SUBTRACTION_DEPTH: usize = 8;
fn read_char_class_expr_depth(
parse: &mut RegexpParse,
depth: usize,
) -> Result<(RuneRange, Option<Vec<u8>>), RegexpError> {
if depth > MAX_CLASS_SUBTRACTION_DEPTH {
return Err(RegexpError {
message: "character class subtraction nested too deeply".into(),
offset: parse.index,
});
}
if parse.is_empty() {
return Err(RegexpError {
message: "unclosed character class".into(),
offset: parse.index,
});
}
let is_negated = parse.bypass_optional('^')?;
if is_negated {
parse.record_feature(RegexpFeature::NegatedClass);
}
let mut rr = read_cce1s(parse)?;
if parse.bypass_optional('-') == Ok(true) {
let next = parse.next_rune().map_err(|_| RegexpError {
message: "unclosed character class".into(),
offset: parse.index,
})?;
if next == '[' {
let (subtract_rr, _) = read_char_class_expr_depth(parse, depth + 1)?;
rr = subtract_rune_range(rr, subtract_rr);
} else {
parse.backup1(next);
rr.push(RunePair { lo: '-', hi: '-' });
}
}
parse.require(']')?;
let ascii_negated_bytes = if is_negated {
detect_ascii_negated_bytes(&rr)
} else {
None
};
if is_negated {
rr = invert_rune_range(rr);
}
Ok((rr, ascii_negated_bytes))
}
fn detect_ascii_negated_bytes(rr: &RuneRange) -> Option<Vec<u8>> {
let mut bytes = Vec::new();
for pair in rr {
if pair.lo as u32 >= 128 || pair.hi as u32 >= 128 {
return None; }
for c in (pair.lo as u8)..=(pair.hi as u8) {
bytes.push(c);
if bytes.len() > 3 {
return None; }
}
}
if bytes.is_empty() || bytes.len() > 3 {
return None;
}
Some(bytes)
}
fn read_cce1s(parse: &mut RegexpParse) -> Result<RuneRange, RegexpError> {
let mut rr = Vec::new();
let mut first = true;
loop {
let cce1 = read_cce1(parse, first)?;
rr.extend(cce1);
first = false;
match parse.next_rune() {
Ok(r) => {
parse.backup1(r);
if r == '-' || r == ']' {
return Ok(simplify_rune_range(rr));
}
}
Err(_) => {
return Err(RegexpError {
message: "unclosed character class".into(),
offset: parse.index,
});
}
}
}
}
fn is_cc_char(r: char) -> bool {
let code = r as u32;
if code <= 0x2c || (0x2e..=0x5A).contains(&code) {
return true;
}
if (0x5e..=0xd7ff).contains(&code) {
return true;
}
if (0xe000..=0x10ffff).contains(&code) {
return true;
}
if r == '\\' {
return true;
}
false
}
fn read_cce1(parse: &mut RegexpParse, first: bool) -> Result<RuneRange, RegexpError> {
let r = parse.next_rune().map_err(|_| RegexpError {
message: "unclosed character class".into(),
offset: parse.index,
})?;
let lo = if first && r == '-' {
return Ok(vec![RunePair { lo: '-', hi: '-' }]);
} else if r == ESCAPE {
let next = parse.next_rune().map_err(|_| RegexpError {
message: "unclosed character class".into(),
offset: parse.index,
})?;
if next == 'p' || next == 'P' {
parse.record_feature(RegexpFeature::Property);
let (mut runes, _cache_key) = read_category(parse)?;
if next == 'P' {
runes = invert_rune_range(runes);
}
return Ok(runes);
}
if let Some((runes, _cache_key)) = check_multi_char_escape(next) {
return Ok(runes);
}
check_single_char_escape(next).ok_or_else(|| RegexpError {
message: format!(
"invalid character '{}' after {} in character class",
next, ESCAPE
),
offset: parse.last_index,
})?
} else {
if !is_cc_char(r) {
return Err(RegexpError {
message: format!("invalid character '{}' in character class", r),
offset: parse.last_index,
});
}
r
};
let next = parse.next_rune().map_err(|_| RegexpError {
message: "unclosed character class".into(),
offset: parse.index,
})?;
if next != '-' {
parse.backup1(next);
return Ok(vec![RunePair { lo, hi: lo }]);
}
let range_end = parse.next_rune().map_err(|_| RegexpError {
message: "unclosed character class".into(),
offset: parse.index,
})?;
if range_end == ']' {
parse.backup1(range_end);
return Ok(vec![RunePair { lo, hi: lo }, RunePair { lo: '-', hi: '-' }]);
}
if range_end == '[' {
parse.backup1(range_end);
parse.backup1('-');
return Ok(vec![RunePair { lo, hi: lo }]);
}
let hi = if range_end == ESCAPE {
let escaped = parse.next_rune().map_err(|_| RegexpError {
message: "unclosed character class".into(),
offset: parse.index,
})?;
check_single_char_escape(escaped).ok_or_else(|| RegexpError {
message: format!("invalid char '{}' after - in character class", escaped),
offset: parse.last_index,
})?
} else {
if !is_cc_char(range_end) {
return Err(RegexpError {
message: format!("invalid char '{}' in range", range_end),
offset: parse.last_index,
});
}
range_end
};
if lo > hi {
return Err(RegexpError {
message: format!("invalid range {}-{}", lo, hi),
offset: parse.last_index,
});
}
Ok(vec![RunePair { lo, hi }])
}
pub(crate) fn subtract_rune_range(base: RuneRange, subtract: RuneRange) -> RuneRange {
let base = simplify_rune_range(base);
let subtract = simplify_rune_range(subtract);
let mut result = Vec::new();
let mut sub_idx = 0;
for pair in &base {
let mut lo = pair.lo as u32;
let hi = pair.hi as u32;
while sub_idx < subtract.len() && (subtract[sub_idx].hi as u32) < lo {
sub_idx += 1;
}
let mut si = sub_idx;
while si < subtract.len() && (subtract[si].lo as u32) <= hi {
let sub_lo = subtract[si].lo as u32;
let sub_hi = subtract[si].hi as u32;
if lo < sub_lo {
if let (Some(r_lo), Some(r_hi)) = (char::from_u32(lo), char::from_u32(sub_lo - 1)) {
result.push(RunePair { lo: r_lo, hi: r_hi });
}
}
lo = sub_hi + 1;
si += 1;
}
if lo <= hi {
if let (Some(r_lo), Some(r_hi)) = (char::from_u32(lo), char::from_u32(hi)) {
result.push(RunePair { lo: r_lo, hi: r_hi });
}
}
}
result
}
pub(crate) fn simplify_rune_range(mut rranges: RuneRange) -> RuneRange {
if rranges.is_empty() {
return rranges;
}
rranges.sort_by_key(|rp| rp.lo);
let mut out = Vec::new();
let mut current = rranges[0];
for next in rranges.iter().skip(1).copied() {
if next.lo as u32 > current.hi as u32 + 1 {
out.push(current);
current = next;
continue;
}
if next.hi <= current.hi {
continue;
}
current.hi = next.hi;
}
out.push(current);
out
}
fn add_gap_range(inverted: &mut Vec<RunePair>, start: u32, end: u32) {
if start > end {
return;
}
if start < SURROGATE_START_CP && end >= SURROGATE_START_CP {
if let (Some(lo), Some(hi)) = (
char::from_u32(start),
char::from_u32(SURROGATE_START_CP - 1),
) {
inverted.push(RunePair { lo, hi });
}
if end > SURROGATE_END_CP {
if let (Some(lo), Some(hi)) =
(char::from_u32(SURROGATE_END_CP + 1), char::from_u32(end))
{
inverted.push(RunePair { lo, hi });
}
}
} else if (SURROGATE_START_CP..=SURROGATE_END_CP).contains(&start) {
if end > SURROGATE_END_CP {
if let (Some(lo), Some(hi)) =
(char::from_u32(SURROGATE_END_CP + 1), char::from_u32(end))
{
inverted.push(RunePair { lo, hi });
}
}
} else {
if let (Some(lo), Some(hi)) = (char::from_u32(start), char::from_u32(end)) {
inverted.push(RunePair { lo, hi });
}
}
}
pub(crate) fn invert_rune_range(mut rr: RuneRange) -> RuneRange {
rr.sort_by_key(|rp| rp.lo);
let mut merged: Vec<RunePair> = Vec::new();
for pair in rr {
if let Some(last) = merged.last_mut() {
if pair.lo as u32 <= last.hi as u32 + 1 {
if pair.hi > last.hi {
last.hi = pair.hi;
}
continue;
}
}
merged.push(pair);
}
let mut inverted = Vec::new();
let mut point: u32 = 0;
for pair in &merged {
let lo = pair.lo as u32;
if lo > point {
add_gap_range(&mut inverted, point, lo - 1);
}
point = pair.hi as u32 + 1;
}
if point <= RUNE_MAX as u32 {
add_gap_range(&mut inverted, point, RUNE_MAX as u32);
}
inverted
}
fn read_category(parse: &mut RegexpParse) -> Result<(RuneRange, Option<String>), RegexpError> {
parse.require('{')?;
let mut name = String::new();
loop {
let c = parse.next_rune()?;
if c == '}' {
break;
}
name.push(c);
}
if name.is_empty() {
return Err(RegexpError {
message: "empty category name".into(),
offset: parse.last_index,
});
}
if name.starts_with("Is") {
if let Some(ranges) = get_block_ranges(&name) {
return Ok((ranges, None));
}
return Err(RegexpError {
message: format!("unknown Unicode block ~p{{{}}}", name),
offset: parse.last_index,
});
}
let mut chars = name.chars();
let initial = match chars.next() {
Some(c) => c,
None => {
return Err(RegexpError {
message: "empty category name".into(),
offset: parse.last_index,
})
}
};
let valid_initials = ['L', 'M', 'N', 'P', 'Z', 'S', 'C'];
if !valid_initials.contains(&initial) {
return Err(RegexpError {
message: format!("unknown category {}", initial),
offset: parse.last_index,
});
}
let detail = chars.next();
if let Some(d) = detail {
let valid_details = match initial {
'L' => "ultmo",
'M' => "nce",
'N' => "dlo",
'P' => "cdseifo",
'Z' => "slp",
'S' => "mcko",
'C' => "cfon",
_ => "",
};
if !valid_details.contains(d) {
return Err(RegexpError {
message: format!("unknown category {}p{{{}{}", ESCAPE, initial, d),
offset: parse.last_index,
});
}
if chars.next().is_some() {
return Err(RegexpError {
message: format!("invalid category name ~p{{{}}}", name),
offset: parse.last_index,
});
}
}
let cache_key = detail.map_or_else(|| initial.to_string(), |d| format!("{}{}", initial, d));
if let Some(ranges) = get_category_ranges(initial, detail) {
Ok((ranges, Some(cache_key)))
} else {
Err(RegexpError {
message: format!("unknown category ~p{{{}}}", name),
offset: parse.last_index,
})
}
}
fn read_quantifier(parse: &mut RegexpParse, qa: &mut QuantifiedAtom) -> Result<(), RegexpError> {
let b = match parse.next_rune() {
Ok(c) => c,
Err(_) => {
qa.quant_min = 1;
qa.quant_max = 1;
return Ok(());
}
};
let mut is_quantifier = true;
match b {
'*' => {
parse.record_feature(RegexpFeature::Star);
qa.quant_min = 0;
qa.quant_max = REGEXP_QUANTIFIER_MAX;
}
'+' => {
parse.record_feature(RegexpFeature::Plus);
qa.quant_min = 1;
qa.quant_max = REGEXP_QUANTIFIER_MAX;
}
'?' => {
parse.record_feature(RegexpFeature::QuestionMark);
qa.quant_min = 0;
qa.quant_max = 1;
}
'{' => {
parse.record_feature(RegexpFeature::Range);
read_range_quantifier(parse, qa)?;
}
_ => {
qa.quant_min = 1;
qa.quant_max = 1;
parse.backup1(b);
is_quantifier = false;
}
}
if is_quantifier {
match parse.next_rune() {
Ok('?') => {
parse.record_feature(RegexpFeature::LazyQuantifier);
}
Ok(c) => {
parse.backup1(c);
}
Err(_) => {
}
}
}
Ok(())
}
fn read_range_quantifier(
parse: &mut RegexpParse,
qa: &mut QuantifiedAtom,
) -> Result<(), RegexpError> {
let next_or_eof = |p: &mut RegexpParse| -> Result<char, RegexpError> {
p.next_rune().map_err(|e| {
if e.message == "end of string" {
RegexpError {
message: "unexpected end of string in quantifier".into(),
offset: e.offset,
}
} else {
e
}
})
};
let mut lo_digits = String::new();
loop {
let b = next_or_eof(parse)?;
if b.is_ascii_digit() {
lo_digits.push(b);
} else {
if lo_digits.is_empty() {
return Err(RegexpError {
message: "invalid range quantifier, expecting digits".into(),
offset: parse.last_index,
});
}
let lo: i32 = lo_digits.parse().map_err(|_| RegexpError {
message: "invalid number in quantifier".into(),
offset: parse.last_index,
})?;
qa.quant_min = lo;
qa.quant_max = lo;
match b {
'}' => return Ok(()),
',' => {
qa.quant_max = REGEXP_QUANTIFIER_MAX;
break;
}
_ => {
return Err(RegexpError {
message: format!("unexpected character '{}' in quantifier", b),
offset: parse.last_index,
});
}
}
}
}
let b = next_or_eof(parse)?;
if b == '}' {
return Ok(());
}
if !b.is_ascii_digit() {
return Err(RegexpError {
message: format!(
"invalid character '{}' in quantifier range, wanted a digit",
b
),
offset: parse.last_index,
});
}
let mut hi_digits = String::from(b);
loop {
let b = next_or_eof(parse)?;
if b.is_ascii_digit() {
hi_digits.push(b);
} else if b == '}' {
let hi: i32 = hi_digits.parse().map_err(|_| RegexpError {
message: "invalid number in quantifier".into(),
offset: parse.last_index,
})?;
if hi < qa.quant_min {
return Err(RegexpError {
message: "invalid range quantifier, top must be greater than bottom".into(),
offset: parse.last_index,
});
}
qa.quant_max = hi;
return Ok(());
} else {
return Err(RegexpError {
message: format!("invalid character '{}', expected '}}'", b),
offset: parse.last_index,
});
}
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_detect_ascii_negated_bytes_single_char() {
let rr = vec![RunePair { lo: 'x', hi: 'x' }];
let result = detect_ascii_negated_bytes(&rr);
assert_eq!(result, Some(vec![b'x']));
}
#[test]
fn test_detect_ascii_negated_bytes_multiple_chars() {
let rr = vec![
RunePair { lo: 'a', hi: 'a' },
RunePair { lo: 'b', hi: 'b' },
RunePair { lo: 'c', hi: 'c' },
];
let result = detect_ascii_negated_bytes(&rr);
assert_eq!(result, Some(vec![b'a', b'b', b'c']));
}
#[test]
fn test_detect_ascii_negated_bytes_range() {
let rr = vec![RunePair { lo: 'a', hi: 'c' }];
let result = detect_ascii_negated_bytes(&rr);
assert_eq!(result, Some(vec![b'a', b'b', b'c']));
}
#[test]
fn test_detect_ascii_negated_bytes_too_many() {
let rr = vec![RunePair { lo: 'a', hi: 'z' }];
let result = detect_ascii_negated_bytes(&rr);
assert_eq!(result, None);
}
#[test]
fn test_detect_ascii_negated_bytes_non_ascii() {
let rr = vec![RunePair { lo: 'ü', hi: 'ü' }];
let result = detect_ascii_negated_bytes(&rr);
assert_eq!(result, None);
}
#[test]
fn test_detect_ascii_negated_bytes_mixed() {
let rr = vec![RunePair { lo: 'a', hi: 'a' }, RunePair { lo: 'ü', hi: 'ü' }];
let result = detect_ascii_negated_bytes(&rr);
assert_eq!(result, None);
}
#[test]
fn test_parse_negated_char_class_stores_ascii_bytes() {
let tree = parse_regexp("[^x]+").unwrap();
assert_eq!(tree.len(), 1);
assert_eq!(tree[0].len(), 1);
let qa = &tree[0][0];
assert_eq!(qa.ascii_negated_bytes, Some(vec![b'x']));
}
#[test]
fn test_parse_non_negated_class_no_ascii_bytes() {
let tree = parse_regexp("[abc]+").unwrap();
assert_eq!(tree.len(), 1);
assert_eq!(tree[0].len(), 1);
let qa = &tree[0][0];
assert_eq!(qa.ascii_negated_bytes, None);
}
#[test]
fn test_parse_negated_unicode_class_no_ascii_bytes() {
let tree = parse_regexp("[^ü]+").unwrap();
assert_eq!(tree.len(), 1);
assert_eq!(tree[0].len(), 1);
let qa = &tree[0][0];
assert_eq!(qa.ascii_negated_bytes, None);
}
#[test]
fn test_parse_negated_slash_class() {
let tree = parse_regexp("[^/]+").unwrap();
assert_eq!(tree.len(), 1);
assert_eq!(tree[0].len(), 1);
let qa = &tree[0][0];
assert_eq!(qa.ascii_negated_bytes, Some(vec![b'/']));
}
#[test]
fn test_parse_negated_quote_class() {
let tree = parse_regexp("[^\"]+").unwrap();
assert_eq!(tree.len(), 1);
assert_eq!(tree[0].len(), 1);
let qa = &tree[0][0];
assert_eq!(qa.ascii_negated_bytes, Some(vec![b'"']));
}
#[test]
fn test_wb_parse_boundary_marker() {
let tree = parse_regexp("~bhello").unwrap();
assert!(has_word_boundary(&tree));
let expanded = expand_word_boundaries(&tree).unwrap();
assert_eq!(expanded.len(), 1);
assert_eq!(expanded[0].len(), 5);
}
#[test]
fn test_wb_impossible_boundary_empty_tree() {
let tree = parse_regexp("ab~bcd").unwrap();
let expanded = expand_word_boundaries(&tree).unwrap();
assert!(
expanded.is_empty(),
"impossible boundary should produce empty tree"
);
}
#[test]
fn test_wb_middle_boundary_expansion() {
let tree = parse_regexp("ab~b cd").unwrap();
let expanded = expand_word_boundaries(&tree).unwrap();
assert_eq!(expanded.len(), 1); }
#[test]
fn test_wb_dot_star_expansion() {
let tree = parse_regexp(".*~bcat").unwrap();
let expanded = expand_word_boundaries(&tree).unwrap();
assert_eq!(expanded.len(), 2);
}
}