use alloc::boxed::Box;
use alloc::string::String;
use alloc::vec::Vec;
use core::fmt;
const MAX_QUANT: usize = crate::limits::DEFAULT_REGEX_MAX_QUANT;
const MAX_PARSE_DEPTH: u32 = crate::limits::DEFAULT_REGEX_MAX_PARSE_DEPTH;
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct RegexError {
message: String,
}
impl RegexError {
pub(crate) fn new(message: impl Into<String>) -> Self {
Self {
message: message.into(),
}
}
}
impl fmt::Display for RegexError {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "invalid regular expression: {}", self.message)
}
}
pub(crate) enum Node {
Empty,
Char(u32),
Any,
Class { neg: bool, items: Vec<ClassItem> },
Start,
End,
WordBoundary { neg: bool },
Group {
index: Option<usize>,
inner: Box<Node>,
},
Look { neg: bool, inner: Box<Node> },
LookBehind { neg: bool, inner: Box<Node> },
Backref(usize),
NamedBackref(alloc::string::String),
Concat(Vec<Node>),
Alt(Vec<Node>),
Repeat {
inner: Box<Node>,
min: usize,
max: Option<usize>,
greedy: bool,
},
}
pub(crate) enum ClassItem {
Char(u32),
Range(u32, u32),
Shorthand(Shorthand),
}
#[derive(Clone, Copy)]
pub(crate) enum Shorthand {
Digit,
NotDigit,
Word,
NotWord,
Space,
NotSpace,
Property(PropKind, bool),
}
#[derive(Clone, Copy)]
pub(crate) enum PropKind {
Letter,
Upper,
Lower,
Number,
White,
Alnum,
Gc([u8; 2]),
}
pub(crate) fn general_category_code(name: &str) -> Option<[u8; 2]> {
let code = match name {
"Mark" => "M",
"Punctuation" => "P",
"Symbol" => "S",
"Separator" => "Z",
"Titlecase_Letter" => "Lt",
"Modifier_Letter" => "Lm",
"Other_Letter" => "Lo",
"Nonspacing_Mark" => "Mn",
"Spacing_Mark" => "Mc",
"Enclosing_Mark" => "Me",
"Letter_Number" => "Nl",
"Other_Number" => "No",
"Connector_Punctuation" => "Pc",
"Dash_Punctuation" => "Pd",
"Open_Punctuation" => "Ps",
"Close_Punctuation" => "Pe",
"Initial_Punctuation" => "Pi",
"Final_Punctuation" => "Pf",
"Other_Punctuation" => "Po",
"Math_Symbol" => "Sm",
"Currency_Symbol" => "Sc",
"Modifier_Symbol" => "Sk",
"Other_Symbol" => "So",
"Space_Separator" => "Zs",
"Line_Separator" => "Zl",
"Paragraph_Separator" => "Zp",
"Control" | "cntrl" => "Cc",
"Format" => "Cf",
"Private_Use" => "Co",
"Unassigned" => "Cn",
other => other,
};
const SUBCATS: [&str; 30] = [
"Lu", "Ll", "Lt", "Lm", "Lo", "Mn", "Mc", "Me", "Nd", "Nl", "No", "Pc", "Pd", "Ps", "Pe",
"Pi", "Pf", "Po", "Sm", "Sc", "Sk", "So", "Zs", "Zl", "Zp", "Cc", "Cf", "Cs", "Co", "Cn",
];
let b = code.as_bytes();
match code {
"L" | "M" | "N" | "P" | "S" | "Z" | "C" => Some([b[0], 0]),
_ if SUBCATS.contains(&code) => Some([b[0], b[1]]),
_ => None,
}
}
pub(crate) type GroupNames = Vec<(usize, alloc::string::String)>;
pub(crate) fn parse(pattern: &str, unicode: bool) -> Result<(Node, usize, GroupNames), RegexError> {
let mut p = Parser {
chars: pattern.chars().collect(),
pos: 0,
group_count: 0,
group_names: Vec::new(),
depth: 0,
unicode,
};
let node = p.parse_alt()?;
if p.pos != p.chars.len() {
return Err(RegexError::new(alloc::format!(
"unexpected `{}`",
p.chars[p.pos]
)));
}
Ok((node, p.group_count, p.group_names))
}
struct Parser {
chars: Vec<char>,
pos: usize,
group_count: usize,
group_names: Vec<(usize, alloc::string::String)>,
depth: u32,
unicode: bool,
}
impl Parser {
fn peek(&self) -> Option<char> {
self.chars.get(self.pos).copied()
}
fn bump(&mut self) -> Option<char> {
let c = self.peek();
if c.is_some() {
self.pos += 1;
}
c
}
fn eat(&mut self, c: char) -> bool {
if self.peek() == Some(c) {
self.pos += 1;
true
} else {
false
}
}
fn parse_alt(&mut self) -> Result<Node, RegexError> {
let mut branches = alloc::vec![self.parse_concat()?];
while self.eat('|') {
branches.push(self.parse_concat()?);
}
Ok(if branches.len() == 1 {
branches.pop().unwrap()
} else {
Node::Alt(branches)
})
}
fn parse_concat(&mut self) -> Result<Node, RegexError> {
let mut nodes = Vec::new();
while let Some(c) = self.peek() {
if c == '|' || c == ')' {
break;
}
nodes.push(self.parse_quantified()?);
}
Ok(match nodes.len() {
0 => Node::Empty,
1 => nodes.pop().unwrap(),
_ => Node::Concat(nodes),
})
}
fn parse_quantified(&mut self) -> Result<Node, RegexError> {
let atom = self.parse_atom()?;
let (min, max) = match self.peek() {
Some('*') => {
self.pos += 1;
(0, None)
}
Some('+') => {
self.pos += 1;
(1, None)
}
Some('?') => {
self.pos += 1;
(0, Some(1))
}
Some('{') => match self.try_parse_brace()? {
Some(bounds) => bounds,
None => return Ok(atom), },
_ => return Ok(atom),
};
let greedy = !self.eat('?');
Ok(Node::Repeat {
inner: Box::new(atom),
min,
max,
greedy,
})
}
fn try_parse_brace(&mut self) -> Result<Option<(usize, Option<usize>)>, RegexError> {
let save = self.pos;
self.pos += 1; let min = self.parse_int()?;
let Some(min) = min else {
self.pos = save;
return Ok(None);
};
let max = if self.eat(',') {
if self.peek() == Some('}') {
None
} else {
match self.parse_int()? {
Some(m) => Some(m),
None => {
self.pos = save;
return Ok(None);
}
}
}
} else {
Some(min)
};
if !self.eat('}') {
self.pos = save;
return Ok(None);
}
if let Some(m) = max
&& min > m
{
return Err(RegexError::new(
"quantifier range out of order (min greater than max)",
));
}
Ok(Some((min, max)))
}
fn parse_int(&mut self) -> Result<Option<usize>, RegexError> {
let start = self.pos;
let mut value: usize = 0;
let mut overflow = false;
while let Some(c) = self.peek() {
if let Some(d) = c.to_digit(10) {
value = value.saturating_mul(10).saturating_add(d as usize);
if value > MAX_QUANT {
overflow = true;
}
self.pos += 1;
} else {
break;
}
}
if self.pos == start {
Ok(None)
} else if overflow {
Err(RegexError::new("quantifier bound too large"))
} else {
Ok(Some(value))
}
}
fn parse_atom(&mut self) -> Result<Node, RegexError> {
match self.peek() {
Some('(') => self.parse_group(),
Some('[') => self.parse_class(),
Some('.') => {
self.pos += 1;
Ok(Node::Any)
}
Some('^') => {
self.pos += 1;
Ok(Node::Start)
}
Some('$') => {
self.pos += 1;
Ok(Node::End)
}
Some('\\') => self.parse_escape(),
Some(c @ ('*' | '+' | '?')) => Err(RegexError::new(alloc::format!(
"nothing to repeat before `{c}`"
))),
Some(c) => {
self.pos += 1;
Ok(Node::Char(c as u32))
}
None => Ok(Node::Empty),
}
}
fn parse_group(&mut self) -> Result<Node, RegexError> {
self.depth += 1;
if self.depth > MAX_PARSE_DEPTH {
self.depth -= 1;
return Err(RegexError::new("pattern nested too deeply"));
}
let result = self.parse_group_inner();
self.depth -= 1;
result
}
fn parse_group_inner(&mut self) -> Result<Node, RegexError> {
self.pos += 1; let index = if self.peek() == Some('?') {
if self.chars.get(self.pos + 1) == Some(&':') {
self.pos += 2;
None
} else if matches!(self.chars.get(self.pos + 1), Some('=' | '!')) {
let neg = self.chars.get(self.pos + 1) == Some(&'!');
self.pos += 2; let inner = self.parse_alt()?;
if !self.eat(')') {
return Err(RegexError::new("unterminated lookahead `(?=`"));
}
return Ok(Node::Look {
neg,
inner: Box::new(inner),
});
} else if self.chars.get(self.pos + 1) == Some(&'<')
&& matches!(self.chars.get(self.pos + 2), Some('=' | '!'))
{
let neg = self.chars.get(self.pos + 2) == Some(&'!');
self.pos += 3; let inner = self.parse_alt()?;
if !self.eat(')') {
return Err(RegexError::new("unterminated lookbehind `(?<=`"));
}
return Ok(Node::LookBehind {
neg,
inner: Box::new(inner),
});
} else if self.chars.get(self.pos + 1) == Some(&'<') {
self.pos += 2; let mut name = alloc::string::String::new();
while let Some(&c) = self.chars.get(self.pos) {
if c == '>' {
break;
}
name.push(c);
self.pos += 1;
}
if !self.eat('>') {
return Err(RegexError::new("unterminated group name `(?<`"));
}
self.group_count += 1;
self.group_names.push((self.group_count, name));
Some(self.group_count)
} else {
return Err(RegexError::new("unsupported group extension `(?…)`"));
}
} else {
self.group_count += 1;
Some(self.group_count)
};
let inner = self.parse_alt()?;
if !self.eat(')') {
return Err(RegexError::new("unterminated group `(`"));
}
Ok(Node::Group {
index,
inner: Box::new(inner),
})
}
fn parse_escape(&mut self) -> Result<Node, RegexError> {
self.pos += 1; let Some(c) = self.bump() else {
return Err(RegexError::new("trailing backslash"));
};
Ok(match c {
'd' => class_shorthand(Shorthand::Digit),
'D' => class_shorthand(Shorthand::NotDigit),
'w' => class_shorthand(Shorthand::Word),
'W' => class_shorthand(Shorthand::NotWord),
's' => class_shorthand(Shorthand::Space),
'S' => class_shorthand(Shorthand::NotSpace),
'b' => Node::WordBoundary { neg: false },
'B' => Node::WordBoundary { neg: true },
d if d.is_ascii_digit() && d != '0' => Node::Backref((d as u8 - b'0') as usize),
'k' if self.chars.get(self.pos) == Some(&'<') => {
self.eat('<');
let mut name = alloc::string::String::new();
while let Some(&c) = self.chars.get(self.pos) {
if c == '>' {
break;
}
name.push(c);
self.pos += 1;
}
if !self.eat('>') {
return Err(RegexError::new("unterminated `\\k<` group name"));
}
Node::NamedBackref(name)
}
'u' => Node::Char(self.parse_unicode_escape()?),
'x' => Node::Char(self.parse_hex_escape(2)?),
'p' => class_shorthand(Shorthand::Property(self.parse_property()?, false)),
'P' => class_shorthand(Shorthand::Property(self.parse_property()?, true)),
other => Node::Char(escape_char(other) as u32),
})
}
fn parse_property(&mut self) -> Result<PropKind, RegexError> {
if !self.eat('{') {
return Err(RegexError::new("expected `{` after `\\p`"));
}
let mut name = alloc::string::String::new();
loop {
match self.bump() {
Some('}') => break,
Some(c) => name.push(c),
None => return Err(RegexError::new("unterminated `\\p{…}`")),
}
}
Ok(match name.as_str() {
"L" | "Letter" | "Alphabetic" => PropKind::Letter,
"Lu" | "Uppercase" | "Uppercase_Letter" => PropKind::Upper,
"Ll" | "Lowercase" | "Lowercase_Letter" => PropKind::Lower,
"N" | "Nd" | "Number" | "Decimal_Number" => PropKind::Number,
"White_Space" | "space" => PropKind::White,
"Alnum" => PropKind::Alnum,
other => match general_category_code(other) {
Some(code) => PropKind::Gc(code),
None => {
return Err(RegexError::new(alloc::format!(
"unsupported \\p property `{other}`"
)));
}
},
})
}
fn parse_unicode_escape(&mut self) -> Result<u32, RegexError> {
if self.eat('{') {
let mut v: u32 = 0;
let mut any = false;
while let Some(c) = self.peek() {
if c == '}' {
self.pos += 1;
break;
}
let d = c
.to_digit(16)
.ok_or_else(|| RegexError::new("invalid `\\u{…}` escape"))?;
v = v.saturating_mul(16).saturating_add(d);
self.pos += 1;
any = true;
}
if !any {
return Err(RegexError::new("empty `\\u{}` escape"));
}
if v > 0x10_FFFF {
return Err(RegexError::new("escape is not a valid code point"));
}
return Ok(v);
}
let hi = self.parse_hex_digits(4)?;
if self.unicode
&& (0xD800..=0xDBFF).contains(&hi)
&& self.chars.get(self.pos) == Some(&'\\')
&& self.chars.get(self.pos + 1) == Some(&'u')
{
let save = self.pos;
self.pos += 2; let lo = self.parse_hex_digits(4)?;
if (0xDC00..=0xDFFF).contains(&lo) {
return Ok(0x10000 + ((hi - 0xD800) << 10) + (lo - 0xDC00));
}
self.pos = save;
}
Ok(hi)
}
fn parse_hex_escape(&mut self, n: usize) -> Result<u32, RegexError> {
self.parse_hex_digits(n)
}
fn parse_hex_digits(&mut self, n: usize) -> Result<u32, RegexError> {
let mut v: u32 = 0;
for _ in 0..n {
let c = self
.bump()
.ok_or_else(|| RegexError::new("incomplete hex escape"))?;
let d = c
.to_digit(16)
.ok_or_else(|| RegexError::new("invalid hex digit in escape"))?;
v = v * 16 + d;
}
Ok(v)
}
fn parse_class(&mut self) -> Result<Node, RegexError> {
self.pos += 1; let neg = self.eat('^');
let mut items = Vec::new();
loop {
match self.peek() {
None => return Err(RegexError::new("unterminated character class `[`")),
Some(']') => {
self.pos += 1;
break;
}
Some('\\') => {
self.pos += 1;
let Some(e) = self.bump() else {
return Err(RegexError::new("trailing backslash in class"));
};
match e {
'd' => items.push(ClassItem::Shorthand(Shorthand::Digit)),
'D' => items.push(ClassItem::Shorthand(Shorthand::NotDigit)),
'w' => items.push(ClassItem::Shorthand(Shorthand::Word)),
'W' => items.push(ClassItem::Shorthand(Shorthand::NotWord)),
's' => items.push(ClassItem::Shorthand(Shorthand::Space)),
'S' => items.push(ClassItem::Shorthand(Shorthand::NotSpace)),
'p' => items.push(ClassItem::Shorthand(Shorthand::Property(
self.parse_property()?,
false,
))),
'P' => items.push(ClassItem::Shorthand(Shorthand::Property(
self.parse_property()?,
true,
))),
'u' => {
let ch = self.parse_unicode_escape()?;
self.push_class_member(&mut items, ch);
}
'x' => {
let ch = self.parse_hex_escape(2)?;
self.push_class_member(&mut items, ch);
}
other => {
self.push_class_member(&mut items, escape_char(other) as u32);
}
}
}
Some(c) => {
self.pos += 1;
self.push_class_member(&mut items, c as u32);
}
}
}
Ok(Node::Class { neg, items })
}
fn push_class_member(&mut self, items: &mut Vec<ClassItem>, c: u32) {
if self.peek() == Some('-') && self.chars.get(self.pos + 1).is_some_and(|&n| n != ']') {
self.pos += 1; let hi = if self.peek() == Some('\\') {
self.pos += 1;
match self.bump() {
Some('u') => self.parse_unicode_escape().unwrap_or(c),
Some('x') => self.parse_hex_escape(2).unwrap_or(c),
Some(e) => escape_char(e) as u32,
None => '\\' as u32,
}
} else {
self.bump().map_or(c, |ch| ch as u32)
};
items.push(ClassItem::Range(c, hi));
} else {
items.push(ClassItem::Char(c));
}
}
}
fn class_shorthand(s: Shorthand) -> Node {
Node::Class {
neg: false,
items: alloc::vec![ClassItem::Shorthand(s)],
}
}
fn escape_char(c: char) -> char {
match c {
'n' => '\n',
't' => '\t',
'r' => '\r',
'f' => '\u{0C}',
'v' => '\u{0B}',
'0' => '\0',
other => other, }
}