use core::cell::{Cell, RefCell};
use alloc::{
boxed::Box,
string::{String, ToString},
vec,
vec::Vec,
};
use crate::{
error::Error,
hir::{self, Config, Flags, Hir, HirKind},
};
const ERR_TOO_MUCH_NESTING: &str = "pattern has too much nesting";
const ERR_TOO_MANY_CAPTURES: &str = "too many capture groups";
const ERR_DUPLICATE_CAPTURE_NAME: &str = "duplicate capture group name";
const ERR_UNCLOSED_GROUP: &str = "found open group without closing ')'";
const ERR_UNCLOSED_GROUP_QUESTION: &str =
"expected closing ')', but got end of pattern";
const ERR_UNOPENED_GROUP: &str = "found closing ')' without matching '('";
const ERR_LOOK_UNSUPPORTED: &str = "look-around is not supported";
const ERR_EMPTY_FLAGS: &str = "empty flag directive '(?)' is not allowed";
const ERR_MISSING_GROUP_NAME: &str =
"expected capture group name, but got end of pattern";
const ERR_INVALID_GROUP_NAME: &str = "invalid group name";
const ERR_UNCLOSED_GROUP_NAME: &str =
"expected end of capture group name, but got end of pattern";
const ERR_EMPTY_GROUP_NAME: &str = "empty capture group names are not allowed";
const ERR_FLAG_UNRECOGNIZED: &str = "unrecognized inline flag";
const ERR_FLAG_REPEATED_NEGATION: &str =
"inline flag negation cannot be repeated";
const ERR_FLAG_DUPLICATE: &str = "duplicate inline flag is not allowed";
const ERR_FLAG_UNEXPECTED_EOF: &str =
"expected ':' or ')' to end inline flags, but got end of pattern";
const ERR_FLAG_DANGLING_NEGATION: &str =
"inline flags cannot end with negation directive";
const ERR_DECIMAL_NO_DIGITS: &str =
"expected decimal number, but found no digits";
const ERR_DECIMAL_INVALID: &str = "got invalid decimal number";
const ERR_HEX_BRACE_INVALID_DIGIT: &str =
"expected hexadecimal number in braces, but got non-hex digit";
const ERR_HEX_BRACE_UNEXPECTED_EOF: &str =
"expected hexadecimal number, but saw end of pattern before closing brace";
const ERR_HEX_BRACE_EMPTY: &str =
"expected hexadecimal number in braces, but got no digits";
const ERR_HEX_BRACE_INVALID: &str = "got invalid hexadecimal number in braces";
const ERR_HEX_FIXED_UNEXPECTED_EOF: &str =
"expected fixed length hexadecimal number, but saw end of pattern first";
const ERR_HEX_FIXED_INVALID_DIGIT: &str =
"expected fixed length hexadecimal number, but got non-hex digit";
const ERR_HEX_FIXED_INVALID: &str =
"got invalid fixed length hexadecimal number";
const ERR_HEX_UNEXPECTED_EOF: &str =
"expected hexadecimal number, but saw end of pattern first";
const ERR_ESCAPE_UNEXPECTED_EOF: &str =
"saw start of escape sequence, but saw end of pattern before it finished";
const ERR_BACKREF_UNSUPPORTED: &str = "backreferences are not supported";
const ERR_UNICODE_CLASS_UNSUPPORTED: &str =
"Unicode character classes are not supported";
const ERR_ESCAPE_UNRECOGNIZED: &str = "unrecognized escape sequence";
const ERR_POSIX_CLASS_UNRECOGNIZED: &str =
"unrecognized POSIX character class";
const ERR_UNCOUNTED_REP_SUB_MISSING: &str =
"uncounted repetition operator must be applied to a sub-expression";
const ERR_COUNTED_REP_SUB_MISSING: &str =
"counted repetition operator must be applied to a sub-expression";
const ERR_COUNTED_REP_UNCLOSED: &str =
"found unclosed counted repetition operator";
const ERR_COUNTED_REP_MIN_UNCLOSED: &str =
"found incomplete and unclosed counted repetition operator";
const ERR_COUNTED_REP_COMMA_UNCLOSED: &str =
"found counted repetition operator with a comma that is unclosed";
const ERR_COUNTED_REP_MIN_MAX_UNCLOSED: &str =
"found counted repetition with min and max that is unclosed";
const ERR_COUNTED_REP_INVALID: &str =
"expected closing brace for counted repetition, but got something else";
const ERR_COUNTED_REP_INVALID_RANGE: &str =
"found counted repetition with a min bigger than its max";
const ERR_CLASS_UNCLOSED_AFTER_ITEM: &str =
"non-empty character class has no closing bracket";
const ERR_CLASS_INVALID_RANGE_ITEM: &str =
"character class ranges must start and end with a single character";
const ERR_CLASS_INVALID_ITEM: &str =
"invalid escape sequence in character class";
const ERR_CLASS_UNCLOSED_AFTER_DASH: &str =
"non-empty character class has no closing bracket after dash";
const ERR_CLASS_UNCLOSED_AFTER_NEGATION: &str =
"negated character class has no closing bracket";
const ERR_CLASS_UNCLOSED_AFTER_CLOSING: &str =
"character class begins with literal ']' but has no closing bracket";
const ERR_CLASS_INVALID_RANGE: &str = "invalid range in character class";
const ERR_CLASS_UNCLOSED: &str = "found unclosed character class";
const ERR_CLASS_NEST_UNSUPPORTED: &str =
"nested character classes are not supported";
const ERR_CLASS_INTERSECTION_UNSUPPORTED: &str =
"character class intersection is not supported";
const ERR_CLASS_DIFFERENCE_UNSUPPORTED: &str =
"character class difference is not supported";
const ERR_CLASS_SYMDIFFERENCE_UNSUPPORTED: &str =
"character class symmetric difference is not supported";
const ERR_SPECIAL_WORD_BOUNDARY_UNCLOSED: &str =
"special word boundary assertion is unclosed or has an invalid character";
const ERR_SPECIAL_WORD_BOUNDARY_UNRECOGNIZED: &str =
"special word boundary assertion is unrecognized";
const ERR_SPECIAL_WORD_OR_REP_UNEXPECTED_EOF: &str =
"found start of special word boundary or repetition without an end";
#[derive(Clone, Debug)]
pub(super) struct Parser<'a> {
config: Config,
pattern: &'a str,
depth: Cell<u32>,
pos: Cell<usize>,
char: Cell<Option<char>>,
capture_index: Cell<u32>,
flags: RefCell<Flags>,
capture_names: RefCell<Vec<String>>,
}
impl<'a> Parser<'a> {
pub(super) fn new(config: Config, pattern: &'a str) -> Parser<'a> {
Parser {
config,
pattern,
depth: Cell::new(0),
pos: Cell::new(0),
char: Cell::new(pattern.chars().next()),
capture_index: Cell::new(0),
flags: RefCell::new(config.flags),
capture_names: RefCell::new(vec![]),
}
}
fn pattern(&self) -> &str {
self.pattern
}
fn pos(&self) -> usize {
self.pos.get()
}
fn increment_depth(&self) -> Result<u32, Error> {
let old = self.depth.get();
if old > self.config.nest_limit {
return Err(Error::new(ERR_TOO_MUCH_NESTING));
}
let new = old.checked_add(1).unwrap();
self.depth.set(new);
Ok(old)
}
fn decrement_depth(&self) {
let old = self.depth.get();
let new = old.checked_sub(1).unwrap();
self.depth.set(new);
}
fn char(&self) -> char {
self.char.get().expect("codepoint, but parser is done")
}
fn is_done(&self) -> bool {
self.pos() == self.pattern.len()
}
fn flags(&self) -> Flags {
*self.flags.borrow()
}
fn bump(&self) -> bool {
if self.is_done() {
return false;
}
self.pos.set(self.pos() + self.char().len_utf8());
self.char.set(self.pattern()[self.pos()..].chars().next());
self.char.get().is_some()
}
fn bump_if(&self, prefix: &str) -> bool {
if self.pattern()[self.pos()..].starts_with(prefix) {
for _ in 0..prefix.chars().count() {
self.bump();
}
true
} else {
false
}
}
fn bump_and_bump_space(&self) -> bool {
if !self.bump() {
return false;
}
self.bump_space();
!self.is_done()
}
fn bump_space(&self) {
if !self.flags().ignore_whitespace {
return;
}
while !self.is_done() {
if self.char().is_whitespace() {
self.bump();
} else if self.char() == '#' {
self.bump();
while !self.is_done() {
let c = self.char();
self.bump();
if c == '\n' {
break;
}
}
} else {
break;
}
}
}
fn peek(&self) -> Option<char> {
if self.is_done() {
return None;
}
self.pattern()[self.pos() + self.char().len_utf8()..].chars().next()
}
fn peek_space(&self) -> Option<char> {
if !self.flags().ignore_whitespace {
return self.peek();
}
if self.is_done() {
return None;
}
let mut start = self.pos() + self.char().len_utf8();
let mut in_comment = false;
for (i, ch) in self.pattern()[start..].char_indices() {
if ch.is_whitespace() {
continue;
} else if !in_comment && ch == '#' {
in_comment = true;
} else if in_comment && ch == '\n' {
in_comment = false;
} else {
start += i;
break;
}
}
self.pattern()[start..].chars().next()
}
fn next_capture_index(&self) -> Result<u32, Error> {
let current = self.capture_index.get();
let next = current
.checked_add(1)
.ok_or_else(|| Error::new(ERR_TOO_MANY_CAPTURES))?;
self.capture_index.set(next);
Ok(next)
}
fn add_capture_name(&self, name: &str) -> Result<(), Error> {
let mut names = self.capture_names.borrow_mut();
match names.binary_search_by(|n| name.cmp(n)) {
Ok(_) => Err(Error::new(ERR_DUPLICATE_CAPTURE_NAME)),
Err(i) => {
names.insert(i, name.to_string());
Ok(())
}
}
}
fn is_lookaround_prefix(&self) -> bool {
self.bump_if("?=")
|| self.bump_if("?!")
|| self.bump_if("?<=")
|| self.bump_if("?<!")
}
}
impl<'a> Parser<'a> {
pub(super) fn parse(&self) -> Result<Hir, Error> {
let hir = self.parse_inner()?;
check_hir_nesting(&hir, self.config.nest_limit)?;
Ok(hir)
}
fn parse_inner(&self) -> Result<Hir, Error> {
let depth = self.increment_depth()?;
let mut alternates = vec![];
let mut concat = vec![];
loop {
self.bump_space();
if self.is_done() {
break;
}
match self.char() {
'(' => {
let oldflags = *self.flags.borrow();
if let Some(sub) = self.parse_group()? {
concat.push(sub);
*self.flags.borrow_mut() = oldflags;
}
if self.char.get() != Some(')') {
return Err(Error::new(ERR_UNCLOSED_GROUP));
}
self.bump();
}
')' => {
if depth == 0 {
return Err(Error::new(ERR_UNOPENED_GROUP));
}
break;
}
'|' => {
alternates.push(Hir::concat(core::mem::take(&mut concat)));
self.bump();
}
'[' => concat.push(self.parse_class()?),
'?' | '*' | '+' => {
concat = self.parse_uncounted_repetition(concat)?;
}
'{' => {
concat = self.parse_counted_repetition(concat)?;
}
_ => concat.push(self.parse_primitive()?),
}
}
self.decrement_depth();
alternates.push(Hir::concat(concat));
Ok(Hir::alternation(alternates))
}
fn parse_primitive(&self) -> Result<Hir, Error> {
let ch = self.char();
self.bump();
match ch {
'\\' => self.parse_escape(),
'.' => Ok(self.hir_dot()),
'^' => Ok(self.hir_anchor_start()),
'$' => Ok(self.hir_anchor_end()),
ch => Ok(self.hir_char(ch)),
}
}
fn parse_escape(&self) -> Result<Hir, Error> {
if self.is_done() {
return Err(Error::new(ERR_ESCAPE_UNEXPECTED_EOF));
}
let ch = self.char();
match ch {
'0'..='9' => return Err(Error::new(ERR_BACKREF_UNSUPPORTED)),
'p' | 'P' => {
return Err(Error::new(ERR_UNICODE_CLASS_UNSUPPORTED))
}
'x' | 'u' | 'U' => return self.parse_hex(),
'd' | 's' | 'w' | 'D' | 'S' | 'W' => {
return Ok(self.parse_perl_class());
}
_ => {}
}
self.bump();
if hir::is_meta_character(ch) || hir::is_escapable_character(ch) {
return Ok(self.hir_char(ch));
}
let special = |ch| Ok(self.hir_char(ch));
match ch {
'a' => special('\x07'),
'f' => special('\x0C'),
't' => special('\t'),
'n' => special('\n'),
'r' => special('\r'),
'v' => special('\x0B'),
'A' => Ok(Hir::look(hir::Look::Start)),
'z' => Ok(Hir::look(hir::Look::End)),
'b' => {
let mut hir = Hir::look(hir::Look::Word);
if !self.is_done() && self.char() == '{' {
if let Some(special) =
self.maybe_parse_special_word_boundary()?
{
hir = special;
}
}
Ok(hir)
}
'B' => Ok(Hir::look(hir::Look::WordNegate)),
'<' => Ok(Hir::look(hir::Look::WordStart)),
'>' => Ok(Hir::look(hir::Look::WordEnd)),
_ => Err(Error::new(ERR_ESCAPE_UNRECOGNIZED)),
}
}
fn maybe_parse_special_word_boundary(&self) -> Result<Option<Hir>, Error> {
assert_eq!(self.char(), '{');
let is_valid_char = |c| match c {
'A'..='Z' | 'a'..='z' | '-' => true,
_ => false,
};
let start = self.pos();
if !self.bump_and_bump_space() {
return Err(Error::new(ERR_SPECIAL_WORD_OR_REP_UNEXPECTED_EOF));
}
if !is_valid_char(self.char()) {
self.pos.set(start);
self.char.set(Some('{'));
return Ok(None);
}
let mut scratch = String::new();
while !self.is_done() && is_valid_char(self.char()) {
scratch.push(self.char());
self.bump_and_bump_space();
}
if self.is_done() || self.char() != '}' {
return Err(Error::new(ERR_SPECIAL_WORD_BOUNDARY_UNCLOSED));
}
self.bump();
let kind = match scratch.as_str() {
"start" => hir::Look::WordStart,
"end" => hir::Look::WordEnd,
"start-half" => hir::Look::WordStartHalf,
"end-half" => hir::Look::WordEndHalf,
_ => {
return Err(Error::new(ERR_SPECIAL_WORD_BOUNDARY_UNRECOGNIZED))
}
};
Ok(Some(Hir::look(kind)))
}
fn parse_hex(&self) -> Result<Hir, Error> {
let digit_len = match self.char() {
'x' => 2,
'u' => 4,
'U' => 8,
unk => unreachable!(
"invalid start of fixed length hexadecimal number {unk}"
),
};
if !self.bump_and_bump_space() {
return Err(Error::new(ERR_HEX_UNEXPECTED_EOF));
}
if self.char() == '{' {
self.parse_hex_brace()
} else {
self.parse_hex_digits(digit_len)
}
}
fn parse_hex_digits(&self, digit_len: usize) -> Result<Hir, Error> {
let mut scratch = String::new();
for i in 0..digit_len {
if i > 0 && !self.bump_and_bump_space() {
return Err(Error::new(ERR_HEX_FIXED_UNEXPECTED_EOF));
}
if !is_hex(self.char()) {
return Err(Error::new(ERR_HEX_FIXED_INVALID_DIGIT));
}
scratch.push(self.char());
}
self.bump_and_bump_space();
match u32::from_str_radix(&scratch, 16).ok().and_then(char::from_u32) {
None => Err(Error::new(ERR_HEX_FIXED_INVALID)),
Some(ch) => Ok(self.hir_char(ch)),
}
}
fn parse_hex_brace(&self) -> Result<Hir, Error> {
let mut scratch = String::new();
while self.bump_and_bump_space() && self.char() != '}' {
if !is_hex(self.char()) {
return Err(Error::new(ERR_HEX_BRACE_INVALID_DIGIT));
}
scratch.push(self.char());
}
if self.is_done() {
return Err(Error::new(ERR_HEX_BRACE_UNEXPECTED_EOF));
}
assert_eq!(self.char(), '}');
self.bump_and_bump_space();
if scratch.is_empty() {
return Err(Error::new(ERR_HEX_BRACE_EMPTY));
}
match u32::from_str_radix(&scratch, 16).ok().and_then(char::from_u32) {
None => Err(Error::new(ERR_HEX_BRACE_INVALID)),
Some(ch) => Ok(self.hir_char(ch)),
}
}
fn parse_decimal(&self) -> Result<u32, Error> {
let mut scratch = String::new();
while !self.is_done() && self.char().is_whitespace() {
self.bump();
}
while !self.is_done() && '0' <= self.char() && self.char() <= '9' {
scratch.push(self.char());
self.bump_and_bump_space();
}
while !self.is_done() && self.char().is_whitespace() {
self.bump_and_bump_space();
}
let digits = scratch.as_str();
if digits.is_empty() {
return Err(Error::new(ERR_DECIMAL_NO_DIGITS));
}
match u32::from_str_radix(digits, 10).ok() {
Some(n) => Ok(n),
None => Err(Error::new(ERR_DECIMAL_INVALID)),
}
}
fn parse_uncounted_repetition(
&self,
mut concat: Vec<Hir>,
) -> Result<Vec<Hir>, Error> {
let sub = match concat.pop() {
Some(hir) => Box::new(hir),
None => {
return Err(Error::new(ERR_UNCOUNTED_REP_SUB_MISSING));
}
};
let (min, max) = match self.char() {
'?' => (0, Some(1)),
'*' => (0, None),
'+' => (1, None),
unk => unreachable!("unrecognized repetition operator '{unk}'"),
};
let mut greedy = true;
if self.bump() && self.char() == '?' {
greedy = false;
self.bump();
}
if self.flags().swap_greed {
greedy = !greedy;
}
concat.push(Hir::repetition(hir::Repetition {
min,
max,
greedy,
sub,
}));
Ok(concat)
}
fn parse_counted_repetition(
&self,
mut concat: Vec<Hir>,
) -> Result<Vec<Hir>, Error> {
assert_eq!(self.char(), '{', "expected opening brace");
let sub = match concat.pop() {
Some(hir) => Box::new(hir),
None => {
return Err(Error::new(ERR_COUNTED_REP_SUB_MISSING));
}
};
if !self.bump_and_bump_space() {
return Err(Error::new(ERR_COUNTED_REP_UNCLOSED));
}
let min = self.parse_decimal()?;
let mut max = Some(min);
if self.is_done() {
return Err(Error::new(ERR_COUNTED_REP_MIN_UNCLOSED));
}
if self.char() == ',' {
if !self.bump_and_bump_space() {
return Err(Error::new(ERR_COUNTED_REP_COMMA_UNCLOSED));
}
if self.char() != '}' {
max = Some(self.parse_decimal()?);
} else {
max = None;
}
if self.is_done() {
return Err(Error::new(ERR_COUNTED_REP_MIN_MAX_UNCLOSED));
}
}
if self.char() != '}' {
return Err(Error::new(ERR_COUNTED_REP_INVALID));
}
let mut greedy = true;
if self.bump_and_bump_space() && self.char() == '?' {
greedy = false;
self.bump();
}
if self.flags().swap_greed {
greedy = !greedy;
}
if max.map_or(false, |max| min > max) {
return Err(Error::new(ERR_COUNTED_REP_INVALID_RANGE));
}
concat.push(Hir::repetition(hir::Repetition {
min,
max,
greedy,
sub,
}));
Ok(concat)
}
fn parse_group(&self) -> Result<Option<Hir>, Error> {
assert_eq!(self.char(), '(');
self.bump_and_bump_space();
if self.is_lookaround_prefix() {
return Err(Error::new(ERR_LOOK_UNSUPPORTED));
}
if self.bump_if("?P<") || self.bump_if("?<") {
let index = self.next_capture_index()?;
let name = Some(Box::from(self.parse_capture_name()?));
let sub = Box::new(self.parse_inner()?);
let cap = hir::Capture { index, name, sub };
Ok(Some(Hir::capture(cap)))
} else if self.bump_if("?") {
if self.is_done() {
return Err(Error::new(ERR_UNCLOSED_GROUP_QUESTION));
}
let start = self.pos();
*self.flags.borrow_mut() = self.parse_flags()?;
let consumed = self.pos() - start;
if self.char() == ')' {
if consumed == 0 {
return Err(Error::new(ERR_EMPTY_FLAGS));
}
Ok(None)
} else {
assert_eq!(':', self.char());
self.bump();
self.parse_inner().map(Some)
}
} else {
let index = self.next_capture_index()?;
let sub = Box::new(self.parse_inner()?);
let cap = hir::Capture { index, name: None, sub };
Ok(Some(Hir::capture(cap)))
}
}
fn parse_capture_name(&self) -> Result<&str, Error> {
if self.is_done() {
return Err(Error::new(ERR_MISSING_GROUP_NAME));
}
let start = self.pos();
loop {
if self.char() == '>' {
break;
}
if !is_capture_char(self.char(), self.pos() == start) {
return Err(Error::new(ERR_INVALID_GROUP_NAME));
}
if !self.bump() {
break;
}
}
let end = self.pos();
if self.is_done() {
return Err(Error::new(ERR_UNCLOSED_GROUP_NAME));
}
assert_eq!(self.char(), '>');
self.bump();
let name = &self.pattern()[start..end];
if name.is_empty() {
return Err(Error::new(ERR_EMPTY_GROUP_NAME));
}
self.add_capture_name(name)?;
Ok(name)
}
fn parse_flags(&self) -> Result<Flags, Error> {
let mut flags = *self.flags.borrow();
let mut negate = false;
let mut last_was_negation = false;
let mut seen = [false; 128];
while self.char() != ':' && self.char() != ')' {
if self.char() == '-' {
last_was_negation = true;
if negate {
return Err(Error::new(ERR_FLAG_REPEATED_NEGATION));
}
negate = true;
} else {
last_was_negation = false;
self.parse_flag(&mut flags, negate)?;
let flag_byte = u8::try_from(self.char()).unwrap();
if seen[usize::from(flag_byte)] {
return Err(Error::new(ERR_FLAG_DUPLICATE));
}
seen[usize::from(flag_byte)] = true;
}
if !self.bump() {
return Err(Error::new(ERR_FLAG_UNEXPECTED_EOF));
}
}
if last_was_negation {
return Err(Error::new(ERR_FLAG_DANGLING_NEGATION));
}
Ok(flags)
}
fn parse_flag(
&self,
flags: &mut Flags,
negate: bool,
) -> Result<(), Error> {
let enabled = !negate;
match self.char() {
'i' => flags.case_insensitive = enabled,
'm' => flags.multi_line = enabled,
's' => flags.dot_matches_new_line = enabled,
'U' => flags.swap_greed = enabled,
'R' => flags.crlf = enabled,
'x' => flags.ignore_whitespace = enabled,
'u' => {}
_ => return Err(Error::new(ERR_FLAG_UNRECOGNIZED)),
}
Ok(())
}
fn parse_class(&self) -> Result<Hir, Error> {
assert_eq!(self.char(), '[');
let mut union = vec![];
if !self.bump_and_bump_space() {
return Err(Error::new(ERR_CLASS_UNCLOSED));
}
let negate = if self.char() != '^' {
false
} else {
if !self.bump_and_bump_space() {
return Err(Error::new(ERR_CLASS_UNCLOSED_AFTER_NEGATION));
}
true
};
while self.char() == '-' {
union.push(hir::ClassRange { start: '-', end: '-' });
if !self.bump_and_bump_space() {
return Err(Error::new(ERR_CLASS_UNCLOSED_AFTER_DASH));
}
}
if union.is_empty() && self.char() == ']' {
union.push(hir::ClassRange { start: ']', end: ']' });
if !self.bump_and_bump_space() {
return Err(Error::new(ERR_CLASS_UNCLOSED_AFTER_CLOSING));
}
}
loop {
self.bump_space();
if self.is_done() {
return Err(Error::new(ERR_CLASS_UNCLOSED));
}
match self.char() {
'[' => {
if let Some(class) = self.maybe_parse_posix_class() {
union.extend_from_slice(&class.ranges);
continue;
}
return Err(Error::new(ERR_CLASS_NEST_UNSUPPORTED));
}
']' => {
self.bump();
let mut class = hir::Class::new(union);
if self.flags().case_insensitive {
class.ascii_case_fold();
}
if negate {
class.negate();
}
return Ok(Hir::class(class));
}
'&' if self.peek() == Some('&') => {
return Err(Error::new(
ERR_CLASS_INTERSECTION_UNSUPPORTED,
));
}
'-' if self.peek() == Some('-') => {
return Err(Error::new(ERR_CLASS_DIFFERENCE_UNSUPPORTED));
}
'~' if self.peek() == Some('~') => {
return Err(Error::new(
ERR_CLASS_SYMDIFFERENCE_UNSUPPORTED,
));
}
_ => self.parse_class_range(&mut union)?,
}
}
}
fn parse_class_range(
&self,
union: &mut Vec<hir::ClassRange>,
) -> Result<(), Error> {
let prim1 = self.parse_class_item()?;
self.bump_space();
if self.is_done() {
return Err(Error::new(ERR_CLASS_UNCLOSED_AFTER_ITEM));
}
if self.char() != '-'
|| self.peek_space() == Some(']')
|| self.peek_space() == Some('-')
{
union.extend_from_slice(&into_class_item_ranges(prim1)?);
return Ok(());
}
if !self.bump_and_bump_space() {
return Err(Error::new(ERR_CLASS_UNCLOSED_AFTER_DASH));
}
let prim2 = self.parse_class_item()?;
let range = hir::ClassRange {
start: into_class_item_range(prim1)?,
end: into_class_item_range(prim2)?,
};
if range.start > range.end {
return Err(Error::new(ERR_CLASS_INVALID_RANGE));
}
union.push(range);
Ok(())
}
fn parse_class_item(&self) -> Result<Hir, Error> {
let ch = self.char();
self.bump();
if ch == '\\' {
self.parse_escape()
} else {
Ok(Hir::char(ch))
}
}
fn maybe_parse_posix_class(&self) -> Option<hir::Class> {
assert_eq!(self.char(), '[');
let start_pos = self.pos();
let start_char = self.char.get();
let reset = || {
self.pos.set(start_pos);
self.char.set(start_char);
};
let mut negated = false;
if !self.bump() || self.char() != ':' {
reset();
return None;
}
if !self.bump() {
reset();
return None;
}
if self.char() == '^' {
negated = true;
if !self.bump() {
reset();
return None;
}
}
let name_start = self.pos();
while self.char() != ':' && self.bump() {}
if self.is_done() {
reset();
return None;
}
let name = &self.pattern()[name_start..self.pos()];
if !self.bump_if(":]") {
reset();
return None;
}
if let Ok(ranges) = posix_class(name) {
let mut class = hir::Class::new(ranges);
if negated {
class.negate();
}
return Some(class);
}
reset();
None
}
fn parse_perl_class(&self) -> Hir {
let ch = self.char();
self.bump();
let mut class = hir::Class::new(match ch {
'd' | 'D' => posix_class("digit").unwrap(),
's' | 'S' => posix_class("space").unwrap(),
'w' | 'W' => posix_class("word").unwrap(),
unk => unreachable!("invalid Perl class \\{unk}"),
});
if ch.is_ascii_uppercase() {
class.negate();
}
Hir::class(class)
}
fn hir_dot(&self) -> Hir {
if self.flags().dot_matches_new_line {
Hir::class(hir::Class::new([hir::ClassRange {
start: '\x00',
end: '\u{10FFFF}',
}]))
} else if self.flags().crlf {
Hir::class(hir::Class::new([
hir::ClassRange { start: '\x00', end: '\x09' },
hir::ClassRange { start: '\x0B', end: '\x0C' },
hir::ClassRange { start: '\x0E', end: '\u{10FFFF}' },
]))
} else {
Hir::class(hir::Class::new([
hir::ClassRange { start: '\x00', end: '\x09' },
hir::ClassRange { start: '\x0B', end: '\u{10FFFF}' },
]))
}
}
fn hir_anchor_start(&self) -> Hir {
let look = if self.flags().multi_line {
if self.flags().crlf {
hir::Look::StartCRLF
} else {
hir::Look::StartLF
}
} else {
hir::Look::Start
};
Hir::look(look)
}
fn hir_anchor_end(&self) -> Hir {
let look = if self.flags().multi_line {
if self.flags().crlf {
hir::Look::EndCRLF
} else {
hir::Look::EndLF
}
} else {
hir::Look::End
};
Hir::look(look)
}
fn hir_char(&self, ch: char) -> Hir {
if self.flags().case_insensitive {
let this = hir::ClassRange { start: ch, end: ch };
if let Some(folded) = this.ascii_case_fold() {
return Hir::class(hir::Class::new([this, folded]));
}
}
Hir::char(ch)
}
}
fn check_hir_nesting(hir: &Hir, limit: u32) -> Result<(), Error> {
fn recurse(hir: &Hir, limit: u32, depth: u32) -> Result<(), Error> {
if depth > limit {
return Err(Error::new(ERR_TOO_MUCH_NESTING));
}
let Some(next_depth) = depth.checked_add(1) else {
return Err(Error::new(ERR_TOO_MUCH_NESTING));
};
match *hir.kind() {
HirKind::Empty
| HirKind::Char(_)
| HirKind::Class(_)
| HirKind::Look(_) => Ok(()),
HirKind::Repetition(hir::Repetition { ref sub, .. }) => {
recurse(sub, limit, next_depth)
}
HirKind::Capture(hir::Capture { ref sub, .. }) => {
recurse(sub, limit, next_depth)
}
HirKind::Concat(ref subs) | HirKind::Alternation(ref subs) => {
for sub in subs.iter() {
recurse(sub, limit, next_depth)?;
}
Ok(())
}
}
}
recurse(hir, limit, 0)
}
fn into_class_item_range(hir: Hir) -> Result<char, Error> {
match hir.kind {
HirKind::Char(ch) => Ok(ch),
_ => Err(Error::new(ERR_CLASS_INVALID_RANGE_ITEM)),
}
}
fn into_class_item_ranges(
mut hir: Hir,
) -> Result<Vec<hir::ClassRange>, Error> {
match core::mem::replace(&mut hir.kind, HirKind::Empty) {
HirKind::Char(ch) => Ok(vec![hir::ClassRange { start: ch, end: ch }]),
HirKind::Class(hir::Class { ranges }) => Ok(ranges),
_ => Err(Error::new(ERR_CLASS_INVALID_ITEM)),
}
}
fn posix_class(
kind: &str,
) -> Result<impl Iterator<Item = hir::ClassRange>, Error> {
let slice: &'static [(u8, u8)] = match kind {
"alnum" => &[(b'0', b'9'), (b'A', b'Z'), (b'a', b'z')],
"alpha" => &[(b'A', b'Z'), (b'a', b'z')],
"ascii" => &[(b'\x00', b'\x7F')],
"blank" => &[(b'\t', b'\t'), (b' ', b' ')],
"cntrl" => &[(b'\x00', b'\x1F'), (b'\x7F', b'\x7F')],
"digit" => &[(b'0', b'9')],
"graph" => &[(b'!', b'~')],
"lower" => &[(b'a', b'z')],
"print" => &[(b' ', b'~')],
"punct" => &[(b'!', b'/'), (b':', b'@'), (b'[', b'`'), (b'{', b'~')],
"space" => &[
(b'\t', b'\t'),
(b'\n', b'\n'),
(b'\x0B', b'\x0B'),
(b'\x0C', b'\x0C'),
(b'\r', b'\r'),
(b' ', b' '),
],
"upper" => &[(b'A', b'Z')],
"word" => &[(b'0', b'9'), (b'A', b'Z'), (b'_', b'_'), (b'a', b'z')],
"xdigit" => &[(b'0', b'9'), (b'A', b'F'), (b'a', b'f')],
_ => return Err(Error::new(ERR_POSIX_CLASS_UNRECOGNIZED)),
};
Ok(slice.iter().map(|&(start, end)| hir::ClassRange {
start: char::from(start),
end: char::from(end),
}))
}
fn is_hex(c: char) -> bool {
('0' <= c && c <= '9') || ('a' <= c && c <= 'f') || ('A' <= c && c <= 'F')
}
fn is_capture_char(c: char, first: bool) -> bool {
if first {
c == '_' || c.is_alphabetic()
} else {
c == '_' || c == '.' || c == '[' || c == ']' || c.is_alphanumeric()
}
}
#[cfg(test)]
mod tests {
use super::*;
fn p(pattern: &str) -> Hir {
Parser::new(Config::default(), pattern).parse_inner().unwrap()
}
fn perr(pattern: &str) -> String {
Parser::new(Config::default(), pattern)
.parse_inner()
.unwrap_err()
.to_string()
}
fn class<I: IntoIterator<Item = (char, char)>>(it: I) -> Hir {
Hir::class(hir::Class::new(
it.into_iter().map(|(start, end)| hir::ClassRange { start, end }),
))
}
fn singles<I: IntoIterator<Item = char>>(it: I) -> Hir {
Hir::class(hir::Class::new(
it.into_iter().map(|ch| hir::ClassRange { start: ch, end: ch }),
))
}
fn posix(name: &str) -> Hir {
Hir::class(hir::Class::new(posix_class(name).unwrap()))
}
fn cap(index: u32, sub: Hir) -> Hir {
Hir::capture(hir::Capture { index, name: None, sub: Box::new(sub) })
}
fn named_cap(index: u32, name: &str, sub: Hir) -> Hir {
Hir::capture(hir::Capture {
index,
name: Some(Box::from(name)),
sub: Box::new(sub),
})
}
#[test]
fn ok_literal() {
assert_eq!(p("a"), Hir::char('a'));
assert_eq!(p("ab"), Hir::concat(vec![Hir::char('a'), Hir::char('b')]));
assert_eq!(p("💩"), Hir::char('💩'));
}
#[test]
fn ok_meta_escapes() {
assert_eq!(p(r"\*"), Hir::char('*'));
assert_eq!(p(r"\+"), Hir::char('+'));
assert_eq!(p(r"\?"), Hir::char('?'));
assert_eq!(p(r"\|"), Hir::char('|'));
assert_eq!(p(r"\("), Hir::char('('));
assert_eq!(p(r"\)"), Hir::char(')'));
assert_eq!(p(r"\^"), Hir::char('^'));
assert_eq!(p(r"\$"), Hir::char('$'));
assert_eq!(p(r"\["), Hir::char('['));
assert_eq!(p(r"\]"), Hir::char(']'));
}
#[test]
fn ok_special_escapes() {
assert_eq!(p(r"\a"), Hir::char('\x07'));
assert_eq!(p(r"\f"), Hir::char('\x0C'));
assert_eq!(p(r"\t"), Hir::char('\t'));
assert_eq!(p(r"\n"), Hir::char('\n'));
assert_eq!(p(r"\r"), Hir::char('\r'));
assert_eq!(p(r"\v"), Hir::char('\x0B'));
assert_eq!(p(r"\A"), Hir::look(hir::Look::Start));
assert_eq!(p(r"\z"), Hir::look(hir::Look::End));
assert_eq!(p(r"\b"), Hir::look(hir::Look::Word));
assert_eq!(p(r"\B"), Hir::look(hir::Look::WordNegate));
}
#[test]
fn ok_hex() {
assert_eq!(p(r"\x41"), Hir::char('A'));
assert_eq!(p(r"\u2603"), Hir::char('☃'));
assert_eq!(p(r"\U0001F4A9"), Hir::char('💩'));
assert_eq!(p(r"\x{1F4A9}"), Hir::char('💩'));
assert_eq!(p(r"\u{1F4A9}"), Hir::char('💩'));
assert_eq!(p(r"\U{1F4A9}"), Hir::char('💩'));
}
#[test]
fn ok_perl() {
assert_eq!(p(r"\d"), posix("digit"));
assert_eq!(p(r"\s"), posix("space"));
assert_eq!(p(r"\w"), posix("word"));
let negated = |name| {
let mut class = hir::Class::new(posix_class(name).unwrap());
class.negate();
Hir::class(class)
};
assert_eq!(p(r"\D"), negated("digit"));
assert_eq!(p(r"\S"), negated("space"));
assert_eq!(p(r"\W"), negated("word"));
}
#[test]
fn ok_flags_and_primitives() {
assert_eq!(p(r"a"), Hir::char('a'));
assert_eq!(p(r"(?i:a)"), singles(['A', 'a']));
assert_eq!(p(r"^"), Hir::look(hir::Look::Start));
assert_eq!(p(r"(?m:^)"), Hir::look(hir::Look::StartLF));
assert_eq!(p(r"(?mR:^)"), Hir::look(hir::Look::StartCRLF));
assert_eq!(p(r"$"), Hir::look(hir::Look::End));
assert_eq!(p(r"(?m:$)"), Hir::look(hir::Look::EndLF));
assert_eq!(p(r"(?mR:$)"), Hir::look(hir::Look::EndCRLF));
assert_eq!(p(r"."), class([('\x00', '\x09'), ('\x0B', '\u{10FFFF}')]));
assert_eq!(
p(r"(?R:.)"),
class([
('\x00', '\x09'),
('\x0B', '\x0C'),
('\x0E', '\u{10FFFF}'),
])
);
assert_eq!(p(r"(?s:.)"), class([('\x00', '\u{10FFFF}')]));
assert_eq!(p(r"(?sR:.)"), class([('\x00', '\u{10FFFF}')]));
}
#[test]
fn ok_alternate() {
assert_eq!(
p(r"a|b"),
Hir::alternation(vec![Hir::char('a'), Hir::char('b')])
);
assert_eq!(
p(r"(?:a|b)"),
Hir::alternation(vec![Hir::char('a'), Hir::char('b')])
);
assert_eq!(
p(r"(a|b)"),
cap(1, Hir::alternation(vec![Hir::char('a'), Hir::char('b')]))
);
assert_eq!(
p(r"(?<foo>a|b)"),
named_cap(
1,
"foo",
Hir::alternation(vec![Hir::char('a'), Hir::char('b')])
)
);
assert_eq!(
p(r"a|b|c"),
Hir::alternation(vec![
Hir::char('a'),
Hir::char('b'),
Hir::char('c')
])
);
assert_eq!(
p(r"ax|by|cz"),
Hir::alternation(vec![
Hir::concat(vec![Hir::char('a'), Hir::char('x')]),
Hir::concat(vec![Hir::char('b'), Hir::char('y')]),
Hir::concat(vec![Hir::char('c'), Hir::char('z')]),
])
);
assert_eq!(
p(r"(ax|(by|(cz)))"),
cap(
1,
Hir::alternation(vec![
Hir::concat(vec![Hir::char('a'), Hir::char('x')]),
cap(
2,
Hir::alternation(vec![
Hir::concat(vec![Hir::char('b'), Hir::char('y')]),
cap(
3,
Hir::concat(vec![
Hir::char('c'),
Hir::char('z')
])
),
])
),
])
)
);
assert_eq!(
p(r"|"),
Hir::alternation(vec![Hir::empty(), Hir::empty()])
);
assert_eq!(
p(r"||"),
Hir::alternation(vec![Hir::empty(), Hir::empty(), Hir::empty()])
);
assert_eq!(
p(r"a|"),
Hir::alternation(vec![Hir::char('a'), Hir::empty()])
);
assert_eq!(
p(r"|a"),
Hir::alternation(vec![Hir::empty(), Hir::char('a')])
);
assert_eq!(
p(r"(|)"),
cap(1, Hir::alternation(vec![Hir::empty(), Hir::empty()]))
);
assert_eq!(
p(r"(a|)"),
cap(1, Hir::alternation(vec![Hir::char('a'), Hir::empty()]))
);
assert_eq!(
p(r"(|a)"),
cap(1, Hir::alternation(vec![Hir::empty(), Hir::char('a')]))
);
}
#[test]
fn ok_flag_group() {
assert_eq!(
p("a(?i:b)"),
Hir::concat(vec![Hir::char('a'), singles(['B', 'b'])])
);
}
#[test]
fn ok_flag_directive() {
assert_eq!(p("(?i)a"), singles(['A', 'a']));
assert_eq!(p("a(?i)"), Hir::char('a'));
assert_eq!(
p("a(?i)b"),
Hir::concat(vec![Hir::char('a'), singles(['B', 'b'])])
);
assert_eq!(
p("a(?i)a(?-i)a"),
Hir::concat(vec![
Hir::char('a'),
singles(['A', 'a']),
Hir::char('a'),
])
);
assert_eq!(
p("a(?:(?i)a)a"),
Hir::concat(vec![
Hir::char('a'),
singles(['A', 'a']),
Hir::char('a'),
])
);
assert_eq!(
p("a((?i)a)a"),
Hir::concat(vec![
Hir::char('a'),
cap(1, singles(['A', 'a'])),
Hir::char('a'),
])
);
}
#[test]
fn ok_uncounted_repetition() {
assert_eq!(
p(r"a?"),
Hir::repetition(hir::Repetition {
min: 0,
max: Some(1),
greedy: true,
sub: Box::new(Hir::char('a')),
}),
);
assert_eq!(
p(r"a*"),
Hir::repetition(hir::Repetition {
min: 0,
max: None,
greedy: true,
sub: Box::new(Hir::char('a')),
}),
);
assert_eq!(
p(r"a+"),
Hir::repetition(hir::Repetition {
min: 1,
max: None,
greedy: true,
sub: Box::new(Hir::char('a')),
}),
);
assert_eq!(
p(r"a??"),
Hir::repetition(hir::Repetition {
min: 0,
max: Some(1),
greedy: false,
sub: Box::new(Hir::char('a')),
}),
);
assert_eq!(
p(r"a*?"),
Hir::repetition(hir::Repetition {
min: 0,
max: None,
greedy: false,
sub: Box::new(Hir::char('a')),
}),
);
assert_eq!(
p(r"a+?"),
Hir::repetition(hir::Repetition {
min: 1,
max: None,
greedy: false,
sub: Box::new(Hir::char('a')),
}),
);
assert_eq!(
p(r"a?b"),
Hir::concat(vec![
Hir::repetition(hir::Repetition {
min: 0,
max: Some(1),
greedy: true,
sub: Box::new(Hir::char('a')),
}),
Hir::char('b'),
]),
);
assert_eq!(
p(r"ab?"),
Hir::concat(vec![
Hir::char('a'),
Hir::repetition(hir::Repetition {
min: 0,
max: Some(1),
greedy: true,
sub: Box::new(Hir::char('b')),
}),
]),
);
assert_eq!(
p(r"(?:ab)?"),
Hir::repetition(hir::Repetition {
min: 0,
max: Some(1),
greedy: true,
sub: Box::new(Hir::concat(vec![
Hir::char('a'),
Hir::char('b')
])),
}),
);
assert_eq!(
p(r"(ab)?"),
Hir::repetition(hir::Repetition {
min: 0,
max: Some(1),
greedy: true,
sub: Box::new(cap(
1,
Hir::concat(vec![Hir::char('a'), Hir::char('b')])
)),
}),
);
assert_eq!(
p(r"|a?"),
Hir::alternation(vec![
Hir::empty(),
Hir::repetition(hir::Repetition {
min: 0,
max: Some(1),
greedy: true,
sub: Box::new(Hir::char('a')),
})
]),
);
}
#[test]
fn ok_counted_repetition() {
assert_eq!(
p(r"a{5}"),
Hir::repetition(hir::Repetition {
min: 5,
max: Some(5),
greedy: true,
sub: Box::new(Hir::char('a')),
}),
);
assert_eq!(
p(r"a{5}?"),
Hir::repetition(hir::Repetition {
min: 5,
max: Some(5),
greedy: false,
sub: Box::new(Hir::char('a')),
}),
);
assert_eq!(
p(r"a{5,}"),
Hir::repetition(hir::Repetition {
min: 5,
max: None,
greedy: true,
sub: Box::new(Hir::char('a')),
}),
);
assert_eq!(
p(r"a{5,9}"),
Hir::repetition(hir::Repetition {
min: 5,
max: Some(9),
greedy: true,
sub: Box::new(Hir::char('a')),
}),
);
assert_eq!(
p(r"ab{5}c"),
Hir::concat(vec![
Hir::char('a'),
Hir::repetition(hir::Repetition {
min: 5,
max: Some(5),
greedy: true,
sub: Box::new(Hir::char('b')),
}),
Hir::char('c'),
]),
);
assert_eq!(
p(r"a{ 5 }"),
Hir::repetition(hir::Repetition {
min: 5,
max: Some(5),
greedy: true,
sub: Box::new(Hir::char('a')),
}),
);
assert_eq!(
p(r"a{ 5 , 9 }"),
Hir::repetition(hir::Repetition {
min: 5,
max: Some(9),
greedy: true,
sub: Box::new(Hir::char('a')),
}),
);
}
#[test]
fn ok_group_unnamed() {
assert_eq!(p("(a)"), cap(1, Hir::char('a')));
assert_eq!(
p("(ab)"),
cap(1, Hir::concat(vec![Hir::char('a'), Hir::char('b')]))
);
}
#[test]
fn ok_group_named() {
assert_eq!(p("(?P<foo>a)"), named_cap(1, "foo", Hir::char('a')));
assert_eq!(p("(?<foo>a)"), named_cap(1, "foo", Hir::char('a')));
assert_eq!(
p("(?P<foo>ab)"),
named_cap(
1,
"foo",
Hir::concat(vec![Hir::char('a'), Hir::char('b')])
)
);
assert_eq!(
p("(?<foo>ab)"),
named_cap(
1,
"foo",
Hir::concat(vec![Hir::char('a'), Hir::char('b')])
)
);
assert_eq!(p(r"(?<a>z)"), named_cap(1, "a", Hir::char('z')));
assert_eq!(p(r"(?P<a>z)"), named_cap(1, "a", Hir::char('z')));
assert_eq!(p(r"(?<a_1>z)"), named_cap(1, "a_1", Hir::char('z')));
assert_eq!(p(r"(?P<a_1>z)"), named_cap(1, "a_1", Hir::char('z')));
assert_eq!(p(r"(?<a.1>z)"), named_cap(1, "a.1", Hir::char('z')));
assert_eq!(p(r"(?P<a.1>z)"), named_cap(1, "a.1", Hir::char('z')));
assert_eq!(p(r"(?<a[1]>z)"), named_cap(1, "a[1]", Hir::char('z')));
assert_eq!(p(r"(?P<a[1]>z)"), named_cap(1, "a[1]", Hir::char('z')));
assert_eq!(p(r"(?<a¾>z)"), named_cap(1, "a¾", Hir::char('z')));
assert_eq!(p(r"(?P<a¾>z)"), named_cap(1, "a¾", Hir::char('z')));
assert_eq!(p(r"(?<名字>z)"), named_cap(1, "名字", Hir::char('z')));
assert_eq!(p(r"(?P<名字>z)"), named_cap(1, "名字", Hir::char('z')));
}
#[test]
fn ok_class() {
assert_eq!(p(r"[a]"), singles(['a']));
assert_eq!(p(r"[a\]]"), singles(['a', ']']));
assert_eq!(p(r"[a\-z]"), singles(['a', '-', 'z']));
assert_eq!(p(r"[ab]"), class([('a', 'b')]));
assert_eq!(p(r"[a-]"), singles(['a', '-']));
assert_eq!(p(r"[-a]"), singles(['a', '-']));
assert_eq!(p(r"[--a]"), singles(['a', '-']));
assert_eq!(p(r"[---a]"), singles(['a', '-']));
assert_eq!(p(r"[[:alnum:]]"), posix("alnum"));
assert_eq!(p(r"[\w]"), posix("word"));
assert_eq!(p(r"[a\wz]"), posix("word"));
assert_eq!(p(r"[\s\S]"), class([('\x00', '\u{10FFFF}')]));
assert_eq!(p(r"[^\s\S]"), Hir::fail());
assert_eq!(p(r"[a-cx-z]"), class([('a', 'c'), ('x', 'z')]));
assert_eq!(p(r"[☃-⛄]"), class([('☃', '⛄')]));
assert_eq!(p(r"[]]"), singles([']']));
assert_eq!(p(r"[]a]"), singles([']', 'a']));
assert_eq!(p(r"[]\[]"), singles(['[', ']']));
assert_eq!(p(r"[\[]"), singles(['[']));
assert_eq!(p(r"(?i)[a]"), singles(['A', 'a']));
assert_eq!(p(r"(?i)[A]"), singles(['A', 'a']));
assert_eq!(p(r"(?i)[k]"), singles(['K', 'k']));
assert_eq!(p(r"(?i)[s]"), singles(['S', 's']));
assert_eq!(p(r"(?i)[β]"), singles(['β']));
assert_eq!(p(r"[^^]"), class([('\x00', ']'), ('_', '\u{10FFFF}')]));
assert_eq!(
p(r"[^-a]"),
class([('\x00', ','), ('.', '`'), ('b', '\u{10FFFF}')])
);
assert_eq!(
p(r"[-]a]"),
Hir::concat(vec![singles(['-']), Hir::char('a'), Hir::char(']')])
);
}
#[test]
fn ok_verbatim() {
assert_eq!(
p(r"(?x)a{5,9} ?"),
Hir::repetition(hir::Repetition {
min: 5,
max: Some(9),
greedy: false,
sub: Box::new(Hir::char('a')),
})
);
assert_eq!(p(r"(?x)[ a]"), singles(['a']));
assert_eq!(
p(r"(?x)[ ^ a]"),
class([('\x00', '`'), ('b', '\u{10FFFF}')])
);
assert_eq!(p(r"(?x)[ - a]"), singles(['a', '-']));
assert_eq!(p(r"(?x)[ ] a]"), singles([']', 'a']));
assert_eq!(
p(r"(?x)a b"),
Hir::concat(vec![Hir::char('a'), Hir::char('b')])
);
assert_eq!(
p(r"(?x)a b(?-x)a b"),
Hir::concat(vec![
Hir::char('a'),
Hir::char('b'),
Hir::char('a'),
Hir::char(' '),
Hir::char('b'),
])
);
assert_eq!(
p(r"a (?x:a )a "),
Hir::concat(vec![
Hir::char('a'),
Hir::char(' '),
Hir::char('a'),
Hir::char('a'),
Hir::char(' '),
])
);
assert_eq!(
p(r"(?x)( ?P<foo> a )"),
named_cap(1, "foo", Hir::char('a')),
);
assert_eq!(p(r"(?x)( a )"), cap(1, Hir::char('a')));
assert_eq!(p(r"(?x)( ?: a )"), Hir::char('a'));
assert_eq!(p(r"(?x)\x { 53 }"), Hir::char('\x53'));
assert_eq!(p(r"(?x)\ "), Hir::char(' '));
}
#[test]
fn ok_comments() {
let pat = "(?x)
# This is comment 1.
foo # This is comment 2.
# This is comment 3.
bar
# This is comment 4.";
assert_eq!(
p(pat),
Hir::concat(vec![
Hir::char('f'),
Hir::char('o'),
Hir::char('o'),
Hir::char('b'),
Hir::char('a'),
Hir::char('r'),
])
);
}
#[test]
fn err_standard() {
assert_eq!(
ERR_TOO_MUCH_NESTING,
perr("(((((((((((((((((((((((((((((((((((((((((((((((((((a)))))))))))))))))))))))))))))))))))))))))))))))))))"),
);
assert_eq!(ERR_DUPLICATE_CAPTURE_NAME, perr(r"(?P<a>y)(?P<a>z)"));
assert_eq!(ERR_UNCLOSED_GROUP, perr("("));
assert_eq!(ERR_UNCLOSED_GROUP_QUESTION, perr("(?"));
assert_eq!(ERR_UNOPENED_GROUP, perr(")"));
assert_eq!(ERR_LOOK_UNSUPPORTED, perr(r"(?=a)"));
assert_eq!(ERR_LOOK_UNSUPPORTED, perr(r"(?!a)"));
assert_eq!(ERR_LOOK_UNSUPPORTED, perr(r"(?<=a)"));
assert_eq!(ERR_LOOK_UNSUPPORTED, perr(r"(?<!a)"));
assert_eq!(ERR_EMPTY_FLAGS, perr(r"(?)"));
assert_eq!(ERR_MISSING_GROUP_NAME, perr(r"(?P<"));
assert_eq!(ERR_MISSING_GROUP_NAME, perr(r"(?<"));
assert_eq!(ERR_INVALID_GROUP_NAME, perr(r"(?P<1abc>z)"));
assert_eq!(ERR_INVALID_GROUP_NAME, perr(r"(?<1abc>z)"));
assert_eq!(ERR_INVALID_GROUP_NAME, perr(r"(?<¾>z)"));
assert_eq!(ERR_INVALID_GROUP_NAME, perr(r"(?<¾a>z)"));
assert_eq!(ERR_INVALID_GROUP_NAME, perr(r"(?<☃>z)"));
assert_eq!(ERR_INVALID_GROUP_NAME, perr(r"(?<a☃>z)"));
assert_eq!(ERR_UNCLOSED_GROUP_NAME, perr(r"(?P<foo"));
assert_eq!(ERR_UNCLOSED_GROUP_NAME, perr(r"(?<foo"));
assert_eq!(ERR_EMPTY_GROUP_NAME, perr(r"(?P<>z)"));
assert_eq!(ERR_EMPTY_GROUP_NAME, perr(r"(?<>z)"));
assert_eq!(ERR_FLAG_UNRECOGNIZED, perr(r"(?z:foo)"));
assert_eq!(ERR_FLAG_REPEATED_NEGATION, perr(r"(?s-i-R)"));
assert_eq!(ERR_FLAG_DUPLICATE, perr(r"(?isi)"));
assert_eq!(ERR_FLAG_DUPLICATE, perr(r"(?is-i)"));
assert_eq!(ERR_FLAG_UNEXPECTED_EOF, perr(r"(?is"));
assert_eq!(ERR_FLAG_DANGLING_NEGATION, perr(r"(?is-:foo)"));
assert_eq!(ERR_HEX_BRACE_INVALID_DIGIT, perr(r"\x{Z}"));
assert_eq!(ERR_HEX_BRACE_UNEXPECTED_EOF, perr(r"\x{"));
assert_eq!(ERR_HEX_BRACE_UNEXPECTED_EOF, perr(r"\x{A"));
assert_eq!(ERR_HEX_BRACE_EMPTY, perr(r"\x{}"));
assert_eq!(ERR_HEX_BRACE_INVALID, perr(r"\x{FFFFFFFFFFFFFFFFF}"));
assert_eq!(ERR_HEX_FIXED_UNEXPECTED_EOF, perr(r"\xA"));
assert_eq!(ERR_HEX_FIXED_INVALID_DIGIT, perr(r"\xZ"));
assert_eq!(ERR_HEX_FIXED_INVALID_DIGIT, perr(r"\xZA"));
assert_eq!(ERR_HEX_FIXED_INVALID_DIGIT, perr(r"\xAZ"));
assert_eq!(ERR_HEX_FIXED_INVALID, perr(r"\uD800"));
assert_eq!(ERR_HEX_FIXED_INVALID, perr(r"\UFFFFFFFF"));
assert_eq!(ERR_HEX_UNEXPECTED_EOF, perr(r"\x"));
assert_eq!(ERR_ESCAPE_UNEXPECTED_EOF, perr(r"\"));
assert_eq!(ERR_BACKREF_UNSUPPORTED, perr(r"\0"));
assert_eq!(ERR_BACKREF_UNSUPPORTED, perr(r"\1"));
assert_eq!(ERR_BACKREF_UNSUPPORTED, perr(r"\8"));
assert_eq!(ERR_UNICODE_CLASS_UNSUPPORTED, perr(r"\pL"));
assert_eq!(ERR_UNICODE_CLASS_UNSUPPORTED, perr(r"\p{L}"));
assert_eq!(ERR_ESCAPE_UNRECOGNIZED, perr(r"\i"));
assert_eq!(ERR_UNCOUNTED_REP_SUB_MISSING, perr(r"?"));
assert_eq!(ERR_UNCOUNTED_REP_SUB_MISSING, perr(r"*"));
assert_eq!(ERR_UNCOUNTED_REP_SUB_MISSING, perr(r"+"));
assert_eq!(ERR_UNCOUNTED_REP_SUB_MISSING, perr(r"(+)"));
assert_eq!(ERR_UNCOUNTED_REP_SUB_MISSING, perr(r"|?"));
assert_eq!(ERR_UNCOUNTED_REP_SUB_MISSING, perr(r"(?i)?"));
assert_eq!(ERR_COUNTED_REP_SUB_MISSING, perr(r"{5}"));
assert_eq!(ERR_COUNTED_REP_SUB_MISSING, perr(r"({5})"));
assert_eq!(ERR_COUNTED_REP_SUB_MISSING, perr(r"(?i){5}"));
assert_eq!(ERR_COUNTED_REP_UNCLOSED, perr(r"a{"));
assert_eq!(ERR_COUNTED_REP_MIN_UNCLOSED, perr(r"a{5"));
assert_eq!(ERR_COUNTED_REP_COMMA_UNCLOSED, perr(r"a{5,"));
assert_eq!(ERR_COUNTED_REP_MIN_MAX_UNCLOSED, perr(r"a{5,6"));
assert_eq!(ERR_COUNTED_REP_INVALID, perr(r"a{5,6Z"));
assert_eq!(ERR_COUNTED_REP_INVALID_RANGE, perr(r"a{6,5}"));
assert_eq!(ERR_DECIMAL_NO_DIGITS, perr(r"a{}"));
assert_eq!(ERR_DECIMAL_NO_DIGITS, perr(r"a{]}"));
assert_eq!(ERR_DECIMAL_INVALID, perr(r"a{999999999999999}"));
assert_eq!(ERR_CLASS_UNCLOSED_AFTER_ITEM, perr(r"[a"));
assert_eq!(ERR_CLASS_INVALID_RANGE_ITEM, perr(r"[\w-a]"));
assert_eq!(ERR_CLASS_INVALID_RANGE_ITEM, perr(r"[a-\w]"));
assert_eq!(ERR_CLASS_INVALID_ITEM, perr(r"[\b]"));
assert_eq!(ERR_CLASS_UNCLOSED_AFTER_DASH, perr(r"[a-"));
assert_eq!(ERR_CLASS_UNCLOSED_AFTER_NEGATION, perr(r"[^"));
assert_eq!(ERR_CLASS_UNCLOSED_AFTER_CLOSING, perr(r"[]"));
assert_eq!(ERR_CLASS_INVALID_RANGE, perr(r"[z-a]"));
assert_eq!(ERR_CLASS_UNCLOSED, perr(r"["));
assert_eq!(ERR_CLASS_UNCLOSED, perr(r"[a-z"));
assert_eq!(ERR_CLASS_NEST_UNSUPPORTED, perr(r"[a-z[A-Z]]"));
assert_eq!(ERR_CLASS_NEST_UNSUPPORTED, perr(r"[[:alnum]]"));
assert_eq!(ERR_CLASS_INTERSECTION_UNSUPPORTED, perr(r"[a&&b]"));
assert_eq!(ERR_CLASS_DIFFERENCE_UNSUPPORTED, perr(r"[a--b]"));
assert_eq!(ERR_CLASS_SYMDIFFERENCE_UNSUPPORTED, perr(r"[a~~b]"));
assert_eq!(ERR_SPECIAL_WORD_BOUNDARY_UNCLOSED, perr(r"\b{foo"));
assert_eq!(ERR_SPECIAL_WORD_BOUNDARY_UNCLOSED, perr(r"\b{foo!}"));
assert_eq!(ERR_SPECIAL_WORD_BOUNDARY_UNRECOGNIZED, perr(r"\b{foo}"));
assert_eq!(ERR_SPECIAL_WORD_OR_REP_UNEXPECTED_EOF, perr(r"\b{"));
assert_eq!(ERR_SPECIAL_WORD_OR_REP_UNEXPECTED_EOF, perr(r"(?x)\b{ "));
}
#[test]
fn err_verbatim() {
assert_eq!(ERR_CLASS_UNCLOSED_AFTER_DASH, perr(r"(?x)[-#]"));
assert_eq!(ERR_CLASS_UNCLOSED_AFTER_ITEM, perr(r"(?x)[a "));
assert_eq!(ERR_CLASS_UNCLOSED_AFTER_DASH, perr(r"(?x)[a- "));
assert_eq!(ERR_CLASS_UNCLOSED, perr(r"(?x)[ "));
}
#[test]
fn regression_454_nest_too_big() {
let pattern = r#"
2(?:
[45]\d{3}|
7(?:
1[0-267]|
2[0-289]|
3[0-29]|
4[01]|
5[1-3]|
6[013]|
7[0178]|
91
)|
8(?:
0[125]|
[139][1-6]|
2[0157-9]|
41|
6[1-35]|
7[1-5]|
8[1-8]|
90
)|
9(?:
0[0-2]|
1[0-4]|
2[568]|
3[3-6]|
5[5-7]|
6[0167]|
7[15]|
8[0146-9]
)
)\d{4}
"#;
p(pattern);
}
#[test]
fn regression_455_trailing_dash_ignore_whitespace() {
p("(?x)[ / - ]");
p("(?x)[ a - ]");
p("(?x)[
a
- ]
");
p("(?x)[
a # wat
- ]
");
perr("(?x)[ / -");
perr("(?x)[ / - ");
perr(
"(?x)[
/ -
",
);
perr(
"(?x)[
/ - # wat
",
);
}
#[test]
fn regression_capture_indices() {
let got = p(r"(a|ab|c|bcd){4,10}(d*)");
assert_eq!(
got,
Hir::concat(vec![
Hir::repetition(hir::Repetition {
min: 4,
max: Some(10),
greedy: true,
sub: Box::new(cap(
1,
Hir::alternation(vec![
Hir::char('a'),
Hir::concat(vec![Hir::char('a'), Hir::char('b')]),
Hir::char('c'),
Hir::concat(vec![
Hir::char('b'),
Hir::char('c'),
Hir::char('d')
]),
])
))
}),
cap(
2,
Hir::repetition(hir::Repetition {
min: 0,
max: None,
greedy: true,
sub: Box::new(Hir::char('d')),
})
),
])
);
}
}