use std::fmt;
use crate::regex::engine::CharClass;
#[derive(Debug, Clone)]
pub enum PatternError {
UnexpectedChar(char, usize),
UnexpectedEnd,
InvalidCharClass(String),
InvalidEscape(char, usize),
UnmatchedParen,
EmptyGroup,
UnsupportedFeature(String),
}
impl fmt::Display for PatternError {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match self {
PatternError::UnexpectedChar(ch, pos) => {
write!(f, "Unexpected character '{ch}' at position {pos}")
}
PatternError::UnexpectedEnd => write!(f, "Unexpected end of pattern"),
PatternError::InvalidCharClass(msg) => write!(f, "Invalid character class: {msg}"),
PatternError::InvalidEscape(ch, pos) => {
write!(f, "Invalid escape sequence '\\{ch}' at position {pos}")
}
PatternError::UnmatchedParen => write!(f, "Unmatched parenthesis"),
PatternError::EmptyGroup => write!(f, "Empty group"),
PatternError::UnsupportedFeature(msg) => write!(f, "Unsupported feature: {msg}"),
}
}
}
impl std::error::Error for PatternError {}
#[derive(Debug, Clone)]
pub enum PatternNode {
Char(char),
CharClass(CharClass),
Any,
Start,
End,
Concat(Vec<PatternNode>),
Alternate(Vec<PatternNode>),
Star(Box<PatternNode>),
Plus(Box<PatternNode>),
Question(Box<PatternNode>),
Group(Box<PatternNode>),
}
#[derive(Debug, Clone)]
pub struct Pattern {
pub root: PatternNode,
pub source: String,
}
pub struct PatternParser<'p> {
pattern: &'p str,
pos: usize,
chars: Vec<char>,
}
impl<'p> PatternParser<'p> {
pub fn new(pattern: &'p str) -> Self {
Self {
pattern,
pos: 0,
chars: pattern.chars().collect(),
}
}
pub fn parse(mut self) -> Result<Pattern, PatternError> {
let root = self.parse_alternation()?;
if self.pos < self.chars.len() {
return Err(PatternError::UnexpectedChar(self.chars[self.pos], self.pos));
}
Ok(Pattern {
root,
source: self.pattern.to_string(),
})
}
fn current_char(&self) -> Option<char> {
self.chars.get(self.pos).copied()
}
fn peek_char(&self, offset: usize) -> Option<char> {
self.chars.get(self.pos + offset).copied()
}
fn advance(&mut self) -> Option<char> {
if self.pos < self.chars.len() {
let ch = self.chars[self.pos];
self.pos += 1;
Some(ch)
} else {
None
}
}
fn expect(&mut self, expected: char) -> Result<(), PatternError> {
match self.advance() {
Some(ch) if ch == expected => Ok(()),
Some(ch) => Err(PatternError::UnexpectedChar(ch, self.pos - 1)),
None => Err(PatternError::UnexpectedEnd),
}
}
fn parse_alternation(&mut self) -> Result<PatternNode, PatternError> {
let mut alternatives = vec![self.parse_sequence()?];
while self.current_char() == Some('|') {
self.advance(); alternatives.push(self.parse_sequence()?);
}
if alternatives.len() == 1 {
Ok(alternatives.into_iter().next().unwrap())
} else {
Ok(PatternNode::Alternate(alternatives))
}
}
fn parse_sequence(&mut self) -> Result<PatternNode, PatternError> {
let mut factors = Vec::new();
while let Some(ch) = self.current_char() {
if ch == '|' || ch == ')' {
break;
}
factors.push(self.parse_factor()?);
}
if factors.is_empty() {
Ok(PatternNode::Concat(vec![]))
} else if factors.len() == 1 {
Ok(factors.into_iter().next().unwrap())
} else {
Ok(PatternNode::Concat(factors))
}
}
fn parse_factor(&mut self) -> Result<PatternNode, PatternError> {
let atom = self.parse_atom()?;
match self.current_char() {
Some('*') => {
self.advance();
Ok(PatternNode::Star(Box::new(atom)))
}
Some('+') => {
self.advance();
Ok(PatternNode::Plus(Box::new(atom)))
}
Some('?') => {
self.advance();
Ok(PatternNode::Question(Box::new(atom)))
}
_ => Ok(atom),
}
}
fn parse_atom(&mut self) -> Result<PatternNode, PatternError> {
match self.current_char() {
Some('.') => {
self.advance();
Ok(PatternNode::Any)
}
Some('^') => {
self.advance();
Ok(PatternNode::Start)
}
Some('$') => {
self.advance();
Ok(PatternNode::End)
}
Some('(') => self.parse_group(),
Some('[') => self.parse_char_class(),
Some('\\') => self.parse_escape(),
Some(ch) if !is_meta_char(ch) => {
self.advance();
Ok(PatternNode::Char(ch))
}
Some(ch) => Err(PatternError::UnexpectedChar(ch, self.pos)),
None => Err(PatternError::UnexpectedEnd),
}
}
fn parse_group(&mut self) -> Result<PatternNode, PatternError> {
self.expect('(')?;
let inner = self.parse_alternation()?;
self.expect(')')?;
Ok(PatternNode::Group(Box::new(inner)))
}
fn parse_char_class(&mut self) -> Result<PatternNode, PatternError> {
self.expect('[')?;
let mut class = CharClass::new();
let mut negated = false;
if self.current_char() == Some('^') {
self.advance();
negated = true;
}
while let Some(ch) = self.current_char() {
if ch == ']' {
break;
}
self.advance();
if self.current_char() == Some('-') && self.peek_char(1) != Some(']') {
self.advance();
let end_char = match self.advance() {
Some(end) => end,
None => return Err(PatternError::UnexpectedEnd),
};
if ch > end_char {
return Err(PatternError::InvalidCharClass(
format!("Invalid range {ch}-{end_char}: start > end")
));
}
class.add_range(ch, end_char);
} else {
class.add_char(ch);
}
}
self.expect(']')?;
if negated {
class = class.negate();
}
Ok(PatternNode::CharClass(class))
}
fn parse_escape(&mut self) -> Result<PatternNode, PatternError> {
self.expect('\\')?;
match self.advance() {
Some('\\') => Ok(PatternNode::Char('\\')),
Some('.') => Ok(PatternNode::Char('.')),
Some('*') => Ok(PatternNode::Char('*')),
Some('+') => Ok(PatternNode::Char('+')),
Some('?') => Ok(PatternNode::Char('?')),
Some('(') => Ok(PatternNode::Char('(')),
Some(')') => Ok(PatternNode::Char(')')),
Some('[') => Ok(PatternNode::Char('[')),
Some(']') => Ok(PatternNode::Char(']')),
Some('^') => Ok(PatternNode::Char('^')),
Some('$') => Ok(PatternNode::Char('$')),
Some('|') => Ok(PatternNode::Char('|')),
Some('t') => Ok(PatternNode::Char('\t')),
Some('n') => Ok(PatternNode::Char('\n')),
Some('r') => Ok(PatternNode::Char('\r')),
Some('d') => Ok(PatternNode::CharClass(CharClass::builtin(
crate::regex::engine::BuiltinClass::Digit
))),
Some('w') => Ok(PatternNode::CharClass(CharClass::builtin(
crate::regex::engine::BuiltinClass::Word
))),
Some('s') => Ok(PatternNode::CharClass(CharClass::builtin(
crate::regex::engine::BuiltinClass::Space
))),
Some(ch) => Err(PatternError::InvalidEscape(ch, self.pos - 1)),
None => Err(PatternError::UnexpectedEnd),
}
}
}
fn is_meta_char(ch: char) -> bool {
matches!(ch, '.' | '^' | '$' | '*' | '+' | '?' | '(' | ')' | '[' | ']' | '|' | '\\')
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_single_char() {
let pattern = PatternParser::new("a").parse().unwrap();
match pattern.root {
PatternNode::Char('a') => {}
_ => panic!("Expected single character"),
}
}
#[test]
fn test_concatenation() {
let pattern = PatternParser::new("abc").parse().unwrap();
match pattern.root {
PatternNode::Concat(nodes) => {
assert_eq!(nodes.len(), 3);
match (&nodes[0], &nodes[1], &nodes[2]) {
(PatternNode::Char('a'), PatternNode::Char('b'), PatternNode::Char('c')) => {}
_ => panic!("Expected abc concatenation"),
}
}
_ => panic!("Expected concatenation"),
}
}
#[test]
fn test_alternation() {
let pattern = PatternParser::new("a|b").parse().unwrap();
match pattern.root {
PatternNode::Alternate(alts) => {
assert_eq!(alts.len(), 2);
match (&alts[0], &alts[1]) {
(PatternNode::Char('a'), PatternNode::Char('b')) => {}
_ => panic!("Expected a|b alternation"),
}
}
_ => panic!("Expected alternation"),
}
}
#[test]
fn test_quantifiers() {
let star = PatternParser::new("a*").parse().unwrap();
match star.root {
PatternNode::Star(inner) => match inner.as_ref() {
PatternNode::Char('a') => {}
_ => panic!("Expected 'a' in star"),
},
_ => panic!("Expected star"),
}
let plus = PatternParser::new("a+").parse().unwrap();
match plus.root {
PatternNode::Plus(inner) => match inner.as_ref() {
PatternNode::Char('a') => {}
_ => panic!("Expected 'a' in plus"),
},
_ => panic!("Expected plus"),
}
let question = PatternParser::new("a?").parse().unwrap();
match question.root {
PatternNode::Question(inner) => match inner.as_ref() {
PatternNode::Char('a') => {}
_ => panic!("Expected 'a' in question"),
},
_ => panic!("Expected question"),
}
}
#[test]
fn test_character_class() {
let pattern = PatternParser::new("[abc]").parse().unwrap();
match pattern.root {
PatternNode::CharClass(class) => {
assert!(class.matches('a'));
assert!(class.matches('b'));
assert!(class.matches('c'));
assert!(!class.matches('d'));
}
_ => panic!("Expected character class"),
}
}
#[test]
fn test_character_range() {
let pattern = PatternParser::new("[a-z]").parse().unwrap();
match pattern.root {
PatternNode::CharClass(class) => {
assert!(class.matches('a'));
assert!(class.matches('m'));
assert!(class.matches('z'));
assert!(!class.matches('A'));
assert!(!class.matches('0'));
}
_ => panic!("Expected character range"),
}
}
#[test]
fn test_negated_class() {
let pattern = PatternParser::new("[^abc]").parse().unwrap();
match pattern.root {
PatternNode::CharClass(class) => {
assert!(!class.matches('a'));
assert!(!class.matches('b'));
assert!(!class.matches('c'));
assert!(class.matches('d'));
assert!(class.matches('x'));
}
_ => panic!("Expected negated character class"),
}
}
#[test]
fn test_builtin_classes() {
let digit = PatternParser::new(r"\d").parse().unwrap();
match digit.root {
PatternNode::CharClass(class) => {
assert!(class.matches('5'));
assert!(!class.matches('a'));
}
_ => panic!("Expected digit class"),
}
let word = PatternParser::new(r"\w").parse().unwrap();
match word.root {
PatternNode::CharClass(class) => {
assert!(class.matches('a'));
assert!(class.matches('5'));
assert!(class.matches('_'));
assert!(!class.matches('@'));
}
_ => panic!("Expected word class"),
}
}
#[test]
fn test_escapes() {
let backslash = PatternParser::new(r"\\").parse().unwrap();
match backslash.root {
PatternNode::Char('\\') => {}
_ => panic!("Expected literal backslash"),
}
let dot = PatternParser::new(r"\.").parse().unwrap();
match dot.root {
PatternNode::Char('.') => {}
_ => panic!("Expected literal dot"),
}
}
#[test]
fn test_any_char() {
let pattern = PatternParser::new(".").parse().unwrap();
match pattern.root {
PatternNode::Any => {}
_ => panic!("Expected any character"),
}
}
#[test]
fn test_anchors() {
let start = PatternParser::new("^").parse().unwrap();
match start.root {
PatternNode::Start => {}
_ => panic!("Expected start anchor"),
}
let end = PatternParser::new("$").parse().unwrap();
match end.root {
PatternNode::End => {}
_ => panic!("Expected end anchor"),
}
}
#[test]
fn test_groups() {
let pattern = PatternParser::new("(abc)").parse().unwrap();
match pattern.root {
PatternNode::Group(inner) => match inner.as_ref() {
PatternNode::Concat(nodes) => {
assert_eq!(nodes.len(), 3);
}
_ => panic!("Expected concatenation in group"),
},
_ => panic!("Expected group"),
}
}
#[test]
fn test_complex_pattern() {
let pattern = PatternParser::new(r"\d+\.\d*").parse().unwrap();
match pattern.root {
PatternNode::Concat(nodes) => {
assert_eq!(nodes.len(), 3);
}
_ => panic!("Expected concatenation"),
}
}
}