use regex_syntax::hir::{Hir, HirKind};
use thiserror::Error;
#[derive(Debug, Error)]
pub enum RegexError {
#[error("regex parse error: {0}")]
Parse(String),
#[error("regex prefix extraction unsupported: {0}")]
PrefixUnsupported(&'static str),
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub enum Ast {
Empty,
Literal(Vec<u8>),
Anchor,
AnyChar,
Concat(Vec<Ast>),
Alt(Vec<Ast>),
Repeat {
sub: Box<Ast>,
min: u32,
max: Option<u32>,
},
}
impl Ast {
pub fn parse(pattern: &str) -> Result<Self, RegexError> {
let hir = regex_syntax::parse(pattern).map_err(|e| RegexError::Parse(e.to_string()))?;
Self::from_hir(&hir)
}
fn from_hir(hir: &Hir) -> Result<Self, RegexError> {
match hir.kind() {
HirKind::Empty => Ok(Ast::Empty),
HirKind::Literal(lit) => {
if lit.0.is_empty() {
Ok(Ast::Empty)
} else {
Ok(Ast::Literal(lit.0.to_vec()))
}
}
HirKind::Class(_) => Ok(Ast::AnyChar),
HirKind::Look(_) => Ok(Ast::Anchor),
HirKind::Repetition(rep) => {
let sub = Self::from_hir(&rep.sub)?;
Ok(Ast::Repeat {
sub: Box::new(sub),
min: rep.min,
max: rep.max,
})
}
HirKind::Capture(cap) => {
if cap.name.is_some() {
return Err(RegexError::PrefixUnsupported("named capture group"));
}
Self::from_hir(&cap.sub)
}
HirKind::Concat(parts) => {
let mut v = Vec::with_capacity(parts.len());
for p in parts {
v.push(Self::from_hir(p)?);
}
Ok(Self::collapse_concat(v))
}
HirKind::Alternation(parts) => {
let mut v = Vec::with_capacity(parts.len());
for p in parts {
v.push(Self::from_hir(p)?);
}
if v.len() == 1 {
Ok(v.into_iter().next().expect("len == 1"))
} else {
Ok(Ast::Alt(v))
}
}
}
}
fn collapse_concat(parts: Vec<Ast>) -> Ast {
match parts.len() {
0 => Ast::Empty,
1 => parts.into_iter().next().expect("len == 1"),
_ => Ast::Concat(parts),
}
}
}
pub fn parse(pattern: &str) -> Result<Ast, RegexError> {
Ast::parse(pattern)
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn parse_simple_literal_yields_literal_bytes() {
let ast = parse("hello").expect("parses");
assert_eq!(ast, Ast::Literal(b"hello".to_vec()));
}
#[test]
fn parse_concat_of_literal_and_dot_yields_concat() {
let ast = parse("ab.").expect("parses");
match ast {
Ast::Concat(parts) => {
assert_eq!(parts.len(), 2);
assert_eq!(parts[0], Ast::Literal(b"ab".to_vec()));
assert_eq!(parts[1], Ast::AnyChar);
}
other => panic!("expected concat, got {other:?}"),
}
}
#[test]
fn parse_alternation_two_branches_yields_alt() {
let ast = parse("foo|bar").expect("parses");
match ast {
Ast::Alt(branches) => {
assert_eq!(branches.len(), 2);
}
other => panic!("expected alt, got {other:?}"),
}
}
#[test]
fn parse_unsupported_lookahead_returns_error() {
let err = parse("(?=foo)").expect_err("lookahead must error");
assert!(matches!(err, RegexError::Parse(_)));
}
#[test]
fn parse_named_capture_returns_prefix_unsupported() {
let err = parse("(?P<name>abc)").expect_err("named capture errors");
assert!(matches!(err, RegexError::PrefixUnsupported(_)));
}
#[test]
fn parse_anchors_show_up_as_anchor_nodes() {
let ast = parse("^a$").expect("parses");
match ast {
Ast::Concat(parts) => {
assert_eq!(parts.len(), 3);
assert_eq!(parts[0], Ast::Anchor);
assert_eq!(parts[1], Ast::Literal(b"a".to_vec()));
assert_eq!(parts[2], Ast::Anchor);
}
other => panic!("expected concat, got {other:?}"),
}
}
#[test]
fn parse_grouping_with_quantifier_yields_repeat() {
let ast = parse("(ab)+").expect("parses");
match ast {
Ast::Repeat { sub, min, max } => {
assert_eq!(*sub, Ast::Literal(b"ab".to_vec()));
assert_eq!(min, 1);
assert!(max.is_none());
}
other => panic!("expected repeat, got {other:?}"),
}
}
}