use super::types::{Nfa, NfaEdge, NfaStateId, PatternError};
const GROUP_RECURSION_LIMIT: usize = 128;
#[inline]
pub fn regex_to_nfa(source: &str) -> Result<Nfa, PatternError> {
let mut parser = Parser {
bytes: source.as_bytes(),
pos: 0,
group_depth: 0,
builder: Builder::default(),
};
let fragment = parser.parse_alternation()?;
if parser.pos != parser.bytes.len() {
return Err(PatternError::ParseError {
offset: parser.pos,
message:
"Fix: unexpected trailing input; check for an unmatched `)` or stray metachar."
.into(),
});
}
Ok(parser.builder.finalize(fragment))
}
struct Parser<'a> {
bytes: &'a [u8],
pos: usize,
group_depth: usize,
builder: Builder,
}
#[derive(Default)]
struct Builder {
state_count: u32,
edges: Vec<NfaEdge>,
}
#[derive(Debug, Clone, Copy)]
struct Fragment {
start: NfaStateId,
end: NfaStateId,
}
impl Builder {
fn new_state(&mut self) -> NfaStateId {
let id = self.state_count;
self.state_count += 1;
id
}
fn edge(&mut self, from: NfaStateId, byte: Option<u8>, to: NfaStateId) {
self.edges.push(NfaEdge { from, byte, to });
}
fn literal(&mut self, byte: u8) -> Fragment {
let start = self.new_state();
let end = self.new_state();
self.edge(start, Some(byte), end);
Fragment { start, end }
}
fn concat(&mut self, a: Fragment, b: Fragment) -> Fragment {
self.edge(a.end, None, b.start);
Fragment {
start: a.start,
end: b.end,
}
}
fn alternate(&mut self, a: Fragment, b: Fragment) -> Fragment {
let start = self.new_state();
let end = self.new_state();
self.edge(start, None, a.start);
self.edge(start, None, b.start);
self.edge(a.end, None, end);
self.edge(b.end, None, end);
Fragment { start, end }
}
fn kleene(&mut self, inner: Fragment) -> Fragment {
let start = self.new_state();
let end = self.new_state();
self.edge(start, None, inner.start);
self.edge(start, None, end);
self.edge(inner.end, None, inner.start);
self.edge(inner.end, None, end);
Fragment { start, end }
}
fn plus(&mut self, inner: Fragment) -> Fragment {
let start = self.new_state();
let end = self.new_state();
self.edge(start, None, inner.start);
self.edge(inner.end, None, inner.start);
self.edge(inner.end, None, end);
Fragment { start, end }
}
fn optional(&mut self, inner: Fragment) -> Fragment {
let start = self.new_state();
let end = self.new_state();
self.edge(start, None, inner.start);
self.edge(start, None, end);
self.edge(inner.end, None, end);
Fragment { start, end }
}
fn empty(&mut self) -> Fragment {
let start = self.new_state();
let end = self.new_state();
self.edge(start, None, end);
Fragment { start, end }
}
fn char_class(&mut self, bytes: &[u8]) -> Fragment {
let start = self.new_state();
let end = self.new_state();
for &b in bytes {
self.edge(start, Some(b), end);
}
Fragment { start, end }
}
fn finalize(self, fragment: Fragment) -> Nfa {
let mut accept = vec![false; self.state_count as usize];
accept[fragment.end as usize] = true;
Nfa {
state_count: self.state_count,
edges: self.edges,
start: fragment.start,
accept,
}
}
}
impl<'a> Parser<'a> {
fn peek(&self) -> Option<u8> {
self.bytes.get(self.pos).copied()
}
fn bump(&mut self) -> Option<u8> {
let b = self.peek()?;
self.pos += 1;
Some(b)
}
fn eat(&mut self, byte: u8) -> bool {
if self.peek() == Some(byte) {
self.pos += 1;
true
} else {
false
}
}
fn parse_alternation(&mut self) -> Result<Fragment, PatternError> {
if self.peek() == Some(b'|') {
return Err(PatternError::ParseError {
offset: self.pos,
message: "Fix: alternation `|` needs a non-empty expression on its left side."
.into(),
});
}
let mut left = self.parse_concatenation()?;
while self.eat(b'|') {
if matches!(self.peek(), None | Some(b'|') | Some(b')')) {
return Err(PatternError::ParseError {
offset: self.pos.saturating_sub(1),
message: "Fix: alternation `|` needs a non-empty expression on both sides."
.into(),
});
}
let right = self.parse_concatenation()?;
left = self.builder.alternate(left, right);
}
Ok(left)
}
fn parse_concatenation(&mut self) -> Result<Fragment, PatternError> {
let mut accum: Option<Fragment> = None;
loop {
match self.peek() {
None | Some(b'|') | Some(b')') => break,
Some(_) => {
let next = self.parse_quantified()?;
accum = Some(match accum {
Some(prev) => self.builder.concat(prev, next),
None => next,
});
}
}
}
Ok(accum.unwrap_or_else(|| self.builder.empty()))
}
fn parse_quantified(&mut self) -> Result<Fragment, PatternError> {
let atom = self.parse_atom()?;
match self.peek() {
Some(b'*') => {
self.pos += 1;
Ok(self.builder.kleene(atom))
}
Some(b'+') => {
self.pos += 1;
Ok(self.builder.plus(atom))
}
Some(b'?') => {
self.pos += 1;
Ok(self.builder.optional(atom))
}
_ => Ok(atom),
}
}
fn parse_atom(&mut self) -> Result<Fragment, PatternError> {
match self.peek() {
None => Err(PatternError::ParseError {
offset: self.pos,
message: "Fix: regex ended while parser expected an atom.".into(),
}),
Some(b'(') => {
if self.group_depth >= GROUP_RECURSION_LIMIT {
return Err(PatternError::RecursionLimit {
limit: GROUP_RECURSION_LIMIT,
});
}
self.pos += 1;
if self.peek() == Some(b')') {
return Err(PatternError::ParseError {
offset: self.pos - 1,
message: "Fix: empty groups are not supported; remove the group or add an expression inside it.".into(),
});
}
self.group_depth += 1;
let inner = self.parse_alternation()?;
self.group_depth -= 1;
if !self.eat(b')') {
return Err(PatternError::ParseError {
offset: self.pos,
message: "Fix: missing closing `)` for group.".into(),
});
}
Ok(inner)
}
Some(b'[') => self.parse_char_class(),
Some(b'\\') => {
self.pos += 1;
match self.parse_escape_byte("escape")? {
Some(b) => Ok(self.builder.literal(b)),
None => Err(PatternError::ParseError {
offset: self.pos,
message: "Fix: dangling backslash; escape a character after `\\`.".into(),
}),
}
}
Some(b'.') => {
self.pos += 1;
let bytes: Vec<u8> = (0..=255u8).filter(|b| *b != b'\n').collect();
Ok(self.builder.char_class(&bytes))
}
Some(b) if is_metachar(b) => Err(PatternError::ParseError {
offset: self.pos,
message: format!(
"Fix: metacharacter `{}` needs an atom before it; escape with `\\{}` to match the literal byte.",
b as char, b as char
),
}),
Some(b) => {
self.pos += 1;
Ok(self.builder.literal(b))
}
}
}
fn parse_char_class(&mut self) -> Result<Fragment, PatternError> {
debug_assert_eq!(self.peek(), Some(b'['));
self.pos += 1;
let mut negate = false;
if self.eat(b'^') {
negate = true;
}
let mut members = [false; 256];
let mut last: Option<u8> = None;
let mut saw_member = false;
loop {
let byte = match self.peek() {
None => {
return Err(PatternError::ParseError {
offset: self.pos,
message: "Fix: character class missing closing `]`.".into(),
})
}
Some(b']') => {
self.pos += 1;
break;
}
Some(b'-') if last.is_some() && self.bytes.get(self.pos + 1) != Some(&b']') => {
self.pos += 1;
let hi = match self.bump() {
Some(b) => b,
None => {
return Err(PatternError::ParseError {
offset: self.pos,
message: "Fix: character-class range missing upper bound.".into(),
})
}
};
let lo = last.unwrap();
if hi < lo {
return Err(PatternError::ParseError {
offset: self.pos.saturating_sub(1),
message: "Fix: character-class ranges must be ascending; swap the bounds or list bytes explicitly.".into(),
});
}
for v in lo..=hi {
members[v as usize] = true;
}
last = None;
saw_member = true;
continue;
}
Some(b'\\') => {
self.pos += 1;
match self.parse_escape_byte("character class")? {
Some(b) => b,
None => {
return Err(PatternError::ParseError {
offset: self.pos,
message: "Fix: dangling backslash inside character class.".into(),
})
}
}
}
Some(b) => {
self.pos += 1;
b
}
};
members[byte as usize] = true;
last = Some(byte);
saw_member = true;
}
if !saw_member {
return Err(PatternError::ParseError {
offset: self.pos,
message: "Fix: character class is empty (matches nothing).".into(),
});
}
if negate {
for m in members.iter_mut() {
*m = !*m;
}
}
let bytes: Vec<u8> = members
.iter()
.enumerate()
.filter_map(|(i, &on)| on.then_some(i as u8))
.collect();
if bytes.is_empty() {
return Err(PatternError::ParseError {
offset: self.pos,
message: "Fix: character class is empty (matches nothing).".into(),
});
}
Ok(self.builder.char_class(&bytes))
}
fn parse_escape_byte(&mut self, context: &str) -> Result<Option<u8>, PatternError> {
let offset = self.pos.saturating_sub(1);
match self.bump() {
Some(b'x') => self.parse_hex_escape(context).map(Some),
Some(b @ (b'd' | b'w' | b's' | b'A' | b'z')) => Err(PatternError::UnsupportedEscape {
offset,
escape: b as char,
}),
other => Ok(other),
}
}
fn parse_hex_escape(&mut self, context: &str) -> Result<u8, PatternError> {
let start = self.pos.saturating_sub(2);
let hi = self.bump().and_then(hex_value);
let lo = self.bump().and_then(hex_value);
match (hi, lo) {
(Some(hi), Some(lo)) => Ok((hi << 4) | lo),
_ => Err(PatternError::ParseError {
offset: start,
message: format!(
"Fix: {context} `\\x` escapes must be followed by exactly two hex digits."
),
}),
}
}
}
fn is_metachar(b: u8) -> bool {
matches!(b, b'*' | b'+' | b'?' | b'|' | b')' | b']')
}
fn hex_value(b: u8) -> Option<u8> {
match b {
b'0'..=b'9' => Some(b - b'0'),
b'a'..=b'f' => Some(b - b'a' + 10),
b'A'..=b'F' => Some(b - b'A' + 10),
_ => None,
}
}
#[cfg(test)]
mod tests;