use std::fmt;
const MAX_PATTERN_SOURCE_BYTES: usize = 16 * 1024;
const MAX_PATTERN_ATOMS: usize = u16::MAX as usize;
const MAX_GROUP_ALTERNATIVES: usize = 1024;
#[derive(Copy, Clone, Debug, Eq, PartialEq)]
pub struct ParsePatError {
kind: PatError,
position: usize,
}
impl ParsePatError {
pub fn kind(self) -> PatError {
self.kind
}
pub fn position(self) -> usize {
self.position
}
}
impl fmt::Display for ParsePatError {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(
f,
"Syntax Error @{}: {}.",
self.position,
self.kind.as_str()
)
}
}
impl std::error::Error for ParsePatError {}
#[derive(Copy, Clone, Debug, Eq, PartialEq)]
pub enum PatError {
UnpairedHexDigit,
UnknownChar,
UnclosedQuote,
SaveOverflow,
ReadOperand,
SkipOperand,
GroupOperand,
StackError,
StackInvalid,
PatternTooLong,
PatternTooComplex,
}
impl PatError {
fn as_str(self) -> &'static str {
match self {
PatError::UnpairedHexDigit => "unpaired hex digit",
PatError::UnknownChar => "unknown character",
PatError::UnclosedQuote => "string missing end quote",
PatError::SaveOverflow => "save store overflow",
PatError::ReadOperand => "read operand error",
PatError::SkipOperand => "skip operand error",
PatError::GroupOperand => "group operand error",
PatError::StackError => "stack unbalanced",
PatError::StackInvalid => "stack must follow jump",
PatError::PatternTooLong => "pattern input too long",
PatError::PatternTooComplex => "pattern expands beyond supported complexity limits",
}
}
}
#[derive(Copy, Clone, Debug, Eq, PartialEq)]
pub enum Atom {
Byte(u8),
Fuzzy(u8),
Save(u8),
Skip(u8),
SkipRange(u16, u16),
Push(u8),
Pop,
Jump1,
Jump4,
Ptr,
Pir(u8),
ReadI8(u8),
ReadU8(u8),
ReadI16(u8),
ReadU16(u8),
ReadI32(u8),
ReadU32(u8),
Zero(u8),
Back(u8),
Aligned(u8),
Check(u8),
Case(u16),
Break(u16),
Nop,
}
pub type Pattern = Vec<Atom>;
pub fn save_len(pat: &[Atom]) -> usize {
pat.iter()
.filter_map(|atom| match atom {
Atom::Save(slot)
| Atom::ReadI8(slot)
| Atom::ReadU8(slot)
| Atom::ReadI16(slot)
| Atom::ReadU16(slot)
| Atom::ReadI32(slot)
| Atom::ReadU32(slot)
| Atom::Zero(slot)
| Atom::Check(slot)
| Atom::Pir(slot) => Some(usize::from(*slot) + 1),
_ => None,
})
.max()
.unwrap_or(0)
}
pub fn parse(pat: &str) -> Result<Pattern, ParsePatError> {
if pat.len() > MAX_PATTERN_SOURCE_BYTES {
return Err(ParsePatError {
kind: PatError::PatternTooLong,
position: MAX_PATTERN_SOURCE_BYTES,
});
}
let mut parser = Parser::new(pat);
let mut result = Vec::with_capacity(pat.len() / 2);
result.push(Atom::Save(0));
result.append(&mut parser.parse_sequence(&[])?);
if result.len() > MAX_PATTERN_ATOMS {
return Err(ParsePatError {
kind: PatError::PatternTooComplex,
position: pat.len(),
});
}
while matches!(result.last(), Some(Atom::Skip(_))) {
result.pop();
}
if parser.depth != 0 {
return Err(ParsePatError {
kind: PatError::StackError,
position: pat.len(),
});
}
if parser.peek().is_some() {
return Err(ParsePatError {
kind: PatError::GroupOperand,
position: parser.peek().map(|(pos, _)| pos).unwrap_or(pat.len()),
});
}
Ok(result)
}
struct Parser<'a> {
chars: Vec<(usize, char)>,
cursor: usize,
save: u8,
depth: u16,
_source: &'a str,
}
impl<'a> Parser<'a> {
fn new(source: &'a str) -> Self {
Self {
chars: source.char_indices().collect(),
cursor: 0,
save: 1,
depth: 0,
_source: source,
}
}
fn parse_sequence(&mut self, stoppers: &[char]) -> Result<Vec<Atom>, ParsePatError> {
let mut result = Vec::new();
while let Some((position, ch)) = self.peek() {
if stoppers.contains(&ch) {
break;
}
self.bump();
match ch {
' ' | '\n' | '\r' | '\t' => {}
'?' => {
push_skip(&mut result, 1);
}
'[' => self.parse_skip_operand(position, &mut result)?,
'\'' => {
if self.save == u8::MAX {
return Err(ParsePatError {
kind: PatError::SaveOverflow,
position,
});
}
result.push(Atom::Save(self.save));
self.save += 1;
}
'%' => result.push(Atom::Jump1),
'$' => result.push(Atom::Jump4),
'*' => result.push(Atom::Ptr),
'"' => {
let mut closed = false;
while let Some((_, next)) = self.bump() {
if next == '"' {
closed = true;
break;
}
if !next.is_ascii() {
return Err(ParsePatError {
kind: PatError::UnknownChar,
position,
});
}
result.push(Atom::Byte(next as u8));
}
if !closed {
return Err(ParsePatError {
kind: PatError::UnclosedQuote,
position,
});
}
}
'{' => {
if self.depth == u16::MAX {
return Err(ParsePatError {
kind: PatError::StackError,
position,
});
}
self.depth += 1;
let Some(last) = result.last_mut() else {
return Err(ParsePatError {
kind: PatError::StackInvalid,
position,
});
};
let replaced = match *last {
Atom::Jump1 => {
*last = Atom::Push(1);
Atom::Jump1
}
Atom::Jump4 => {
*last = Atom::Push(4);
Atom::Jump4
}
Atom::Ptr => {
*last = Atom::Push(0);
Atom::Ptr
}
_ => {
return Err(ParsePatError {
kind: PatError::StackInvalid,
position,
});
}
};
result.push(replaced);
}
'}' => {
if self.depth == 0 {
return Err(ParsePatError {
kind: PatError::StackError,
position,
});
}
self.depth -= 1;
result.push(Atom::Pop);
}
'i' | 'u' => {
let signed = ch == 'i';
let (_, op) = self.bump().ok_or(ParsePatError {
kind: PatError::ReadOperand,
position,
})?;
if self.save == u8::MAX {
return Err(ParsePatError {
kind: PatError::SaveOverflow,
position,
});
}
let slot = self.save;
self.save += 1;
let atom = match (signed, op) {
(true, '1') => Atom::ReadI8(slot),
(false, '1') => Atom::ReadU8(slot),
(true, '2') => Atom::ReadI16(slot),
(false, '2') => Atom::ReadU16(slot),
(true, '4') => Atom::ReadI32(slot),
(false, '4') => Atom::ReadU32(slot),
_ => {
return Err(ParsePatError {
kind: PatError::ReadOperand,
position,
});
}
};
result.push(atom);
}
'z' => {
if self.save == u8::MAX {
return Err(ParsePatError {
kind: PatError::SaveOverflow,
position,
});
}
result.push(Atom::Zero(self.save));
self.save += 1;
}
'@' => {
let (next_pos, op) = self.bump().ok_or(ParsePatError {
kind: PatError::ReadOperand,
position,
})?;
let align = match op {
'0'..='9' => op as u8 - b'0',
'A'..='Z' => 10 + (op as u8 - b'A'),
'a'..='z' => 10 + (op as u8 - b'a'),
_ => {
return Err(ParsePatError {
kind: PatError::ReadOperand,
position: next_pos,
});
}
};
result.push(Atom::Aligned(align));
}
'(' => {
let alts = self.parse_group(position)?;
result.append(&mut compile_alternatives(alts, position)?);
}
_ if ch.is_ascii_hexdigit() => {
let (next_position, lo_ch) = self.bump().ok_or(ParsePatError {
kind: PatError::UnpairedHexDigit,
position,
})?;
if !lo_ch.is_ascii_hexdigit() {
return Err(ParsePatError {
kind: PatError::UnpairedHexDigit,
position: next_position,
});
}
let hi = hex_value(ch).expect("ascii hex already validated");
let lo = hex_value(lo_ch).expect("ascii hex already validated");
result.push(Atom::Byte((hi << 4) | lo));
}
_ => {
return Err(ParsePatError {
kind: PatError::UnknownChar,
position,
});
}
}
if result.len() > MAX_PATTERN_ATOMS {
return Err(ParsePatError {
kind: PatError::PatternTooComplex,
position,
});
}
}
Ok(result)
}
fn parse_group(&mut self, position: usize) -> Result<Vec<Vec<Atom>>, ParsePatError> {
let mut alternatives = Vec::new();
let group_save = self.save;
let group_depth = self.depth;
let mut max_save = self.save;
loop {
self.save = group_save;
self.depth = group_depth;
let seq = self.parse_sequence(&['|', ')'])?;
max_save = max_save.max(self.save);
alternatives.push(seq);
if alternatives.len() > MAX_GROUP_ALTERNATIVES {
return Err(ParsePatError {
kind: PatError::PatternTooComplex,
position,
});
}
let Some((stop_pos, stop)) = self.bump() else {
return Err(ParsePatError {
kind: PatError::GroupOperand,
position,
});
};
match stop {
'|' => continue,
')' => break,
_ => {
return Err(ParsePatError {
kind: PatError::GroupOperand,
position: stop_pos,
});
}
}
}
if alternatives.is_empty() {
return Err(ParsePatError {
kind: PatError::GroupOperand,
position,
});
}
self.save = max_save;
self.depth = group_depth;
Ok(alternatives)
}
fn parse_skip_operand(
&mut self,
position: usize,
result: &mut Vec<Atom>,
) -> Result<(), ParsePatError> {
let mut first: u32 = 0;
let mut second: u32 = 0;
let mut saw_first = false;
let mut saw_second = false;
let mut ranged = false;
while let Some((_, ch)) = self.bump() {
match ch {
'0'..='9' => {
let digit = u32::from(ch as u8 - b'0');
if !ranged {
saw_first = true;
first = first
.checked_mul(10)
.and_then(|n| n.checked_add(digit))
.ok_or(ParsePatError {
kind: PatError::SkipOperand,
position,
})?;
} else {
saw_second = true;
second = second
.checked_mul(10)
.and_then(|n| n.checked_add(digit))
.ok_or(ParsePatError {
kind: PatError::SkipOperand,
position,
})?;
}
}
'-' => {
if ranged || !saw_first {
return Err(ParsePatError {
kind: PatError::SkipOperand,
position,
});
}
ranged = true;
}
']' => {
if !saw_first {
return Err(ParsePatError {
kind: PatError::SkipOperand,
position,
});
}
if !ranged {
push_skip(result, first);
return Ok(());
}
if !saw_second || second <= first {
return Err(ParsePatError {
kind: PatError::SkipOperand,
position,
});
}
let min = u16::try_from(first).map_err(|_| ParsePatError {
kind: PatError::SkipOperand,
position,
})?;
let max = u16::try_from(second - 1).map_err(|_| ParsePatError {
kind: PatError::SkipOperand,
position,
})?;
result.push(Atom::SkipRange(min, max));
return Ok(());
}
_ => {
return Err(ParsePatError {
kind: PatError::SkipOperand,
position,
});
}
}
}
Err(ParsePatError {
kind: PatError::SkipOperand,
position,
})
}
fn peek(&self) -> Option<(usize, char)> {
self.chars.get(self.cursor).copied()
}
fn bump(&mut self) -> Option<(usize, char)> {
let value = self.peek()?;
self.cursor += 1;
Some(value)
}
}
fn compile_alternatives(
mut alts: Vec<Vec<Atom>>,
position: usize,
) -> Result<Vec<Atom>, ParsePatError> {
debug_assert!(!alts.is_empty(), "alternatives parser guarantees non-empty");
if alts.len() == 1 {
let only = alts
.pop()
.expect("alternatives parser guarantees exactly one arm remains");
return Ok(only);
}
let first = alts.remove(0);
let rest = compile_alternatives(alts, position)?;
let case_skip = u16::try_from(first.len() + 2).map_err(|_| ParsePatError {
kind: PatError::PatternTooComplex,
position,
})?;
let break_skip = u16::try_from(rest.len()).map_err(|_| ParsePatError {
kind: PatError::PatternTooComplex,
position,
})?;
let mut out = Vec::with_capacity(2 + first.len() + rest.len());
out.push(Atom::Case(case_skip));
out.extend(first);
out.push(Atom::Break(break_skip));
out.extend(rest);
if out.len() > MAX_PATTERN_ATOMS {
return Err(ParsePatError {
kind: PatError::PatternTooComplex,
position,
});
}
Ok(out)
}
fn hex_value(ch: char) -> Option<u8> {
match ch {
'0'..='9' => Some(ch as u8 - b'0'),
'a'..='f' => Some(ch as u8 - b'a' + 10),
'A'..='F' => Some(ch as u8 - b'A' + 10),
_ => None,
}
}
fn push_skip(result: &mut Vec<Atom>, mut remaining: u32) {
while remaining != 0 {
let chunk = u8::try_from(remaining.min(u32::from(u8::MAX))).expect("chunk is bounded");
remaining -= u32::from(chunk);
if let Some(Atom::Skip(prev)) = result.last_mut() {
let free = u8::MAX - *prev;
if free != 0 {
let to_add = free.min(chunk);
*prev += to_add;
let leftover = chunk - to_add;
if leftover != 0 {
result.push(Atom::Skip(leftover));
}
continue;
}
}
result.push(Atom::Skip(chunk));
}
}
#[cfg(test)]
mod tests {
use proptest::prelude::*;
use super::{Atom, ParsePatError, PatError, parse};
#[test]
fn parses_used_subset() {
assert_eq!(
parse("488915${'} 488942"),
Ok(vec![
Atom::Save(0),
Atom::Byte(0x48),
Atom::Byte(0x89),
Atom::Byte(0x15),
Atom::Push(4),
Atom::Jump4,
Atom::Save(1),
Atom::Pop,
Atom::Byte(0x48),
Atom::Byte(0x89),
Atom::Byte(0x42),
])
);
assert_eq!(
parse("68*'31c0c3"),
Ok(vec![
Atom::Save(0),
Atom::Byte(0x68),
Atom::Ptr,
Atom::Save(1),
Atom::Byte(0x31),
Atom::Byte(0xc0),
Atom::Byte(0xc3),
])
);
assert_eq!(
parse("*{'90}"),
Ok(vec![
Atom::Save(0),
Atom::Push(0),
Atom::Ptr,
Atom::Save(1),
Atom::Byte(0x90),
Atom::Pop,
])
);
assert_eq!(
parse("44 8B 81 u4 48 8D 0D"),
Ok(vec![
Atom::Save(0),
Atom::Byte(0x44),
Atom::Byte(0x8B),
Atom::Byte(0x81),
Atom::ReadU32(1),
Atom::Byte(0x48),
Atom::Byte(0x8D),
Atom::Byte(0x0D),
])
);
assert_eq!(
parse("488b1d${'} 48891d[4] 4c63b3"),
Ok(vec![
Atom::Save(0),
Atom::Byte(0x48),
Atom::Byte(0x8b),
Atom::Byte(0x1d),
Atom::Push(4),
Atom::Jump4,
Atom::Save(1),
Atom::Pop,
Atom::Byte(0x48),
Atom::Byte(0x89),
Atom::Byte(0x1d),
Atom::Skip(4),
Atom::Byte(0x4c),
Atom::Byte(0x63),
Atom::Byte(0xb3),
])
);
assert_eq!(
parse("\"hello\" 00"),
Ok(vec![
Atom::Save(0),
Atom::Byte(b'h'),
Atom::Byte(b'e'),
Atom::Byte(b'l'),
Atom::Byte(b'l'),
Atom::Byte(b'o'),
Atom::Byte(0x00),
])
);
}
#[test]
fn reports_error_position() {
assert_eq!(
parse("4G") as Result<Vec<Atom>, ParsePatError>,
Err(ParsePatError {
kind: PatError::UnpairedHexDigit,
position: 1,
})
);
assert_eq!(
parse("u8") as Result<Vec<Atom>, ParsePatError>,
Err(ParsePatError {
kind: PatError::ReadOperand,
position: 0,
})
);
assert_eq!(
parse("\"unterminated") as Result<Vec<Atom>, ParsePatError>,
Err(ParsePatError {
kind: PatError::UnclosedQuote,
position: 0,
})
);
}
#[test]
fn group_alternatives_reset_and_merge_save_state() {
assert_eq!(
parse("('41|'42)'"),
Ok(vec![
Atom::Save(0),
Atom::Case(4),
Atom::Save(1),
Atom::Byte(0x41),
Atom::Break(2),
Atom::Save(1),
Atom::Byte(0x42),
Atom::Save(2),
])
);
}
#[test]
fn range_skip_uses_strict_upper_bound() {
assert_eq!(
parse("[5-6]"),
Ok(vec![Atom::Save(0), Atom::SkipRange(5, 5)])
);
assert_eq!(
parse("[5-5]") as Result<Vec<Atom>, ParsePatError>,
Err(ParsePatError {
kind: PatError::SkipOperand,
position: 0,
})
);
}
#[test]
fn wildcard_semantics_match_pelite() {
assert_eq!(
parse("A?") as Result<Vec<Atom>, ParsePatError>,
Err(ParsePatError {
kind: PatError::UnpairedHexDigit,
position: 1,
})
);
assert_eq!(parse("?"), Ok(vec![Atom::Save(0)]));
assert_eq!(parse("??"), Ok(vec![Atom::Save(0)]));
assert_eq!(
parse("4183?03"),
Ok(vec![
Atom::Save(0),
Atom::Byte(0x41),
Atom::Byte(0x83),
Atom::Skip(1),
Atom::Byte(0x03),
])
);
}
#[test]
fn supports_aligned_base36_syntax() {
assert_eq!(parse("@4"), Ok(vec![Atom::Save(0), Atom::Aligned(4)]));
assert_eq!(parse("@A"), Ok(vec![Atom::Save(0), Atom::Aligned(10)]));
assert_eq!(parse("@f"), Ok(vec![Atom::Save(0), Atom::Aligned(15)]));
assert_eq!(parse("@z"), Ok(vec![Atom::Save(0), Atom::Aligned(35)]));
assert_eq!(
parse("@") as Result<Vec<Atom>, ParsePatError>,
Err(ParsePatError {
kind: PatError::ReadOperand,
position: 0,
})
);
assert_eq!(
parse("@_") as Result<Vec<Atom>, ParsePatError>,
Err(ParsePatError {
kind: PatError::ReadOperand,
position: 1,
})
);
assert_eq!(
parse("@?") as Result<Vec<Atom>, ParsePatError>,
Err(ParsePatError {
kind: PatError::ReadOperand,
position: 1,
})
);
}
#[test]
fn save_len_counts_programmatic_pir_slots() {
let pat = [Atom::Save(0), Atom::Pir(3)];
assert_eq!(super::save_len(&pat), 4);
}
proptest! {
#[test]
fn parsed_patterns_preserve_base_capture_and_slot_bounds(source in "[ -~]{0,128}") {
if let Ok(atoms) = parse(&source) {
prop_assert!(!atoms.is_empty());
prop_assert_eq!(atoms[0], Atom::Save(0));
let required_slots = super::save_len(&atoms);
prop_assert!(required_slots >= 1);
prop_assert!(required_slots <= usize::from(u8::MAX));
}
}
}
}