use std::collections::HashMap;
use super::error::{GrammarError, GrammarResult};
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct CharRange {
pub lo: u8,
pub hi: u8,
}
impl CharRange {
pub fn single(b: u8) -> Self {
Self { lo: b, hi: b }
}
pub fn range(lo: u8, hi: u8) -> Self {
Self { lo, hi }
}
pub fn contains(&self, b: u8) -> bool {
b >= self.lo && b <= self.hi
}
}
#[derive(Debug, Clone)]
pub enum GrammarNode {
Literal(Vec<u8>),
CharClass {
ranges: Vec<CharRange>,
negated: bool,
},
RuleRef(String),
Sequence(Vec<GrammarNode>),
Alternation(Vec<GrammarNode>),
Repeat {
node: Box<GrammarNode>,
min: usize,
max: Option<usize>,
},
}
#[derive(Debug, Clone)]
pub struct Grammar {
pub rules: HashMap<String, GrammarNode>,
pub root: String,
pub source: String,
}
impl Grammar {
pub fn parse(input: &str) -> GrammarResult<Self> {
let mut parser = GbnfParser::new(input);
let mut grammar = parser.parse_grammar()?;
grammar.source = input.to_string();
Ok(grammar)
}
pub fn root_node(&self) -> GrammarResult<&GrammarNode> {
self.rules
.get(&self.root)
.ok_or_else(|| GrammarError::UnknownRule {
rule: self.root.clone(),
})
}
}
struct GbnfParser<'a> {
input: &'a [u8],
pos: usize,
}
impl<'a> GbnfParser<'a> {
fn new(input: &'a str) -> Self {
Self {
input: input.as_bytes(),
pos: 0,
}
}
fn parse_error(&self, msg: impl Into<String>) -> GrammarError {
GrammarError::ParseError {
pos: self.pos,
msg: msg.into(),
}
}
fn peek(&self) -> Option<u8> {
self.input.get(self.pos).copied()
}
fn peek2(&self) -> Option<u8> {
self.input.get(self.pos + 1).copied()
}
fn advance(&mut self) {
self.pos += 1;
}
fn consume(&mut self) -> Option<u8> {
let b = self.peek()?;
self.advance();
Some(b)
}
fn expect(&mut self, expected: u8) -> GrammarResult<()> {
match self.consume() {
Some(b) if b == expected => Ok(()),
Some(b) => Err(self.parse_error(format!(
"expected '{}' but got '{}'",
expected as char, b as char
))),
None => Err(self.parse_error(format!(
"expected '{}' but reached end of input",
expected as char
))),
}
}
fn is_ident_start(b: u8) -> bool {
b.is_ascii_alphabetic() || b == b'_'
}
fn is_ident_cont(b: u8) -> bool {
b.is_ascii_alphanumeric() || b == b'_' || b == b'-'
}
fn skip_inline_ws(&mut self) {
while matches!(self.peek(), Some(b' ') | Some(b'\t')) {
self.advance();
}
}
fn skip_ws_and_comments(&mut self) {
loop {
match self.peek() {
Some(b' ') | Some(b'\t') | Some(b'\r') | Some(b'\n') => {
self.advance();
}
Some(b'#') => {
while !matches!(self.peek(), None | Some(b'\n')) {
self.advance();
}
}
_ => break,
}
}
}
fn parse_grammar(&mut self) -> GrammarResult<Grammar> {
let mut rules: HashMap<String, GrammarNode> = HashMap::new();
let mut first_rule: Option<String> = None;
self.skip_ws_and_comments();
while self.pos < self.input.len() {
let name = self.parse_ident()?;
self.skip_inline_ws();
if self.pos + 3 > self.input.len() || &self.input[self.pos..self.pos + 3] != b"::=" {
return Err(self.parse_error("expected '::=' after rule name"));
}
self.pos += 3;
self.skip_inline_ws();
let node = self.parse_alternation()?;
if first_rule.is_none() {
first_rule = Some(name.clone());
}
rules.insert(name, node);
self.skip_ws_and_comments();
}
if rules.is_empty() {
return Err(GrammarError::ParseError {
pos: 0,
msg: "grammar has no rules".to_string(),
});
}
let root = if rules.contains_key("root") {
"root".to_string()
} else {
first_rule.unwrap_or_default()
};
Ok(Grammar {
rules,
root,
source: String::new(), })
}
fn parse_ident(&mut self) -> GrammarResult<String> {
let start = self.pos;
match self.peek() {
Some(b) if Self::is_ident_start(b) => {
self.advance();
}
_ => return Err(self.parse_error("expected identifier")),
}
while matches!(self.peek(), Some(b) if Self::is_ident_cont(b)) {
self.advance();
}
let slice = &self.input[start..self.pos];
Ok(String::from_utf8_lossy(slice).into_owned())
}
fn parse_alternation(&mut self) -> GrammarResult<GrammarNode> {
let first = self.parse_sequence()?;
self.skip_inline_ws();
if !matches!(self.peek(), Some(b'|')) {
return Ok(first);
}
let mut alternatives = vec![first];
while matches!(self.peek(), Some(b'|')) {
self.advance(); self.skip_inline_ws();
alternatives.push(self.parse_sequence()?);
self.skip_inline_ws();
}
Ok(GrammarNode::Alternation(alternatives))
}
fn parse_sequence(&mut self) -> GrammarResult<GrammarNode> {
let mut items: Vec<GrammarNode> = Vec::new();
loop {
self.skip_inline_ws();
match self.peek() {
None | Some(b'\n') | Some(b'\r') | Some(b'|') | Some(b')') => break,
Some(b'#') => break,
_ => {}
}
let item = self.parse_item()?;
items.push(item);
}
match items.len() {
0 => Ok(GrammarNode::Literal(vec![])),
1 => Ok(items.remove(0)),
_ => Ok(GrammarNode::Sequence(items)),
}
}
fn parse_item(&mut self) -> GrammarResult<GrammarNode> {
let base = self.parse_atom()?;
self.parse_quantifier(base)
}
fn parse_quantifier(&mut self, base: GrammarNode) -> GrammarResult<GrammarNode> {
match self.peek() {
Some(b'*') => {
self.advance();
Ok(GrammarNode::Repeat {
node: Box::new(base),
min: 0,
max: None,
})
}
Some(b'+') => {
self.advance();
Ok(GrammarNode::Repeat {
node: Box::new(base),
min: 1,
max: None,
})
}
Some(b'?') => {
self.advance();
Ok(GrammarNode::Repeat {
node: Box::new(base),
min: 0,
max: Some(1),
})
}
_ => Ok(base),
}
}
fn parse_atom(&mut self) -> GrammarResult<GrammarNode> {
match self.peek() {
Some(b'"') => self.parse_string_literal(),
Some(b'[') => self.parse_char_class(),
Some(b'(') => {
self.advance(); self.skip_inline_ws();
let inner = self.parse_alternation()?;
self.skip_inline_ws();
self.expect(b')')?;
Ok(inner)
}
Some(b) if Self::is_ident_start(b) => {
let name = self.parse_ident()?;
Ok(GrammarNode::RuleRef(name))
}
Some(b) => {
Err(self.parse_error(format!("unexpected character '{}' in grammar", b as char)))
}
None => Err(self.parse_error("unexpected end of input")),
}
}
fn parse_string_literal(&mut self) -> GrammarResult<GrammarNode> {
self.expect(b'"')?;
let mut bytes: Vec<u8> = Vec::new();
loop {
match self.consume() {
None => return Err(self.parse_error("unterminated string literal")),
Some(b'"') => break,
Some(b'\\') => {
let escaped = self.parse_escape_sequence()?;
bytes.extend_from_slice(&escaped);
}
Some(b) => bytes.push(b),
}
}
Ok(GrammarNode::Literal(bytes))
}
fn parse_escape_sequence(&mut self) -> GrammarResult<Vec<u8>> {
match self.consume() {
Some(b'n') => Ok(vec![b'\n']),
Some(b'r') => Ok(vec![b'\r']),
Some(b't') => Ok(vec![b'\t']),
Some(b'\\') => Ok(vec![b'\\']),
Some(b'"') => Ok(vec![b'"']),
Some(b'\'') => Ok(vec![b'\'']),
Some(b'0') => Ok(vec![0u8]),
Some(b'x') => {
let hi = self.consume_hex_digit()?;
let lo = self.consume_hex_digit()?;
Ok(vec![(hi << 4) | lo])
}
Some(b'u') => {
self.expect(b'{')?;
let mut codepoint: u32 = 0;
let mut digits = 0usize;
while !matches!(self.peek(), Some(b'}')) {
let d = self.consume_hex_digit()?;
codepoint = (codepoint << 4) | (d as u32);
digits += 1;
if digits > 6 {
return Err(self.parse_error("\\u{...} codepoint too large"));
}
}
self.expect(b'}')?;
let ch = char::from_u32(codepoint)
.ok_or_else(|| self.parse_error("invalid Unicode codepoint"))?;
let mut buf = [0u8; 4];
Ok(ch.encode_utf8(&mut buf).as_bytes().to_vec())
}
Some(b) => Err(self.parse_error(format!("unknown escape sequence '\\{}'", b as char))),
None => Err(self.parse_error("truncated escape sequence")),
}
}
fn consume_hex_digit(&mut self) -> GrammarResult<u8> {
match self.consume() {
Some(b @ b'0'..=b'9') => Ok(b - b'0'),
Some(b @ b'a'..=b'f') => Ok(b - b'a' + 10),
Some(b @ b'A'..=b'F') => Ok(b - b'A' + 10),
Some(b) => Err(self.parse_error(format!("expected hex digit, got '{}'", b as char))),
None => Err(self.parse_error("expected hex digit, reached end of input")),
}
}
fn parse_char_class(&mut self) -> GrammarResult<GrammarNode> {
self.expect(b'[')?;
let negated = if matches!(self.peek(), Some(b'^')) {
self.advance();
true
} else {
false
};
let mut ranges: Vec<CharRange> = Vec::new();
loop {
match self.peek() {
None => return Err(self.parse_error("unterminated character class")),
Some(b']') => {
self.advance();
break;
}
Some(b'\\') => {
self.advance(); let b = self.parse_class_escape()?;
if matches!(self.peek(), Some(b'-')) && !matches!(self.peek2(), Some(b']')) {
self.advance(); let hi = self.parse_class_byte()?;
ranges.push(CharRange::range(b, hi));
} else {
ranges.push(CharRange::single(b));
}
}
Some(_) => {
let lo = self.parse_class_byte()?;
if matches!(self.peek(), Some(b'-')) && !matches!(self.peek2(), Some(b']')) {
self.advance(); let hi = self.parse_class_byte()?;
ranges.push(CharRange::range(lo, hi));
} else {
ranges.push(CharRange::single(lo));
}
}
}
}
Ok(GrammarNode::CharClass { ranges, negated })
}
fn parse_class_byte(&mut self) -> GrammarResult<u8> {
match self.peek() {
Some(b'\\') => {
self.advance();
self.parse_class_escape()
}
Some(b) => {
self.advance();
Ok(b)
}
None => Err(self.parse_error("unexpected end inside character class")),
}
}
fn parse_class_escape(&mut self) -> GrammarResult<u8> {
match self.consume() {
Some(b'n') => Ok(b'\n'),
Some(b'r') => Ok(b'\r'),
Some(b't') => Ok(b'\t'),
Some(b'\\') => Ok(b'\\'),
Some(b']') => Ok(b']'),
Some(b'-') => Ok(b'-'),
Some(b'^') => Ok(b'^'),
Some(b'x') => {
let hi = self.consume_hex_digit()?;
let lo = self.consume_hex_digit()?;
Ok((hi << 4) | lo)
}
Some(b) => Err(self.parse_error(format!("unknown class escape '\\{}'", b as char))),
None => Err(self.parse_error("truncated class escape")),
}
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_parse_literal_alternation() {
let g = Grammar::parse(r#"root ::= "yes" | "no""#).unwrap();
assert_eq!(g.root, "root");
assert!(g.rules.contains_key("root"));
}
#[test]
fn test_parse_char_class_range() {
let g = Grammar::parse("root ::= [a-z]+").unwrap();
assert!(g.rules.contains_key("root"));
}
#[test]
fn test_parse_sequence() {
let g = Grammar::parse(r#"root ::= [a-z]+ ":" [0-9]+"#).unwrap();
assert!(!g.rules.is_empty());
}
#[test]
fn test_parse_multiple_rules() {
let grammar_str = "root ::= greeting\ngreeting ::= \"hello\"";
let g = Grammar::parse(grammar_str).unwrap();
assert_eq!(g.root, "root");
assert!(g.rules.contains_key("greeting"));
}
#[test]
fn test_parse_empty_grammar_fails() {
let result = Grammar::parse("");
assert!(result.is_err());
}
#[test]
fn test_parse_negated_char_class() {
let g = Grammar::parse("root ::= [^0-9]+").unwrap();
assert!(g.rules.contains_key("root"));
match g.rules.get("root").unwrap() {
GrammarNode::Repeat { node, .. } => match node.as_ref() {
GrammarNode::CharClass { negated, .. } => assert!(*negated),
other => panic!("expected CharClass, got {:?}", other),
},
other => panic!("expected Repeat, got {:?}", other),
}
}
#[test]
fn test_parse_grouping() {
let g = Grammar::parse(r#"root ::= ("a" | "b")+"#).unwrap();
assert!(g.rules.contains_key("root"));
}
#[test]
fn test_parse_json_simple() {
let grammar_str = r#"root ::= "{" ws "}"
ws ::= [ \t\n]*"#;
let g = Grammar::parse(grammar_str).unwrap();
assert_eq!(g.root, "root");
}
#[test]
fn test_parse_escape_newline() {
let g = Grammar::parse(r#"root ::= "\n""#).expect("test: \\n escape should parse");
match g.rules.get("root").expect("test: root rule") {
GrammarNode::Literal(bytes) => assert_eq!(bytes, b"\n"),
other => panic!("expected Literal, got {:?}", other),
}
}
#[test]
fn test_parse_escape_tab() {
let g = Grammar::parse(r#"root ::= "\t""#).expect("test: \\t escape should parse");
match g.rules.get("root").expect("test: root rule") {
GrammarNode::Literal(bytes) => assert_eq!(bytes, b"\t"),
other => panic!("expected Literal, got {:?}", other),
}
}
#[test]
fn test_parse_escape_carriage_return() {
let g = Grammar::parse(r#"root ::= "\r""#).expect("test: \\r escape should parse");
match g.rules.get("root").expect("test: root rule") {
GrammarNode::Literal(bytes) => assert_eq!(bytes, b"\r"),
other => panic!("expected Literal, got {:?}", other),
}
}
#[test]
fn test_parse_escape_backslash() {
let g = Grammar::parse(r#"root ::= "\\""#).expect("test: \\\\ escape should parse");
match g.rules.get("root").expect("test: root rule") {
GrammarNode::Literal(bytes) => assert_eq!(bytes, b"\\"),
other => panic!("expected Literal, got {:?}", other),
}
}
#[test]
fn test_parse_escape_quote() {
let g = Grammar::parse(r#"root ::= "\"hi\"""#).expect("test: escaped quote should parse");
match g.rules.get("root").expect("test: root rule") {
GrammarNode::Literal(bytes) => {
assert_eq!(bytes[0], b'"');
assert_eq!(*bytes.last().expect("test: last byte"), b'"');
}
other => panic!("expected Literal, got {:?}", other),
}
}
#[test]
fn test_parse_escape_null() {
let g = Grammar::parse(r#"root ::= "\0""#).expect("test: \\0 escape should parse");
match g.rules.get("root").expect("test: root rule") {
GrammarNode::Literal(bytes) => assert_eq!(bytes, &[0u8]),
other => panic!("expected Literal, got {:?}", other),
}
}
#[test]
fn test_parse_escape_hex() {
let g = Grammar::parse(r#"root ::= "\x41""#).expect("test: \\x41 should parse");
match g.rules.get("root").expect("test: root rule") {
GrammarNode::Literal(bytes) => assert_eq!(bytes, &[0x41u8]),
other => panic!("expected Literal, got {:?}", other),
}
}
#[test]
fn test_parse_escape_unicode() {
let g = Grammar::parse(r#"root ::= "\u{0041}""#).expect("test: \\u{0041} should parse");
match g.rules.get("root").expect("test: root rule") {
GrammarNode::Literal(bytes) => assert_eq!(bytes, b"A"),
other => panic!("expected Literal, got {:?}", other),
}
}
#[test]
fn test_parse_escape_unicode_multibyte() {
let g = Grammar::parse(r#"root ::= "\u{263A}""#).expect("test: \\u{263A} should parse");
match g.rules.get("root").expect("test: root rule") {
GrammarNode::Literal(bytes) => {
assert_eq!(bytes, "☺".as_bytes());
}
other => panic!("expected Literal, got {:?}", other),
}
}
#[test]
fn test_parse_unterminated_string_errors() {
let result = Grammar::parse(r#"root ::= "abc"#);
assert!(result.is_err(), "unterminated string should be an error");
}
#[test]
fn test_parse_unknown_escape_errors() {
let result = Grammar::parse(r#"root ::= "\q""#);
assert!(result.is_err(), "unknown escape \\q should be an error");
}
#[test]
fn test_parse_truncated_escape_errors() {
let result = Grammar::parse("root ::= \"\\\"");
assert!(
result.is_err(),
"truncated escape at end of input should error"
);
}
#[test]
fn test_parse_hex_escape_invalid_digit_errors() {
let result = Grammar::parse(r#"root ::= "\xGG""#);
assert!(result.is_err(), "invalid hex digit in \\x should error");
}
#[test]
fn test_parse_unicode_escape_invalid_codepoint_errors() {
let grammar_str = "root ::= \"\\u{D800}\"";
let result = Grammar::parse(grammar_str);
assert!(
result.is_err(),
"surrogate codepoint \\u{{D800}} should error"
);
}
#[test]
fn test_parse_unicode_escape_too_many_digits_errors() {
let grammar_str = "root ::= \"\\u{1234567}\"";
let result = Grammar::parse(grammar_str);
assert!(result.is_err(), "\\u{{...}} with >6 digits should error");
}
#[test]
fn test_parse_char_class_escape_newline() {
let g = Grammar::parse(r"root ::= [\n]").expect("test: [\\n] should parse");
assert!(g.rules.contains_key("root"));
}
#[test]
fn test_parse_char_class_escape_tab() {
let g = Grammar::parse(r"root ::= [\t]").expect("test: [\\t] should parse");
assert!(g.rules.contains_key("root"));
}
#[test]
fn test_parse_char_class_escape_dash() {
let g = Grammar::parse(r"root ::= [\-]").expect("test: [\\-] should parse");
assert!(g.rules.contains_key("root"));
}
#[test]
fn test_parse_char_class_escape_caret() {
let g = Grammar::parse(r"root ::= [\^]").expect("test: [\\^] should parse");
assert!(g.rules.contains_key("root"));
}
#[test]
fn test_parse_char_class_escape_bracket() {
let g = Grammar::parse(r"root ::= [\]]").expect("test: [\\]] should parse");
assert!(g.rules.contains_key("root"));
}
#[test]
fn test_parse_char_class_hex_escape() {
let g = Grammar::parse(r"root ::= [\x41-\x5A]+").expect("test: [\\x41-\\x5A] should parse");
assert!(g.rules.contains_key("root"));
}
#[test]
fn test_parse_char_class_unknown_escape_errors() {
let result = Grammar::parse(r"root ::= [\q]");
assert!(
result.is_err(),
"unknown char class escape \\q should error"
);
}
#[test]
fn test_parse_unterminated_char_class_errors() {
let result = Grammar::parse("root ::= [a-z");
assert!(result.is_err(), "unterminated char class should error");
}
#[test]
fn test_parse_missing_assign_errors() {
let result = Grammar::parse("root = \"hello\"");
assert!(result.is_err(), "missing ::= should error");
}
#[test]
fn test_parse_unexpected_char_errors() {
let result = Grammar::parse("root ::= @");
assert!(
result.is_err(),
"unexpected char '@' in grammar atom should error"
);
}
#[test]
fn test_parse_unclosed_group_errors() {
let result = Grammar::parse(r#"root ::= ("abc""#);
assert!(result.is_err(), "unclosed group should error");
}
#[test]
fn test_parse_first_rule_becomes_root_when_no_root() {
let g = Grammar::parse("entry ::= \"hello\"").expect("test: single rule without root");
assert_eq!(g.root, "entry");
}
#[test]
fn test_parse_optional_quantifier() {
let g = Grammar::parse(r#"root ::= "a"?"#).expect("test: ? quantifier should parse");
match g.rules.get("root").expect("test: root rule") {
GrammarNode::Repeat { min, max, .. } => {
assert_eq!(*min, 0);
assert_eq!(*max, Some(1));
}
other => panic!("expected Repeat, got {:?}", other),
}
}
#[test]
fn test_parse_star_quantifier() {
let g = Grammar::parse(r#"root ::= "a"*"#).expect("test: * quantifier should parse");
match g.rules.get("root").expect("test: root rule") {
GrammarNode::Repeat { min, max, .. } => {
assert_eq!(*min, 0);
assert_eq!(*max, None);
}
other => panic!("expected Repeat, got {:?}", other),
}
}
#[test]
fn test_parse_plus_quantifier() {
let g = Grammar::parse(r#"root ::= "a"+"#).expect("test: + quantifier should parse");
match g.rules.get("root").expect("test: root rule") {
GrammarNode::Repeat { min, max, .. } => {
assert_eq!(*min, 1);
assert_eq!(*max, None);
}
other => panic!("expected Repeat, got {:?}", other),
}
}
#[test]
fn test_parse_comment_skipped() {
let grammar_str = "# This is a comment\nroot ::= \"hello\"";
let g = Grammar::parse(grammar_str).expect("test: comment should be ignored");
assert_eq!(g.root, "root");
}
#[test]
fn test_root_node_accessor_ok() {
let g = Grammar::parse(r#"root ::= "hi""#).expect("test: should parse");
g.root_node()
.expect("test: root_node() should succeed when root rule exists");
}
#[test]
fn test_root_node_accessor_missing_rule_errors() {
let g = Grammar {
rules: std::collections::HashMap::new(),
root: "missing".to_string(),
source: String::new(),
};
let result = g.root_node();
assert!(result.is_err(), "root_node() on missing rule should error");
match result {
Err(super::super::error::GrammarError::UnknownRule { rule }) => {
assert_eq!(rule, "missing");
}
other => panic!("expected UnknownRule, got {:?}", other),
}
}
#[test]
fn test_char_range_contains() {
let r = CharRange::range(b'a', b'z');
assert!(r.contains(b'a'));
assert!(r.contains(b'z'));
assert!(r.contains(b'm'));
assert!(!r.contains(b'A'));
assert!(!r.contains(b'0'));
}
#[test]
fn test_char_range_single() {
let r = CharRange::single(b'X');
assert!(r.contains(b'X'));
assert!(!r.contains(b'Y'));
assert!(!r.contains(b'W'));
}
}