use std::collections::HashMap;
use super::ast::{Grammar, NonTerminalId, Rule, Symbol};
#[derive(Debug, Clone, PartialEq)]
pub enum GbnfParseError {
EmptyGrammar,
MissingRootRule,
UnknownRule(String),
UnexpectedChar { line: usize, col: usize, ch: char },
UnterminatedString,
UnterminatedCharClass,
InvalidEscape(char),
UnsupportedFeature(String),
RecursionLimit,
}
impl std::fmt::Display for GbnfParseError {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
Self::EmptyGrammar => write!(f, "GBNF grammar is empty (no rules found)"),
Self::MissingRootRule => write!(f, "GBNF grammar has no `root` rule"),
Self::UnknownRule(name) => write!(f, "rule `{name}` referenced but never defined"),
Self::UnexpectedChar { line, col, ch } => {
write!(f, "line {line}:{col}: unexpected character `{ch}`")
}
Self::UnterminatedString => write!(f, "unterminated string literal"),
Self::UnterminatedCharClass => write!(f, "unterminated character class `[...]`"),
Self::InvalidEscape(ch) => write!(f, "invalid escape sequence `\\{ch}`"),
Self::UnsupportedFeature(feat) => {
write!(f, "unsupported GBNF feature: {feat}")
}
Self::RecursionLimit => write!(f, "group nesting depth exceeded limit"),
}
}
}
impl std::error::Error for GbnfParseError {}
const MAX_DEPTH: usize = 64;
#[derive(Debug, Clone, PartialEq)]
enum Token {
Ident(String),
Assign,
Pipe,
Star,
Plus,
Question,
LParen,
RParen,
StringLit(Vec<u8>),
CharClass(Vec<u8>),
Newline,
Eof,
}
struct Lexer {
chars: Vec<char>,
pos: usize,
line: usize,
col: usize,
}
impl Lexer {
fn new(src: &str) -> Self {
Self {
chars: src.chars().collect(),
pos: 0,
line: 1,
col: 1,
}
}
fn peek(&self) -> Option<char> {
self.chars.get(self.pos).copied()
}
fn peek2(&self) -> Option<char> {
self.chars.get(self.pos + 1).copied()
}
fn advance(&mut self) -> Option<char> {
let ch = self.chars.get(self.pos).copied()?;
self.pos += 1;
if ch == '\n' {
self.line += 1;
self.col = 1;
} else {
self.col += 1;
}
Some(ch)
}
fn skip_spaces(&mut self) {
while matches!(self.peek(), Some(' ') | Some('\t') | Some('\r')) {
self.advance();
}
}
fn parse_escape_byte(&mut self) -> Result<u8, GbnfParseError> {
match self.advance() {
None => Err(GbnfParseError::UnterminatedString),
Some('n') => Ok(b'\n'),
Some('r') => Ok(b'\r'),
Some('t') => Ok(b'\t'),
Some('\\') => Ok(b'\\'),
Some('"') => Ok(b'"'),
Some('\'') => Ok(b'\''),
Some('0') => Ok(0u8),
Some('x') => self.parse_hex_escape(),
Some(ch) => {
Err(GbnfParseError::InvalidEscape(ch))
}
}
}
fn parse_hex_escape(&mut self) -> Result<u8, GbnfParseError> {
let hi = self.advance().ok_or(GbnfParseError::UnterminatedString)?;
let lo = self.advance().ok_or(GbnfParseError::UnterminatedString)?;
let hex = format!("{hi}{lo}");
u8::from_str_radix(&hex, 16).map_err(|_| GbnfParseError::UnexpectedChar {
line: self.line,
col: self.col,
ch: hi,
})
}
fn parse_string_lit(&mut self) -> Result<Vec<u8>, GbnfParseError> {
let mut bytes: Vec<u8> = Vec::new();
loop {
match self.peek() {
None => return Err(GbnfParseError::UnterminatedString),
Some('"') => {
self.advance();
break;
}
Some('\\') => {
self.advance(); let b = self.parse_escape_byte()?;
bytes.push(b);
}
Some(ch) => {
let mut buf = [0u8; 4];
let encoded = ch.encode_utf8(&mut buf);
bytes.extend_from_slice(encoded.as_bytes());
self.advance();
}
}
}
Ok(bytes)
}
fn parse_char_class(&mut self) -> Result<Vec<u8>, GbnfParseError> {
let negated = if self.peek() == Some('^') {
self.advance();
true
} else {
false
};
let mut set = [false; 256];
loop {
match self.peek() {
None => return Err(GbnfParseError::UnterminatedCharClass),
Some(']') => {
self.advance();
break;
}
Some('\\') => {
self.advance(); let b = self.parse_escape_byte()?;
if self.peek() == Some('-') && self.peek2() != Some(']') {
self.advance(); let end_byte = self.parse_class_single_byte()?;
for byte in b..=end_byte {
set[byte as usize] = true;
}
} else {
set[b as usize] = true;
}
}
Some(ch) => {
let mut buf = [0u8; 4];
let encoded = ch.encode_utf8(&mut buf);
if encoded.len() > 1 {
return Err(GbnfParseError::UnsupportedFeature(
"non-ASCII character in character class".to_string(),
));
}
let b = encoded.as_bytes()[0];
self.advance();
if self.peek() == Some('-') && self.peek2() != Some(']') {
self.advance(); let end_byte = self.parse_class_single_byte()?;
for byte in b..=end_byte {
set[byte as usize] = true;
}
} else {
set[b as usize] = true;
}
}
}
}
if negated {
for b in &mut set {
*b = !*b;
}
}
let bytes: Vec<u8> = (0u8..=255u8).filter(|&b| set[b as usize]).collect();
Ok(bytes)
}
fn parse_class_single_byte(&mut self) -> Result<u8, GbnfParseError> {
match self.peek() {
None => Err(GbnfParseError::UnterminatedCharClass),
Some(']') => Err(GbnfParseError::UnterminatedCharClass),
Some('\\') => {
self.advance(); self.parse_escape_byte()
}
Some(ch) => {
let mut buf = [0u8; 4];
let encoded = ch.encode_utf8(&mut buf);
if encoded.len() > 1 {
return Err(GbnfParseError::UnsupportedFeature(
"non-ASCII character in character class range".to_string(),
));
}
let b = encoded.as_bytes()[0];
self.advance();
Ok(b)
}
}
}
fn tokenise(&mut self) -> Result<Vec<Token>, GbnfParseError> {
let mut tokens: Vec<Token> = Vec::new();
loop {
self.skip_spaces();
match self.peek() {
None => {
tokens.push(Token::Eof);
break;
}
Some('#') => {
while matches!(self.peek(), Some(ch) if ch != '\n') {
self.advance();
}
}
Some('\n') => {
self.advance();
tokens.push(Token::Newline);
}
Some(':') => {
let (line, col) = (self.line, self.col);
if self.chars.get(self.pos + 1).copied() == Some(':')
&& self.chars.get(self.pos + 2).copied() == Some('=')
{
self.pos += 3;
self.col += 3;
tokens.push(Token::Assign);
} else {
return Err(GbnfParseError::UnexpectedChar { line, col, ch: ':' });
}
}
Some('|') => {
self.advance();
tokens.push(Token::Pipe);
}
Some('*') => {
self.advance();
tokens.push(Token::Star);
}
Some('+') => {
self.advance();
tokens.push(Token::Plus);
}
Some('?') => {
self.advance();
tokens.push(Token::Question);
}
Some('(') => {
self.advance();
tokens.push(Token::LParen);
}
Some(')') => {
self.advance();
tokens.push(Token::RParen);
}
Some('"') => {
self.advance(); let bytes = self.parse_string_lit()?;
tokens.push(Token::StringLit(bytes));
}
Some('[') => {
self.advance(); let bytes = self.parse_char_class()?;
tokens.push(Token::CharClass(bytes));
}
Some(ch) if ch.is_ascii_alphabetic() || ch == '_' => {
let mut ident = String::new();
while let Some(c) = self.peek() {
if c.is_ascii_alphanumeric() || c == '_' || c == '-' {
ident.push(c);
self.advance();
} else {
break;
}
}
tokens.push(Token::Ident(ident));
}
Some(ch) => {
let (line, col) = (self.line, self.col);
return Err(GbnfParseError::UnexpectedChar { line, col, ch });
}
}
}
Ok(tokens)
}
}
fn collect_rule_names(tokens: &[Token]) -> Vec<String> {
let mut names: Vec<String> = Vec::new();
let mut seen: std::collections::HashSet<String> = std::collections::HashSet::new();
let mut i = 0;
while i < tokens.len() {
if let Token::Ident(name) = &tokens[i] {
let mut j = i + 1;
while j < tokens.len() {
match &tokens[j] {
Token::Newline => j += 1,
_ => break,
}
}
if matches!(tokens.get(j), Some(Token::Assign)) && !seen.contains(name) {
seen.insert(name.clone());
names.push(name.clone());
}
}
i += 1;
}
names
}
struct Parser<'g> {
tokens: Vec<Token>,
pos: usize,
nt_map: HashMap<String, NonTerminalId>,
defined: std::collections::HashSet<NonTerminalId>,
grammar: &'g mut Grammar,
synthetic_counter: usize,
}
impl<'g> Parser<'g> {
fn new(
tokens: Vec<Token>,
nt_map: HashMap<String, NonTerminalId>,
grammar: &'g mut Grammar,
) -> Self {
Self {
tokens,
pos: 0,
nt_map,
defined: std::collections::HashSet::new(),
grammar,
synthetic_counter: 0,
}
}
fn peek(&self) -> &Token {
self.tokens.get(self.pos).unwrap_or(&Token::Eof)
}
fn advance(&mut self) -> &Token {
let tok = self.tokens.get(self.pos).unwrap_or(&Token::Eof);
if self.pos < self.tokens.len() {
self.pos += 1;
}
tok
}
fn skip_newlines(&mut self) {
while matches!(self.peek(), Token::Newline) {
self.advance();
}
}
fn next_synthetic_id(&mut self) -> usize {
let id = self.synthetic_counter;
self.synthetic_counter += 1;
id
}
fn alloc_synthetic(&mut self, prefix: &str) -> NonTerminalId {
let id = self.next_synthetic_id();
let name = format!("__{prefix}_{id}");
self.grammar.alloc_nt(&name)
}
fn resolve_nt(&self, name: &str) -> Result<NonTerminalId, GbnfParseError> {
self.nt_map
.get(name)
.copied()
.ok_or_else(|| GbnfParseError::UnknownRule(name.to_string()))
}
fn parse_all(&mut self) -> Result<(), GbnfParseError> {
self.skip_newlines();
while !matches!(self.peek(), Token::Eof) {
self.parse_one_rule()?;
self.skip_newlines();
}
Ok(())
}
fn parse_one_rule(&mut self) -> Result<(), GbnfParseError> {
let lhs_name = match self.advance().clone() {
Token::Ident(name) => name,
Token::Eof => return Ok(()),
tok => {
return Err(GbnfParseError::UnexpectedChar {
line: 0,
col: 0,
ch: token_to_char(&tok),
});
}
};
self.skip_newlines();
match self.advance().clone() {
Token::Assign => {}
tok => {
return Err(GbnfParseError::UnexpectedChar {
line: 0,
col: 0,
ch: token_to_char(&tok),
});
}
}
let lhs_id = self.resolve_nt(&lhs_name)?;
self.defined.insert(lhs_id);
self.parse_body_into(lhs_id, 0)?;
Ok(())
}
fn parse_body_into(
&mut self,
lhs_id: NonTerminalId,
depth: usize,
) -> Result<(), GbnfParseError> {
if depth > MAX_DEPTH {
return Err(GbnfParseError::RecursionLimit);
}
loop {
let rhs = self.parse_alternative(depth)?;
self.grammar.add_rule(Rule::new(lhs_id, rhs));
if matches!(self.peek(), Token::Pipe) {
self.advance(); self.skip_newlines();
continue;
}
break;
}
Ok(())
}
fn parse_alternative(&mut self, depth: usize) -> Result<Vec<Symbol>, GbnfParseError> {
let mut sequence: Vec<Symbol> = Vec::new();
loop {
match self.peek() {
Token::Eof | Token::RParen => break,
Token::Pipe => break,
Token::Newline => {
let mut look = self.pos + 1;
while look < self.tokens.len() {
match &self.tokens[look] {
Token::Newline => look += 1,
Token::Pipe => {
while matches!(self.peek(), Token::Newline) {
self.advance();
}
return Ok(sequence);
}
_ => break,
}
}
break;
}
Token::Ident(_) if self.is_new_rule_start() => break,
_ => {
let atom_sym = self.parse_atom(depth)?;
let sym = self.apply_quantifier(atom_sym, depth)?;
sequence.extend(sym);
}
}
}
Ok(sequence)
}
fn is_new_rule_start(&self) -> bool {
if !matches!(self.peek(), Token::Ident(_)) {
return false;
}
let mut j = self.pos + 1;
while j < self.tokens.len() {
match &self.tokens[j] {
Token::Newline => j += 1,
Token::Assign => return true,
_ => return false,
}
}
false
}
fn parse_atom(&mut self, depth: usize) -> Result<AtomResult, GbnfParseError> {
if depth > MAX_DEPTH {
return Err(GbnfParseError::RecursionLimit);
}
match self.peek().clone() {
Token::StringLit(bytes) => {
self.advance();
Ok(AtomResult::Inline(bytes))
}
Token::CharClass(bytes) => {
self.advance();
let class_nt = self.alloc_synthetic("gbnf_class");
for b in &bytes {
self.grammar
.add_rule(Rule::new(class_nt, vec![Symbol::Terminal(vec![*b])]));
}
Ok(AtomResult::Single(Symbol::NonTerminal(class_nt)))
}
Token::Ident(name) => {
self.advance();
let nt_id = self.resolve_nt(&name)?;
Ok(AtomResult::Single(Symbol::NonTerminal(nt_id)))
}
Token::LParen => {
self.advance(); let group_nt = self.alloc_synthetic("gbnf_group");
self.parse_body_into(group_nt, depth + 1)?;
match self.advance().clone() {
Token::RParen => {}
tok => {
return Err(GbnfParseError::UnexpectedChar {
line: 0,
col: 0,
ch: token_to_char(&tok),
});
}
}
Ok(AtomResult::Single(Symbol::NonTerminal(group_nt)))
}
tok => Err(GbnfParseError::UnexpectedChar {
line: 0,
col: 0,
ch: token_to_char(&tok),
}),
}
}
fn apply_quantifier(
&mut self,
atom: AtomResult,
depth: usize,
) -> Result<Vec<Symbol>, GbnfParseError> {
if depth > MAX_DEPTH {
return Err(GbnfParseError::RecursionLimit);
}
let quantifier = match self.peek() {
Token::Star => {
self.advance();
Some('*')
}
Token::Plus => {
self.advance();
Some('+')
}
Token::Question => {
self.advance();
Some('?')
}
_ => None,
};
match quantifier {
None => {
Ok(atom_to_symbols(atom))
}
Some('?') => {
let opt_nt = self.alloc_synthetic("gbnf_opt");
self.grammar.add_rule(Rule::new(opt_nt, vec![]));
let rhs = atom_to_symbols(atom);
self.grammar.add_rule(Rule::new(opt_nt, rhs));
Ok(vec![Symbol::NonTerminal(opt_nt)])
}
Some('*') => {
let star_nt = self.alloc_synthetic("gbnf_star");
let rhs = atom_to_symbols(atom);
self.grammar.add_rule(Rule::new(star_nt, vec![]));
let mut star_rhs = rhs;
star_rhs.push(Symbol::NonTerminal(star_nt));
self.grammar.add_rule(Rule::new(star_nt, star_rhs));
Ok(vec![Symbol::NonTerminal(star_nt)])
}
Some('+') => {
let star_nt = self.alloc_synthetic("gbnf_star");
let atom_rhs = atom_to_symbols(atom);
self.grammar.add_rule(Rule::new(star_nt, vec![]));
let mut star_rhs = atom_rhs.clone();
star_rhs.push(Symbol::NonTerminal(star_nt));
self.grammar.add_rule(Rule::new(star_nt, star_rhs));
let plus_nt = self.alloc_synthetic("gbnf_plus");
let mut plus_rhs = atom_rhs;
plus_rhs.push(Symbol::NonTerminal(star_nt));
self.grammar.add_rule(Rule::new(plus_nt, plus_rhs));
Ok(vec![Symbol::NonTerminal(plus_nt)])
}
_ => unreachable!(),
}
}
}
enum AtomResult {
Inline(Vec<u8>),
Single(Symbol),
}
fn atom_to_symbols(atom: AtomResult) -> Vec<Symbol> {
match atom {
AtomResult::Inline(bytes) => bytes
.into_iter()
.map(|b| Symbol::Terminal(vec![b]))
.collect(),
AtomResult::Single(sym) => vec![sym],
}
}
fn token_to_char(tok: &Token) -> char {
match tok {
Token::Assign => ':',
Token::Pipe => '|',
Token::Star => '*',
Token::Plus => '+',
Token::Question => '?',
Token::LParen => '(',
Token::RParen => ')',
Token::Newline => '\n',
Token::Eof => '\0',
Token::Ident(_) => 'i',
Token::StringLit(_) => '"',
Token::CharClass(_) => '[',
}
}
pub fn parse_gbnf(src: &str) -> Result<Grammar, GbnfParseError> {
let mut lexer = Lexer::new(src);
let tokens = lexer.tokenise()?;
let has_content = tokens
.iter()
.any(|t| !matches!(t, Token::Newline | Token::Eof));
if !has_content {
return Err(GbnfParseError::EmptyGrammar);
}
let rule_names = collect_rule_names(&tokens);
if rule_names.is_empty() {
return Err(GbnfParseError::EmptyGrammar);
}
if !rule_names.iter().any(|n| n == "root") {
return Err(GbnfParseError::MissingRootRule);
}
let mut grammar = Grammar::new(0);
let mut nt_map: HashMap<String, NonTerminalId> = HashMap::new();
for name in &rule_names {
let id = grammar.alloc_nt(name);
nt_map.insert(name.clone(), id);
}
let root_id = *nt_map.get("root").expect("root was in rule_names");
grammar.start = root_id;
let mut parser = Parser::new(tokens, nt_map.clone(), &mut grammar);
parser.parse_all()?;
let defined = parser.defined;
let violations: Vec<String> = grammar
.rules
.iter()
.flat_map(|rule| rule.rhs.iter())
.filter_map(|sym| {
if let Symbol::NonTerminal(id) = sym {
let name = grammar.nt_names.get(id).cloned().unwrap_or_default();
if !name.starts_with("__") && !defined.contains(id) {
return Some(name);
}
}
None
})
.collect();
if let Some(unknown) = violations.into_iter().next() {
return Err(GbnfParseError::UnknownRule(unknown));
}
Ok(grammar)
}
#[cfg(test)]
mod tests {
use super::*;
fn parse_ok(src: &str) -> Grammar {
parse_gbnf(src).unwrap_or_else(|e| panic!("parse_gbnf failed: {e}"))
}
#[test]
fn parse_simple_literal() {
let g = parse_ok(r#"root ::= "hello""#);
let root_rules: Vec<_> = g.rules_for(g.start()).collect();
assert!(!root_rules.is_empty());
}
#[test]
fn parse_alternation_internal() {
let g = parse_ok(r#"root ::= "cat" | "dog""#);
let root_rules: Vec<_> = g.rules_for(g.start()).collect();
assert_eq!(root_rules.len(), 2);
}
#[test]
fn error_empty_grammar_internal() {
assert!(matches!(parse_gbnf(""), Err(GbnfParseError::EmptyGrammar)));
}
#[test]
fn error_missing_root_internal() {
let err = parse_gbnf("word ::= \"x\"");
assert!(matches!(err, Err(GbnfParseError::MissingRootRule)));
}
#[test]
fn parse_star_quantifier_internal() {
let g = parse_ok(r#"root ::= "a"*"#);
assert!(!g.rules.is_empty());
let start_has_rule = g.rules.iter().any(|r| r.lhs == g.start);
assert!(start_has_rule, "root NT must have at least one rule");
}
#[test]
fn error_display_non_empty() {
let e = GbnfParseError::EmptyGrammar;
assert!(!e.to_string().is_empty());
let e2 = GbnfParseError::UnknownRule("foo".to_string());
assert!(e2.to_string().contains("foo"));
}
}