#![forbid(unsafe_code)]
#![cfg_attr(feature = "strict_docs", deny(missing_docs))]
#![cfg_attr(not(feature = "strict_docs"), warn(missing_docs))]
use indexmap::IndexMap;
use serde::{Deserialize, Serialize};
use std::fmt;
pub mod error;
pub use error::{IrError, Result as IrResult};
pub mod optimizer;
pub use optimizer::{GrammarOptimizer, OptimizationStats, optimize_grammar};
pub mod validation;
pub use validation::{GrammarValidator, ValidationError, ValidationResult, ValidationWarning};
pub mod debug_macros;
pub mod symbol_registry;
pub use symbol_registry::{SymbolInfo, SymbolRegistry};
pub mod builder;
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize, Default)]
pub struct Grammar {
pub name: String,
pub rules: IndexMap<SymbolId, Vec<Rule>>,
pub tokens: IndexMap<SymbolId, Token>,
pub precedences: Vec<Precedence>,
pub conflicts: Vec<ConflictDeclaration>,
pub externals: Vec<ExternalToken>,
pub extras: Vec<SymbolId>,
pub fields: IndexMap<FieldId, String>,
pub supertypes: Vec<SymbolId>,
pub inline_rules: Vec<SymbolId>,
pub alias_sequences: IndexMap<ProductionId, AliasSequence>,
pub production_ids: IndexMap<RuleId, ProductionId>,
pub max_alias_sequence_length: usize,
pub rule_names: IndexMap<SymbolId, String>,
pub symbol_registry: Option<SymbolRegistry>,
}
impl Grammar {
pub fn add_rule(&mut self, rule: Rule) {
self.rules.entry(rule.lhs).or_default().push(rule);
}
pub fn get_rules_for_symbol(&self, symbol: SymbolId) -> Option<&Vec<Rule>> {
self.rules.get(&symbol)
}
pub fn all_rules(&self) -> impl Iterator<Item = &Rule> {
self.rules.values().flat_map(|rules| rules.iter())
}
pub fn start_symbol(&self) -> Option<SymbolId> {
if let Some(source_file_id) = self.find_symbol_by_name("source_file")
&& self.rules.contains_key(&source_file_id)
{
return Some(source_file_id);
}
for name in &["Expression", "Statement", "Program", "Module"] {
if let Some(symbol_id) = self.find_symbol_by_name(name)
&& self.rules.contains_key(&symbol_id)
{
return Some(symbol_id);
}
}
for (symbol_id, rules) in &self.rules {
if let Some(name) = self.rule_names.get(symbol_id)
&& !name.contains('_')
&& !rules.is_empty()
{
return Some(*symbol_id);
}
}
self.rules.keys().next().copied()
}
pub fn find_symbol_by_name(&self, name: &str) -> Option<SymbolId> {
for (symbol_id, symbol_name) in &self.rule_names {
if symbol_name == name {
return Some(*symbol_id);
}
}
None
}
pub fn get_or_build_registry(&mut self) -> &SymbolRegistry {
if self.symbol_registry.is_none() {
self.symbol_registry = Some(self.build_registry());
}
self.symbol_registry
.as_ref()
.expect("symbol_registry was just initialized above")
}
#[must_use = "validation result must be checked"]
pub fn check_empty_terminals(&self) -> Result<(), Vec<String>> {
let mut errors = Vec::new();
for (id, token) in &self.tokens {
match &token.pattern {
TokenPattern::String(s) if s.is_empty() => {
errors.push(format!(
"Token '{}' (id={}) has empty string pattern",
token.name, id.0
));
}
TokenPattern::Regex(r) if r.is_empty() => {
errors.push(format!(
"Token '{}' (id={}) has empty regex pattern",
token.name, id.0
));
}
_ => {}
}
}
if errors.is_empty() {
Ok(())
} else {
Err(errors)
}
}
pub fn build_registry(&self) -> SymbolRegistry {
let mut registry = SymbolRegistry::new();
let mut token_entries: Vec<_> = self.tokens.iter().collect();
token_entries.sort_by_key(|(_id, token)| {
let name = &token.name;
(name.starts_with('_'), name.clone())
});
for (symbol_id, token) in token_entries {
let metadata = SymbolMetadata {
visible: !token.name.starts_with('_'),
named: false,
hidden: self.extras.contains(symbol_id),
terminal: true,
};
registry.register(&token.name, metadata);
}
let mut rule_entries: Vec<_> = self.rule_names.iter().collect();
rule_entries.sort_by_key(|(_, name)| (*name).clone());
for (symbol_id, name) in rule_entries {
if !self.tokens.contains_key(symbol_id) {
let metadata = SymbolMetadata {
visible: !name.starts_with('_'),
named: true,
hidden: name.starts_with('_'),
terminal: false,
};
registry.register(name, metadata);
}
}
for external in &self.externals {
let metadata = SymbolMetadata {
visible: true,
named: false,
hidden: false,
terminal: true,
};
registry.register(&external.name, metadata);
}
registry
}
}
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
pub struct Rule {
pub lhs: SymbolId,
pub rhs: Vec<Symbol>,
pub precedence: Option<PrecedenceKind>,
pub associativity: Option<Associativity>,
pub fields: Vec<(FieldId, usize)>,
pub production_id: ProductionId,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
pub enum PrecedenceKind {
Static(i16),
Dynamic(i16),
}
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
pub struct Token {
pub name: String,
pub pattern: TokenPattern,
pub fragile: bool,
}
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
pub enum TokenPattern {
String(String),
Regex(String),
}
#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash, Serialize, Deserialize)]
pub enum Symbol {
Terminal(SymbolId),
NonTerminal(SymbolId),
External(SymbolId),
Optional(Box<Symbol>),
Repeat(Box<Symbol>),
RepeatOne(Box<Symbol>),
Choice(Vec<Symbol>),
Sequence(Vec<Symbol>),
Epsilon,
}
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
pub struct AliasSequence {
pub aliases: Vec<Option<String>>,
}
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
pub struct Precedence {
pub level: i16,
pub associativity: Associativity,
pub symbols: Vec<SymbolId>,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
pub enum Associativity {
Left,
Right,
None,
}
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
pub struct ConflictDeclaration {
pub symbols: Vec<SymbolId>,
pub resolution: ConflictResolution,
}
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
pub enum ConflictResolution {
Precedence(PrecedenceKind),
Associativity(Associativity),
GLR,
}
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
pub struct ExternalToken {
pub name: String,
pub symbol_id: SymbolId,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash, Serialize, Deserialize)]
pub struct SymbolId(pub u16);
#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash, Serialize, Deserialize)]
pub struct RuleId(pub u16);
#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash, Serialize, Deserialize)]
pub struct StateId(pub u16);
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
pub struct FieldId(pub u16);
#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash, Serialize, Deserialize)]
pub struct ProductionId(pub u16);
impl fmt::Display for SymbolId {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "Symbol({})", self.0)
}
}
impl fmt::Display for RuleId {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "Rule({})", self.0)
}
}
impl fmt::Display for StateId {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "State({})", self.0)
}
}
impl fmt::Display for FieldId {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "Field({})", self.0)
}
}
impl fmt::Display for ProductionId {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "Production({})", self.0)
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
pub struct SymbolMetadata {
pub visible: bool,
pub named: bool,
pub hidden: bool,
pub terminal: bool,
}
impl Grammar {
pub fn new(name: String) -> Self {
Self {
name,
rules: IndexMap::new(),
tokens: IndexMap::new(),
precedences: Vec::new(),
conflicts: Vec::new(),
externals: Vec::new(),
extras: Vec::new(),
fields: IndexMap::new(),
supertypes: Vec::new(),
inline_rules: Vec::new(),
alias_sequences: IndexMap::new(),
production_ids: IndexMap::new(),
max_alias_sequence_length: 0,
rule_names: IndexMap::new(),
symbol_registry: None,
}
}
#[must_use = "parsing result must be checked"]
pub fn from_macro_output(data: &str) -> Result<Self, GrammarError> {
serde_json::from_str(data).map_err(GrammarError::ParseError)
}
fn validate_symbol(&self, symbol: &Symbol) -> Result<(), GrammarError> {
match symbol {
Symbol::Terminal(id) | Symbol::NonTerminal(id) => {
if !self.rules.contains_key(id) && !self.tokens.contains_key(id) {
return Err(GrammarError::UnresolvedSymbol(*id));
}
}
Symbol::External(id) => {
if !self.externals.iter().any(|ext| ext.symbol_id == *id) {
return Err(GrammarError::UnresolvedExternalSymbol(*id));
}
}
Symbol::Optional(inner) | Symbol::Repeat(inner) | Symbol::RepeatOne(inner) => {
self.validate_symbol(inner)?;
}
Symbol::Choice(choices) => {
for s in choices {
self.validate_symbol(s)?;
}
}
Symbol::Sequence(seq) => {
for s in seq {
self.validate_symbol(s)?;
}
}
Symbol::Epsilon => {}
}
Ok(())
}
#[must_use = "validation result must be checked"]
pub fn validate(&self) -> Result<(), GrammarError> {
let mut field_names: Vec<_> = self.fields.values().collect();
field_names.sort();
let expected_order: Vec<_> = self.fields.values().collect();
if field_names != expected_order {
return Err(GrammarError::InvalidFieldOrdering);
}
for rule in self.all_rules() {
for symbol in &rule.rhs {
self.validate_symbol(symbol)?;
}
}
Ok(())
}
pub fn optimize(&mut self) {
}
pub fn normalize(&mut self) -> Vec<Rule> {
let max_id = self.rules.keys().map(|id| id.0).max().unwrap_or(0);
let mut aux_counter = max_id + 1000;
loop {
let mut found_complex = false;
let mut all_rules = Vec::new();
for (_lhs, rules) in &self.rules {
for rule in rules {
all_rules.push(rule.clone());
}
}
self.rules.clear();
for mut rule in all_rules {
let mut new_rhs = Vec::new();
for symbol in rule.rhs {
match symbol {
Symbol::Optional(inner) => {
found_complex = true;
let aux_id = SymbolId(aux_counter);
aux_counter += 1;
let inner_rule = Rule {
lhs: aux_id,
rhs: vec![*inner.clone()],
precedence: None,
associativity: None,
fields: vec![],
production_id: ProductionId(0),
};
self.add_rule(inner_rule);
self.add_rule(Rule {
lhs: aux_id,
rhs: vec![Symbol::Epsilon],
precedence: None,
associativity: None,
fields: vec![],
production_id: ProductionId(0),
});
new_rhs.push(Symbol::NonTerminal(aux_id));
}
Symbol::Repeat(inner) => {
found_complex = true;
let aux_id = SymbolId(aux_counter);
aux_counter += 1;
self.add_rule(Rule {
lhs: aux_id,
rhs: vec![Symbol::NonTerminal(aux_id), *inner.clone()],
precedence: None,
associativity: None,
fields: vec![],
production_id: ProductionId(0),
});
self.add_rule(Rule {
lhs: aux_id,
rhs: vec![Symbol::Epsilon],
precedence: None,
associativity: None,
fields: vec![],
production_id: ProductionId(0),
});
new_rhs.push(Symbol::NonTerminal(aux_id));
}
Symbol::RepeatOne(inner) => {
found_complex = true;
let aux_id = SymbolId(aux_counter);
aux_counter += 1;
self.add_rule(Rule {
lhs: aux_id,
rhs: vec![Symbol::NonTerminal(aux_id), *inner.clone()],
precedence: None,
associativity: None,
fields: vec![],
production_id: ProductionId(0),
});
self.add_rule(Rule {
lhs: aux_id,
rhs: vec![*inner],
precedence: None,
associativity: None,
fields: vec![],
production_id: ProductionId(0),
});
new_rhs.push(Symbol::NonTerminal(aux_id));
}
Symbol::Choice(choices) => {
found_complex = true;
let aux_id = SymbolId(aux_counter);
aux_counter += 1;
for choice in choices {
self.add_rule(Rule {
lhs: aux_id,
rhs: vec![choice],
precedence: None,
associativity: None,
fields: vec![],
production_id: ProductionId(0),
});
}
new_rhs.push(Symbol::NonTerminal(aux_id));
}
Symbol::Sequence(seq) => {
found_complex = true;
new_rhs.extend(seq);
}
other => new_rhs.push(other),
}
}
rule.rhs = new_rhs;
self.add_rule(rule);
}
if !found_complex {
break;
}
}
self.rules.values().flatten().cloned().collect()
}
}
#[derive(Debug, thiserror::Error)]
pub enum GrammarError {
#[error("Failed to parse grammar: {0}")]
ParseError(#[from] serde_json::Error),
#[error("Invalid field ordering - fields must be in lexicographic order")]
InvalidFieldOrdering,
#[error("Unresolved symbol reference: {0}")]
UnresolvedSymbol(SymbolId),
#[error("Unresolved external symbol reference: {0}")]
UnresolvedExternalSymbol(SymbolId),
#[error("Conflict in grammar: {0}")]
ConflictError(String),
#[error("Invalid precedence declaration: {0}")]
InvalidPrecedence(String),
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_grammar_creation() {
let grammar = Grammar::new("test".to_string());
assert_eq!(grammar.name, "test");
assert!(grammar.rules.is_empty());
assert!(grammar.tokens.is_empty());
assert!(grammar.precedences.is_empty());
assert!(grammar.conflicts.is_empty());
assert!(grammar.externals.is_empty());
assert!(grammar.fields.is_empty());
assert!(grammar.supertypes.is_empty());
assert!(grammar.inline_rules.is_empty());
assert!(grammar.alias_sequences.is_empty());
assert!(grammar.production_ids.is_empty());
assert_eq!(grammar.max_alias_sequence_length, 0);
}
#[test]
fn test_field_ordering_validation() {
let mut grammar = Grammar::new("test".to_string());
grammar.fields.insert(FieldId(1), "zebra".to_string());
grammar.fields.insert(FieldId(0), "alpha".to_string());
assert!(grammar.validate().is_err());
grammar.fields.clear();
grammar.fields.insert(FieldId(0), "alpha".to_string());
grammar.fields.insert(FieldId(1), "zebra".to_string());
assert!(grammar.validate().is_ok());
}
#[test]
fn test_symbol_id_display() {
let symbol_id = SymbolId(42);
assert_eq!(format!("{}", symbol_id), "Symbol(42)");
let rule_id = RuleId(10);
assert_eq!(format!("{}", rule_id), "Rule(10)");
let state_id = StateId(5);
assert_eq!(format!("{}", state_id), "State(5)");
let field_id = FieldId(3);
assert_eq!(format!("{}", field_id), "Field(3)");
let production_id = ProductionId(7);
assert_eq!(format!("{}", production_id), "Production(7)");
}
#[test]
fn test_precedence_kinds() {
let static_prec = PrecedenceKind::Static(5);
let dynamic_prec = PrecedenceKind::Dynamic(10);
match static_prec {
PrecedenceKind::Static(level) => assert_eq!(level, 5),
_ => panic!("Expected static precedence"),
}
match dynamic_prec {
PrecedenceKind::Dynamic(level) => assert_eq!(level, 10),
_ => panic!("Expected dynamic precedence"),
}
}
#[test]
fn test_symbol_types() {
let terminal = Symbol::Terminal(SymbolId(1));
let non_terminal = Symbol::NonTerminal(SymbolId(2));
let external = Symbol::External(SymbolId(3));
match terminal {
Symbol::Terminal(SymbolId(1)) => {}
_ => panic!("Expected terminal symbol"),
}
match non_terminal {
Symbol::NonTerminal(SymbolId(2)) => {}
_ => panic!("Expected non-terminal symbol"),
}
match external {
Symbol::External(SymbolId(3)) => {}
_ => panic!("Expected external symbol"),
}
assert_eq!(terminal, Symbol::Terminal(SymbolId(1)));
assert_ne!(terminal, non_terminal);
let mut set = std::collections::HashSet::new();
set.insert(terminal.clone());
assert!(set.contains(&terminal));
assert!(!set.contains(&non_terminal));
}
#[test]
fn test_token_patterns() {
let string_pattern = TokenPattern::String("hello".to_string());
let regex_pattern = TokenPattern::Regex(r"\d+".to_string());
match string_pattern {
TokenPattern::String(s) => assert_eq!(s, "hello"),
_ => panic!("Expected string pattern"),
}
match regex_pattern {
TokenPattern::Regex(r) => assert_eq!(r, r"\d+"),
_ => panic!("Expected regex pattern"),
}
}
#[test]
fn test_associativity() {
let left = Associativity::Left;
let right = Associativity::Right;
let none = Associativity::None;
assert_eq!(left, Associativity::Left);
assert_eq!(right, Associativity::Right);
assert_eq!(none, Associativity::None);
assert_ne!(left, right);
assert_ne!(left, none);
assert_ne!(right, none);
}
#[test]
fn test_conflict_resolution() {
let precedence_resolution = ConflictResolution::Precedence(PrecedenceKind::Static(5));
let associativity_resolution = ConflictResolution::Associativity(Associativity::Left);
let glr_resolution = ConflictResolution::GLR;
match precedence_resolution {
ConflictResolution::Precedence(PrecedenceKind::Static(5)) => {}
_ => panic!("Expected precedence resolution"),
}
match associativity_resolution {
ConflictResolution::Associativity(Associativity::Left) => {}
_ => panic!("Expected associativity resolution"),
}
match glr_resolution {
ConflictResolution::GLR => {}
_ => panic!("Expected GLR resolution"),
}
}
#[test]
fn test_grammar_with_rules_and_tokens() {
let mut grammar = Grammar::new("test_grammar".to_string());
let rule = Rule {
lhs: SymbolId(0), rhs: vec![Symbol::Terminal(SymbolId(1))], precedence: Some(PrecedenceKind::Static(1)),
associativity: Some(Associativity::Left),
fields: vec![(FieldId(0), 0)],
production_id: ProductionId(0),
};
grammar.add_rule(rule);
let token = Token {
name: "NUMBER".to_string(),
pattern: TokenPattern::Regex(r"\d+".to_string()),
fragile: false,
};
grammar.tokens.insert(SymbolId(1), token);
grammar.fields.insert(FieldId(0), "left".to_string());
grammar.fields.insert(FieldId(1), "right".to_string());
match grammar.validate() {
Ok(_) => {}
Err(e) => panic!("Grammar validation failed: {:?}", e),
}
assert_eq!(grammar.rules.len(), 1);
assert_eq!(grammar.tokens.len(), 1);
assert_eq!(grammar.fields.len(), 2);
}
#[test]
fn test_grammar_validation_unresolved_symbol() {
let mut grammar = Grammar::new("test".to_string());
let rule = Rule {
lhs: SymbolId(0),
rhs: vec![Symbol::Terminal(SymbolId(999))], precedence: None,
associativity: None,
fields: vec![],
production_id: ProductionId(0),
};
grammar.add_rule(rule);
assert!(grammar.validate().is_err());
match grammar.validate() {
Err(GrammarError::UnresolvedSymbol(SymbolId(999))) => {}
_ => panic!("Expected unresolved symbol error"),
}
}
#[test]
fn test_grammar_validation_unresolved_external() {
let mut grammar = Grammar::new("test".to_string());
let rule = Rule {
lhs: SymbolId(0),
rhs: vec![Symbol::External(SymbolId(999))], precedence: None,
associativity: None,
fields: vec![],
production_id: ProductionId(0),
};
grammar.add_rule(rule);
assert!(grammar.validate().is_err());
match grammar.validate() {
Err(GrammarError::UnresolvedExternalSymbol(SymbolId(999))) => {}
_ => panic!("Expected unresolved external symbol error"),
}
}
#[test]
fn test_alias_sequence() {
let alias_seq = AliasSequence {
aliases: vec![Some("alias1".to_string()), None, Some("alias2".to_string())],
};
assert_eq!(alias_seq.aliases.len(), 3);
assert_eq!(alias_seq.aliases[0], Some("alias1".to_string()));
assert_eq!(alias_seq.aliases[1], None);
assert_eq!(alias_seq.aliases[2], Some("alias2".to_string()));
}
#[test]
fn test_external_token() {
let external_token = ExternalToken {
name: "HERE_STRING".to_string(),
symbol_id: SymbolId(42),
};
assert_eq!(external_token.name, "HERE_STRING");
assert_eq!(external_token.symbol_id, SymbolId(42));
}
#[test]
fn test_precedence() {
let precedence = Precedence {
level: 10,
associativity: Associativity::Right,
symbols: vec![SymbolId(1), SymbolId(2), SymbolId(3)],
};
assert_eq!(precedence.level, 10);
assert_eq!(precedence.associativity, Associativity::Right);
assert_eq!(precedence.symbols.len(), 3);
assert!(precedence.symbols.contains(&SymbolId(2)));
}
#[test]
fn test_conflict_declaration() {
let conflict = ConflictDeclaration {
symbols: vec![SymbolId(1), SymbolId(2)],
resolution: ConflictResolution::GLR,
};
assert_eq!(conflict.symbols.len(), 2);
assert!(conflict.symbols.contains(&SymbolId(1)));
assert!(conflict.symbols.contains(&SymbolId(2)));
match conflict.resolution {
ConflictResolution::GLR => {}
_ => panic!("Expected GLR resolution"),
}
}
}