use crate::{FieldId, Grammar, Symbol, SymbolId};
use std::collections::{HashMap, HashSet, VecDeque};
use std::fmt;
#[derive(Debug, Clone, PartialEq, Eq)]
pub enum ValidationError {
UndefinedSymbol {
symbol: SymbolId,
location: String,
},
UnreachableSymbol {
symbol: SymbolId,
name: String,
},
NonProductiveSymbol {
symbol: SymbolId,
name: String,
},
CyclicRule {
symbols: Vec<SymbolId>,
},
DuplicateRule {
symbol: SymbolId,
existing_count: usize,
},
InvalidField {
field_id: FieldId,
rule_symbol: SymbolId,
},
EmptyGrammar,
NoExplicitStartRule,
ConflictingPrecedence {
symbol: SymbolId,
precedences: Vec<i16>,
},
InvalidRegex {
token: SymbolId,
pattern: String,
error: String,
},
ExternalTokenConflict {
token1: String,
token2: String,
},
}
impl fmt::Display for ValidationError {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match self {
ValidationError::UndefinedSymbol { symbol, location } => {
write!(
f,
"Undefined symbol {:?} referenced in {}",
symbol, location
)
}
ValidationError::UnreachableSymbol { symbol, name } => {
write!(
f,
"Symbol '{}' ({:?}) is unreachable from start symbol",
name, symbol
)
}
ValidationError::NonProductiveSymbol { symbol, name } => {
write!(
f,
"Symbol '{}' ({:?}) cannot derive any terminal strings",
name, symbol
)
}
ValidationError::CyclicRule { symbols } => {
write!(f, "Cyclic dependency detected: {:?}", symbols)
}
ValidationError::DuplicateRule {
symbol,
existing_count,
} => {
write!(
f,
"Symbol {:?} has {} rule definitions (expected 1)",
symbol, existing_count
)
}
ValidationError::InvalidField {
field_id,
rule_symbol,
} => {
write!(
f,
"Invalid field {:?} in rule for symbol {:?}",
field_id, rule_symbol
)
}
ValidationError::EmptyGrammar => {
write!(f, "Grammar has no rules defined")
}
ValidationError::NoExplicitStartRule => {
write!(
f,
"No explicit start rule defined (first rule will be used)"
)
}
ValidationError::ConflictingPrecedence {
symbol,
precedences,
} => {
write!(
f,
"Symbol {:?} has conflicting precedences: {:?}",
symbol, precedences
)
}
ValidationError::InvalidRegex {
token,
pattern,
error,
} => {
write!(
f,
"Invalid regex pattern for token {:?}: '{}' - {}",
token, pattern, error
)
}
ValidationError::ExternalTokenConflict { token1, token2 } => {
write!(f, "External tokens '{}' and '{}' conflict", token1, token2)
}
}
}
}
pub struct GrammarValidator {
errors: Vec<ValidationError>,
warnings: Vec<ValidationWarning>,
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub enum ValidationWarning {
UnusedToken {
token: SymbolId,
name: String,
},
DuplicateTokenPattern {
tokens: Vec<SymbolId>,
pattern: String,
},
AmbiguousGrammar {
message: String,
},
MissingFieldNames {
rule_symbol: SymbolId,
},
InefficientRule {
symbol: SymbolId,
suggestion: String,
},
}
impl fmt::Display for ValidationWarning {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match self {
ValidationWarning::UnusedToken { token, name } => {
write!(
f,
"Token '{}' ({:?}) is defined but never used",
name, token
)
}
ValidationWarning::DuplicateTokenPattern { tokens, pattern } => {
write!(
f,
"Multiple tokens have the same pattern '{}': {:?}",
pattern, tokens
)
}
ValidationWarning::AmbiguousGrammar { message } => {
write!(f, "Grammar ambiguity detected: {}", message)
}
ValidationWarning::MissingFieldNames { rule_symbol } => {
write!(f, "Rule for symbol {:?} has no field names", rule_symbol)
}
ValidationWarning::InefficientRule { symbol, suggestion } => {
write!(
f,
"Inefficient rule for symbol {:?}: {}",
symbol, suggestion
)
}
}
}
}
pub struct ValidationResult {
pub errors: Vec<ValidationError>,
pub warnings: Vec<ValidationWarning>,
pub stats: ValidationStats,
}
#[derive(Debug, Clone, Default)]
pub struct ValidationStats {
pub total_symbols: usize,
pub total_tokens: usize,
pub total_rules: usize,
pub reachable_symbols: usize,
pub productive_symbols: usize,
pub external_tokens: usize,
pub max_rule_length: usize,
pub avg_rule_length: f64,
}
impl Default for GrammarValidator {
fn default() -> Self {
Self::new()
}
}
impl GrammarValidator {
pub fn new() -> Self {
Self {
errors: Vec::new(),
warnings: Vec::new(),
}
}
pub fn validate(&mut self, grammar: &Grammar) -> ValidationResult {
self.errors.clear();
self.warnings.clear();
let mut stats = ValidationStats::default();
self.check_empty_grammar(grammar);
let defined_symbols = self.collect_defined_symbols(grammar);
let used_symbols = self.collect_used_symbols(grammar);
self.check_undefined_symbols(&defined_symbols, &used_symbols, grammar);
let reachable = self.find_reachable_symbols(grammar);
self.check_unreachable_symbols(&reachable, &defined_symbols, grammar);
let productive = self.find_productive_symbols(grammar);
self.check_non_productive_symbols(&productive, &defined_symbols, grammar);
self.validate_tokens(grammar);
self.validate_fields(grammar);
self.validate_precedences(grammar);
self.validate_external_tokens(grammar);
self.check_cycles(grammar);
self.check_inefficiencies(grammar);
stats.total_symbols = defined_symbols.len();
stats.total_tokens = grammar.tokens.len();
stats.total_rules = grammar.rules.values().map(|v| v.len()).sum();
stats.reachable_symbols = reachable.len();
stats.productive_symbols = productive.len();
stats.external_tokens = grammar.externals.len();
if !grammar.rules.is_empty() {
let all_rules: Vec<_> = grammar.all_rules().collect();
let total_length: usize = all_rules.iter().map(|r| r.rhs.len()).sum();
stats.max_rule_length = all_rules.iter().map(|r| r.rhs.len()).max().unwrap_or(0);
stats.avg_rule_length = total_length as f64 / all_rules.len() as f64;
}
ValidationResult {
errors: self.errors.clone(),
warnings: self.warnings.clone(),
stats,
}
}
fn check_empty_grammar(&mut self, grammar: &Grammar) {
if grammar.rules.is_empty() {
self.errors.push(ValidationError::EmptyGrammar);
}
}
fn collect_defined_symbols(&self, grammar: &Grammar) -> HashSet<SymbolId> {
let mut defined = HashSet::new();
defined.extend(grammar.tokens.keys());
defined.extend(grammar.rules.keys());
for external in &grammar.externals {
defined.insert(external.symbol_id);
}
defined
}
fn collect_used_in_symbol(symbol: &Symbol, used: &mut HashSet<SymbolId>) {
match symbol {
Symbol::Terminal(id) | Symbol::NonTerminal(id) => {
used.insert(*id);
}
Symbol::External(id) => {
used.insert(SymbolId(id.0));
}
Symbol::Optional(inner) | Symbol::Repeat(inner) | Symbol::RepeatOne(inner) => {
Self::collect_used_in_symbol(inner, used);
}
Symbol::Choice(choices) => {
for s in choices {
Self::collect_used_in_symbol(s, used);
}
}
Symbol::Sequence(seq) => {
for s in seq {
Self::collect_used_in_symbol(s, used);
}
}
Symbol::Epsilon => {}
}
}
fn collect_used_symbols(&self, grammar: &Grammar) -> HashSet<SymbolId> {
let mut used = HashSet::new();
if let Some(start_symbol) = grammar.start_symbol() {
used.insert(start_symbol);
}
for rule in grammar.all_rules() {
for symbol in &rule.rhs {
Self::collect_used_in_symbol(symbol, &mut used);
}
}
used
}
fn check_undefined_symbols(
&mut self,
defined: &HashSet<SymbolId>,
used: &HashSet<SymbolId>,
grammar: &Grammar,
) {
for symbol in used {
if !defined.contains(symbol) {
let mut location = String::from("unknown");
for (rule_sym, rules) in &grammar.rules {
for rule in rules {
for rhs_sym in &rule.rhs {
match rhs_sym {
Symbol::Terminal(id) | Symbol::NonTerminal(id) if id == symbol => {
location = format!("rule for {:?}", rule_sym);
break;
}
_ => {}
}
}
}
}
self.errors.push(ValidationError::UndefinedSymbol {
symbol: *symbol,
location,
});
}
}
}
fn add_reachable_from_symbol(
symbol: &Symbol,
reachable: &mut HashSet<SymbolId>,
queue: &mut VecDeque<SymbolId>,
) {
match symbol {
Symbol::Terminal(id) | Symbol::NonTerminal(id) => {
if reachable.insert(*id) {
queue.push_back(*id);
}
}
Symbol::External(ext_id) => {
let id = SymbolId(ext_id.0);
if reachable.insert(id) {
queue.push_back(id);
}
}
Symbol::Optional(inner) | Symbol::Repeat(inner) | Symbol::RepeatOne(inner) => {
Self::add_reachable_from_symbol(inner, reachable, queue);
}
Symbol::Choice(choices) => {
for s in choices {
Self::add_reachable_from_symbol(s, reachable, queue);
}
}
Symbol::Sequence(seq) => {
for s in seq {
Self::add_reachable_from_symbol(s, reachable, queue);
}
}
Symbol::Epsilon => {}
}
}
fn find_reachable_symbols(&self, grammar: &Grammar) -> HashSet<SymbolId> {
let mut reachable = HashSet::new();
let mut queue = VecDeque::new();
if let Some(start) = grammar.start_symbol() {
queue.push_back(start);
reachable.insert(start);
}
while let Some(symbol) = queue.pop_front() {
if let Some(rules) = grammar.rules.get(&symbol) {
for rule in rules {
for rhs_symbol in &rule.rhs {
Self::add_reachable_from_symbol(rhs_symbol, &mut reachable, &mut queue);
}
}
}
}
reachable
}
fn check_unreachable_symbols(
&mut self,
reachable: &HashSet<SymbolId>,
defined: &HashSet<SymbolId>,
grammar: &Grammar,
) {
let start_symbol = grammar.start_symbol();
for symbol in defined {
if !reachable.contains(symbol) && Some(*symbol) != start_symbol {
let name = self.get_symbol_name(*symbol, grammar);
self.warnings.push(ValidationWarning::UnusedToken {
token: *symbol,
name,
});
}
}
}
fn is_symbol_productive(symbol: &Symbol, productive: &HashSet<SymbolId>) -> bool {
match symbol {
Symbol::Terminal(id) | Symbol::NonTerminal(id) => productive.contains(id),
Symbol::External(ext_id) => productive.contains(&SymbolId(ext_id.0)),
Symbol::Epsilon => true, Symbol::Optional(_) => true, Symbol::Repeat(_) => true, Symbol::RepeatOne(inner) => Self::is_symbol_productive(inner, productive),
Symbol::Choice(choices) => choices
.iter()
.any(|s| Self::is_symbol_productive(s, productive)),
Symbol::Sequence(seq) => seq
.iter()
.all(|s| Self::is_symbol_productive(s, productive)),
}
}
fn find_productive_symbols(&self, grammar: &Grammar) -> HashSet<SymbolId> {
let mut productive = HashSet::new();
let mut changed = true;
productive.extend(grammar.tokens.keys());
for external in &grammar.externals {
productive.insert(external.symbol_id);
}
while changed {
changed = false;
for (symbol, rules) in &grammar.rules {
if !productive.contains(symbol) {
let any_productive = rules.iter().any(|rule| {
rule.rhs
.iter()
.all(|rhs_sym| Self::is_symbol_productive(rhs_sym, &productive))
});
if any_productive {
productive.insert(*symbol);
changed = true;
}
}
}
}
productive
}
fn check_non_productive_symbols(
&mut self,
productive: &HashSet<SymbolId>,
defined: &HashSet<SymbolId>,
grammar: &Grammar,
) {
for symbol in defined {
if !productive.contains(symbol) {
let name = self.get_symbol_name(*symbol, grammar);
self.errors.push(ValidationError::NonProductiveSymbol {
symbol: *symbol,
name,
});
}
}
}
fn validate_tokens(&mut self, grammar: &Grammar) {
let mut pattern_map: HashMap<String, Vec<SymbolId>> = HashMap::new();
for (symbol, token) in &grammar.tokens {
if let crate::TokenPattern::Regex(pattern) = &token.pattern {
if pattern.is_empty() {
self.errors.push(ValidationError::InvalidRegex {
token: *symbol,
pattern: pattern.clone(),
error: "Empty regex pattern".to_string(),
});
}
}
let pattern_str = match &token.pattern {
crate::TokenPattern::String(s) => s.clone(),
crate::TokenPattern::Regex(r) => r.clone(),
};
pattern_map
.entry(pattern_str.clone())
.or_default()
.push(*symbol);
}
for (pattern, symbols) in pattern_map {
if symbols.len() > 1 {
self.warnings
.push(ValidationWarning::DuplicateTokenPattern {
tokens: symbols,
pattern,
});
}
}
}
fn validate_fields(&mut self, grammar: &Grammar) {
for (symbol, rules) in &grammar.rules {
for rule in rules {
for (field_id, index) in &rule.fields {
if *index >= rule.rhs.len() {
self.errors.push(ValidationError::InvalidField {
field_id: *field_id,
rule_symbol: *symbol,
});
}
}
if rule.fields.is_empty() && rule.rhs.len() > 1 {
self.warnings.push(ValidationWarning::MissingFieldNames {
rule_symbol: *symbol,
});
}
}
}
}
fn validate_precedences(&mut self, grammar: &Grammar) {
let mut symbol_precedences: HashMap<SymbolId, Vec<i16>> = HashMap::new();
for prec in &grammar.precedences {
for symbol in &prec.symbols {
symbol_precedences
.entry(*symbol)
.or_default()
.push(prec.level);
}
}
for (symbol, precedences) in symbol_precedences {
let unique_precs: HashSet<_> = precedences.iter().cloned().collect();
if unique_precs.len() > 1 {
self.errors.push(ValidationError::ConflictingPrecedence {
symbol,
precedences: unique_precs.into_iter().collect(),
});
}
}
}
fn validate_external_tokens(&mut self, grammar: &Grammar) {
let mut names = HashSet::new();
for external in &grammar.externals {
if !names.insert(&external.name) {
self.errors.push(ValidationError::ExternalTokenConflict {
token1: external.name.clone(),
token2: external.name.clone(),
});
}
}
}
fn check_cycles(&mut self, grammar: &Grammar) {
let mut visited = HashSet::new();
let mut rec_stack = HashSet::new();
let mut path = Vec::new();
for symbol in grammar.rules.keys() {
if !visited.contains(symbol)
&& self.has_cycle(*symbol, grammar, &mut visited, &mut rec_stack, &mut path)
{
self.errors.push(ValidationError::CyclicRule {
symbols: path.clone(),
});
}
}
}
#[allow(clippy::only_used_in_recursion)]
fn has_cycle(
&self,
symbol: SymbolId,
grammar: &Grammar,
visited: &mut HashSet<SymbolId>,
rec_stack: &mut HashSet<SymbolId>,
path: &mut Vec<SymbolId>,
) -> bool {
visited.insert(symbol);
rec_stack.insert(symbol);
path.push(symbol);
if let Some(rules) = grammar.rules.get(&symbol) {
for rule in rules {
for rhs_symbol in &rule.rhs {
if let Symbol::NonTerminal(id) = rhs_symbol {
if !visited.contains(id) {
if self.has_cycle(*id, grammar, visited, rec_stack, path) {
return true;
}
} else if rec_stack.contains(id) {
return true;
}
}
}
}
}
path.pop();
rec_stack.remove(&symbol);
false
}
fn check_inefficiencies(&mut self, grammar: &Grammar) {
for (symbol, rules) in &grammar.rules {
for rule in rules {
if rule.rhs.len() == 1
&& let Symbol::NonTerminal(_) = &rule.rhs[0]
{
self.warnings.push(ValidationWarning::InefficientRule {
symbol: *symbol,
suggestion: "Consider inlining trivial rules".to_string(),
});
}
if rule.rhs.len() > 10 {
self.warnings.push(ValidationWarning::InefficientRule {
symbol: *symbol,
suggestion: format!(
"Rule has {} symbols, consider breaking it down",
rule.rhs.len()
),
});
}
}
}
}
fn get_symbol_name(&self, symbol: SymbolId, grammar: &Grammar) -> String {
if let Some(token) = grammar.tokens.get(&symbol) {
return token.name.clone();
}
if let Some(_rule) = grammar.rules.get(&symbol) {
return format!("rule_{}", symbol.0);
}
for external in &grammar.externals {
if external.symbol_id == symbol {
return external.name.clone();
}
}
format!("symbol_{}", symbol.0)
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::{
FieldId, Grammar, Precedence, ProductionId, Rule, Symbol, SymbolId, Token, TokenPattern,
};
#[test]
fn test_empty_grammar_validation() {
let grammar = Grammar::new("test".to_string());
let mut validator = GrammarValidator::new();
let result = validator.validate(&grammar);
assert!(
result
.errors
.iter()
.any(|e| matches!(e, ValidationError::EmptyGrammar))
);
}
#[test]
fn test_undefined_symbol() {
let mut grammar = Grammar::new("test".to_string());
let expr = SymbolId(1);
let undefined = SymbolId(99);
grammar.add_rule(Rule {
lhs: expr,
rhs: vec![Symbol::NonTerminal(undefined)],
precedence: None,
associativity: None,
fields: vec![],
production_id: ProductionId(0),
});
let mut validator = GrammarValidator::new();
let result = validator.validate(&grammar);
assert!(result.errors.iter().any(|e| {
matches!(e, ValidationError::UndefinedSymbol { symbol, .. } if *symbol == undefined)
}));
}
#[test]
fn test_non_productive_symbol() {
let mut grammar = Grammar::new("test".to_string());
let a = SymbolId(1);
let b = SymbolId(2);
grammar.add_rule(Rule {
lhs: a,
rhs: vec![Symbol::NonTerminal(b)],
precedence: None,
associativity: None,
fields: vec![],
production_id: ProductionId(0),
});
grammar.add_rule(Rule {
lhs: b,
rhs: vec![Symbol::NonTerminal(a)],
precedence: None,
associativity: None,
fields: vec![],
production_id: ProductionId(1),
});
let mut validator = GrammarValidator::new();
let result = validator.validate(&grammar);
assert!(
result
.errors
.iter()
.any(|e| { matches!(e, ValidationError::NonProductiveSymbol { .. }) })
);
}
#[test]
fn test_valid_grammar() {
let mut grammar = Grammar::new("test".to_string());
let expr = SymbolId(1);
let num = SymbolId(2);
grammar.tokens.insert(
num,
Token {
name: "number".to_string(),
pattern: TokenPattern::Regex(r"\d+".to_string()),
fragile: false,
},
);
grammar.add_rule(Rule {
lhs: expr,
rhs: vec![Symbol::Terminal(num)],
precedence: None,
associativity: None,
fields: vec![],
production_id: ProductionId(0),
});
let mut validator = GrammarValidator::new();
let result = validator.validate(&grammar);
assert!(result.errors.is_empty());
assert_eq!(result.stats.total_symbols, 2);
assert_eq!(result.stats.productive_symbols, 2);
}
#[test]
fn test_duplicate_token_patterns() {
let mut grammar = Grammar::new("test".to_string());
grammar.tokens.insert(
SymbolId(1),
Token {
name: "plus1".to_string(),
pattern: TokenPattern::String("+".to_string()),
fragile: false,
},
);
grammar.tokens.insert(
SymbolId(2),
Token {
name: "plus2".to_string(),
pattern: TokenPattern::String("+".to_string()),
fragile: false,
},
);
let mut validator = GrammarValidator::new();
let result = validator.validate(&grammar);
assert!(result.warnings.iter().any(|w| {
matches!(w, ValidationWarning::DuplicateTokenPattern { pattern, .. } if pattern == "+")
}));
}
#[test]
fn test_invalid_field_index() {
let mut grammar = Grammar::new("test".to_string());
let expr = SymbolId(1);
let num = SymbolId(2);
grammar.tokens.insert(
num,
Token {
name: "number".to_string(),
pattern: TokenPattern::Regex(r"\d+".to_string()),
fragile: false,
},
);
grammar.add_rule(Rule {
lhs: expr,
rhs: vec![Symbol::Terminal(num)],
precedence: None,
associativity: None,
fields: vec![(FieldId(0), 5)], production_id: ProductionId(0),
});
let mut validator = GrammarValidator::new();
let result = validator.validate(&grammar);
assert!(
result
.errors
.iter()
.any(|e| { matches!(e, ValidationError::InvalidField { .. }) })
);
}
#[test]
fn test_cyclic_rules() {
let mut grammar = Grammar::new("test".to_string());
let a = SymbolId(1);
let b = SymbolId(2);
let c = SymbolId(3);
grammar.add_rule(Rule {
lhs: a,
rhs: vec![Symbol::NonTerminal(b)],
precedence: None,
associativity: None,
fields: vec![],
production_id: ProductionId(0),
});
grammar.add_rule(Rule {
lhs: b,
rhs: vec![Symbol::NonTerminal(c)],
precedence: None,
associativity: None,
fields: vec![],
production_id: ProductionId(1),
});
grammar.add_rule(Rule {
lhs: c,
rhs: vec![Symbol::NonTerminal(a)],
precedence: None,
associativity: None,
fields: vec![],
production_id: ProductionId(2),
});
let mut validator = GrammarValidator::new();
let result = validator.validate(&grammar);
assert!(
result
.errors
.iter()
.any(|e| { matches!(e, ValidationError::CyclicRule { .. }) })
);
}
#[test]
fn test_conflicting_precedence() {
let mut grammar = Grammar::new("test".to_string());
let plus = SymbolId(1);
grammar.precedences.push(Precedence {
level: 1,
associativity: crate::Associativity::Left,
symbols: vec![plus],
});
grammar.precedences.push(Precedence {
level: 2,
associativity: crate::Associativity::Right,
symbols: vec![plus],
});
let mut validator = GrammarValidator::new();
let result = validator.validate(&grammar);
assert!(result.errors.iter().any(|e| {
matches!(e, ValidationError::ConflictingPrecedence { symbol, .. } if *symbol == plus)
}));
}
}