#![cfg_attr(feature = "strict_docs", allow(missing_docs))]
#![allow(dead_code, clippy::needless_range_loop, clippy::only_used_in_recursion)]
use adze_ir::{Grammar, Rule, Symbol, SymbolId, TokenPattern};
use std::collections::{HashMap, HashSet, VecDeque};
use std::fmt;
#[derive(Debug, Clone)]
pub struct ValidationError {
pub kind: ErrorKind,
pub message: String,
pub location: ErrorLocation,
pub suggestion: Option<String>,
pub related: Vec<RelatedInfo>,
}
#[derive(Debug, Clone)]
pub struct ErrorLocation {
pub symbol: Option<SymbolId>,
pub rule_index: Option<usize>,
pub position: Option<usize>,
pub description: String,
}
#[derive(Debug, Clone)]
pub struct RelatedInfo {
pub location: String,
pub message: String,
}
#[derive(Debug, Clone, PartialEq)]
pub enum ErrorKind {
EmptyGrammar,
NoStartSymbol,
UndefinedSymbol,
UnreachableSymbol,
NonProductiveSymbol,
LeftRecursion,
AmbiguousGrammar,
InvalidToken,
DuplicateRule,
InvalidField,
ConflictingPrecedence,
MissingRequiredSymbol,
CyclicDependency,
}
impl fmt::Display for ValidationError {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
writeln!(f, "Error: {}", self.message)?;
writeln!(f, "Location: {}", self.location.description)?;
if let Some(suggestion) = &self.suggestion {
writeln!(f, "Suggestion: {}", suggestion)?;
}
if !self.related.is_empty() {
writeln!(f, "Related information:")?;
for info in &self.related {
writeln!(f, " - {} at {}", info.message, info.location)?;
}
}
Ok(())
}
}
pub struct GLRGrammarValidator {
errors: Vec<ValidationError>,
warnings: Vec<ValidationWarning>,
symbol_names: HashMap<SymbolId, String>,
}
#[derive(Debug, Clone)]
pub struct ValidationWarning {
pub message: String,
pub location: String,
pub suggestion: Option<String>,
}
pub struct ValidationResult {
pub is_valid: bool,
pub errors: Vec<ValidationError>,
pub warnings: Vec<ValidationWarning>,
pub stats: GrammarStats,
pub suggestions: Vec<String>,
}
#[derive(Debug, Default)]
pub struct GrammarStats {
pub total_symbols: usize,
pub terminal_count: usize,
pub nonterminal_count: usize,
pub rule_count: usize,
pub max_rule_length: usize,
pub has_left_recursion: bool,
pub is_ll1: bool,
pub is_lr1: bool,
pub requires_glr: bool,
}
impl Default for GLRGrammarValidator {
fn default() -> Self {
Self::new()
}
}
impl GLRGrammarValidator {
pub fn new() -> Self {
Self {
errors: Vec::new(),
warnings: Vec::new(),
symbol_names: HashMap::new(),
}
}
pub fn validate(&mut self, grammar: &Grammar) -> ValidationResult {
self.errors.clear();
self.warnings.clear();
self.build_symbol_names(grammar);
let mut stats = GrammarStats::default();
self.validate_basic_structure(grammar);
self.validate_symbols(grammar);
let reachable = self.find_reachable_symbols(grammar);
let productive = self.find_productive_symbols(grammar);
self.validate_reachability(&reachable, grammar);
self.validate_productivity(&productive, grammar);
self.check_left_recursion(grammar, &mut stats);
self.check_ambiguity(grammar, &mut stats);
self.validate_tokens(grammar);
self.validate_precedence(grammar);
self.collect_statistics(grammar, &mut stats);
let suggestions = self.generate_suggestions(grammar, &stats);
ValidationResult {
is_valid: self.errors.is_empty(),
errors: self.errors.clone(),
warnings: self.warnings.clone(),
stats,
suggestions,
}
}
fn build_symbol_names(&mut self, grammar: &Grammar) {
for (id, token) in &grammar.tokens {
self.symbol_names.insert(*id, token.name.clone());
}
for (id, name) in &grammar.rule_names {
self.symbol_names.insert(*id, name.clone());
}
for external in &grammar.externals {
self.symbol_names
.insert(external.symbol_id, external.name.clone());
}
}
fn get_symbol_name(&self, id: SymbolId) -> String {
self.symbol_names
.get(&id)
.cloned()
.unwrap_or_else(|| format!("symbol_{}", id.0))
}
fn validate_basic_structure(&mut self, grammar: &Grammar) {
if grammar.rules.is_empty() {
self.errors.push(ValidationError {
kind: ErrorKind::EmptyGrammar,
message: "Grammar has no rules defined".to_string(),
location: ErrorLocation {
symbol: None,
rule_index: None,
position: None,
description: "Grammar definition".to_string(),
},
suggestion: Some(
"Add at least one rule to define the grammar structure".to_string(),
),
related: vec![],
});
}
let has_explicit_start = grammar.rules.keys().any(|id| {
self.get_symbol_name(*id).starts_with("_start") || self.get_symbol_name(*id) == "start"
});
if !has_explicit_start && !grammar.rules.is_empty() {
let Some(first_rule) = grammar.rules.keys().next() else {
return;
};
self.warnings.push(ValidationWarning {
message: format!(
"No explicit start rule found, using '{}' as start symbol",
self.get_symbol_name(*first_rule)
),
location: "Grammar root".to_string(),
suggestion: Some(
"Consider adding an explicit '_start' rule for clarity".to_string(),
),
});
}
}
fn validate_symbols(&mut self, grammar: &Grammar) {
let mut defined_symbols = HashSet::new();
let mut used_symbols = HashSet::new();
defined_symbols.extend(grammar.tokens.keys());
for rules in grammar.rules.values() {
for rule in rules {
defined_symbols.insert(rule.lhs);
}
}
for external in &grammar.externals {
defined_symbols.insert(external.symbol_id);
}
for (_symbol_id, rules) in &grammar.rules {
for rule in rules {
for (pos, symbol) in rule.rhs.iter().enumerate() {
let symbol_id = match symbol {
Symbol::Terminal(id) | Symbol::NonTerminal(id) => *id,
Symbol::External(ext) => SymbolId(ext.0),
Symbol::Optional(_)
| Symbol::Repeat(_)
| Symbol::RepeatOne(_)
| Symbol::Choice(_)
| Symbol::Sequence(_)
| Symbol::Epsilon => {
continue; }
};
used_symbols.insert(symbol_id);
if !defined_symbols.contains(&symbol_id) {
let mut related = vec![];
let similar =
self.find_similar_symbols(&symbol_id, &defined_symbols, grammar);
for sim in similar {
related.push(RelatedInfo {
location: "Symbol definition".to_string(),
message: format!("Did you mean '{}'?", self.get_symbol_name(sim)),
});
}
self.errors.push(ValidationError {
kind: ErrorKind::UndefinedSymbol,
message: format!(
"Symbol '{}' is not defined",
self.get_symbol_name(symbol_id)
),
location: ErrorLocation {
symbol: Some(rule.lhs),
rule_index: None,
position: Some(pos),
description: format!(
"In rule '{}' at position {}",
self.get_symbol_name(rule.lhs),
pos
),
},
suggestion: Some(
"Define the symbol as a token or rule before using it".to_string(),
),
related,
});
}
}
}
}
let mut rule_counts: HashMap<SymbolId, usize> = HashMap::new();
for rule_id in grammar.rules.keys() {
*rule_counts.entry(*rule_id).or_insert(0) += 1;
}
for (symbol, count) in rule_counts {
if count > 1 {
self.errors.push(ValidationError {
kind: ErrorKind::DuplicateRule,
message: format!(
"Symbol '{}' has {} rule definitions",
self.get_symbol_name(symbol),
count
),
location: ErrorLocation {
symbol: Some(symbol),
rule_index: None,
position: None,
description: format!("Symbol '{}'", self.get_symbol_name(symbol)),
},
suggestion: Some(
"Each symbol should have exactly one rule definition".to_string(),
),
related: vec![],
});
}
}
}
fn find_similar_symbols(
&self,
target: &SymbolId,
defined: &HashSet<SymbolId>,
_grammar: &Grammar,
) -> Vec<SymbolId> {
let target_name = self.get_symbol_name(*target);
let mut similar = vec![];
for symbol in defined {
let name = self.get_symbol_name(*symbol);
if self.is_similar(&target_name, &name) {
similar.push(*symbol);
}
}
similar.sort_by_key(|s| {
let name = self.get_symbol_name(*s);
self.edit_distance(&target_name, &name)
});
similar.truncate(3);
similar
}
fn is_similar(&self, s1: &str, s2: &str) -> bool {
if s1 == s2 {
return false;
}
let dist = self.edit_distance(s1, s2);
let max_len = s1.len().max(s2.len());
(dist as f64) / (max_len as f64) < 0.3
}
fn edit_distance(&self, s1: &str, s2: &str) -> usize {
let len1 = s1.len();
let len2 = s2.len();
let mut dp = vec![vec![0; len2 + 1]; len1 + 1];
for i in 0..=len1 {
dp[i][0] = i;
}
for j in 0..=len2 {
dp[0][j] = j;
}
for i in 1..=len1 {
for j in 1..=len2 {
if s1.chars().nth(i - 1) == s2.chars().nth(j - 1) {
dp[i][j] = dp[i - 1][j - 1];
} else {
dp[i][j] = 1 + dp[i - 1][j].min(dp[i][j - 1]).min(dp[i - 1][j - 1]);
}
}
}
dp[len1][len2]
}
fn find_reachable_symbols(&self, grammar: &Grammar) -> HashSet<SymbolId> {
let mut reachable = HashSet::new();
let mut queue = VecDeque::new();
if let Some(start_rules) = grammar.rules.values().next()
&& let Some(start_rule) = start_rules.first()
{
let start_symbol = start_rule.lhs;
queue.push_back(start_symbol);
reachable.insert(start_symbol);
}
while let Some(symbol) = queue.pop_front() {
for rules in grammar.rules.values() {
for rule in rules {
if rule.lhs == symbol {
for rhs_symbol in &rule.rhs {
let id = match rhs_symbol {
Symbol::Terminal(id) | Symbol::NonTerminal(id) => *id,
Symbol::External(ext) => SymbolId(ext.0),
Symbol::Optional(_)
| Symbol::Repeat(_)
| Symbol::RepeatOne(_)
| Symbol::Choice(_)
| Symbol::Sequence(_)
| Symbol::Epsilon => {
continue; }
};
if reachable.insert(id) {
queue.push_back(id);
}
}
}
}
}
}
reachable
}
fn is_symbol_productive(symbol: &Symbol, productive: &HashSet<SymbolId>) -> bool {
match symbol {
Symbol::Terminal(id) | Symbol::NonTerminal(id) => productive.contains(id),
Symbol::External(ext) => productive.contains(&SymbolId(ext.0)),
Symbol::Optional(_) | Symbol::Repeat(_) => true, Symbol::RepeatOne(inner) => Self::is_symbol_productive(inner, productive),
Symbol::Choice(choices) => choices
.iter()
.any(|s| Self::is_symbol_productive(s, productive)),
Symbol::Sequence(seq) => seq
.iter()
.all(|s| Self::is_symbol_productive(s, productive)),
Symbol::Epsilon => true,
}
}
fn find_productive_symbols(&self, grammar: &Grammar) -> HashSet<SymbolId> {
let mut productive = HashSet::new();
let mut changed = true;
productive.extend(grammar.tokens.keys());
for external in &grammar.externals {
productive.insert(external.symbol_id);
}
while changed {
changed = false;
for rules in grammar.rules.values() {
for rule in rules {
let symbol = rule.lhs;
if !productive.contains(&symbol) {
let all_productive = rule.rhs.iter().all(|sym| {
match sym {
Symbol::Terminal(id) | Symbol::NonTerminal(id) => {
productive.contains(id)
}
Symbol::External(ext) => productive.contains(&SymbolId(ext.0)),
Symbol::Optional(_) | Symbol::Repeat(_) => true, Symbol::RepeatOne(inner) => {
Self::is_symbol_productive(inner, &productive)
}
Symbol::Choice(choices) => choices
.iter()
.any(|s| Self::is_symbol_productive(s, &productive)),
Symbol::Sequence(seq) => seq
.iter()
.all(|s| Self::is_symbol_productive(s, &productive)),
Symbol::Epsilon => true,
}
});
if all_productive {
productive.insert(symbol);
changed = true;
}
}
}
}
}
productive
}
fn validate_reachability(&mut self, reachable: &HashSet<SymbolId>, grammar: &Grammar) {
let mut non_terminals = HashSet::new();
for rules in grammar.rules.values() {
for rule in rules {
non_terminals.insert(rule.lhs);
}
}
for symbol in non_terminals {
if !reachable.contains(&symbol) {
self.warnings.push(ValidationWarning {
message: format!(
"Symbol '{}' is defined but not reachable from start symbol",
self.get_symbol_name(symbol)
),
location: format!("Rule definition for '{}'", self.get_symbol_name(symbol)),
suggestion: Some(
"Remove unused rules or ensure they are referenced".to_string(),
),
});
}
}
for (symbol, _token) in &grammar.tokens {
if !reachable.contains(symbol) {
self.warnings.push(ValidationWarning {
message: format!(
"Token '{}' is defined but never used",
self.get_symbol_name(*symbol)
),
location: format!("Token definition for '{}'", self.get_symbol_name(*symbol)),
suggestion: Some("Remove unused tokens or use them in rules".to_string()),
});
}
}
}
fn validate_productivity(&mut self, productive: &HashSet<SymbolId>, grammar: &Grammar) {
let mut non_terminals = HashSet::new();
for rules in grammar.rules.values() {
for rule in rules {
non_terminals.insert(rule.lhs);
}
}
for symbol in non_terminals {
if !productive.contains(&symbol) {
let mut cycle_symbols = vec![];
self.find_non_productive_cycle(
symbol,
grammar,
&mut cycle_symbols,
&mut HashSet::new(),
);
let mut related = vec![];
for sym in &cycle_symbols {
related.push(RelatedInfo {
location: format!("Symbol '{}'", self.get_symbol_name(*sym)),
message: "Part of non-productive cycle".to_string(),
});
}
self.errors.push(ValidationError {
kind: ErrorKind::NonProductiveSymbol,
message: format!(
"Symbol '{}' cannot derive any terminal strings",
self.get_symbol_name(symbol)
),
location: ErrorLocation {
symbol: Some(symbol),
rule_index: None,
position: None,
description: format!("Rule for '{}'", self.get_symbol_name(symbol)),
},
suggestion: Some(
"Add a rule that derives terminals or break the cycle".to_string(),
),
related,
});
}
}
}
fn find_non_productive_cycle(
&self,
start: SymbolId,
grammar: &Grammar,
path: &mut Vec<SymbolId>,
visited: &mut HashSet<SymbolId>,
) -> bool {
if visited.contains(&start) {
return path.contains(&start);
}
visited.insert(start);
path.push(start);
if let Some(rules) = grammar.rules.get(&start) {
for rule in rules {
for symbol in &rule.rhs {
if let Symbol::NonTerminal(id) = symbol
&& self.find_non_productive_cycle(*id, grammar, path, visited)
{
return true;
}
}
}
}
path.pop();
false
}
fn check_left_recursion(&mut self, grammar: &Grammar, stats: &mut GrammarStats) {
for (symbol, rules) in &grammar.rules {
for rule in rules {
if let Some(Symbol::NonTerminal(first)) = rule.rhs.first()
&& first == symbol
{
stats.has_left_recursion = true;
self.warnings.push(ValidationWarning {
message: format!(
"Direct left recursion in rule for '{}'",
self.get_symbol_name(*symbol)
),
location: format!(
"Rule: {} → {} ...",
self.get_symbol_name(*symbol),
self.get_symbol_name(*first)
),
suggestion: Some(
"Consider rewriting to eliminate left recursion or use GLR parsing"
.to_string(),
),
});
}
}
if self.has_indirect_left_recursion(*symbol, grammar) {
stats.has_left_recursion = true;
self.warnings.push(ValidationWarning {
message: format!(
"Indirect left recursion detected for '{}'",
self.get_symbol_name(*symbol)
),
location: format!("Starting from rule for '{}'", self.get_symbol_name(*symbol)),
suggestion: Some(
"GLR parsing can handle left recursion efficiently".to_string(),
),
});
}
}
}
fn has_indirect_left_recursion(&self, symbol: SymbolId, grammar: &Grammar) -> bool {
let mut visited = HashSet::new();
self.can_derive_left(symbol, symbol, grammar, &mut visited)
}
fn can_derive_left(
&self,
from: SymbolId,
target: SymbolId,
grammar: &Grammar,
visited: &mut HashSet<SymbolId>,
) -> bool {
if visited.contains(&from) {
return false;
}
visited.insert(from);
if let Some(rules) = grammar.rules.get(&from) {
for rule in rules {
if let Some(Symbol::NonTerminal(first)) = rule.rhs.first() {
if *first == target {
return true;
}
if self.can_derive_left(*first, target, grammar, visited) {
return true;
}
}
}
}
false
}
fn check_ambiguity(&mut self, grammar: &Grammar, stats: &mut GrammarStats) {
self.check_direct_ambiguity(grammar, stats);
self.check_lr_conflicts(grammar, stats);
self.check_ambiguous_patterns(grammar, stats);
}
fn check_direct_ambiguity(&mut self, grammar: &Grammar, stats: &mut GrammarStats) {
let mut rhs_map: HashMap<Vec<Symbol>, Vec<SymbolId>> = HashMap::new();
for (symbol, rules) in &grammar.rules {
for rule in rules {
let rhs = rule.rhs.clone();
rhs_map.entry(rhs).or_default().push(*symbol);
}
}
for (rhs, symbols) in rhs_map {
if symbols.len() > 1 {
let rhs_str = rhs
.iter()
.map(|s| match s {
Symbol::Terminal(id) | Symbol::NonTerminal(id) => self.get_symbol_name(*id),
Symbol::External(ext) => format!("external_{}", ext.0),
Symbol::Optional(_)
| Symbol::Repeat(_)
| Symbol::RepeatOne(_)
| Symbol::Choice(_)
| Symbol::Sequence(_)
| Symbol::Epsilon => "<complex>".to_string(),
})
.collect::<Vec<_>>()
.join(" ");
self.warnings.push(ValidationWarning {
message: format!(
"Direct ambiguity: multiple non-terminals produce '{}'",
rhs_str
),
location: format!(
"Non-terminals: {}",
symbols
.iter()
.map(|s| self.get_symbol_name(*s))
.collect::<Vec<_>>()
.join(", ")
),
suggestion: Some(
"Consider merging these rules or adding precedence".to_string(),
),
});
stats.requires_glr = true;
}
}
}
fn check_lr_conflicts(&mut self, grammar: &Grammar, stats: &mut GrammarStats) {
for (symbol, rules) in &grammar.rules {
for rule in rules {
let has_left_rec = !rule.rhs.is_empty()
&& match &rule.rhs[0] {
Symbol::NonTerminal(id) => id == symbol,
_ => false,
};
let has_right_rec = rule.rhs.len() > 1
&& match rule.rhs.last() {
Some(Symbol::NonTerminal(id)) => id == symbol,
_ => false,
};
if has_left_rec || has_right_rec {
let other_rules: Vec<_> = grammar
.rules
.iter()
.filter(|(s, _)| *s == symbol)
.flat_map(|(_, rules)| rules.iter())
.filter(|r| {
let other_left_rec = !r.rhs.is_empty()
&& match &r.rhs[0] {
Symbol::NonTerminal(id) => id == symbol,
_ => false,
};
let other_right_rec = r.rhs.len() > 1
&& match r.rhs.last() {
Some(Symbol::NonTerminal(id)) => id == symbol,
_ => false,
};
(has_left_rec && other_right_rec) || (has_right_rec && other_left_rec)
})
.collect();
if !other_rules.is_empty() {
self.warnings.push(ValidationWarning {
message: format!(
"Mixed left/right recursion for '{}'",
self.get_symbol_name(*symbol)
),
location: format!("Non-terminal '{}'", self.get_symbol_name(*symbol)),
suggestion: Some(
"This creates shift/reduce conflicts - GLR will handle it"
.to_string(),
),
});
stats.requires_glr = true;
}
}
}
}
let mut prefix_map: HashMap<Vec<SymbolId>, Vec<(SymbolId, usize)>> = HashMap::new();
for (symbol, rules) in &grammar.rules {
for rule in rules {
for prefix_len in 1..=rule.rhs.len().min(3) {
let mut prefix = Vec::new();
let mut unsupported_prefix = false;
for sym in rule.rhs.iter().take(prefix_len) {
match sym {
Symbol::Terminal(id) | Symbol::NonTerminal(id) => prefix.push(*id),
Symbol::External(ext) => prefix.push(SymbolId(ext.0)),
Symbol::Optional(_)
| Symbol::Repeat(_)
| Symbol::RepeatOne(_)
| Symbol::Choice(_)
| Symbol::Sequence(_)
| Symbol::Epsilon => {
unsupported_prefix = true;
break;
}
}
}
if unsupported_prefix {
continue;
}
prefix_map
.entry(prefix)
.or_default()
.push((*symbol, rule.rhs.len()));
}
}
}
for (prefix, occurrences) in prefix_map {
if occurrences.len() > 1 {
let lengths: HashSet<_> = occurrences.iter().map(|(_, len)| *len).collect();
if lengths.len() > 1 && lengths.iter().min().is_some_and(|min| prefix.len() < *min)
{
let prefix_str = prefix
.iter()
.map(|s| self.get_symbol_name(*s))
.collect::<Vec<_>>()
.join(" ");
self.warnings.push(ValidationWarning {
message: format!(
"Shift/reduce conflict: rules with common prefix '{}'",
prefix_str
),
location: format!(
"Rules: {}",
occurrences
.iter()
.map(|(s, _)| self.get_symbol_name(*s))
.collect::<Vec<_>>()
.join(", ")
),
suggestion: Some("GLR parsing will explore both possibilities".to_string()),
});
stats.requires_glr = true;
}
}
}
}
fn check_ambiguous_patterns(&mut self, grammar: &Grammar, stats: &mut GrammarStats) {
for (_key, rules) in &grammar.rules {
for rule in rules {
if rule.rhs.len() == 2
&& let (Symbol::NonTerminal(id1), Symbol::NonTerminal(id2)) =
(&rule.rhs[0], &rule.rhs[1])
&& *id1 == rule.lhs
&& *id2 == rule.lhs
{
self.warnings.push(ValidationWarning {
message: format!(
"Highly ambiguous pattern: '{}' → '{}' '{}'",
self.get_symbol_name(rule.lhs),
self.get_symbol_name(rule.lhs),
self.get_symbol_name(rule.lhs)
),
location: format!("Non-terminal '{}'", self.get_symbol_name(rule.lhs)),
suggestion: Some(
"This creates extreme ambiguity - GLR required".to_string(),
),
});
stats.requires_glr = true;
}
}
}
for (_symbol, rules) in &grammar.rules {
for rule in rules {
let rule_str = rule
.rhs
.iter()
.map(|s| match s {
Symbol::Terminal(id) | Symbol::NonTerminal(id) => self.get_symbol_name(*id),
Symbol::External(ext) => format!("external_{}", ext.0),
Symbol::Optional(_)
| Symbol::Repeat(_)
| Symbol::RepeatOne(_)
| Symbol::Choice(_)
| Symbol::Sequence(_)
| Symbol::Epsilon => "<complex>".to_string(),
})
.collect::<Vec<_>>();
if rule_str.len() >= 4 {
let has_if_pattern = rule_str
.windows(4)
.any(|w| w[0].contains("if") && w[2].contains("then"));
if has_if_pattern {
let has_optional_else = grammar.rules.values().any(|rules| {
rules.iter().any(|r| {
let other_str = r
.rhs
.iter()
.map(|s| match s {
Symbol::Terminal(id) | Symbol::NonTerminal(id) => {
self.get_symbol_name(*id)
}
Symbol::External(ext) => format!("external_{}", ext.0),
Symbol::Optional(_)
| Symbol::Repeat(_)
| Symbol::RepeatOne(_)
| Symbol::Choice(_)
| Symbol::Sequence(_)
| Symbol::Epsilon => "<complex>".to_string(),
})
.collect::<Vec<_>>();
other_str.len() > rule_str.len()
&& other_str.contains(&"else".to_string())
})
});
if has_optional_else {
self.warnings.push(ValidationWarning {
message: "Potential 'dangling else' ambiguity detected".to_string(),
location: "Conditional statement rules".to_string(),
suggestion: Some(
"Use precedence or GLR parsing to resolve".to_string(),
),
});
stats.requires_glr = true;
}
}
}
}
}
let mut symbol_binary_ops: HashMap<SymbolId, Vec<&Rule>> = HashMap::new();
for (_key, rules) in &grammar.rules {
for rule in rules {
if rule.rhs.len() == 3
&& let (Symbol::NonTerminal(id1), _, Symbol::NonTerminal(id2)) =
(&rule.rhs[0], &rule.rhs[1], &rule.rhs[2])
&& *id1 == rule.lhs
&& *id2 == rule.lhs
{
symbol_binary_ops.entry(rule.lhs).or_default().push(rule);
}
}
}
for (symbol, binary_ops) in symbol_binary_ops {
if binary_ops.len() > 1 {
let symbol_name = self.get_symbol_name(symbol);
self.warnings.push(ValidationWarning {
message: format!(
"Expression ambiguity: '{}' has multiple binary operators",
symbol_name
),
location: format!("Non-terminal '{}'", symbol_name),
suggestion: Some("Define precedence levels or use GLR parsing".to_string()),
});
stats.requires_glr = true;
}
}
}
fn validate_tokens(&mut self, grammar: &Grammar) {
for (symbol, token) in &grammar.tokens {
match &token.pattern {
TokenPattern::String(s) if s.is_empty() => {
self.errors.push(ValidationError {
kind: ErrorKind::InvalidToken,
message: format!(
"Token '{}' has empty string pattern",
self.get_symbol_name(*symbol)
),
location: ErrorLocation {
symbol: Some(*symbol),
rule_index: None,
position: None,
description: format!("Token '{}'", self.get_symbol_name(*symbol)),
},
suggestion: Some("Provide a non-empty pattern for the token".to_string()),
related: vec![],
});
}
TokenPattern::Regex(r) => {
match regex::Regex::new(r) {
Err(e) => {
self.errors.push(ValidationError {
kind: ErrorKind::InvalidToken,
message: format!(
"Invalid regex pattern for token '{}'",
self.get_symbol_name(*symbol)
),
location: ErrorLocation {
symbol: Some(*symbol),
rule_index: None,
position: None,
description: format!(
"Token '{}' pattern: {}",
self.get_symbol_name(*symbol),
r
),
},
suggestion: Some(format!("Fix regex error: {}", e)),
related: vec![],
});
}
Ok(re) => {
if r.contains("*+") || r.contains("++") || r.contains("?+") {
self.warnings.push(ValidationWarning {
message: format!(
"Possessive quantifiers in token '{}'",
self.get_symbol_name(*symbol)
),
location: format!("Pattern: {}", r),
suggestion: Some(
"Consider if possessive quantifiers are intended"
.to_string(),
),
});
}
if !r.starts_with('^') && re.find_at("test", 1).is_some() {
self.warnings.push(ValidationWarning {
message: format!(
"Token '{}' pattern may match at non-boundary",
self.get_symbol_name(*symbol)
),
location: format!("Pattern: {}", r),
suggestion: Some(
"Consider anchoring with ^ for consistent tokenization"
.to_string(),
),
});
}
}
}
}
_ => {}
}
}
let mut patterns: Vec<(SymbolId, String)> = vec![];
for (symbol, token) in &grammar.tokens {
let pattern_str = match &token.pattern {
TokenPattern::String(s) => regex::escape(s),
TokenPattern::Regex(r) => r.clone(),
};
patterns.push((*symbol, pattern_str));
}
for i in 0..patterns.len() {
for j in i + 1..patterns.len() {
if self.patterns_overlap(&patterns[i].1, &patterns[j].1) {
self.warnings.push(ValidationWarning {
message: format!(
"Tokens '{}' and '{}' have overlapping patterns",
self.get_symbol_name(patterns[i].0),
self.get_symbol_name(patterns[j].0)
),
location: "Token definitions".to_string(),
suggestion: Some(
"Ensure token precedence is correctly defined".to_string(),
),
});
}
}
}
}
fn patterns_overlap(&self, p1: &str, p2: &str) -> bool {
if p1 == p2 {
return true;
}
if let (Ok(re1), Ok(re2)) = (regex::Regex::new(p1), regex::Regex::new(p2)) {
let test_strings = vec!["a", "1", " ", "test", "123", "abc123"];
for test in test_strings {
if re1.is_match(test) && re2.is_match(test) {
return true;
}
}
}
false
}
fn validate_precedence(&mut self, grammar: &Grammar) {
let mut symbol_precedences: HashMap<SymbolId, Vec<(i32, String)>> = HashMap::new();
for (i, prec) in grammar.precedences.iter().enumerate() {
for symbol in &prec.symbols {
symbol_precedences
.entry(*symbol)
.or_default()
.push((prec.level as i32, format!("precedence group {}", i)));
}
}
for (symbol, precs) in symbol_precedences {
if precs.len() > 1 {
let levels: HashSet<_> = precs.iter().map(|(l, _)| *l).collect();
if levels.len() > 1 {
self.errors.push(ValidationError {
kind: ErrorKind::ConflictingPrecedence,
message: format!(
"Symbol '{}' has conflicting precedence levels",
self.get_symbol_name(symbol)
),
location: ErrorLocation {
symbol: Some(symbol),
rule_index: None,
position: None,
description: "Precedence declarations".to_string(),
},
suggestion: Some(
"Each symbol should have at most one precedence level".to_string(),
),
related: precs
.iter()
.map(|(level, loc)| RelatedInfo {
location: loc.clone(),
message: format!("Precedence level {}", level),
})
.collect(),
});
}
}
}
}
fn collect_statistics(&self, grammar: &Grammar, stats: &mut GrammarStats) {
stats.total_symbols = grammar.tokens.len() + grammar.rules.len() + grammar.externals.len();
stats.terminal_count = grammar.tokens.len();
stats.nonterminal_count = grammar.rules.len();
stats.rule_count = grammar.rules.values().map(|rules| rules.len()).sum();
stats.max_rule_length = grammar
.rules
.values()
.flat_map(|rules| rules.iter())
.map(|r| r.rhs.len())
.max()
.unwrap_or(0);
stats.is_ll1 = !stats.has_left_recursion && !stats.requires_glr;
stats.is_lr1 = !stats.requires_glr;
}
fn generate_suggestions(&self, grammar: &Grammar, stats: &GrammarStats) -> Vec<String> {
let mut suggestions = vec![];
if stats.has_left_recursion {
suggestions
.push("Grammar contains left recursion - GLR parsing is recommended".to_string());
}
if stats.requires_glr {
suggestions.push(
"Grammar has ambiguities that require GLR parsing for correct handling".to_string(),
);
}
if stats.max_rule_length > 10 {
suggestions.push(format!("Some rules are very long (max: {} symbols). Consider breaking them down for readability",
stats.max_rule_length));
}
if grammar.precedences.is_empty() && stats.requires_glr {
suggestions
.push("Consider adding precedence declarations to resolve ambiguities".to_string());
}
if grammar.fields.is_empty() && grammar.rules.len() > 5 {
suggestions
.push("Consider adding field names to rules for better AST access".to_string());
}
suggestions
}
}
#[cfg(test)]
mod tests {
use super::*;
use adze_ir::ProductionId;
#[test]
fn test_empty_grammar_error() {
let grammar = Grammar::new("test".to_string());
let mut validator = GLRGrammarValidator::new();
let result = validator.validate(&grammar);
assert!(!result.is_valid);
assert!(
result
.errors
.iter()
.any(|e| e.kind == ErrorKind::EmptyGrammar)
);
}
#[test]
fn test_undefined_symbol_with_suggestions() {
let mut grammar = Grammar::new("test".to_string());
grammar.tokens.insert(
SymbolId(1),
adze_ir::Token {
name: "number".to_string(),
pattern: TokenPattern::Regex(r"\d+".to_string()),
fragile: false,
},
);
grammar.rule_names.insert(SymbolId(1), "number".to_string());
grammar.rules.insert(
SymbolId(2),
vec![Rule {
lhs: SymbolId(2),
rhs: vec![Symbol::Terminal(SymbolId(99))], precedence: None,
associativity: None,
fields: vec![],
production_id: ProductionId(0),
}],
);
grammar.rule_names.insert(SymbolId(2), "expr".to_string());
grammar.rule_names.insert(SymbolId(99), "numbr".to_string());
let mut validator = GLRGrammarValidator::new();
let result = validator.validate(&grammar);
assert!(!result.is_valid);
let undefined_error = result
.errors
.iter()
.find(|e| e.kind == ErrorKind::UndefinedSymbol)
.expect("Should have undefined symbol error");
assert!(!undefined_error.related.is_empty());
}
#[test]
fn test_left_recursion_detection() {
let mut grammar = Grammar::new("test".to_string());
let expr_id = SymbolId(1);
let plus_id = SymbolId(2);
let number_id = SymbolId(3);
grammar.tokens.insert(
plus_id,
adze_ir::Token {
name: "plus".to_string(),
pattern: TokenPattern::String("+".to_string()),
fragile: false,
},
);
grammar.tokens.insert(
number_id,
adze_ir::Token {
name: "number".to_string(),
pattern: TokenPattern::Regex(r"\d+".to_string()),
fragile: false,
},
);
grammar.rules.insert(
expr_id,
vec![Rule {
lhs: expr_id,
rhs: vec![
Symbol::NonTerminal(expr_id),
Symbol::Terminal(plus_id),
Symbol::Terminal(number_id),
],
precedence: None,
associativity: None,
fields: vec![],
production_id: ProductionId(0),
}],
);
grammar.rule_names.insert(expr_id, "expr".to_string());
let mut validator = GLRGrammarValidator::new();
let result = validator.validate(&grammar);
assert!(result.stats.has_left_recursion);
assert!(
result
.warnings
.iter()
.any(|w| w.message.contains("left recursion"))
);
}
}