use super::ParseError;
use perl_ast::Node;
#[derive(Debug, Clone, PartialEq)]
pub enum ParseErrorKind {
UnexpectedToken {
expected: String,
found: String,
},
UnclosedString,
UnclosedRegex,
UnclosedBlock,
MissingSemicolon,
InvalidSyntax,
UnclosedParenthesis,
UnclosedBracket,
UnclosedBrace,
UnterminatedHeredoc,
InvalidVariableName,
InvalidSubroutineName,
MissingOperator,
MissingOperand,
UnexpectedEof,
}
pub struct ErrorClassifier;
impl Default for ErrorClassifier {
fn default() -> Self {
Self::new()
}
}
impl ErrorClassifier {
pub fn new() -> Self {
ErrorClassifier
}
pub fn classify(&self, error_node: &Node, source: &str) -> ParseErrorKind {
let error_text = {
let start = error_node.location.start;
let end = (start + 10).min(source.len()); if start < source.len() && end <= source.len() && start <= end {
&source[start..end]
} else {
""
}
};
let quote_count = source.matches('"').count();
let single_quote_count = source.matches('\'').count();
if !quote_count.is_multiple_of(2) {
return ParseErrorKind::UnclosedString;
}
if !single_quote_count.is_multiple_of(2) {
return ParseErrorKind::UnclosedString;
}
if error_text.starts_with('"') && !error_text.ends_with('"') {
return ParseErrorKind::UnclosedString;
}
if error_text.starts_with('\'') && !error_text.ends_with('\'') {
return ParseErrorKind::UnclosedString;
}
if error_text.starts_with('/') && !error_text.contains("//") {
if !error_text[1..].contains('/') {
return ParseErrorKind::UnclosedRegex;
}
}
{
let pos = error_node.location.start;
let line_start = source[..pos].rfind('\n').map(|i| i + 1).unwrap_or(0);
let line_end = source[pos..].find('\n').map(|i| pos + i).unwrap_or(source.len());
let line = &source[line_start..line_end];
if !line.trim().is_empty()
&& !line.trim().ends_with(';')
&& !line.trim().ends_with('{')
&& !line.trim().ends_with('}')
{
if line.contains("my ")
|| line.contains("our ")
|| line.contains("local ")
|| line.contains("print ")
|| line.contains("say ")
|| line.contains("return ")
{
return ParseErrorKind::MissingSemicolon;
}
}
let open_parens = line.matches('(').count();
let close_parens = line.matches(')').count();
if open_parens > close_parens {
return ParseErrorKind::UnclosedParenthesis;
}
let open_brackets = line.matches('[').count();
let close_brackets = line.matches(']').count();
if open_brackets > close_brackets {
return ParseErrorKind::UnclosedBracket;
}
let open_braces = line.matches('{').count();
let close_braces = line.matches('}').count();
if open_braces > close_braces {
return ParseErrorKind::UnclosedBrace;
}
}
if error_node.location.start >= source.len() - 1 {
return ParseErrorKind::UnexpectedEof;
}
ParseErrorKind::InvalidSyntax
}
pub fn get_diagnostic_message(&self, kind: &ParseErrorKind) -> String {
match kind {
ParseErrorKind::UnexpectedToken { expected, found } => {
format!("Expected {} but found {}", expected, found)
}
ParseErrorKind::UnclosedString => "Unclosed string literal".to_string(),
ParseErrorKind::UnclosedRegex => "Unclosed regular expression".to_string(),
ParseErrorKind::UnclosedBlock => "Unclosed code block - missing '}'".to_string(),
ParseErrorKind::MissingSemicolon => "Missing semicolon at end of statement".to_string(),
ParseErrorKind::InvalidSyntax => "Invalid syntax".to_string(),
ParseErrorKind::UnclosedParenthesis => "Unclosed parenthesis - missing ')'".to_string(),
ParseErrorKind::UnclosedBracket => "Unclosed bracket - missing ']'".to_string(),
ParseErrorKind::UnclosedBrace => "Unclosed brace - missing '}'".to_string(),
ParseErrorKind::UnterminatedHeredoc => "Unterminated heredoc".to_string(),
ParseErrorKind::InvalidVariableName => "Invalid variable name".to_string(),
ParseErrorKind::InvalidSubroutineName => "Invalid subroutine name".to_string(),
ParseErrorKind::MissingOperator => "Missing operator".to_string(),
ParseErrorKind::MissingOperand => "Missing operand".to_string(),
ParseErrorKind::UnexpectedEof => "Unexpected end of file".to_string(),
}
}
pub fn get_suggestion(&self, kind: &ParseErrorKind) -> Option<String> {
match kind {
ParseErrorKind::MissingSemicolon => {
Some("Add a semicolon ';' at the end of the statement".to_string())
}
ParseErrorKind::UnclosedString => {
Some("Add a closing quote to terminate the string".to_string())
}
ParseErrorKind::UnclosedParenthesis => {
Some("Add a closing parenthesis ')' to match the opening '('".to_string())
}
ParseErrorKind::UnclosedBracket => {
Some("Add a closing bracket ']' to match the opening '['".to_string())
}
ParseErrorKind::UnclosedBrace => {
Some("Add a closing brace '}' to match the opening '{'".to_string())
}
ParseErrorKind::UnclosedBlock => {
Some("Add a closing brace '}' to complete the code block".to_string())
}
ParseErrorKind::UnclosedRegex => {
Some("Add a closing delimiter to terminate the regex pattern".to_string())
}
ParseErrorKind::UnterminatedHeredoc => {
Some("Add the heredoc terminator marker on its own line".to_string())
}
ParseErrorKind::InvalidVariableName => {
Some("Variable names must start with a letter or underscore, followed by alphanumeric characters or underscores".to_string())
}
ParseErrorKind::InvalidSubroutineName => {
Some("Subroutine names must start with a letter or underscore, followed by alphanumeric characters or underscores".to_string())
}
ParseErrorKind::MissingOperator => {
Some("Add an operator between operands (e.g., +, -, *, /, ., ==, !=)".to_string())
}
ParseErrorKind::MissingOperand => {
Some("Add a value or expression after the operator".to_string())
}
ParseErrorKind::UnexpectedEof => {
Some("The file ended unexpectedly - check for unclosed blocks, strings, or parentheses".to_string())
}
ParseErrorKind::UnexpectedToken { expected, found: _ } => {
Some(format!("Expected {} at this location", expected))
}
ParseErrorKind::InvalidSyntax => None,
}
}
pub fn get_explanation(&self, kind: &ParseErrorKind) -> Option<String> {
match kind {
ParseErrorKind::MissingSemicolon => {
Some("In Perl, most statements must end with a semicolon. The only exceptions are the last statement in a block and statements that end with a block (like if, while, sub, etc.).".to_string())
}
ParseErrorKind::UnclosedString => {
Some("String literals must be properly terminated with a matching quote. Use double quotes (\") for interpolated strings or single quotes (') for literal strings.".to_string())
}
ParseErrorKind::UnclosedRegex => {
Some("Regular expressions must be properly delimited. Common forms include /pattern/, m/pattern/, s/old/new/, and qr/pattern/.".to_string())
}
ParseErrorKind::UnterminatedHeredoc => {
Some("Heredoc blocks must have their terminator marker appear on a line by itself with no leading or trailing whitespace (unless using <<~MARKER for indented heredocs).".to_string())
}
ParseErrorKind::InvalidVariableName => {
Some("Perl variable names (after the sigil) must follow identifier rules: start with a letter (a-z, A-Z) or underscore (_), followed by any combination of letters, digits, or underscores.".to_string())
}
ParseErrorKind::UnclosedBlock => {
Some("Code blocks must have matching braces. Each opening '{' needs a corresponding closing '}'.".to_string())
}
_ => None,
}
}
}
#[derive(Debug, Clone, Default, PartialEq, Eq)]
pub struct RecoverySalvageMetrics {
pub recovered_node_count: usize,
pub unrecovered_diagnostic_count: usize,
pub error_node_count: usize,
pub first_unrecovered_error_node: Option<String>,
}
impl RecoverySalvageMetrics {
pub fn is_dirty(&self) -> bool {
self.error_node_count > 0
|| self.recovered_node_count > 0
|| self.unrecovered_diagnostic_count > 0
}
pub fn is_structured_recovery_only(&self) -> bool {
self.recovered_node_count > 0
&& self.error_node_count == 0
&& self.unrecovered_diagnostic_count == 0
}
}
pub fn classify_recovery_salvage(ast: &Node, diagnostics: &[ParseError]) -> RecoverySalvageMetrics {
let mut error_node_count = 0usize;
let mut first_start = usize::MAX;
let mut first_unrecovered_error_node: Option<String> = None;
fn walk(
node: &Node,
error_node_count: &mut usize,
first_start: &mut usize,
first_unrecovered_error_node: &mut Option<String>,
) {
if let perl_ast::NodeKind::Error { message, .. } = &node.kind {
*error_node_count = error_node_count.saturating_add(1);
if node.location.start < *first_start {
*first_start = node.location.start;
*first_unrecovered_error_node = Some(message.clone());
}
}
node.for_each_child(|child| {
walk(child, error_node_count, first_start, first_unrecovered_error_node);
});
}
walk(ast, &mut error_node_count, &mut first_start, &mut first_unrecovered_error_node);
let recovered_node_count =
diagnostics.iter().filter(|e| matches!(e, ParseError::Recovered { .. })).count();
let unrecovered_diagnostic_count = diagnostics.len().saturating_sub(recovered_node_count);
RecoverySalvageMetrics {
recovered_node_count,
unrecovered_diagnostic_count,
error_node_count,
first_unrecovered_error_node,
}
}
#[cfg(test)]
mod tests {
use super::*;
use perl_ast::{Node, NodeKind, SourceLocation};
#[test]
fn test_classify_unclosed_string() {
let classifier = ErrorClassifier::new();
let source = r#"my $x = "hello"#;
let error_node = Node::new(
NodeKind::Error {
message: "Unclosed string".to_string(),
expected: vec![],
found: None,
partial: None,
},
SourceLocation { start: 9, end: 15 }, );
let kind = classifier.classify(&error_node, source);
assert_eq!(kind, ParseErrorKind::UnclosedString);
}
#[test]
fn test_classify_missing_semicolon() {
let classifier = ErrorClassifier::new();
let source = "my $x = 42\nmy $y = 10";
let error = Node::new(
NodeKind::Error {
message: "Unexpected token".to_string(),
expected: vec![],
found: None,
partial: None,
},
SourceLocation { start: 10, end: 11 }, );
let kind = classifier.classify(&error, source);
assert_eq!(kind, ParseErrorKind::MissingSemicolon);
}
fn make_error_node(message: &str, start: usize, end: usize) -> Node {
Node::new(
NodeKind::Error {
message: message.to_string(),
expected: vec![],
found: None,
partial: None,
},
SourceLocation { start, end },
)
}
fn make_program_node(children: Vec<Node>) -> Node {
Node::new(NodeKind::Program { statements: children }, SourceLocation { start: 0, end: 100 })
}
#[test]
fn clean_parse_produces_zero_metrics() {
let root = make_program_node(vec![]);
let metrics = classify_recovery_salvage(&root, &[]);
assert_eq!(metrics.recovered_node_count, 0);
assert_eq!(metrics.unrecovered_diagnostic_count, 0);
assert_eq!(metrics.error_node_count, 0);
assert!(metrics.first_unrecovered_error_node.is_none());
assert!(!metrics.is_dirty());
assert!(!metrics.is_structured_recovery_only());
}
#[test]
fn error_node_without_diagnostics_is_dirty_but_not_structured_recovery() {
let error = make_error_node("unexpected token", 5, 10);
let root = make_program_node(vec![error]);
let metrics = classify_recovery_salvage(&root, &[]);
assert_eq!(metrics.error_node_count, 1);
assert_eq!(metrics.recovered_node_count, 0);
assert_eq!(metrics.unrecovered_diagnostic_count, 0);
assert!(metrics.is_dirty(), "error node alone makes result dirty");
assert!(
!metrics.is_structured_recovery_only(),
"no recovery diagnostics — not structured-recovery-only"
);
assert_eq!(metrics.first_unrecovered_error_node.as_deref(), Some("unexpected token"));
}
#[test]
fn multiple_error_nodes_reports_earliest_by_start_offset() {
let later = make_error_node("later error", 50, 60);
let earlier = make_error_node("earlier error", 10, 20);
let root = make_program_node(vec![later, earlier]);
let metrics = classify_recovery_salvage(&root, &[]);
assert_eq!(metrics.error_node_count, 2);
assert_eq!(
metrics.first_unrecovered_error_node.as_deref(),
Some("earlier error"),
"earliest by start offset must win"
);
}
}