perl-parser-core 0.13.3

//! Error classification and diagnostic generation for Perl parsing workflows
//!
//! This module provides intelligent error classification for parsing failures in Perl scripts,
//! offering specific error types and recovery suggestions for LSP workflow operations.
//!
//! # LSP Workflow Integration
//!
//! Error classification supports robust Perl parsing across LSP workflow stages:
//! - **Parse**: Classify syntax errors during parser construction
//! - **Index**: Provide error context for symbol extraction and indexing
//! - **Navigate**: Surface recovery hints for definition and reference resolution
//! - **Complete**: Enable error-tolerant completion and quick fixes
//! - **Analyze**: Drive diagnostics and remediation guidance
//!
//! # Usage Examples
//!
//! ```ignore
//! use perl_parser::error_classifier::{ErrorClassifier, ParseErrorKind};
//! use perl_parser::{Parser, ast::Node};
//!
//! let classifier = ErrorClassifier::new();
//! let source = "my $value = \"unclosed string...";
//! let mut parser = Parser::new(source);
//! let _result = parser.parse(); // This will fail due to unclosed string
//!
//! // Classify parsing errors for better user feedback
//! // let error_kind = classifier.classify(&error_node, source);
//! // let message = classifier.get_diagnostic_message(&error_kind);
//! // let suggestion = classifier.get_suggestion(&error_kind);
//! ```

use super::ParseError;
use perl_ast::Node;

/// Specific types of parse errors found in Perl script content
///
/// Provides detailed categorization of parsing failures to enable targeted
/// error recovery strategies during LSP workflows.
#[derive(Debug, Clone, PartialEq)]
pub enum ParseErrorKind {
    /// Parser encountered unexpected token during Perl script analysis
    UnexpectedToken {
        /// Token type that was expected during parsing
        expected: String,
        /// Actual token found in Perl script content
        found: String,
    },
    /// String literal not properly closed in Perl script
    UnclosedString,
    /// Regular expression pattern not properly closed
    UnclosedRegex,
    /// Code block (braces) not properly closed
    UnclosedBlock,
    /// Required semicolon missing in Perl script
    MissingSemicolon,
    /// General syntax error in Perl parsing code
    InvalidSyntax,
    /// Parenthesis not properly closed in expression
    UnclosedParenthesis,
    /// Array or hash bracket not properly closed
    UnclosedBracket,
    /// Hash or block brace not properly closed
    UnclosedBrace,
    /// Heredoc block not properly terminated
    UnterminatedHeredoc,
    /// Variable name does not follow Perl naming rules
    InvalidVariableName,
    /// Subroutine name does not follow Perl naming rules
    InvalidSubroutineName,
    /// Required operator missing in expression
    MissingOperator,
    /// Required operand missing in expression
    MissingOperand,
    /// Unexpected end of file during parsing
    UnexpectedEof,
}

/// Perl script error classification engine for LSP workflow operations
///
/// Analyzes parsing errors and provides specific error types with recovery suggestions
/// for robust Perl parsing workflows within enterprise LSP environments.
pub struct ErrorClassifier;

impl Default for ErrorClassifier {
    fn default() -> Self {
        Self::new()
    }
}

impl ErrorClassifier {
    /// Create new error classifier for Perl script analysis
    ///
    /// # Returns
    ///
    /// Configured classifier ready for LSP workflow error analysis
    pub fn new() -> Self {
        ErrorClassifier
    }

    /// Classify parsing error based on AST node and source context
    ///
    /// Analyzes error patterns in Perl script content to provide specific
    /// error types for targeted recovery strategies during LSP workflow.
    ///
    /// # Arguments
    ///
    /// * `error_node` - AST node where error occurred
    /// * `source` - Complete Perl script source code for context analysis
    ///
    /// # Returns
    ///
    /// Specific error type for targeted recovery during Perl parsing
    pub fn classify(&self, error_node: &Node, source: &str) -> ParseErrorKind {
        // Get the error text if available based on location
        let error_text = {
            let start = error_node.location.start;
            let end = (start + 10).min(source.len()); // Look at next 10 chars
            if start < source.len() && end <= source.len() && start <= end {
                &source[start..end]
            } else {
                ""
            }
        };

        // Check for common patterns - check the entire source for unclosed quotes
        let quote_count = source.matches('"').count();
        let single_quote_count = source.matches('\'').count();

        // Check if we have unclosed quotes
        if !quote_count.is_multiple_of(2) {
            return ParseErrorKind::UnclosedString;
        }
        if !single_quote_count.is_multiple_of(2) {
            return ParseErrorKind::UnclosedString;
        }

        // Also check the error text itself
        if error_text.starts_with('"') && !error_text.ends_with('"') {
            return ParseErrorKind::UnclosedString;
        }

        if error_text.starts_with('\'') && !error_text.ends_with('\'') {
            return ParseErrorKind::UnclosedString;
        }

        if error_text.starts_with('/') && !error_text.contains("//") {
            // Could be unclosed regex
            if !error_text[1..].contains('/') {
                return ParseErrorKind::UnclosedRegex;
            }
        }

        // Check context around error
        {
            let pos = error_node.location.start;
            let line_start = source[..pos].rfind('\n').map(|i| i + 1).unwrap_or(0);
            let line_end = source[pos..].find('\n').map(|i| pos + i).unwrap_or(source.len());

            let line = &source[line_start..line_end];

            // Check for missing semicolon
            if !line.trim().is_empty()
                && !line.trim().ends_with(';')
                && !line.trim().ends_with('{')
                && !line.trim().ends_with('}')
            {
                // Look for common statement patterns
                if line.contains("my ")
                    || line.contains("our ")
                    || line.contains("local ")
                    || line.contains("print ")
                    || line.contains("say ")
                    || line.contains("return ")
                {
                    return ParseErrorKind::MissingSemicolon;
                }
            }

            // Check for unclosed delimiters
            let open_parens = line.matches('(').count();
            let close_parens = line.matches(')').count();
            if open_parens > close_parens {
                return ParseErrorKind::UnclosedParenthesis;
            }

            let open_brackets = line.matches('[').count();
            let close_brackets = line.matches(']').count();
            if open_brackets > close_brackets {
                return ParseErrorKind::UnclosedBracket;
            }

            let open_braces = line.matches('{').count();
            let close_braces = line.matches('}').count();
            if open_braces > close_braces {
                return ParseErrorKind::UnclosedBrace;
            }
        }

        // Check if we're at EOF
        if error_node.location.start >= source.len() - 1 {
            return ParseErrorKind::UnexpectedEof;
        }

        // Default to invalid syntax
        ParseErrorKind::InvalidSyntax
    }

    /// Generate user-friendly diagnostic message for classified error
    ///
    /// Converts error classification into readable message for Perl script developers
    /// during LSP workflow processing and debugging operations.
    ///
    /// # Arguments
    ///
    /// * `kind` - Classified error type from Perl script analysis
    ///
    /// # Returns
    ///
    /// Human-readable error message describing the parsing issue
    pub fn get_diagnostic_message(&self, kind: &ParseErrorKind) -> String {
        match kind {
            ParseErrorKind::UnexpectedToken { expected, found } => {
                format!("Expected {} but found {}", expected, found)
            }
            ParseErrorKind::UnclosedString => "Unclosed string literal".to_string(),
            ParseErrorKind::UnclosedRegex => "Unclosed regular expression".to_string(),
            ParseErrorKind::UnclosedBlock => "Unclosed code block - missing '}'".to_string(),
            ParseErrorKind::MissingSemicolon => "Missing semicolon at end of statement".to_string(),
            ParseErrorKind::InvalidSyntax => "Invalid syntax".to_string(),
            ParseErrorKind::UnclosedParenthesis => "Unclosed parenthesis - missing ')'".to_string(),
            ParseErrorKind::UnclosedBracket => "Unclosed bracket - missing ']'".to_string(),
            ParseErrorKind::UnclosedBrace => "Unclosed brace - missing '}'".to_string(),
            ParseErrorKind::UnterminatedHeredoc => "Unterminated heredoc".to_string(),
            ParseErrorKind::InvalidVariableName => "Invalid variable name".to_string(),
            ParseErrorKind::InvalidSubroutineName => "Invalid subroutine name".to_string(),
            ParseErrorKind::MissingOperator => "Missing operator".to_string(),
            ParseErrorKind::MissingOperand => "Missing operand".to_string(),
            ParseErrorKind::UnexpectedEof => "Unexpected end of file".to_string(),
        }
    }

    /// Generate recovery suggestion for classified parsing error
    ///
    /// Provides actionable recovery suggestions for Perl script developers
    /// to resolve parsing issues during LSP workflow development.
    ///
    /// # Arguments
    ///
    /// * `kind` - Classified error type requiring recovery suggestion
    ///
    /// # Returns
    ///
    /// Optional recovery suggestion or None if no specific suggestion available
    pub fn get_suggestion(&self, kind: &ParseErrorKind) -> Option<String> {
        match kind {
            ParseErrorKind::MissingSemicolon => {
                Some("Add a semicolon ';' at the end of the statement".to_string())
            }
            ParseErrorKind::UnclosedString => {
                Some("Add a closing quote to terminate the string".to_string())
            }
            ParseErrorKind::UnclosedParenthesis => {
                Some("Add a closing parenthesis ')' to match the opening '('".to_string())
            }
            ParseErrorKind::UnclosedBracket => {
                Some("Add a closing bracket ']' to match the opening '['".to_string())
            }
            ParseErrorKind::UnclosedBrace => {
                Some("Add a closing brace '}' to match the opening '{'".to_string())
            }
            ParseErrorKind::UnclosedBlock => {
                Some("Add a closing brace '}' to complete the code block".to_string())
            }
            ParseErrorKind::UnclosedRegex => {
                Some("Add a closing delimiter to terminate the regex pattern".to_string())
            }
            ParseErrorKind::UnterminatedHeredoc => {
                Some("Add the heredoc terminator marker on its own line".to_string())
            }
            ParseErrorKind::InvalidVariableName => {
                Some("Variable names must start with a letter or underscore, followed by alphanumeric characters or underscores".to_string())
            }
            ParseErrorKind::InvalidSubroutineName => {
                Some("Subroutine names must start with a letter or underscore, followed by alphanumeric characters or underscores".to_string())
            }
            ParseErrorKind::MissingOperator => {
                Some("Add an operator between operands (e.g., +, -, *, /, ., ==, !=)".to_string())
            }
            ParseErrorKind::MissingOperand => {
                Some("Add a value or expression after the operator".to_string())
            }
            ParseErrorKind::UnexpectedEof => {
                Some("The file ended unexpectedly - check for unclosed blocks, strings, or parentheses".to_string())
            }
            ParseErrorKind::UnexpectedToken { expected, found: _ } => {
                Some(format!("Expected {} at this location", expected))
            }
            ParseErrorKind::InvalidSyntax => None,
        }
    }

    /// Get a detailed explanation for the error kind
    ///
    /// Provides additional context and explanation beyond the basic diagnostic message
    /// to help developers understand the root cause of the error.
    ///
    /// # Arguments
    ///
    /// * `kind` - Classified error type
    ///
    /// # Returns
    ///
    /// Optional detailed explanation
    pub fn get_explanation(&self, kind: &ParseErrorKind) -> Option<String> {
        match kind {
            ParseErrorKind::MissingSemicolon => {
                Some("In Perl, most statements must end with a semicolon. The only exceptions are the last statement in a block and statements that end with a block (like if, while, sub, etc.).".to_string())
            }
            ParseErrorKind::UnclosedString => {
                Some("String literals must be properly terminated with a matching quote. Use double quotes (\") for interpolated strings or single quotes (') for literal strings.".to_string())
            }
            ParseErrorKind::UnclosedRegex => {
                Some("Regular expressions must be properly delimited. Common forms include /pattern/, m/pattern/, s/old/new/, and qr/pattern/.".to_string())
            }
            ParseErrorKind::UnterminatedHeredoc => {
                Some("Heredoc blocks must have their terminator marker appear on a line by itself with no leading or trailing whitespace (unless using <<~MARKER for indented heredocs).".to_string())
            }
            ParseErrorKind::InvalidVariableName => {
                Some("Perl variable names (after the sigil) must follow identifier rules: start with a letter (a-z, A-Z) or underscore (_), followed by any combination of letters, digits, or underscores.".to_string())
            }
            ParseErrorKind::UnclosedBlock => {
                Some("Code blocks must have matching braces. Each opening '{' needs a corresponding closing '}'.".to_string())
            }
            _ => None,
        }
    }
}

/// Recovery-salvage metrics computed for a single parsed file.
///
/// Used by accuracy closeout reporting to distinguish salvageable structured
/// recovery from unrecovered parser damage.
///
/// Distinct from [`crate::RecoverySalvageProfile`] in two ways: this type
/// carries `unrecovered_diagnostic_count` (non-recovery diagnostics) for finer
/// classification, and exposes `is_dirty()`/`is_structured_recovery_only()`
/// helpers used by the corpus closeout reports.
#[derive(Debug, Clone, Default, PartialEq, Eq)]
pub struct RecoverySalvageMetrics {
    /// Number of [`ParseError::Recovered`] diagnostics observed.
    pub recovered_node_count: usize,
    /// Number of non-recovery diagnostics observed (`diagnostics.len() -
    /// recovered_node_count`).
    pub unrecovered_diagnostic_count: usize,
    /// Number of `NodeKind::Error` nodes observed in the AST.
    pub error_node_count: usize,
    /// Message from the earliest unrecovered `ERROR` node (by start offset),
    /// if any.
    pub first_unrecovered_error_node: Option<String>,
}

impl RecoverySalvageMetrics {
    /// Returns true when the parse produced any error node, recovered
    /// diagnostic, or unrecovered diagnostic.
    pub fn is_dirty(&self) -> bool {
        self.error_node_count > 0
            || self.recovered_node_count > 0
            || self.unrecovered_diagnostic_count > 0
    }

    /// Returns true when the parse only produced structured recovery
    /// diagnostics — i.e. recovered diagnostics with no `ERROR` AST nodes and
    /// no other diagnostics.
    pub fn is_structured_recovery_only(&self) -> bool {
        self.recovered_node_count > 0
            && self.error_node_count == 0
            && self.unrecovered_diagnostic_count == 0
    }
}

/// Compute [`RecoverySalvageMetrics`] for a parsed AST and its diagnostics.
///
/// Walks the AST counting `NodeKind::Error` nodes, recording the earliest
/// error message by start offset, and partitions `diagnostics` into recovered
/// vs unrecovered counts.
pub fn classify_recovery_salvage(ast: &Node, diagnostics: &[ParseError]) -> RecoverySalvageMetrics {
    let mut error_node_count = 0usize;
    let mut first_start = usize::MAX;
    let mut first_unrecovered_error_node: Option<String> = None;

    fn walk(
        node: &Node,
        error_node_count: &mut usize,
        first_start: &mut usize,
        first_unrecovered_error_node: &mut Option<String>,
    ) {
        if let perl_ast::NodeKind::Error { message, .. } = &node.kind {
            *error_node_count = error_node_count.saturating_add(1);
            if node.location.start < *first_start {
                *first_start = node.location.start;
                *first_unrecovered_error_node = Some(message.clone());
            }
        }
        node.for_each_child(|child| {
            walk(child, error_node_count, first_start, first_unrecovered_error_node);
        });
    }
    walk(ast, &mut error_node_count, &mut first_start, &mut first_unrecovered_error_node);

    let recovered_node_count =
        diagnostics.iter().filter(|e| matches!(e, ParseError::Recovered { .. })).count();
    let unrecovered_diagnostic_count = diagnostics.len().saturating_sub(recovered_node_count);

    RecoverySalvageMetrics {
        recovered_node_count,
        unrecovered_diagnostic_count,
        error_node_count,
        first_unrecovered_error_node,
    }
}

#[cfg(test)]
mod tests {
    use super::*;
    use perl_ast::{Node, NodeKind, SourceLocation};

    #[test]
    fn test_classify_unclosed_string() {
        let classifier = ErrorClassifier::new();
        let source = r#"my $x = "hello"#;

        // Manually construct error node
        // "hello is at index 9 (my  = ) is 0..8
        // m y   $ x   =   "
        // 0123456789

        let error_node = Node::new(
            NodeKind::Error {
                message: "Unclosed string".to_string(),
                expected: vec![],
                found: None,
                partial: None,
            },
            SourceLocation { start: 9, end: 15 }, // "hello
        );

        let kind = classifier.classify(&error_node, source);
        assert_eq!(kind, ParseErrorKind::UnclosedString);
    }

    #[test]
    fn test_classify_missing_semicolon() {
        let classifier = ErrorClassifier::new();
        let source = "my $x = 42\nmy $y = 10";

        // Simulate an error node at the end of first line
        let error = Node::new(
            NodeKind::Error {
                message: "Unexpected token".to_string(),
                expected: vec![],
                found: None,
                partial: None,
            },
            SourceLocation { start: 10, end: 11 }, // newline char
        );
        let kind = classifier.classify(&error, source);
        assert_eq!(kind, ParseErrorKind::MissingSemicolon);
    }

    // ── classify_recovery_salvage unit tests ─────────────────────────────────

    fn make_error_node(message: &str, start: usize, end: usize) -> Node {
        Node::new(
            NodeKind::Error {
                message: message.to_string(),
                expected: vec![],
                found: None,
                partial: None,
            },
            SourceLocation { start, end },
        )
    }

    fn make_program_node(children: Vec<Node>) -> Node {
        Node::new(NodeKind::Program { statements: children }, SourceLocation { start: 0, end: 100 })
    }

    #[test]
    fn clean_parse_produces_zero_metrics() {
        // A clean AST with no Error nodes and no diagnostics is not dirty.
        let root = make_program_node(vec![]);
        let metrics = classify_recovery_salvage(&root, &[]);
        assert_eq!(metrics.recovered_node_count, 0);
        assert_eq!(metrics.unrecovered_diagnostic_count, 0);
        assert_eq!(metrics.error_node_count, 0);
        assert!(metrics.first_unrecovered_error_node.is_none());
        assert!(!metrics.is_dirty());
        assert!(!metrics.is_structured_recovery_only());
    }

    #[test]
    fn error_node_without_diagnostics_is_dirty_but_not_structured_recovery() {
        // Edge case: parser inserts an Error node but emits no diagnostic.
        // is_dirty() must return true; is_structured_recovery_only() must be false.
        let error = make_error_node("unexpected token", 5, 10);
        let root = make_program_node(vec![error]);
        let metrics = classify_recovery_salvage(&root, &[]);

        assert_eq!(metrics.error_node_count, 1);
        assert_eq!(metrics.recovered_node_count, 0);
        assert_eq!(metrics.unrecovered_diagnostic_count, 0);
        assert!(metrics.is_dirty(), "error node alone makes result dirty");
        assert!(
            !metrics.is_structured_recovery_only(),
            "no recovery diagnostics — not structured-recovery-only"
        );
        assert_eq!(metrics.first_unrecovered_error_node.as_deref(), Some("unexpected token"));
    }

    #[test]
    fn multiple_error_nodes_reports_earliest_by_start_offset() {
        // When multiple Error nodes are present, the first one by start offset
        // should be captured as first_unrecovered_error_node.
        let later = make_error_node("later error", 50, 60);
        let earlier = make_error_node("earlier error", 10, 20);
        let root = make_program_node(vec![later, earlier]);
        let metrics = classify_recovery_salvage(&root, &[]);

        assert_eq!(metrics.error_node_count, 2);
        assert_eq!(
            metrics.first_unrecovered_error_node.as_deref(),
            Some("earlier error"),
            "earliest by start offset must win"
        );
    }
}