Skip to main content

perl_parser_core/syntax/error/
classifier.rs

1//! Error classification and diagnostic generation for Perl parsing workflows
2//!
3//! This module provides intelligent error classification for parsing failures in Perl scripts,
4//! offering specific error types and recovery suggestions for LSP workflow operations.
5//!
6//! # LSP Workflow Integration
7//!
8//! Error classification supports robust Perl parsing across LSP workflow stages:
9//! - **Parse**: Classify syntax errors during parser construction
10//! - **Index**: Provide error context for symbol extraction and indexing
11//! - **Navigate**: Surface recovery hints for definition and reference resolution
12//! - **Complete**: Enable error-tolerant completion and quick fixes
13//! - **Analyze**: Drive diagnostics and remediation guidance
14//!
15//! # Usage Examples
16//!
17//! ```ignore
18//! use perl_parser::error_classifier::{ErrorClassifier, ParseErrorKind};
19//! use perl_parser::{Parser, ast::Node};
20//!
21//! let classifier = ErrorClassifier::new();
22//! let source = "my $value = \"unclosed string...";
23//! let mut parser = Parser::new(source);
24//! let _result = parser.parse(); // This will fail due to unclosed string
25//!
26//! // Classify parsing errors for better user feedback
27//! // let error_kind = classifier.classify(&error_node, source);
28//! // let message = classifier.get_diagnostic_message(&error_kind);
29//! // let suggestion = classifier.get_suggestion(&error_kind);
30//! ```
31
32use super::ParseError;
33use perl_ast::Node;
34
35/// Specific types of parse errors found in Perl script content
36///
37/// Provides detailed categorization of parsing failures to enable targeted
38/// error recovery strategies during LSP workflows.
39#[derive(Debug, Clone, PartialEq)]
40pub enum ParseErrorKind {
41    /// Parser encountered unexpected token during Perl script analysis
42    UnexpectedToken {
43        /// Token type that was expected during parsing
44        expected: String,
45        /// Actual token found in Perl script content
46        found: String,
47    },
48    /// String literal not properly closed in Perl script
49    UnclosedString,
50    /// Regular expression pattern not properly closed
51    UnclosedRegex,
52    /// Code block (braces) not properly closed
53    UnclosedBlock,
54    /// Required semicolon missing in Perl script
55    MissingSemicolon,
56    /// General syntax error in Perl parsing code
57    InvalidSyntax,
58    /// Parenthesis not properly closed in expression
59    UnclosedParenthesis,
60    /// Array or hash bracket not properly closed
61    UnclosedBracket,
62    /// Hash or block brace not properly closed
63    UnclosedBrace,
64    /// Heredoc block not properly terminated
65    UnterminatedHeredoc,
66    /// Variable name does not follow Perl naming rules
67    InvalidVariableName,
68    /// Subroutine name does not follow Perl naming rules
69    InvalidSubroutineName,
70    /// Required operator missing in expression
71    MissingOperator,
72    /// Required operand missing in expression
73    MissingOperand,
74    /// Unexpected end of file during parsing
75    UnexpectedEof,
76}
77
78/// Perl script error classification engine for LSP workflow operations
79///
80/// Analyzes parsing errors and provides specific error types with recovery suggestions
81/// for robust Perl parsing workflows within enterprise LSP environments.
82pub struct ErrorClassifier;
83
84impl Default for ErrorClassifier {
85    fn default() -> Self {
86        Self::new()
87    }
88}
89
90impl ErrorClassifier {
91    /// Create new error classifier for Perl script analysis
92    ///
93    /// # Returns
94    ///
95    /// Configured classifier ready for LSP workflow error analysis
96    pub fn new() -> Self {
97        ErrorClassifier
98    }
99
100    /// Classify parsing error based on AST node and source context
101    ///
102    /// Analyzes error patterns in Perl script content to provide specific
103    /// error types for targeted recovery strategies during LSP workflow.
104    ///
105    /// # Arguments
106    ///
107    /// * `error_node` - AST node where error occurred
108    /// * `source` - Complete Perl script source code for context analysis
109    ///
110    /// # Returns
111    ///
112    /// Specific error type for targeted recovery during Perl parsing
113    pub fn classify(&self, error_node: &Node, source: &str) -> ParseErrorKind {
114        // Get the error text if available based on location.
115        //
116        // Both `start` and `end` must sit on char boundaries before we can index
117        // into a `&str`.  Parser locations are stored as byte offsets; if a
118        // multibyte character straddles the window boundary, a raw `source[s..e]`
119        // would panic.  We snap both bounds using `str::is_char_boundary`.
120        let error_text = {
121            let raw_start = error_node.location.start;
122            // Clamp start to [0, source.len()] and snap forward to the next
123            // char boundary so we never begin inside a multibyte sequence.
124            let start = {
125                let s = raw_start.min(source.len());
126                // Walk forward until we land on a char boundary (or reach the end).
127                let mut s = s;
128                while s < source.len() && !source.is_char_boundary(s) {
129                    s += 1;
130                }
131                s
132            };
133            // Look at up to 10 bytes ahead, but snap end *down* to the nearest
134            // char boundary so we never split a multibyte sequence at the tail.
135            let end = {
136                let mut e = (start + 10).min(source.len());
137                while e > start && !source.is_char_boundary(e) {
138                    e -= 1;
139                }
140                e
141            };
142            if start < source.len() && start <= end {
143                source.get(start..end).unwrap_or("")
144            } else {
145                ""
146            }
147        };
148
149        // Check for common patterns - check the entire source for unclosed quotes
150        let quote_count = source.matches('"').count();
151        let single_quote_count = source.matches('\'').count();
152
153        // Check if we have unclosed quotes
154        if !quote_count.is_multiple_of(2) {
155            return ParseErrorKind::UnclosedString;
156        }
157        if !single_quote_count.is_multiple_of(2) {
158            return ParseErrorKind::UnclosedString;
159        }
160
161        // Also check the error text itself
162        if error_text.starts_with('"') && !error_text.ends_with('"') {
163            return ParseErrorKind::UnclosedString;
164        }
165
166        if error_text.starts_with('\'') && !error_text.ends_with('\'') {
167            return ParseErrorKind::UnclosedString;
168        }
169
170        if error_text.starts_with('/') && !error_text.contains("//") {
171            // Could be unclosed regex
172            if !error_text[1..].contains('/') {
173                return ParseErrorKind::UnclosedRegex;
174            }
175        }
176
177        // Check context around error.
178        //
179        // `pos` is a byte offset from the parser and may not lie on a char
180        // boundary when the source contains multibyte characters.  We must snap
181        // it before slicing.  `rfind('\n')` and `find('\n')` are safe because
182        // '\n' (0x0A) is always a single-byte sequence and its position is
183        // therefore always a valid char boundary.
184        {
185            let raw_pos = error_node.location.start;
186            // Snap pos forward to the nearest char boundary (or source.len()).
187            let pos = {
188                let p = raw_pos.min(source.len());
189                let mut p = p;
190                while p < source.len() && !source.is_char_boundary(p) {
191                    p += 1;
192                }
193                p
194            };
195            // source[..pos] is now safe because pos is on a boundary.
196            let line_start = source[..pos].rfind('\n').map(|i| i + 1).unwrap_or(0);
197            let line_end = source[pos..].find('\n').map(|i| pos + i).unwrap_or(source.len());
198
199            let line = &source[line_start..line_end];
200
201            // Check for missing semicolon
202            if !line.trim().is_empty()
203                && !line.trim().ends_with(';')
204                && !line.trim().ends_with('{')
205                && !line.trim().ends_with('}')
206            {
207                // Look for common statement patterns
208                if line.contains("my ")
209                    || line.contains("our ")
210                    || line.contains("local ")
211                    || line.contains("print ")
212                    || line.contains("say ")
213                    || line.contains("return ")
214                {
215                    return ParseErrorKind::MissingSemicolon;
216                }
217            }
218
219            // Check for unclosed delimiters
220            let open_parens = line.matches('(').count();
221            let close_parens = line.matches(')').count();
222            if open_parens > close_parens {
223                return ParseErrorKind::UnclosedParenthesis;
224            }
225
226            let open_brackets = line.matches('[').count();
227            let close_brackets = line.matches(']').count();
228            if open_brackets > close_brackets {
229                return ParseErrorKind::UnclosedBracket;
230            }
231
232            let open_braces = line.matches('{').count();
233            let close_braces = line.matches('}').count();
234            if open_braces > close_braces {
235                return ParseErrorKind::UnclosedBrace;
236            }
237        }
238
239        // Check if we're at EOF; use saturating_sub to avoid underflow on empty source
240        if source.is_empty() || error_node.location.start >= source.len().saturating_sub(1) {
241            return ParseErrorKind::UnexpectedEof;
242        }
243
244        // Default to invalid syntax
245        ParseErrorKind::InvalidSyntax
246    }
247
248    /// Generate user-friendly diagnostic message for classified error
249    ///
250    /// Converts error classification into readable message for Perl script developers
251    /// during LSP workflow processing and debugging operations.
252    ///
253    /// # Arguments
254    ///
255    /// * `kind` - Classified error type from Perl script analysis
256    ///
257    /// # Returns
258    ///
259    /// Human-readable error message describing the parsing issue
260    pub fn get_diagnostic_message(&self, kind: &ParseErrorKind) -> String {
261        match kind {
262            ParseErrorKind::UnexpectedToken { expected, found } => {
263                format!("Expected {} but found {}", expected, found)
264            }
265            ParseErrorKind::UnclosedString => "Unclosed string literal".to_string(),
266            ParseErrorKind::UnclosedRegex => "Unclosed regular expression".to_string(),
267            ParseErrorKind::UnclosedBlock => "Unclosed code block - missing '}'".to_string(),
268            ParseErrorKind::MissingSemicolon => "Missing semicolon at end of statement".to_string(),
269            ParseErrorKind::InvalidSyntax => "Invalid syntax".to_string(),
270            ParseErrorKind::UnclosedParenthesis => "Unclosed parenthesis - missing ')'".to_string(),
271            ParseErrorKind::UnclosedBracket => "Unclosed bracket - missing ']'".to_string(),
272            ParseErrorKind::UnclosedBrace => "Unclosed brace - missing '}'".to_string(),
273            ParseErrorKind::UnterminatedHeredoc => "Unterminated heredoc".to_string(),
274            ParseErrorKind::InvalidVariableName => "Invalid variable name".to_string(),
275            ParseErrorKind::InvalidSubroutineName => "Invalid subroutine name".to_string(),
276            ParseErrorKind::MissingOperator => "Missing operator".to_string(),
277            ParseErrorKind::MissingOperand => "Missing operand".to_string(),
278            ParseErrorKind::UnexpectedEof => "Unexpected end of file".to_string(),
279        }
280    }
281
282    /// Generate recovery suggestion for classified parsing error
283    ///
284    /// Provides actionable recovery suggestions for Perl script developers
285    /// to resolve parsing issues during LSP workflow development.
286    ///
287    /// # Arguments
288    ///
289    /// * `kind` - Classified error type requiring recovery suggestion
290    ///
291    /// # Returns
292    ///
293    /// Optional recovery suggestion or None if no specific suggestion available
294    pub fn get_suggestion(&self, kind: &ParseErrorKind) -> Option<String> {
295        match kind {
296            ParseErrorKind::MissingSemicolon => {
297                Some("Add a semicolon ';' at the end of the statement".to_string())
298            }
299            ParseErrorKind::UnclosedString => {
300                Some("Add a closing quote to terminate the string".to_string())
301            }
302            ParseErrorKind::UnclosedParenthesis => {
303                Some("Add a closing parenthesis ')' to match the opening '('".to_string())
304            }
305            ParseErrorKind::UnclosedBracket => {
306                Some("Add a closing bracket ']' to match the opening '['".to_string())
307            }
308            ParseErrorKind::UnclosedBrace => {
309                Some("Add a closing brace '}' to match the opening '{'".to_string())
310            }
311            ParseErrorKind::UnclosedBlock => {
312                Some("Add a closing brace '}' to complete the code block".to_string())
313            }
314            ParseErrorKind::UnclosedRegex => {
315                Some("Add a closing delimiter to terminate the regex pattern".to_string())
316            }
317            ParseErrorKind::UnterminatedHeredoc => {
318                Some("Add the heredoc terminator marker on its own line".to_string())
319            }
320            ParseErrorKind::InvalidVariableName => {
321                Some("Variable names must start with a letter or underscore, followed by alphanumeric characters or underscores".to_string())
322            }
323            ParseErrorKind::InvalidSubroutineName => {
324                Some("Subroutine names must start with a letter or underscore, followed by alphanumeric characters or underscores".to_string())
325            }
326            ParseErrorKind::MissingOperator => {
327                Some("Add an operator between operands (e.g., +, -, *, /, ., ==, !=)".to_string())
328            }
329            ParseErrorKind::MissingOperand => {
330                Some("Add a value or expression after the operator".to_string())
331            }
332            ParseErrorKind::UnexpectedEof => {
333                Some("The file ended unexpectedly - check for unclosed blocks, strings, or parentheses".to_string())
334            }
335            ParseErrorKind::UnexpectedToken { expected, found: _ } => {
336                Some(format!("Expected {} at this location", expected))
337            }
338            ParseErrorKind::InvalidSyntax => None,
339        }
340    }
341
342    /// Get a detailed explanation for the error kind
343    ///
344    /// Provides additional context and explanation beyond the basic diagnostic message
345    /// to help developers understand the root cause of the error.
346    ///
347    /// # Arguments
348    ///
349    /// * `kind` - Classified error type
350    ///
351    /// # Returns
352    ///
353    /// Optional detailed explanation
354    pub fn get_explanation(&self, kind: &ParseErrorKind) -> Option<String> {
355        match kind {
356            ParseErrorKind::MissingSemicolon => {
357                Some("In Perl, most statements must end with a semicolon. The only exceptions are the last statement in a block and statements that end with a block (like if, while, sub, etc.).".to_string())
358            }
359            ParseErrorKind::UnclosedString => {
360                Some("String literals must be properly terminated with a matching quote. Use double quotes (\") for interpolated strings or single quotes (') for literal strings.".to_string())
361            }
362            ParseErrorKind::UnclosedRegex => {
363                Some("Regular expressions must be properly delimited. Common forms include /pattern/, m/pattern/, s/old/new/, and qr/pattern/.".to_string())
364            }
365            ParseErrorKind::UnterminatedHeredoc => {
366                Some("Heredoc blocks must have their terminator marker appear on a line by itself with no leading or trailing whitespace (unless using <<~MARKER for indented heredocs).".to_string())
367            }
368            ParseErrorKind::InvalidVariableName => {
369                Some("Perl variable names (after the sigil) must follow identifier rules: start with a letter (a-z, A-Z) or underscore (_), followed by any combination of letters, digits, or underscores.".to_string())
370            }
371            ParseErrorKind::UnclosedBlock => {
372                Some("Code blocks must have matching braces. Each opening '{' needs a corresponding closing '}'.".to_string())
373            }
374            _ => None,
375        }
376    }
377}
378
379/// Recovery-salvage metrics computed for a single parsed file.
380///
381/// Used by accuracy closeout reporting to distinguish salvageable structured
382/// recovery from unrecovered parser damage.
383///
384/// Distinct from [`crate::RecoverySalvageProfile`] in two ways: this type
385/// carries `unrecovered_diagnostic_count` (non-recovery diagnostics) for finer
386/// classification, and exposes `is_dirty()`/`is_structured_recovery_only()`
387/// helpers used by the corpus closeout reports.
388#[derive(Debug, Clone, Default, PartialEq, Eq)]
389pub struct RecoverySalvageMetrics {
390    /// Number of [`ParseError::Recovered`] diagnostics observed.
391    pub recovered_node_count: usize,
392    /// Number of non-recovery diagnostics observed (`diagnostics.len() -
393    /// recovered_node_count`).
394    pub unrecovered_diagnostic_count: usize,
395    /// Number of `NodeKind::Error` nodes observed in the AST.
396    pub error_node_count: usize,
397    /// Message from the earliest unrecovered `ERROR` node (by start offset),
398    /// if any.
399    pub first_unrecovered_error_node: Option<String>,
400}
401
402impl RecoverySalvageMetrics {
403    /// Returns true when the parse produced any error node, recovered
404    /// diagnostic, or unrecovered diagnostic.
405    pub fn is_dirty(&self) -> bool {
406        self.error_node_count > 0
407            || self.recovered_node_count > 0
408            || self.unrecovered_diagnostic_count > 0
409    }
410
411    /// Returns true when the parse only produced structured recovery
412    /// diagnostics — i.e. recovered diagnostics with no `ERROR` AST nodes and
413    /// no other diagnostics.
414    pub fn is_structured_recovery_only(&self) -> bool {
415        self.recovered_node_count > 0
416            && self.error_node_count == 0
417            && self.unrecovered_diagnostic_count == 0
418    }
419}
420
421/// Compute [`RecoverySalvageMetrics`] for a parsed AST and its diagnostics.
422///
423/// Walks the AST counting `NodeKind::Error` nodes, recording the earliest
424/// error message by start offset, and partitions `diagnostics` into recovered
425/// vs unrecovered counts.
426pub fn classify_recovery_salvage(ast: &Node, diagnostics: &[ParseError]) -> RecoverySalvageMetrics {
427    let mut error_node_count = 0usize;
428    let mut first_start = usize::MAX;
429    let mut first_unrecovered_error_node: Option<String> = None;
430
431    fn walk(
432        node: &Node,
433        error_node_count: &mut usize,
434        first_start: &mut usize,
435        first_unrecovered_error_node: &mut Option<String>,
436    ) {
437        if let perl_ast::NodeKind::Error { message, .. } = &node.kind {
438            *error_node_count = error_node_count.saturating_add(1);
439            if node.location.start < *first_start {
440                *first_start = node.location.start;
441                *first_unrecovered_error_node = Some(message.clone());
442            }
443        }
444        node.for_each_child(|child| {
445            walk(child, error_node_count, first_start, first_unrecovered_error_node);
446        });
447    }
448    walk(ast, &mut error_node_count, &mut first_start, &mut first_unrecovered_error_node);
449
450    let recovered_node_count =
451        diagnostics.iter().filter(|e| matches!(e, ParseError::Recovered { .. })).count();
452    let unrecovered_diagnostic_count = diagnostics.len().saturating_sub(recovered_node_count);
453
454    RecoverySalvageMetrics {
455        recovered_node_count,
456        unrecovered_diagnostic_count,
457        error_node_count,
458        first_unrecovered_error_node,
459    }
460}
461
462#[cfg(test)]
463mod tests {
464    use super::*;
465    use perl_ast::{Node, NodeKind, SourceLocation};
466
467    #[test]
468    fn test_classify_unclosed_string() {
469        let classifier = ErrorClassifier::new();
470        let source = r#"my $x = "hello"#;
471
472        // Manually construct error node
473        // "hello is at index 9 (my  = ) is 0..8
474        // m y   $ x   =   "
475        // 0123456789
476
477        let error_node = Node::new(
478            NodeKind::Error {
479                message: "Unclosed string".to_string(),
480                expected: vec![],
481                found: None,
482                partial: None,
483            },
484            SourceLocation { start: 9, end: 15 }, // "hello
485        );
486
487        let kind = classifier.classify(&error_node, source);
488        assert_eq!(kind, ParseErrorKind::UnclosedString);
489    }
490
491    #[test]
492    fn test_classify_missing_semicolon() {
493        let classifier = ErrorClassifier::new();
494        let source = "my $x = 42\nmy $y = 10";
495
496        // Simulate an error node at the end of first line
497        let error = Node::new(
498            NodeKind::Error {
499                message: "Unexpected token".to_string(),
500                expected: vec![],
501                found: None,
502                partial: None,
503            },
504            SourceLocation { start: 10, end: 11 }, // newline char
505        );
506        let kind = classifier.classify(&error, source);
507        assert_eq!(kind, ParseErrorKind::MissingSemicolon);
508    }
509
510    // ── classify_recovery_salvage unit tests ─────────────────────────────────
511
512    fn make_error_node(message: &str, start: usize, end: usize) -> Node {
513        Node::new(
514            NodeKind::Error {
515                message: message.to_string(),
516                expected: vec![],
517                found: None,
518                partial: None,
519            },
520            SourceLocation { start, end },
521        )
522    }
523
524    fn make_program_node(children: Vec<Node>) -> Node {
525        Node::new(NodeKind::Program { statements: children }, SourceLocation { start: 0, end: 100 })
526    }
527
528    #[test]
529    fn clean_parse_produces_zero_metrics() {
530        // A clean AST with no Error nodes and no diagnostics is not dirty.
531        let root = make_program_node(vec![]);
532        let metrics = classify_recovery_salvage(&root, &[]);
533        assert_eq!(metrics.recovered_node_count, 0);
534        assert_eq!(metrics.unrecovered_diagnostic_count, 0);
535        assert_eq!(metrics.error_node_count, 0);
536        assert!(metrics.first_unrecovered_error_node.is_none());
537        assert!(!metrics.is_dirty());
538        assert!(!metrics.is_structured_recovery_only());
539    }
540
541    #[test]
542    fn error_node_without_diagnostics_is_dirty_but_not_structured_recovery() {
543        // Edge case: parser inserts an Error node but emits no diagnostic.
544        // is_dirty() must return true; is_structured_recovery_only() must be false.
545        let error = make_error_node("unexpected token", 5, 10);
546        let root = make_program_node(vec![error]);
547        let metrics = classify_recovery_salvage(&root, &[]);
548
549        assert_eq!(metrics.error_node_count, 1);
550        assert_eq!(metrics.recovered_node_count, 0);
551        assert_eq!(metrics.unrecovered_diagnostic_count, 0);
552        assert!(metrics.is_dirty(), "error node alone makes result dirty");
553        assert!(
554            !metrics.is_structured_recovery_only(),
555            "no recovery diagnostics — not structured-recovery-only"
556        );
557        assert_eq!(metrics.first_unrecovered_error_node.as_deref(), Some("unexpected token"));
558    }
559
560    #[test]
561    fn multiple_error_nodes_reports_earliest_by_start_offset() {
562        // When multiple Error nodes are present, the first one by start offset
563        // should be captured as first_unrecovered_error_node.
564        let later = make_error_node("later error", 50, 60);
565        let earlier = make_error_node("earlier error", 10, 20);
566        let root = make_program_node(vec![later, earlier]);
567        let metrics = classify_recovery_salvage(&root, &[]);
568
569        assert_eq!(metrics.error_node_count, 2);
570        assert_eq!(
571            metrics.first_unrecovered_error_node.as_deref(),
572            Some("earlier error"),
573            "earliest by start offset must win"
574        );
575    }
576}