Skip to main content

perl_parser_core/syntax/error/
classifier.rs

1//! Error classification and diagnostic generation for Perl parsing workflows
2//!
3//! This module provides intelligent error classification for parsing failures in Perl scripts,
4//! offering specific error types and recovery suggestions for LSP workflow operations.
5//!
6//! # LSP Workflow Integration
7//!
8//! Error classification supports robust Perl parsing across LSP workflow stages:
9//! - **Parse**: Classify syntax errors during parser construction
10//! - **Index**: Provide error context for symbol extraction and indexing
11//! - **Navigate**: Surface recovery hints for definition and reference resolution
12//! - **Complete**: Enable error-tolerant completion and quick fixes
13//! - **Analyze**: Drive diagnostics and remediation guidance
14//!
15//! # Usage Examples
16//!
17//! ```ignore
18//! use perl_parser::error_classifier::{ErrorClassifier, ParseErrorKind};
19//! use perl_parser::{Parser, ast::Node};
20//!
21//! let classifier = ErrorClassifier::new();
22//! let source = "my $value = \"unclosed string...";
23//! let mut parser = Parser::new(source);
24//! let _result = parser.parse(); // This will fail due to unclosed string
25//!
26//! // Classify parsing errors for better user feedback
27//! // let error_kind = classifier.classify(&error_node, source);
28//! // let message = classifier.get_diagnostic_message(&error_kind);
29//! // let suggestion = classifier.get_suggestion(&error_kind);
30//! ```
31
32use super::ParseError;
33use perl_ast::Node;
34
35/// Specific types of parse errors found in Perl script content
36///
37/// Provides detailed categorization of parsing failures to enable targeted
38/// error recovery strategies during LSP workflows.
39#[derive(Debug, Clone, PartialEq)]
40pub enum ParseErrorKind {
41    /// Parser encountered unexpected token during Perl script analysis
42    UnexpectedToken {
43        /// Token type that was expected during parsing
44        expected: String,
45        /// Actual token found in Perl script content
46        found: String,
47    },
48    /// String literal not properly closed in Perl script
49    UnclosedString,
50    /// Regular expression pattern not properly closed
51    UnclosedRegex,
52    /// Code block (braces) not properly closed
53    UnclosedBlock,
54    /// Required semicolon missing in Perl script
55    MissingSemicolon,
56    /// General syntax error in Perl parsing code
57    InvalidSyntax,
58    /// Parenthesis not properly closed in expression
59    UnclosedParenthesis,
60    /// Array or hash bracket not properly closed
61    UnclosedBracket,
62    /// Hash or block brace not properly closed
63    UnclosedBrace,
64    /// Heredoc block not properly terminated
65    UnterminatedHeredoc,
66    /// Variable name does not follow Perl naming rules
67    InvalidVariableName,
68    /// Subroutine name does not follow Perl naming rules
69    InvalidSubroutineName,
70    /// Required operator missing in expression
71    MissingOperator,
72    /// Required operand missing in expression
73    MissingOperand,
74    /// Unexpected end of file during parsing
75    UnexpectedEof,
76}
77
78/// Perl script error classification engine for LSP workflow operations
79///
80/// Analyzes parsing errors and provides specific error types with recovery suggestions
81/// for robust Perl parsing workflows within enterprise LSP environments.
82pub struct ErrorClassifier;
83
84impl Default for ErrorClassifier {
85    fn default() -> Self {
86        Self::new()
87    }
88}
89
90impl ErrorClassifier {
91    /// Create new error classifier for Perl script analysis
92    ///
93    /// # Returns
94    ///
95    /// Configured classifier ready for LSP workflow error analysis
96    pub fn new() -> Self {
97        ErrorClassifier
98    }
99
100    /// Classify parsing error based on AST node and source context
101    ///
102    /// Analyzes error patterns in Perl script content to provide specific
103    /// error types for targeted recovery strategies during LSP workflow.
104    ///
105    /// # Arguments
106    ///
107    /// * `error_node` - AST node where error occurred
108    /// * `source` - Complete Perl script source code for context analysis
109    ///
110    /// # Returns
111    ///
112    /// Specific error type for targeted recovery during Perl parsing
113    pub fn classify(&self, error_node: &Node, source: &str) -> ParseErrorKind {
114        // Get the error text if available based on location
115        let error_text = {
116            let start = error_node.location.start;
117            let end = (start + 10).min(source.len()); // Look at next 10 chars
118            if start < source.len() && end <= source.len() && start <= end {
119                &source[start..end]
120            } else {
121                ""
122            }
123        };
124
125        // Check for common patterns - check the entire source for unclosed quotes
126        let quote_count = source.matches('"').count();
127        let single_quote_count = source.matches('\'').count();
128
129        // Check if we have unclosed quotes
130        if !quote_count.is_multiple_of(2) {
131            return ParseErrorKind::UnclosedString;
132        }
133        if !single_quote_count.is_multiple_of(2) {
134            return ParseErrorKind::UnclosedString;
135        }
136
137        // Also check the error text itself
138        if error_text.starts_with('"') && !error_text.ends_with('"') {
139            return ParseErrorKind::UnclosedString;
140        }
141
142        if error_text.starts_with('\'') && !error_text.ends_with('\'') {
143            return ParseErrorKind::UnclosedString;
144        }
145
146        if error_text.starts_with('/') && !error_text.contains("//") {
147            // Could be unclosed regex
148            if !error_text[1..].contains('/') {
149                return ParseErrorKind::UnclosedRegex;
150            }
151        }
152
153        // Check context around error
154        {
155            let pos = error_node.location.start;
156            let line_start = source[..pos].rfind('\n').map(|i| i + 1).unwrap_or(0);
157            let line_end = source[pos..].find('\n').map(|i| pos + i).unwrap_or(source.len());
158
159            let line = &source[line_start..line_end];
160
161            // Check for missing semicolon
162            if !line.trim().is_empty()
163                && !line.trim().ends_with(';')
164                && !line.trim().ends_with('{')
165                && !line.trim().ends_with('}')
166            {
167                // Look for common statement patterns
168                if line.contains("my ")
169                    || line.contains("our ")
170                    || line.contains("local ")
171                    || line.contains("print ")
172                    || line.contains("say ")
173                    || line.contains("return ")
174                {
175                    return ParseErrorKind::MissingSemicolon;
176                }
177            }
178
179            // Check for unclosed delimiters
180            let open_parens = line.matches('(').count();
181            let close_parens = line.matches(')').count();
182            if open_parens > close_parens {
183                return ParseErrorKind::UnclosedParenthesis;
184            }
185
186            let open_brackets = line.matches('[').count();
187            let close_brackets = line.matches(']').count();
188            if open_brackets > close_brackets {
189                return ParseErrorKind::UnclosedBracket;
190            }
191
192            let open_braces = line.matches('{').count();
193            let close_braces = line.matches('}').count();
194            if open_braces > close_braces {
195                return ParseErrorKind::UnclosedBrace;
196            }
197        }
198
199        // Check if we're at EOF
200        if error_node.location.start >= source.len() - 1 {
201            return ParseErrorKind::UnexpectedEof;
202        }
203
204        // Default to invalid syntax
205        ParseErrorKind::InvalidSyntax
206    }
207
208    /// Generate user-friendly diagnostic message for classified error
209    ///
210    /// Converts error classification into readable message for Perl script developers
211    /// during LSP workflow processing and debugging operations.
212    ///
213    /// # Arguments
214    ///
215    /// * `kind` - Classified error type from Perl script analysis
216    ///
217    /// # Returns
218    ///
219    /// Human-readable error message describing the parsing issue
220    pub fn get_diagnostic_message(&self, kind: &ParseErrorKind) -> String {
221        match kind {
222            ParseErrorKind::UnexpectedToken { expected, found } => {
223                format!("Expected {} but found {}", expected, found)
224            }
225            ParseErrorKind::UnclosedString => "Unclosed string literal".to_string(),
226            ParseErrorKind::UnclosedRegex => "Unclosed regular expression".to_string(),
227            ParseErrorKind::UnclosedBlock => "Unclosed code block - missing '}'".to_string(),
228            ParseErrorKind::MissingSemicolon => "Missing semicolon at end of statement".to_string(),
229            ParseErrorKind::InvalidSyntax => "Invalid syntax".to_string(),
230            ParseErrorKind::UnclosedParenthesis => "Unclosed parenthesis - missing ')'".to_string(),
231            ParseErrorKind::UnclosedBracket => "Unclosed bracket - missing ']'".to_string(),
232            ParseErrorKind::UnclosedBrace => "Unclosed brace - missing '}'".to_string(),
233            ParseErrorKind::UnterminatedHeredoc => "Unterminated heredoc".to_string(),
234            ParseErrorKind::InvalidVariableName => "Invalid variable name".to_string(),
235            ParseErrorKind::InvalidSubroutineName => "Invalid subroutine name".to_string(),
236            ParseErrorKind::MissingOperator => "Missing operator".to_string(),
237            ParseErrorKind::MissingOperand => "Missing operand".to_string(),
238            ParseErrorKind::UnexpectedEof => "Unexpected end of file".to_string(),
239        }
240    }
241
242    /// Generate recovery suggestion for classified parsing error
243    ///
244    /// Provides actionable recovery suggestions for Perl script developers
245    /// to resolve parsing issues during LSP workflow development.
246    ///
247    /// # Arguments
248    ///
249    /// * `kind` - Classified error type requiring recovery suggestion
250    ///
251    /// # Returns
252    ///
253    /// Optional recovery suggestion or None if no specific suggestion available
254    pub fn get_suggestion(&self, kind: &ParseErrorKind) -> Option<String> {
255        match kind {
256            ParseErrorKind::MissingSemicolon => {
257                Some("Add a semicolon ';' at the end of the statement".to_string())
258            }
259            ParseErrorKind::UnclosedString => {
260                Some("Add a closing quote to terminate the string".to_string())
261            }
262            ParseErrorKind::UnclosedParenthesis => {
263                Some("Add a closing parenthesis ')' to match the opening '('".to_string())
264            }
265            ParseErrorKind::UnclosedBracket => {
266                Some("Add a closing bracket ']' to match the opening '['".to_string())
267            }
268            ParseErrorKind::UnclosedBrace => {
269                Some("Add a closing brace '}' to match the opening '{'".to_string())
270            }
271            ParseErrorKind::UnclosedBlock => {
272                Some("Add a closing brace '}' to complete the code block".to_string())
273            }
274            ParseErrorKind::UnclosedRegex => {
275                Some("Add a closing delimiter to terminate the regex pattern".to_string())
276            }
277            ParseErrorKind::UnterminatedHeredoc => {
278                Some("Add the heredoc terminator marker on its own line".to_string())
279            }
280            ParseErrorKind::InvalidVariableName => {
281                Some("Variable names must start with a letter or underscore, followed by alphanumeric characters or underscores".to_string())
282            }
283            ParseErrorKind::InvalidSubroutineName => {
284                Some("Subroutine names must start with a letter or underscore, followed by alphanumeric characters or underscores".to_string())
285            }
286            ParseErrorKind::MissingOperator => {
287                Some("Add an operator between operands (e.g., +, -, *, /, ., ==, !=)".to_string())
288            }
289            ParseErrorKind::MissingOperand => {
290                Some("Add a value or expression after the operator".to_string())
291            }
292            ParseErrorKind::UnexpectedEof => {
293                Some("The file ended unexpectedly - check for unclosed blocks, strings, or parentheses".to_string())
294            }
295            ParseErrorKind::UnexpectedToken { expected, found: _ } => {
296                Some(format!("Expected {} at this location", expected))
297            }
298            ParseErrorKind::InvalidSyntax => None,
299        }
300    }
301
302    /// Get a detailed explanation for the error kind
303    ///
304    /// Provides additional context and explanation beyond the basic diagnostic message
305    /// to help developers understand the root cause of the error.
306    ///
307    /// # Arguments
308    ///
309    /// * `kind` - Classified error type
310    ///
311    /// # Returns
312    ///
313    /// Optional detailed explanation
314    pub fn get_explanation(&self, kind: &ParseErrorKind) -> Option<String> {
315        match kind {
316            ParseErrorKind::MissingSemicolon => {
317                Some("In Perl, most statements must end with a semicolon. The only exceptions are the last statement in a block and statements that end with a block (like if, while, sub, etc.).".to_string())
318            }
319            ParseErrorKind::UnclosedString => {
320                Some("String literals must be properly terminated with a matching quote. Use double quotes (\") for interpolated strings or single quotes (') for literal strings.".to_string())
321            }
322            ParseErrorKind::UnclosedRegex => {
323                Some("Regular expressions must be properly delimited. Common forms include /pattern/, m/pattern/, s/old/new/, and qr/pattern/.".to_string())
324            }
325            ParseErrorKind::UnterminatedHeredoc => {
326                Some("Heredoc blocks must have their terminator marker appear on a line by itself with no leading or trailing whitespace (unless using <<~MARKER for indented heredocs).".to_string())
327            }
328            ParseErrorKind::InvalidVariableName => {
329                Some("Perl variable names (after the sigil) must follow identifier rules: start with a letter (a-z, A-Z) or underscore (_), followed by any combination of letters, digits, or underscores.".to_string())
330            }
331            ParseErrorKind::UnclosedBlock => {
332                Some("Code blocks must have matching braces. Each opening '{' needs a corresponding closing '}'.".to_string())
333            }
334            _ => None,
335        }
336    }
337}
338
339/// Recovery-salvage metrics computed for a single parsed file.
340///
341/// Used by accuracy closeout reporting to distinguish salvageable structured
342/// recovery from unrecovered parser damage.
343///
344/// Distinct from [`crate::RecoverySalvageProfile`] in two ways: this type
345/// carries `unrecovered_diagnostic_count` (non-recovery diagnostics) for finer
346/// classification, and exposes `is_dirty()`/`is_structured_recovery_only()`
347/// helpers used by the corpus closeout reports.
348#[derive(Debug, Clone, Default, PartialEq, Eq)]
349pub struct RecoverySalvageMetrics {
350    /// Number of [`ParseError::Recovered`] diagnostics observed.
351    pub recovered_node_count: usize,
352    /// Number of non-recovery diagnostics observed (`diagnostics.len() -
353    /// recovered_node_count`).
354    pub unrecovered_diagnostic_count: usize,
355    /// Number of `NodeKind::Error` nodes observed in the AST.
356    pub error_node_count: usize,
357    /// Message from the earliest unrecovered `ERROR` node (by start offset),
358    /// if any.
359    pub first_unrecovered_error_node: Option<String>,
360}
361
362impl RecoverySalvageMetrics {
363    /// Returns true when the parse produced any error node, recovered
364    /// diagnostic, or unrecovered diagnostic.
365    pub fn is_dirty(&self) -> bool {
366        self.error_node_count > 0
367            || self.recovered_node_count > 0
368            || self.unrecovered_diagnostic_count > 0
369    }
370
371    /// Returns true when the parse only produced structured recovery
372    /// diagnostics — i.e. recovered diagnostics with no `ERROR` AST nodes and
373    /// no other diagnostics.
374    pub fn is_structured_recovery_only(&self) -> bool {
375        self.recovered_node_count > 0
376            && self.error_node_count == 0
377            && self.unrecovered_diagnostic_count == 0
378    }
379}
380
381/// Compute [`RecoverySalvageMetrics`] for a parsed AST and its diagnostics.
382///
383/// Walks the AST counting `NodeKind::Error` nodes, recording the earliest
384/// error message by start offset, and partitions `diagnostics` into recovered
385/// vs unrecovered counts.
386pub fn classify_recovery_salvage(ast: &Node, diagnostics: &[ParseError]) -> RecoverySalvageMetrics {
387    let mut error_node_count = 0usize;
388    let mut first_start = usize::MAX;
389    let mut first_unrecovered_error_node: Option<String> = None;
390
391    fn walk(
392        node: &Node,
393        error_node_count: &mut usize,
394        first_start: &mut usize,
395        first_unrecovered_error_node: &mut Option<String>,
396    ) {
397        if let perl_ast::NodeKind::Error { message, .. } = &node.kind {
398            *error_node_count = error_node_count.saturating_add(1);
399            if node.location.start < *first_start {
400                *first_start = node.location.start;
401                *first_unrecovered_error_node = Some(message.clone());
402            }
403        }
404        node.for_each_child(|child| {
405            walk(child, error_node_count, first_start, first_unrecovered_error_node);
406        });
407    }
408    walk(ast, &mut error_node_count, &mut first_start, &mut first_unrecovered_error_node);
409
410    let recovered_node_count =
411        diagnostics.iter().filter(|e| matches!(e, ParseError::Recovered { .. })).count();
412    let unrecovered_diagnostic_count = diagnostics.len().saturating_sub(recovered_node_count);
413
414    RecoverySalvageMetrics {
415        recovered_node_count,
416        unrecovered_diagnostic_count,
417        error_node_count,
418        first_unrecovered_error_node,
419    }
420}
421
422#[cfg(test)]
423mod tests {
424    use super::*;
425    use perl_ast::{Node, NodeKind, SourceLocation};
426
427    #[test]
428    fn test_classify_unclosed_string() {
429        let classifier = ErrorClassifier::new();
430        let source = r#"my $x = "hello"#;
431
432        // Manually construct error node
433        // "hello is at index 9 (my  = ) is 0..8
434        // m y   $ x   =   "
435        // 0123456789
436
437        let error_node = Node::new(
438            NodeKind::Error {
439                message: "Unclosed string".to_string(),
440                expected: vec![],
441                found: None,
442                partial: None,
443            },
444            SourceLocation { start: 9, end: 15 }, // "hello
445        );
446
447        let kind = classifier.classify(&error_node, source);
448        assert_eq!(kind, ParseErrorKind::UnclosedString);
449    }
450
451    #[test]
452    fn test_classify_missing_semicolon() {
453        let classifier = ErrorClassifier::new();
454        let source = "my $x = 42\nmy $y = 10";
455
456        // Simulate an error node at the end of first line
457        let error = Node::new(
458            NodeKind::Error {
459                message: "Unexpected token".to_string(),
460                expected: vec![],
461                found: None,
462                partial: None,
463            },
464            SourceLocation { start: 10, end: 11 }, // newline char
465        );
466        let kind = classifier.classify(&error, source);
467        assert_eq!(kind, ParseErrorKind::MissingSemicolon);
468    }
469
470    // ── classify_recovery_salvage unit tests ─────────────────────────────────
471
472    fn make_error_node(message: &str, start: usize, end: usize) -> Node {
473        Node::new(
474            NodeKind::Error {
475                message: message.to_string(),
476                expected: vec![],
477                found: None,
478                partial: None,
479            },
480            SourceLocation { start, end },
481        )
482    }
483
484    fn make_program_node(children: Vec<Node>) -> Node {
485        Node::new(NodeKind::Program { statements: children }, SourceLocation { start: 0, end: 100 })
486    }
487
488    #[test]
489    fn clean_parse_produces_zero_metrics() {
490        // A clean AST with no Error nodes and no diagnostics is not dirty.
491        let root = make_program_node(vec![]);
492        let metrics = classify_recovery_salvage(&root, &[]);
493        assert_eq!(metrics.recovered_node_count, 0);
494        assert_eq!(metrics.unrecovered_diagnostic_count, 0);
495        assert_eq!(metrics.error_node_count, 0);
496        assert!(metrics.first_unrecovered_error_node.is_none());
497        assert!(!metrics.is_dirty());
498        assert!(!metrics.is_structured_recovery_only());
499    }
500
501    #[test]
502    fn error_node_without_diagnostics_is_dirty_but_not_structured_recovery() {
503        // Edge case: parser inserts an Error node but emits no diagnostic.
504        // is_dirty() must return true; is_structured_recovery_only() must be false.
505        let error = make_error_node("unexpected token", 5, 10);
506        let root = make_program_node(vec![error]);
507        let metrics = classify_recovery_salvage(&root, &[]);
508
509        assert_eq!(metrics.error_node_count, 1);
510        assert_eq!(metrics.recovered_node_count, 0);
511        assert_eq!(metrics.unrecovered_diagnostic_count, 0);
512        assert!(metrics.is_dirty(), "error node alone makes result dirty");
513        assert!(
514            !metrics.is_structured_recovery_only(),
515            "no recovery diagnostics — not structured-recovery-only"
516        );
517        assert_eq!(metrics.first_unrecovered_error_node.as_deref(), Some("unexpected token"));
518    }
519
520    #[test]
521    fn multiple_error_nodes_reports_earliest_by_start_offset() {
522        // When multiple Error nodes are present, the first one by start offset
523        // should be captured as first_unrecovered_error_node.
524        let later = make_error_node("later error", 50, 60);
525        let earlier = make_error_node("earlier error", 10, 20);
526        let root = make_program_node(vec![later, earlier]);
527        let metrics = classify_recovery_salvage(&root, &[]);
528
529        assert_eq!(metrics.error_node_count, 2);
530        assert_eq!(
531            metrics.first_unrecovered_error_node.as_deref(),
532            Some("earlier error"),
533            "earliest by start offset must win"
534        );
535    }
536}