perl-parser-core 0.13.3

Core parser engine for perl-parser
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
//! Error classification and diagnostic generation for Perl parsing workflows
//!
//! This module provides intelligent error classification for parsing failures in Perl scripts,
//! offering specific error types and recovery suggestions for LSP workflow operations.
//!
//! # LSP Workflow Integration
//!
//! Error classification supports robust Perl parsing across LSP workflow stages:
//! - **Parse**: Classify syntax errors during parser construction
//! - **Index**: Provide error context for symbol extraction and indexing
//! - **Navigate**: Surface recovery hints for definition and reference resolution
//! - **Complete**: Enable error-tolerant completion and quick fixes
//! - **Analyze**: Drive diagnostics and remediation guidance
//!
//! # Usage Examples
//!
//! ```ignore
//! use perl_parser::error_classifier::{ErrorClassifier, ParseErrorKind};
//! use perl_parser::{Parser, ast::Node};
//!
//! let classifier = ErrorClassifier::new();
//! let source = "my $value = \"unclosed string...";
//! let mut parser = Parser::new(source);
//! let _result = parser.parse(); // This will fail due to unclosed string
//!
//! // Classify parsing errors for better user feedback
//! // let error_kind = classifier.classify(&error_node, source);
//! // let message = classifier.get_diagnostic_message(&error_kind);
//! // let suggestion = classifier.get_suggestion(&error_kind);
//! ```

use super::ParseError;
use perl_ast::Node;

/// Specific types of parse errors found in Perl script content
///
/// Provides detailed categorization of parsing failures to enable targeted
/// error recovery strategies during LSP workflows.
#[derive(Debug, Clone, PartialEq)]
pub enum ParseErrorKind {
    /// Parser encountered unexpected token during Perl script analysis
    UnexpectedToken {
        /// Token type that was expected during parsing
        expected: String,
        /// Actual token found in Perl script content
        found: String,
    },
    /// String literal not properly closed in Perl script
    UnclosedString,
    /// Regular expression pattern not properly closed
    UnclosedRegex,
    /// Code block (braces) not properly closed
    UnclosedBlock,
    /// Required semicolon missing in Perl script
    MissingSemicolon,
    /// General syntax error in Perl parsing code
    InvalidSyntax,
    /// Parenthesis not properly closed in expression
    UnclosedParenthesis,
    /// Array or hash bracket not properly closed
    UnclosedBracket,
    /// Hash or block brace not properly closed
    UnclosedBrace,
    /// Heredoc block not properly terminated
    UnterminatedHeredoc,
    /// Variable name does not follow Perl naming rules
    InvalidVariableName,
    /// Subroutine name does not follow Perl naming rules
    InvalidSubroutineName,
    /// Required operator missing in expression
    MissingOperator,
    /// Required operand missing in expression
    MissingOperand,
    /// Unexpected end of file during parsing
    UnexpectedEof,
}

/// Perl script error classification engine for LSP workflow operations
///
/// Analyzes parsing errors and provides specific error types with recovery suggestions
/// for robust Perl parsing workflows within enterprise LSP environments.
pub struct ErrorClassifier;

impl Default for ErrorClassifier {
    fn default() -> Self {
        Self::new()
    }
}

impl ErrorClassifier {
    /// Create new error classifier for Perl script analysis
    ///
    /// # Returns
    ///
    /// Configured classifier ready for LSP workflow error analysis
    pub fn new() -> Self {
        ErrorClassifier
    }

    /// Classify parsing error based on AST node and source context
    ///
    /// Analyzes error patterns in Perl script content to provide specific
    /// error types for targeted recovery strategies during LSP workflow.
    ///
    /// # Arguments
    ///
    /// * `error_node` - AST node where error occurred
    /// * `source` - Complete Perl script source code for context analysis
    ///
    /// # Returns
    ///
    /// Specific error type for targeted recovery during Perl parsing
    pub fn classify(&self, error_node: &Node, source: &str) -> ParseErrorKind {
        // Get the error text if available based on location
        let error_text = {
            let start = error_node.location.start;
            let end = (start + 10).min(source.len()); // Look at next 10 chars
            if start < source.len() && end <= source.len() && start <= end {
                &source[start..end]
            } else {
                ""
            }
        };

        // Check for common patterns - check the entire source for unclosed quotes
        let quote_count = source.matches('"').count();
        let single_quote_count = source.matches('\'').count();

        // Check if we have unclosed quotes
        if !quote_count.is_multiple_of(2) {
            return ParseErrorKind::UnclosedString;
        }
        if !single_quote_count.is_multiple_of(2) {
            return ParseErrorKind::UnclosedString;
        }

        // Also check the error text itself
        if error_text.starts_with('"') && !error_text.ends_with('"') {
            return ParseErrorKind::UnclosedString;
        }

        if error_text.starts_with('\'') && !error_text.ends_with('\'') {
            return ParseErrorKind::UnclosedString;
        }

        if error_text.starts_with('/') && !error_text.contains("//") {
            // Could be unclosed regex
            if !error_text[1..].contains('/') {
                return ParseErrorKind::UnclosedRegex;
            }
        }

        // Check context around error
        {
            let pos = error_node.location.start;
            let line_start = source[..pos].rfind('\n').map(|i| i + 1).unwrap_or(0);
            let line_end = source[pos..].find('\n').map(|i| pos + i).unwrap_or(source.len());

            let line = &source[line_start..line_end];

            // Check for missing semicolon
            if !line.trim().is_empty()
                && !line.trim().ends_with(';')
                && !line.trim().ends_with('{')
                && !line.trim().ends_with('}')
            {
                // Look for common statement patterns
                if line.contains("my ")
                    || line.contains("our ")
                    || line.contains("local ")
                    || line.contains("print ")
                    || line.contains("say ")
                    || line.contains("return ")
                {
                    return ParseErrorKind::MissingSemicolon;
                }
            }

            // Check for unclosed delimiters
            let open_parens = line.matches('(').count();
            let close_parens = line.matches(')').count();
            if open_parens > close_parens {
                return ParseErrorKind::UnclosedParenthesis;
            }

            let open_brackets = line.matches('[').count();
            let close_brackets = line.matches(']').count();
            if open_brackets > close_brackets {
                return ParseErrorKind::UnclosedBracket;
            }

            let open_braces = line.matches('{').count();
            let close_braces = line.matches('}').count();
            if open_braces > close_braces {
                return ParseErrorKind::UnclosedBrace;
            }
        }

        // Check if we're at EOF
        if error_node.location.start >= source.len() - 1 {
            return ParseErrorKind::UnexpectedEof;
        }

        // Default to invalid syntax
        ParseErrorKind::InvalidSyntax
    }

    /// Generate user-friendly diagnostic message for classified error
    ///
    /// Converts error classification into readable message for Perl script developers
    /// during LSP workflow processing and debugging operations.
    ///
    /// # Arguments
    ///
    /// * `kind` - Classified error type from Perl script analysis
    ///
    /// # Returns
    ///
    /// Human-readable error message describing the parsing issue
    pub fn get_diagnostic_message(&self, kind: &ParseErrorKind) -> String {
        match kind {
            ParseErrorKind::UnexpectedToken { expected, found } => {
                format!("Expected {} but found {}", expected, found)
            }
            ParseErrorKind::UnclosedString => "Unclosed string literal".to_string(),
            ParseErrorKind::UnclosedRegex => "Unclosed regular expression".to_string(),
            ParseErrorKind::UnclosedBlock => "Unclosed code block - missing '}'".to_string(),
            ParseErrorKind::MissingSemicolon => "Missing semicolon at end of statement".to_string(),
            ParseErrorKind::InvalidSyntax => "Invalid syntax".to_string(),
            ParseErrorKind::UnclosedParenthesis => "Unclosed parenthesis - missing ')'".to_string(),
            ParseErrorKind::UnclosedBracket => "Unclosed bracket - missing ']'".to_string(),
            ParseErrorKind::UnclosedBrace => "Unclosed brace - missing '}'".to_string(),
            ParseErrorKind::UnterminatedHeredoc => "Unterminated heredoc".to_string(),
            ParseErrorKind::InvalidVariableName => "Invalid variable name".to_string(),
            ParseErrorKind::InvalidSubroutineName => "Invalid subroutine name".to_string(),
            ParseErrorKind::MissingOperator => "Missing operator".to_string(),
            ParseErrorKind::MissingOperand => "Missing operand".to_string(),
            ParseErrorKind::UnexpectedEof => "Unexpected end of file".to_string(),
        }
    }

    /// Generate recovery suggestion for classified parsing error
    ///
    /// Provides actionable recovery suggestions for Perl script developers
    /// to resolve parsing issues during LSP workflow development.
    ///
    /// # Arguments
    ///
    /// * `kind` - Classified error type requiring recovery suggestion
    ///
    /// # Returns
    ///
    /// Optional recovery suggestion or None if no specific suggestion available
    pub fn get_suggestion(&self, kind: &ParseErrorKind) -> Option<String> {
        match kind {
            ParseErrorKind::MissingSemicolon => {
                Some("Add a semicolon ';' at the end of the statement".to_string())
            }
            ParseErrorKind::UnclosedString => {
                Some("Add a closing quote to terminate the string".to_string())
            }
            ParseErrorKind::UnclosedParenthesis => {
                Some("Add a closing parenthesis ')' to match the opening '('".to_string())
            }
            ParseErrorKind::UnclosedBracket => {
                Some("Add a closing bracket ']' to match the opening '['".to_string())
            }
            ParseErrorKind::UnclosedBrace => {
                Some("Add a closing brace '}' to match the opening '{'".to_string())
            }
            ParseErrorKind::UnclosedBlock => {
                Some("Add a closing brace '}' to complete the code block".to_string())
            }
            ParseErrorKind::UnclosedRegex => {
                Some("Add a closing delimiter to terminate the regex pattern".to_string())
            }
            ParseErrorKind::UnterminatedHeredoc => {
                Some("Add the heredoc terminator marker on its own line".to_string())
            }
            ParseErrorKind::InvalidVariableName => {
                Some("Variable names must start with a letter or underscore, followed by alphanumeric characters or underscores".to_string())
            }
            ParseErrorKind::InvalidSubroutineName => {
                Some("Subroutine names must start with a letter or underscore, followed by alphanumeric characters or underscores".to_string())
            }
            ParseErrorKind::MissingOperator => {
                Some("Add an operator between operands (e.g., +, -, *, /, ., ==, !=)".to_string())
            }
            ParseErrorKind::MissingOperand => {
                Some("Add a value or expression after the operator".to_string())
            }
            ParseErrorKind::UnexpectedEof => {
                Some("The file ended unexpectedly - check for unclosed blocks, strings, or parentheses".to_string())
            }
            ParseErrorKind::UnexpectedToken { expected, found: _ } => {
                Some(format!("Expected {} at this location", expected))
            }
            ParseErrorKind::InvalidSyntax => None,
        }
    }

    /// Get a detailed explanation for the error kind
    ///
    /// Provides additional context and explanation beyond the basic diagnostic message
    /// to help developers understand the root cause of the error.
    ///
    /// # Arguments
    ///
    /// * `kind` - Classified error type
    ///
    /// # Returns
    ///
    /// Optional detailed explanation
    pub fn get_explanation(&self, kind: &ParseErrorKind) -> Option<String> {
        match kind {
            ParseErrorKind::MissingSemicolon => {
                Some("In Perl, most statements must end with a semicolon. The only exceptions are the last statement in a block and statements that end with a block (like if, while, sub, etc.).".to_string())
            }
            ParseErrorKind::UnclosedString => {
                Some("String literals must be properly terminated with a matching quote. Use double quotes (\") for interpolated strings or single quotes (') for literal strings.".to_string())
            }
            ParseErrorKind::UnclosedRegex => {
                Some("Regular expressions must be properly delimited. Common forms include /pattern/, m/pattern/, s/old/new/, and qr/pattern/.".to_string())
            }
            ParseErrorKind::UnterminatedHeredoc => {
                Some("Heredoc blocks must have their terminator marker appear on a line by itself with no leading or trailing whitespace (unless using <<~MARKER for indented heredocs).".to_string())
            }
            ParseErrorKind::InvalidVariableName => {
                Some("Perl variable names (after the sigil) must follow identifier rules: start with a letter (a-z, A-Z) or underscore (_), followed by any combination of letters, digits, or underscores.".to_string())
            }
            ParseErrorKind::UnclosedBlock => {
                Some("Code blocks must have matching braces. Each opening '{' needs a corresponding closing '}'.".to_string())
            }
            _ => None,
        }
    }
}

/// Recovery-salvage metrics computed for a single parsed file.
///
/// Used by accuracy closeout reporting to distinguish salvageable structured
/// recovery from unrecovered parser damage.
///
/// Distinct from [`crate::RecoverySalvageProfile`] in two ways: this type
/// carries `unrecovered_diagnostic_count` (non-recovery diagnostics) for finer
/// classification, and exposes `is_dirty()`/`is_structured_recovery_only()`
/// helpers used by the corpus closeout reports.
#[derive(Debug, Clone, Default, PartialEq, Eq)]
pub struct RecoverySalvageMetrics {
    /// Number of [`ParseError::Recovered`] diagnostics observed.
    pub recovered_node_count: usize,
    /// Number of non-recovery diagnostics observed (`diagnostics.len() -
    /// recovered_node_count`).
    pub unrecovered_diagnostic_count: usize,
    /// Number of `NodeKind::Error` nodes observed in the AST.
    pub error_node_count: usize,
    /// Message from the earliest unrecovered `ERROR` node (by start offset),
    /// if any.
    pub first_unrecovered_error_node: Option<String>,
}

impl RecoverySalvageMetrics {
    /// Returns true when the parse produced any error node, recovered
    /// diagnostic, or unrecovered diagnostic.
    pub fn is_dirty(&self) -> bool {
        self.error_node_count > 0
            || self.recovered_node_count > 0
            || self.unrecovered_diagnostic_count > 0
    }

    /// Returns true when the parse only produced structured recovery
    /// diagnostics — i.e. recovered diagnostics with no `ERROR` AST nodes and
    /// no other diagnostics.
    pub fn is_structured_recovery_only(&self) -> bool {
        self.recovered_node_count > 0
            && self.error_node_count == 0
            && self.unrecovered_diagnostic_count == 0
    }
}

/// Compute [`RecoverySalvageMetrics`] for a parsed AST and its diagnostics.
///
/// Walks the AST counting `NodeKind::Error` nodes, recording the earliest
/// error message by start offset, and partitions `diagnostics` into recovered
/// vs unrecovered counts.
pub fn classify_recovery_salvage(ast: &Node, diagnostics: &[ParseError]) -> RecoverySalvageMetrics {
    let mut error_node_count = 0usize;
    let mut first_start = usize::MAX;
    let mut first_unrecovered_error_node: Option<String> = None;

    fn walk(
        node: &Node,
        error_node_count: &mut usize,
        first_start: &mut usize,
        first_unrecovered_error_node: &mut Option<String>,
    ) {
        if let perl_ast::NodeKind::Error { message, .. } = &node.kind {
            *error_node_count = error_node_count.saturating_add(1);
            if node.location.start < *first_start {
                *first_start = node.location.start;
                *first_unrecovered_error_node = Some(message.clone());
            }
        }
        node.for_each_child(|child| {
            walk(child, error_node_count, first_start, first_unrecovered_error_node);
        });
    }
    walk(ast, &mut error_node_count, &mut first_start, &mut first_unrecovered_error_node);

    let recovered_node_count =
        diagnostics.iter().filter(|e| matches!(e, ParseError::Recovered { .. })).count();
    let unrecovered_diagnostic_count = diagnostics.len().saturating_sub(recovered_node_count);

    RecoverySalvageMetrics {
        recovered_node_count,
        unrecovered_diagnostic_count,
        error_node_count,
        first_unrecovered_error_node,
    }
}

#[cfg(test)]
mod tests {
    use super::*;
    use perl_ast::{Node, NodeKind, SourceLocation};

    #[test]
    fn test_classify_unclosed_string() {
        let classifier = ErrorClassifier::new();
        let source = r#"my $x = "hello"#;

        // Manually construct error node
        // "hello is at index 9 (my  = ) is 0..8
        // m y   $ x   =   "
        // 0123456789

        let error_node = Node::new(
            NodeKind::Error {
                message: "Unclosed string".to_string(),
                expected: vec![],
                found: None,
                partial: None,
            },
            SourceLocation { start: 9, end: 15 }, // "hello
        );

        let kind = classifier.classify(&error_node, source);
        assert_eq!(kind, ParseErrorKind::UnclosedString);
    }

    #[test]
    fn test_classify_missing_semicolon() {
        let classifier = ErrorClassifier::new();
        let source = "my $x = 42\nmy $y = 10";

        // Simulate an error node at the end of first line
        let error = Node::new(
            NodeKind::Error {
                message: "Unexpected token".to_string(),
                expected: vec![],
                found: None,
                partial: None,
            },
            SourceLocation { start: 10, end: 11 }, // newline char
        );
        let kind = classifier.classify(&error, source);
        assert_eq!(kind, ParseErrorKind::MissingSemicolon);
    }

    // ── classify_recovery_salvage unit tests ─────────────────────────────────

    fn make_error_node(message: &str, start: usize, end: usize) -> Node {
        Node::new(
            NodeKind::Error {
                message: message.to_string(),
                expected: vec![],
                found: None,
                partial: None,
            },
            SourceLocation { start, end },
        )
    }

    fn make_program_node(children: Vec<Node>) -> Node {
        Node::new(NodeKind::Program { statements: children }, SourceLocation { start: 0, end: 100 })
    }

    #[test]
    fn clean_parse_produces_zero_metrics() {
        // A clean AST with no Error nodes and no diagnostics is not dirty.
        let root = make_program_node(vec![]);
        let metrics = classify_recovery_salvage(&root, &[]);
        assert_eq!(metrics.recovered_node_count, 0);
        assert_eq!(metrics.unrecovered_diagnostic_count, 0);
        assert_eq!(metrics.error_node_count, 0);
        assert!(metrics.first_unrecovered_error_node.is_none());
        assert!(!metrics.is_dirty());
        assert!(!metrics.is_structured_recovery_only());
    }

    #[test]
    fn error_node_without_diagnostics_is_dirty_but_not_structured_recovery() {
        // Edge case: parser inserts an Error node but emits no diagnostic.
        // is_dirty() must return true; is_structured_recovery_only() must be false.
        let error = make_error_node("unexpected token", 5, 10);
        let root = make_program_node(vec![error]);
        let metrics = classify_recovery_salvage(&root, &[]);

        assert_eq!(metrics.error_node_count, 1);
        assert_eq!(metrics.recovered_node_count, 0);
        assert_eq!(metrics.unrecovered_diagnostic_count, 0);
        assert!(metrics.is_dirty(), "error node alone makes result dirty");
        assert!(
            !metrics.is_structured_recovery_only(),
            "no recovery diagnostics — not structured-recovery-only"
        );
        assert_eq!(metrics.first_unrecovered_error_node.as_deref(), Some("unexpected token"));
    }

    #[test]
    fn multiple_error_nodes_reports_earliest_by_start_offset() {
        // When multiple Error nodes are present, the first one by start offset
        // should be captured as first_unrecovered_error_node.
        let later = make_error_node("later error", 50, 60);
        let earlier = make_error_node("earlier error", 10, 20);
        let root = make_program_node(vec![later, earlier]);
        let metrics = classify_recovery_salvage(&root, &[]);

        assert_eq!(metrics.error_node_count, 2);
        assert_eq!(
            metrics.first_unrecovered_error_node.as_deref(),
            Some("earlier error"),
            "earliest by start offset must win"
        );
    }
}