perl_parser_core/syntax/error/
classifier.rs1use super::ParseError;
33use perl_ast::Node;
34
35#[derive(Debug, Clone, PartialEq)]
40pub enum ParseErrorKind {
41 UnexpectedToken {
43 expected: String,
45 found: String,
47 },
48 UnclosedString,
50 UnclosedRegex,
52 UnclosedBlock,
54 MissingSemicolon,
56 InvalidSyntax,
58 UnclosedParenthesis,
60 UnclosedBracket,
62 UnclosedBrace,
64 UnterminatedHeredoc,
66 InvalidVariableName,
68 InvalidSubroutineName,
70 MissingOperator,
72 MissingOperand,
74 UnexpectedEof,
76}
77
78pub struct ErrorClassifier;
83
84impl Default for ErrorClassifier {
85 fn default() -> Self {
86 Self::new()
87 }
88}
89
90impl ErrorClassifier {
91 pub fn new() -> Self {
97 ErrorClassifier
98 }
99
100 pub fn classify(&self, error_node: &Node, source: &str) -> ParseErrorKind {
114 let error_text = {
121 let raw_start = error_node.location.start;
122 let start = {
125 let s = raw_start.min(source.len());
126 let mut s = s;
128 while s < source.len() && !source.is_char_boundary(s) {
129 s += 1;
130 }
131 s
132 };
133 let end = {
136 let mut e = (start + 10).min(source.len());
137 while e > start && !source.is_char_boundary(e) {
138 e -= 1;
139 }
140 e
141 };
142 if start < source.len() && start <= end {
143 source.get(start..end).unwrap_or("")
144 } else {
145 ""
146 }
147 };
148
149 let quote_count = source.matches('"').count();
151 let single_quote_count = source.matches('\'').count();
152
153 if !quote_count.is_multiple_of(2) {
155 return ParseErrorKind::UnclosedString;
156 }
157 if !single_quote_count.is_multiple_of(2) {
158 return ParseErrorKind::UnclosedString;
159 }
160
161 if error_text.starts_with('"') && !error_text.ends_with('"') {
163 return ParseErrorKind::UnclosedString;
164 }
165
166 if error_text.starts_with('\'') && !error_text.ends_with('\'') {
167 return ParseErrorKind::UnclosedString;
168 }
169
170 if error_text.starts_with('/') && !error_text.contains("//") {
171 if !error_text[1..].contains('/') {
173 return ParseErrorKind::UnclosedRegex;
174 }
175 }
176
177 {
185 let raw_pos = error_node.location.start;
186 let pos = {
188 let p = raw_pos.min(source.len());
189 let mut p = p;
190 while p < source.len() && !source.is_char_boundary(p) {
191 p += 1;
192 }
193 p
194 };
195 let line_start = source[..pos].rfind('\n').map(|i| i + 1).unwrap_or(0);
197 let line_end = source[pos..].find('\n').map(|i| pos + i).unwrap_or(source.len());
198
199 let line = &source[line_start..line_end];
200
201 if !line.trim().is_empty()
203 && !line.trim().ends_with(';')
204 && !line.trim().ends_with('{')
205 && !line.trim().ends_with('}')
206 {
207 if line.contains("my ")
209 || line.contains("our ")
210 || line.contains("local ")
211 || line.contains("print ")
212 || line.contains("say ")
213 || line.contains("return ")
214 {
215 return ParseErrorKind::MissingSemicolon;
216 }
217 }
218
219 let open_parens = line.matches('(').count();
221 let close_parens = line.matches(')').count();
222 if open_parens > close_parens {
223 return ParseErrorKind::UnclosedParenthesis;
224 }
225
226 let open_brackets = line.matches('[').count();
227 let close_brackets = line.matches(']').count();
228 if open_brackets > close_brackets {
229 return ParseErrorKind::UnclosedBracket;
230 }
231
232 let open_braces = line.matches('{').count();
233 let close_braces = line.matches('}').count();
234 if open_braces > close_braces {
235 return ParseErrorKind::UnclosedBrace;
236 }
237 }
238
239 if source.is_empty() || error_node.location.start >= source.len().saturating_sub(1) {
241 return ParseErrorKind::UnexpectedEof;
242 }
243
244 ParseErrorKind::InvalidSyntax
246 }
247
248 pub fn get_diagnostic_message(&self, kind: &ParseErrorKind) -> String {
261 match kind {
262 ParseErrorKind::UnexpectedToken { expected, found } => {
263 format!("Expected {} but found {}", expected, found)
264 }
265 ParseErrorKind::UnclosedString => "Unclosed string literal".to_string(),
266 ParseErrorKind::UnclosedRegex => "Unclosed regular expression".to_string(),
267 ParseErrorKind::UnclosedBlock => "Unclosed code block - missing '}'".to_string(),
268 ParseErrorKind::MissingSemicolon => "Missing semicolon at end of statement".to_string(),
269 ParseErrorKind::InvalidSyntax => "Invalid syntax".to_string(),
270 ParseErrorKind::UnclosedParenthesis => "Unclosed parenthesis - missing ')'".to_string(),
271 ParseErrorKind::UnclosedBracket => "Unclosed bracket - missing ']'".to_string(),
272 ParseErrorKind::UnclosedBrace => "Unclosed brace - missing '}'".to_string(),
273 ParseErrorKind::UnterminatedHeredoc => "Unterminated heredoc".to_string(),
274 ParseErrorKind::InvalidVariableName => "Invalid variable name".to_string(),
275 ParseErrorKind::InvalidSubroutineName => "Invalid subroutine name".to_string(),
276 ParseErrorKind::MissingOperator => "Missing operator".to_string(),
277 ParseErrorKind::MissingOperand => "Missing operand".to_string(),
278 ParseErrorKind::UnexpectedEof => "Unexpected end of file".to_string(),
279 }
280 }
281
282 pub fn get_suggestion(&self, kind: &ParseErrorKind) -> Option<String> {
295 match kind {
296 ParseErrorKind::MissingSemicolon => {
297 Some("Add a semicolon ';' at the end of the statement".to_string())
298 }
299 ParseErrorKind::UnclosedString => {
300 Some("Add a closing quote to terminate the string".to_string())
301 }
302 ParseErrorKind::UnclosedParenthesis => {
303 Some("Add a closing parenthesis ')' to match the opening '('".to_string())
304 }
305 ParseErrorKind::UnclosedBracket => {
306 Some("Add a closing bracket ']' to match the opening '['".to_string())
307 }
308 ParseErrorKind::UnclosedBrace => {
309 Some("Add a closing brace '}' to match the opening '{'".to_string())
310 }
311 ParseErrorKind::UnclosedBlock => {
312 Some("Add a closing brace '}' to complete the code block".to_string())
313 }
314 ParseErrorKind::UnclosedRegex => {
315 Some("Add a closing delimiter to terminate the regex pattern".to_string())
316 }
317 ParseErrorKind::UnterminatedHeredoc => {
318 Some("Add the heredoc terminator marker on its own line".to_string())
319 }
320 ParseErrorKind::InvalidVariableName => {
321 Some("Variable names must start with a letter or underscore, followed by alphanumeric characters or underscores".to_string())
322 }
323 ParseErrorKind::InvalidSubroutineName => {
324 Some("Subroutine names must start with a letter or underscore, followed by alphanumeric characters or underscores".to_string())
325 }
326 ParseErrorKind::MissingOperator => {
327 Some("Add an operator between operands (e.g., +, -, *, /, ., ==, !=)".to_string())
328 }
329 ParseErrorKind::MissingOperand => {
330 Some("Add a value or expression after the operator".to_string())
331 }
332 ParseErrorKind::UnexpectedEof => {
333 Some("The file ended unexpectedly - check for unclosed blocks, strings, or parentheses".to_string())
334 }
335 ParseErrorKind::UnexpectedToken { expected, found: _ } => {
336 Some(format!("Expected {} at this location", expected))
337 }
338 ParseErrorKind::InvalidSyntax => None,
339 }
340 }
341
342 pub fn get_explanation(&self, kind: &ParseErrorKind) -> Option<String> {
355 match kind {
356 ParseErrorKind::MissingSemicolon => {
357 Some("In Perl, most statements must end with a semicolon. The only exceptions are the last statement in a block and statements that end with a block (like if, while, sub, etc.).".to_string())
358 }
359 ParseErrorKind::UnclosedString => {
360 Some("String literals must be properly terminated with a matching quote. Use double quotes (\") for interpolated strings or single quotes (') for literal strings.".to_string())
361 }
362 ParseErrorKind::UnclosedRegex => {
363 Some("Regular expressions must be properly delimited. Common forms include /pattern/, m/pattern/, s/old/new/, and qr/pattern/.".to_string())
364 }
365 ParseErrorKind::UnterminatedHeredoc => {
366 Some("Heredoc blocks must have their terminator marker appear on a line by itself with no leading or trailing whitespace (unless using <<~MARKER for indented heredocs).".to_string())
367 }
368 ParseErrorKind::InvalidVariableName => {
369 Some("Perl variable names (after the sigil) must follow identifier rules: start with a letter (a-z, A-Z) or underscore (_), followed by any combination of letters, digits, or underscores.".to_string())
370 }
371 ParseErrorKind::UnclosedBlock => {
372 Some("Code blocks must have matching braces. Each opening '{' needs a corresponding closing '}'.".to_string())
373 }
374 _ => None,
375 }
376 }
377}
378
379#[derive(Debug, Clone, Default, PartialEq, Eq)]
389pub struct RecoverySalvageMetrics {
390 pub recovered_node_count: usize,
392 pub unrecovered_diagnostic_count: usize,
395 pub error_node_count: usize,
397 pub first_unrecovered_error_node: Option<String>,
400}
401
402impl RecoverySalvageMetrics {
403 pub fn is_dirty(&self) -> bool {
406 self.error_node_count > 0
407 || self.recovered_node_count > 0
408 || self.unrecovered_diagnostic_count > 0
409 }
410
411 pub fn is_structured_recovery_only(&self) -> bool {
415 self.recovered_node_count > 0
416 && self.error_node_count == 0
417 && self.unrecovered_diagnostic_count == 0
418 }
419}
420
421pub fn classify_recovery_salvage(ast: &Node, diagnostics: &[ParseError]) -> RecoverySalvageMetrics {
427 let mut error_node_count = 0usize;
428 let mut first_start = usize::MAX;
429 let mut first_unrecovered_error_node: Option<String> = None;
430
431 fn walk(
432 node: &Node,
433 error_node_count: &mut usize,
434 first_start: &mut usize,
435 first_unrecovered_error_node: &mut Option<String>,
436 ) {
437 if let perl_ast::NodeKind::Error { message, .. } = &node.kind {
438 *error_node_count = error_node_count.saturating_add(1);
439 if node.location.start < *first_start {
440 *first_start = node.location.start;
441 *first_unrecovered_error_node = Some(message.clone());
442 }
443 }
444 node.for_each_child(|child| {
445 walk(child, error_node_count, first_start, first_unrecovered_error_node);
446 });
447 }
448 walk(ast, &mut error_node_count, &mut first_start, &mut first_unrecovered_error_node);
449
450 let recovered_node_count =
451 diagnostics.iter().filter(|e| matches!(e, ParseError::Recovered { .. })).count();
452 let unrecovered_diagnostic_count = diagnostics.len().saturating_sub(recovered_node_count);
453
454 RecoverySalvageMetrics {
455 recovered_node_count,
456 unrecovered_diagnostic_count,
457 error_node_count,
458 first_unrecovered_error_node,
459 }
460}
461
462#[cfg(test)]
463mod tests {
464 use super::*;
465 use perl_ast::{Node, NodeKind, SourceLocation};
466
467 #[test]
468 fn test_classify_unclosed_string() {
469 let classifier = ErrorClassifier::new();
470 let source = r#"my $x = "hello"#;
471
472 let error_node = Node::new(
478 NodeKind::Error {
479 message: "Unclosed string".to_string(),
480 expected: vec![],
481 found: None,
482 partial: None,
483 },
484 SourceLocation { start: 9, end: 15 }, );
486
487 let kind = classifier.classify(&error_node, source);
488 assert_eq!(kind, ParseErrorKind::UnclosedString);
489 }
490
491 #[test]
492 fn test_classify_missing_semicolon() {
493 let classifier = ErrorClassifier::new();
494 let source = "my $x = 42\nmy $y = 10";
495
496 let error = Node::new(
498 NodeKind::Error {
499 message: "Unexpected token".to_string(),
500 expected: vec![],
501 found: None,
502 partial: None,
503 },
504 SourceLocation { start: 10, end: 11 }, );
506 let kind = classifier.classify(&error, source);
507 assert_eq!(kind, ParseErrorKind::MissingSemicolon);
508 }
509
510 fn make_error_node(message: &str, start: usize, end: usize) -> Node {
513 Node::new(
514 NodeKind::Error {
515 message: message.to_string(),
516 expected: vec![],
517 found: None,
518 partial: None,
519 },
520 SourceLocation { start, end },
521 )
522 }
523
524 fn make_program_node(children: Vec<Node>) -> Node {
525 Node::new(NodeKind::Program { statements: children }, SourceLocation { start: 0, end: 100 })
526 }
527
528 #[test]
529 fn clean_parse_produces_zero_metrics() {
530 let root = make_program_node(vec![]);
532 let metrics = classify_recovery_salvage(&root, &[]);
533 assert_eq!(metrics.recovered_node_count, 0);
534 assert_eq!(metrics.unrecovered_diagnostic_count, 0);
535 assert_eq!(metrics.error_node_count, 0);
536 assert!(metrics.first_unrecovered_error_node.is_none());
537 assert!(!metrics.is_dirty());
538 assert!(!metrics.is_structured_recovery_only());
539 }
540
541 #[test]
542 fn error_node_without_diagnostics_is_dirty_but_not_structured_recovery() {
543 let error = make_error_node("unexpected token", 5, 10);
546 let root = make_program_node(vec![error]);
547 let metrics = classify_recovery_salvage(&root, &[]);
548
549 assert_eq!(metrics.error_node_count, 1);
550 assert_eq!(metrics.recovered_node_count, 0);
551 assert_eq!(metrics.unrecovered_diagnostic_count, 0);
552 assert!(metrics.is_dirty(), "error node alone makes result dirty");
553 assert!(
554 !metrics.is_structured_recovery_only(),
555 "no recovery diagnostics — not structured-recovery-only"
556 );
557 assert_eq!(metrics.first_unrecovered_error_node.as_deref(), Some("unexpected token"));
558 }
559
560 #[test]
561 fn multiple_error_nodes_reports_earliest_by_start_offset() {
562 let later = make_error_node("later error", 50, 60);
565 let earlier = make_error_node("earlier error", 10, 20);
566 let root = make_program_node(vec![later, earlier]);
567 let metrics = classify_recovery_salvage(&root, &[]);
568
569 assert_eq!(metrics.error_node_count, 2);
570 assert_eq!(
571 metrics.first_unrecovered_error_node.as_deref(),
572 Some("earlier error"),
573 "earliest by start offset must win"
574 );
575 }
576}