Skip to main content

perl_parser/heredoc_anti_patterns/
detectors.rs

1use regex::Regex;
2use std::collections::HashSet;
3use std::sync::LazyLock;
4
5use crate::heredoc_anti_patterns::model::{AntiPattern, Diagnostic, Location, Severity};
6use crate::heredoc_anti_patterns::utils::{
7    build_line_starts, location_from_start, mask_non_code_regions,
8};
9
10/// Scans Perl source for heredoc-related anti-patterns and produces [`Diagnostic`]s.
11///
12/// Construct with [`AntiPatternDetector::new`], then call [`detect_all`] with the
13/// source text. The detector runs all seven built-in pattern checkers and returns
14/// the results sorted by byte offset so callers receive problems in source order.
15///
16/// [`detect_all`]: AntiPatternDetector::detect_all
17pub struct AntiPatternDetector {
18    patterns: Vec<Box<dyn PatternDetector>>,
19}
20
21trait PatternDetector: Send + Sync {
22    fn detect(
23        &self,
24        code: &str,
25        offset: usize,
26        line_starts: &[usize],
27    ) -> Vec<(AntiPattern, Location)>;
28    fn diagnose(&self, pattern: &AntiPattern) -> Option<Diagnostic>;
29}
30
31// Format heredoc detector
32struct FormatHeredocDetector;
33
34/// Pattern for identifying format declarations
35static FORMAT_PATTERN: LazyLock<Regex> =
36    LazyLock::new(|| match Regex::new(r"(?m)^\s*format\s+(\w+)\s*=\s*$") {
37        Ok(re) => re,
38        Err(_) => unreachable!("FORMAT_PATTERN regex failed to compile"),
39    });
40
41/// Pattern for extracting heredoc delimiter declarations.
42static HEREDOC_DELIMITER_PATTERN: LazyLock<Regex> =
43    LazyLock::new(|| match Regex::new(r#"<<\s*['"`]?([A-Za-z_][A-Za-z0-9_]*)['"`]?"#) {
44        Ok(re) => re,
45        Err(_) => unreachable!("HEREDOC_DELIMITER_PATTERN regex failed to compile"),
46    });
47
48impl PatternDetector for FormatHeredocDetector {
49    fn detect(
50        &self,
51        code: &str,
52        offset: usize,
53        line_starts: &[usize],
54    ) -> Vec<(AntiPattern, Location)> {
55        let mut results = Vec::new();
56        let scan_code = mask_non_code_regions(code);
57
58        for cap in FORMAT_PATTERN.captures_iter(&scan_code) {
59            if let (Some(match_pos), Some(name_match)) = (cap.get(0), cap.get(1)) {
60                let format_name = name_match.as_str().to_string();
61                let location = location_from_start(line_starts, offset, match_pos.start());
62
63                // Look for heredoc marker inside format body (simplified)
64                let body_start = match_pos.end();
65                let body_end = code[body_start..].find("\n.").unwrap_or(code.len() - body_start);
66                let body = &scan_code[body_start..body_start + body_end];
67                let source_body = &code[body_start..body_start + body_end];
68
69                if body.contains("<<") {
70                    results.push((
71                        AntiPattern::FormatHeredoc {
72                            location: location.clone(),
73                            format_name,
74                            heredoc_delimiter: extract_heredoc_delimiter(source_body),
75                        },
76                        location,
77                    ));
78                }
79            }
80        }
81
82        results
83    }
84
85    fn diagnose(&self, pattern: &AntiPattern) -> Option<Diagnostic> {
86        let AntiPattern::FormatHeredoc { format_name, .. } = pattern else {
87            return None;
88        };
89
90        Some(Diagnostic {
91            severity: Severity::Warning,
92            pattern: pattern.clone(),
93            message: format!("Heredoc declared inside format '{}'", format_name),
94            explanation: "Heredocs inside format declarations are often handled specially by the Perl interpreter and can be difficult to parse statically.".to_string(),
95            suggested_fix: Some("Consider moving the heredoc outside the format or using a simple string if possible.".to_string()),
96            references: vec!["perldoc perlform".to_string()],
97        })
98    }
99}
100
101// BEGIN-time heredoc detector
102struct BeginTimeHeredocDetector;
103
104/// Pattern for identifying BEGIN block openings
105static BEGIN_BLOCK_START_PATTERN: LazyLock<Regex> =
106    LazyLock::new(|| match Regex::new(r"\bBEGIN\s*\{") {
107        Ok(re) => re,
108        Err(_) => unreachable!("BEGIN_BLOCK_START_PATTERN regex failed to compile"),
109    });
110
111fn extract_heredoc_delimiter(body: &str) -> String {
112    HEREDOC_DELIMITER_PATTERN
113        .captures(body)
114        .and_then(|captures| captures.get(1).map(|delimiter| delimiter.as_str().to_string()))
115        .unwrap_or_else(|| "UNKNOWN".to_string())
116}
117
118fn find_matching_brace(code: &str, opening_brace_idx: usize) -> Option<usize> {
119    let bytes = code.as_bytes();
120    let mut depth = 0usize;
121    let mut in_single_quote = false;
122    let mut in_double_quote = false;
123    let mut escaped = false;
124
125    for (idx, &byte) in bytes.iter().enumerate().skip(opening_brace_idx) {
126        let ch = byte as char;
127
128        if escaped {
129            escaped = false;
130            continue;
131        }
132
133        if in_single_quote {
134            if ch == '\\' {
135                escaped = true;
136            } else if ch == '\'' {
137                in_single_quote = false;
138            }
139            continue;
140        }
141
142        if in_double_quote {
143            if ch == '\\' {
144                escaped = true;
145            } else if ch == '"' {
146                in_double_quote = false;
147            }
148            continue;
149        }
150
151        match ch {
152            '\'' => in_single_quote = true,
153            '"' => in_double_quote = true,
154            '{' => depth += 1,
155            '}' => {
156                if depth == 0 {
157                    return None;
158                }
159                depth -= 1;
160                if depth == 0 {
161                    return Some(idx);
162                }
163            }
164            _ => {}
165        }
166    }
167
168    None
169}
170
171impl PatternDetector for BeginTimeHeredocDetector {
172    fn detect(
173        &self,
174        code: &str,
175        offset: usize,
176        line_starts: &[usize],
177    ) -> Vec<(AntiPattern, Location)> {
178        let mut results = Vec::new();
179        let scan_code = mask_non_code_regions(code);
180
181        for begin_match in BEGIN_BLOCK_START_PATTERN.find_iter(&scan_code) {
182            let Some(opening_brace_rel) = begin_match.as_str().rfind('{') else {
183                continue;
184            };
185            let opening_brace_idx = begin_match.start() + opening_brace_rel;
186            let Some(closing_brace_idx) = find_matching_brace(&scan_code, opening_brace_idx) else {
187                continue;
188            };
189            let block_content = &scan_code[opening_brace_idx + 1..closing_brace_idx];
190
191            if !block_content.contains("<<") {
192                continue;
193            }
194
195            let location = location_from_start(line_starts, offset, begin_match.start());
196
197            results.push((
198                AntiPattern::BeginTimeHeredoc {
199                    location: location.clone(),
200                    heredoc_content: block_content.to_string(),
201                    side_effects: vec!["Phase-dependent parsing".to_string()],
202                },
203                location,
204            ));
205        }
206
207        results
208    }
209
210    fn diagnose(&self, pattern: &AntiPattern) -> Option<Diagnostic> {
211        if let AntiPattern::BeginTimeHeredoc { .. } = pattern {
212            Some(Diagnostic {
213                severity: Severity::Error,
214                pattern: pattern.clone(),
215                message: "Heredoc declared during BEGIN-time".to_string(),
216                explanation: "Heredocs declared inside BEGIN blocks are evaluated during the compilation phase. This can lead to complex side effects that are difficult to track statically.".to_string(),
217                suggested_fix: Some("Move the heredoc declaration out of the BEGIN block if it doesn't need to be evaluated during compilation.".to_string()),
218                references: vec!["perldoc perlmod".to_string()],
219            })
220        } else {
221            None
222        }
223    }
224}
225
226// Dynamic delimiter detector
227struct DynamicDelimiterDetector;
228
229/// Pattern for identifying dynamic heredoc delimiters
230static DYNAMIC_DELIMITER_PATTERN: LazyLock<Regex> =
231    LazyLock::new(|| match Regex::new(r"<<\s*\$\{[^}]+\}|<<\s*\$\w+|<<\s*`[^`]+`") {
232        Ok(re) => re,
233        Err(_) => unreachable!("DYNAMIC_DELIMITER_PATTERN regex failed to compile"),
234    });
235
236impl PatternDetector for DynamicDelimiterDetector {
237    fn detect(
238        &self,
239        code: &str,
240        offset: usize,
241        line_starts: &[usize],
242    ) -> Vec<(AntiPattern, Location)> {
243        let mut results = Vec::new();
244        let scan_code = mask_non_code_regions(code);
245
246        for cap in DYNAMIC_DELIMITER_PATTERN.captures_iter(&scan_code) {
247            if let Some(match_pos) = cap.get(0) {
248                let expression = match_pos.as_str().to_string();
249                let location = location_from_start(line_starts, offset, match_pos.start());
250
251                results.push((
252                    AntiPattern::DynamicHeredocDelimiter { location: location.clone(), expression },
253                    location,
254                ));
255            }
256        }
257
258        results
259    }
260
261    fn diagnose(&self, pattern: &AntiPattern) -> Option<Diagnostic> {
262        let AntiPattern::DynamicHeredocDelimiter { expression, .. } = pattern else {
263            return None;
264        };
265
266        Some(Diagnostic {
267            severity: Severity::Warning,
268            pattern: pattern.clone(),
269            message: format!("Dynamic heredoc delimiter: {}", expression),
270            explanation: "Using variables or expressions as heredoc delimiters makes it impossible to know the terminator without executing the code.".to_string(),
271            suggested_fix: Some("Use a literal string as the heredoc terminator.".to_string()),
272            references: vec!["perldoc perlop".to_string()],
273        })
274    }
275}
276
277// Source filter detector
278struct SourceFilterDetector;
279
280/// Pattern for identifying common source filter modules
281static SOURCE_FILTER_PATTERN: LazyLock<Regex> = LazyLock::new(|| {
282    match Regex::new(r"use\s+Filter::(Simple|Util::Call|cpp|exec|sh|decrypt|tee)") {
283        Ok(re) => re,
284        Err(_) => unreachable!("SOURCE_FILTER_PATTERN regex failed to compile"),
285    }
286});
287
288impl PatternDetector for SourceFilterDetector {
289    fn detect(
290        &self,
291        code: &str,
292        offset: usize,
293        line_starts: &[usize],
294    ) -> Vec<(AntiPattern, Location)> {
295        let mut results = Vec::new();
296        let scan_code = mask_non_code_regions(code);
297
298        for cap in SOURCE_FILTER_PATTERN.captures_iter(&scan_code) {
299            if let (Some(match_pos), Some(module_match)) = (cap.get(0), cap.get(1)) {
300                let filter_module = module_match.as_str().to_string();
301                let location = location_from_start(line_starts, offset, match_pos.start());
302
303                results.push((
304                    AntiPattern::SourceFilterHeredoc {
305                        location: location.clone(),
306                        module: filter_module,
307                    },
308                    location,
309                ));
310            }
311        }
312
313        results
314    }
315
316    fn diagnose(&self, pattern: &AntiPattern) -> Option<Diagnostic> {
317        let AntiPattern::SourceFilterHeredoc { module, .. } = pattern else {
318            return None;
319        };
320
321        Some(Diagnostic {
322            severity: Severity::Error,
323            pattern: pattern.clone(),
324            message: format!("Source filter detected: Filter::{}", module),
325            explanation: "Source filters rewrite the source code before it's parsed. Static analysis cannot reliably predict the state of the code after filtering.".to_string(),
326            suggested_fix: Some("Avoid using source filters. They are considered problematic and often replaced by better alternatives like Devel::Declare or modern Perl features.".to_string()),
327            references: vec!["perldoc Filter::Simple".to_string()],
328        })
329    }
330}
331
332// Regex heredoc detector
333struct RegexHeredocDetector;
334
335/// Pattern for identifying heredocs inside regex code blocks
336static REGEX_HEREDOC_PATTERN: LazyLock<Regex> =
337    LazyLock::new(|| match Regex::new(r"\(\?\{[^}]*<<[^}]*\}") {
338        Ok(re) => re,
339        Err(_) => unreachable!("REGEX_HEREDOC_PATTERN regex failed to compile"),
340    });
341
342impl PatternDetector for RegexHeredocDetector {
343    fn detect(
344        &self,
345        code: &str,
346        offset: usize,
347        line_starts: &[usize],
348    ) -> Vec<(AntiPattern, Location)> {
349        let mut results = Vec::new();
350        let scan_code = mask_non_code_regions(code);
351
352        for cap in REGEX_HEREDOC_PATTERN.captures_iter(&scan_code) {
353            if let Some(match_pos) = cap.get(0) {
354                let location = location_from_start(line_starts, offset, match_pos.start());
355
356                results.push((
357                    AntiPattern::RegexCodeBlockHeredoc { location: location.clone() },
358                    location,
359                ));
360            }
361        }
362
363        results
364    }
365
366    fn diagnose(&self, pattern: &AntiPattern) -> Option<Diagnostic> {
367        if let AntiPattern::RegexCodeBlockHeredoc { .. } = pattern {
368            Some(Diagnostic {
369                severity: Severity::Warning,
370                pattern: pattern.clone(),
371                message: "Heredoc inside regex code block".to_string(),
372                explanation: "Declaring heredocs inside (?{ ... }) or (??{ ... }) blocks is extremely rare and difficult to parse correctly.".to_string(),
373                suggested_fix: None,
374                references: vec!["perldoc perlre".to_string()],
375            })
376        } else {
377            None
378        }
379    }
380}
381
382// Eval heredoc detector
383struct EvalHeredocDetector;
384
385/// Pattern for identifying heredocs inside eval strings
386static EVAL_HEREDOC_PATTERN: LazyLock<Regex> =
387    LazyLock::new(|| match Regex::new(r#"eval\s+(?:'[^']*<<[^']*'|"[^"]*<<[^"]*")"#) {
388        Ok(re) => re,
389        Err(_) => unreachable!("EVAL_HEREDOC_PATTERN regex failed to compile"),
390    });
391
392impl PatternDetector for EvalHeredocDetector {
393    fn detect(
394        &self,
395        code: &str,
396        offset: usize,
397        line_starts: &[usize],
398    ) -> Vec<(AntiPattern, Location)> {
399        let mut results = Vec::new();
400
401        for cap in EVAL_HEREDOC_PATTERN.captures_iter(code) {
402            if let Some(match_pos) = cap.get(0) {
403                let location = location_from_start(line_starts, offset, match_pos.start());
404
405                results.push((
406                    AntiPattern::EvalStringHeredoc { location: location.clone() },
407                    location,
408                ));
409            }
410        }
411
412        results
413    }
414
415    fn diagnose(&self, pattern: &AntiPattern) -> Option<Diagnostic> {
416        if let AntiPattern::EvalStringHeredoc { .. } = pattern {
417            Some(Diagnostic {
418                severity: Severity::Warning,
419                pattern: pattern.clone(),
420                message: "Heredoc inside eval string".to_string(),
421                explanation: "Heredocs declared inside strings passed to eval require double parsing and can hide malicious or complex code.".to_string(),
422                suggested_fix: Some("Consider using a block eval or moving the heredoc outside the eval string.".to_string()),
423                references: vec!["perldoc -f eval".to_string()],
424            })
425        } else {
426            None
427        }
428    }
429}
430
431// Tied handle detector
432struct TiedHandleDetector;
433
434/// Pattern for identifying tie statements
435static TIE_PATTERN: LazyLock<Regex> = LazyLock::new(|| match Regex::new(r"tie\s+([*$]\w+)") {
436    Ok(re) => re,
437    Err(_) => unreachable!("TIE_PATTERN regex failed to compile"),
438});
439
440/// Pattern for identifying print statements that write heredocs to a handle.
441static PRINT_HEREDOC_PATTERN: LazyLock<Regex> =
442    LazyLock::new(|| match Regex::new(r"print\s+([*$]?\w+)\s+<<") {
443        Ok(re) => re,
444        Err(_) => unreachable!("PRINT_HEREDOC_PATTERN regex failed to compile"),
445    });
446
447impl PatternDetector for TiedHandleDetector {
448    fn detect(
449        &self,
450        code: &str,
451        offset: usize,
452        line_starts: &[usize],
453    ) -> Vec<(AntiPattern, Location)> {
454        let mut results = Vec::new();
455        let scan_code = mask_non_code_regions(code);
456
457        // First collect tied handles in normalized form:
458        // *FH -> FH, $fh -> $fh.
459        let mut tied_handles = HashSet::new();
460        for cap in TIE_PATTERN.captures_iter(&scan_code) {
461            if let Some(handle_match) = cap.get(1) {
462                let raw_handle = handle_match.as_str();
463                let normalized = raw_handle.strip_prefix('*').unwrap_or(raw_handle);
464                tied_handles.insert(normalized.to_string());
465            }
466        }
467
468        // Use a single static regex for all print-heredoc matches, then filter
469        // by whether the handle is in the tied set. This avoids O(n) Regex
470        // compilations (one per tied handle) and is faster for large files.
471        for cap in PRINT_HEREDOC_PATTERN.captures_iter(&scan_code) {
472            let (Some(match_pos), Some(handle_match)) = (cap.get(0), cap.get(1)) else {
473                continue;
474            };
475
476            let raw_print_handle = handle_match.as_str();
477            let normalized_print_handle =
478                raw_print_handle.strip_prefix('*').unwrap_or(raw_print_handle);
479
480            if tied_handles.contains(normalized_print_handle) {
481                let location = location_from_start(line_starts, offset, match_pos.start());
482                results.push((
483                    AntiPattern::TiedHandleHeredoc {
484                        location: location.clone(),
485                        handle_name: normalized_print_handle.to_string(),
486                    },
487                    location,
488                ));
489            }
490        }
491
492        results
493    }
494
495    fn diagnose(&self, pattern: &AntiPattern) -> Option<Diagnostic> {
496        let AntiPattern::TiedHandleHeredoc { handle_name, .. } = pattern else {
497            return None;
498        };
499
500        Some(Diagnostic {
501            severity: Severity::Info,
502            pattern: pattern.clone(),
503            message: format!("Heredoc written to tied handle '{}'", handle_name),
504            explanation: "Writing to a tied handle invokes custom code. The behavior of heredoc output depends on the tied class implementation.".to_string(),
505            suggested_fix: None,
506            references: vec!["perldoc -f tie".to_string()],
507        })
508    }
509}
510
511impl Default for AntiPatternDetector {
512    fn default() -> Self {
513        Self::new()
514    }
515}
516
517impl AntiPatternDetector {
518    /// Create a detector pre-loaded with all seven built-in pattern checkers.
519    pub fn new() -> Self {
520        Self {
521            patterns: vec![
522                Box::new(FormatHeredocDetector),
523                Box::new(BeginTimeHeredocDetector),
524                Box::new(DynamicDelimiterDetector),
525                Box::new(SourceFilterDetector),
526                Box::new(RegexHeredocDetector),
527                Box::new(EvalHeredocDetector),
528                Box::new(TiedHandleDetector),
529            ],
530        }
531    }
532
533    /// Run all pattern checkers against `code` and return diagnostics sorted by offset.
534    pub fn detect_all(&self, code: &str) -> Vec<Diagnostic> {
535        let mut diagnostics = Vec::new();
536        let line_starts = build_line_starts(code);
537
538        for detector in &self.patterns {
539            let patterns = detector.detect(code, 0, &line_starts);
540            for (pattern, _) in patterns {
541                if let Some(diagnostic) = detector.diagnose(&pattern) {
542                    diagnostics.push(diagnostic);
543                }
544            }
545        }
546
547        diagnostics.sort_by_key(|d| match &d.pattern {
548            AntiPattern::FormatHeredoc { location, .. }
549            | AntiPattern::BeginTimeHeredoc { location, .. }
550            | AntiPattern::DynamicHeredocDelimiter { location, .. }
551            | AntiPattern::SourceFilterHeredoc { location, .. }
552            | AntiPattern::RegexCodeBlockHeredoc { location, .. }
553            | AntiPattern::EvalStringHeredoc { location, .. }
554            | AntiPattern::TiedHandleHeredoc { location, .. } => location.offset,
555        });
556
557        diagnostics
558    }
559
560    /// Format a list of diagnostics as a human-readable plain-text report.
561    ///
562    /// Prints a header, a count, and one entry per diagnostic including its
563    /// severity, location, explanation, optional suggested fix, and references.
564    pub fn format_report(&self, diagnostics: &[Diagnostic]) -> String {
565        let mut report = String::from("Anti-Pattern Analysis Report\n");
566        report.push_str("============================\n\n");
567
568        if diagnostics.is_empty() {
569            report.push_str("No problematic patterns detected.\n");
570            return report;
571        }
572
573        report.push_str(&format!("Found {} problematic patterns:\n\n", diagnostics.len()));
574
575        for (i, diag) in diagnostics.iter().enumerate() {
576            report.push_str(&format!(
577                "{}. {} ({})\n",
578                i + 1,
579                diag.message,
580                match diag.severity {
581                    Severity::Error => "ERROR",
582                    Severity::Warning => "WARNING",
583                    Severity::Info => "INFO",
584                }
585            ));
586
587            report.push_str(&format!(
588                "   Location: {}\n",
589                match &diag.pattern {
590                    AntiPattern::FormatHeredoc { location, .. }
591                    | AntiPattern::BeginTimeHeredoc { location, .. }
592                    | AntiPattern::DynamicHeredocDelimiter { location, .. }
593                    | AntiPattern::SourceFilterHeredoc { location, .. }
594                    | AntiPattern::RegexCodeBlockHeredoc { location, .. }
595                    | AntiPattern::EvalStringHeredoc { location, .. }
596                    | AntiPattern::TiedHandleHeredoc { location, .. } =>
597                        format!("line {}, column {}", location.line, location.column),
598                }
599            ));
600
601            report.push_str(&format!("   Explanation: {}\n", diag.explanation));
602
603            if let Some(fix) = &diag.suggested_fix {
604                report.push_str(&format!(
605                    "   Suggested fix:\n     {}\n",
606                    fix.lines().collect::<Vec<_>>().join("\n     ")
607                ));
608            }
609
610            if !diag.references.is_empty() {
611                report.push_str(&format!("   References: {}\n", diag.references.join(", ")));
612            }
613
614            report.push('\n');
615        }
616
617        report
618    }
619}
620
621#[cfg(test)]
622mod tests;