Skip to main content

perl_parser/heredoc_anti_patterns/
detectors.rs

1use regex::Regex;
2use std::collections::HashSet;
3use std::sync::LazyLock;
4
5use crate::heredoc_anti_patterns::model::{AntiPattern, Diagnostic, Location, Severity};
6use crate::heredoc_anti_patterns::utils::{
7    build_line_starts, location_from_start, mask_non_code_regions,
8};
9
10/// Scans Perl source for heredoc-related anti-patterns and produces [`Diagnostic`]s.
11///
12/// Construct with [`AntiPatternDetector::new`], then call [`detect_all`] with the
13/// source text. The detector runs all seven built-in pattern checkers and returns
14/// the results sorted by byte offset so callers receive problems in source order.
15///
16/// [`detect_all`]: AntiPatternDetector::detect_all
17pub struct AntiPatternDetector {
18    patterns: Vec<Box<dyn PatternDetector>>,
19}
20
21trait PatternDetector: Send + Sync {
22    fn detect(
23        &self,
24        code: &str,
25        offset: usize,
26        line_starts: &[usize],
27    ) -> Vec<(AntiPattern, Location)>;
28    fn diagnose(&self, pattern: &AntiPattern) -> Option<Diagnostic>;
29}
30
31// Format heredoc detector
32struct FormatHeredocDetector;
33
34/// Pattern for identifying format declarations
35static FORMAT_PATTERN: LazyLock<Regex> =
36    LazyLock::new(|| match Regex::new(r"(?m)^\s*format\s+(\w+)\s*=\s*$") {
37        Ok(re) => re,
38        Err(_) => unreachable!("FORMAT_PATTERN regex failed to compile"),
39    });
40
41impl PatternDetector for FormatHeredocDetector {
42    fn detect(
43        &self,
44        code: &str,
45        offset: usize,
46        line_starts: &[usize],
47    ) -> Vec<(AntiPattern, Location)> {
48        let mut results = Vec::new();
49        let scan_code = mask_non_code_regions(code);
50
51        for cap in FORMAT_PATTERN.captures_iter(&scan_code) {
52            if let (Some(match_pos), Some(name_match)) = (cap.get(0), cap.get(1)) {
53                let format_name = name_match.as_str().to_string();
54                let location = location_from_start(line_starts, offset, match_pos.start());
55
56                // Look for heredoc marker inside format body (simplified)
57                let body_start = match_pos.end();
58                let body_end = code[body_start..].find("\n.").unwrap_or(code.len() - body_start);
59                let body = &scan_code[body_start..body_start + body_end];
60
61                if body.contains("<<") {
62                    results.push((
63                        AntiPattern::FormatHeredoc {
64                            location: location.clone(),
65                            format_name,
66                            heredoc_delimiter: "UNKNOWN".to_string(), // Would need better extraction
67                        },
68                        location,
69                    ));
70                }
71            }
72        }
73
74        results
75    }
76
77    fn diagnose(&self, pattern: &AntiPattern) -> Option<Diagnostic> {
78        let AntiPattern::FormatHeredoc { format_name, .. } = pattern else {
79            return None;
80        };
81
82        Some(Diagnostic {
83            severity: Severity::Warning,
84            pattern: pattern.clone(),
85            message: format!("Heredoc declared inside format '{}'", format_name),
86            explanation: "Heredocs inside format declarations are often handled specially by the Perl interpreter and can be difficult to parse statically.".to_string(),
87            suggested_fix: Some("Consider moving the heredoc outside the format or using a simple string if possible.".to_string()),
88            references: vec!["perldoc perlform".to_string()],
89        })
90    }
91}
92
93// BEGIN-time heredoc detector
94struct BeginTimeHeredocDetector;
95
96/// Pattern for identifying BEGIN block openings
97static BEGIN_BLOCK_START_PATTERN: LazyLock<Regex> =
98    LazyLock::new(|| match Regex::new(r"\bBEGIN\s*\{") {
99        Ok(re) => re,
100        Err(_) => unreachable!("BEGIN_BLOCK_START_PATTERN regex failed to compile"),
101    });
102
103fn find_matching_brace(code: &str, opening_brace_idx: usize) -> Option<usize> {
104    let bytes = code.as_bytes();
105    let mut depth = 0usize;
106    let mut in_single_quote = false;
107    let mut in_double_quote = false;
108    let mut escaped = false;
109
110    for (idx, &byte) in bytes.iter().enumerate().skip(opening_brace_idx) {
111        let ch = byte as char;
112
113        if escaped {
114            escaped = false;
115            continue;
116        }
117
118        if in_single_quote {
119            if ch == '\\' {
120                escaped = true;
121            } else if ch == '\'' {
122                in_single_quote = false;
123            }
124            continue;
125        }
126
127        if in_double_quote {
128            if ch == '\\' {
129                escaped = true;
130            } else if ch == '"' {
131                in_double_quote = false;
132            }
133            continue;
134        }
135
136        match ch {
137            '\'' => in_single_quote = true,
138            '"' => in_double_quote = true,
139            '{' => depth += 1,
140            '}' => {
141                if depth == 0 {
142                    return None;
143                }
144                depth -= 1;
145                if depth == 0 {
146                    return Some(idx);
147                }
148            }
149            _ => {}
150        }
151    }
152
153    None
154}
155
156impl PatternDetector for BeginTimeHeredocDetector {
157    fn detect(
158        &self,
159        code: &str,
160        offset: usize,
161        line_starts: &[usize],
162    ) -> Vec<(AntiPattern, Location)> {
163        let mut results = Vec::new();
164        let scan_code = mask_non_code_regions(code);
165
166        for begin_match in BEGIN_BLOCK_START_PATTERN.find_iter(&scan_code) {
167            let Some(opening_brace_rel) = begin_match.as_str().rfind('{') else {
168                continue;
169            };
170            let opening_brace_idx = begin_match.start() + opening_brace_rel;
171            let Some(closing_brace_idx) = find_matching_brace(&scan_code, opening_brace_idx) else {
172                continue;
173            };
174            let block_content = &scan_code[opening_brace_idx + 1..closing_brace_idx];
175
176            if !block_content.contains("<<") {
177                continue;
178            }
179
180            let location = location_from_start(line_starts, offset, begin_match.start());
181
182            results.push((
183                AntiPattern::BeginTimeHeredoc {
184                    location: location.clone(),
185                    heredoc_content: block_content.to_string(),
186                    side_effects: vec!["Phase-dependent parsing".to_string()],
187                },
188                location,
189            ));
190        }
191
192        results
193    }
194
195    fn diagnose(&self, pattern: &AntiPattern) -> Option<Diagnostic> {
196        if let AntiPattern::BeginTimeHeredoc { .. } = pattern {
197            Some(Diagnostic {
198                severity: Severity::Error,
199                pattern: pattern.clone(),
200                message: "Heredoc declared during BEGIN-time".to_string(),
201                explanation: "Heredocs declared inside BEGIN blocks are evaluated during the compilation phase. This can lead to complex side effects that are difficult to track statically.".to_string(),
202                suggested_fix: Some("Move the heredoc declaration out of the BEGIN block if it doesn't need to be evaluated during compilation.".to_string()),
203                references: vec!["perldoc perlmod".to_string()],
204            })
205        } else {
206            None
207        }
208    }
209}
210
211// Dynamic delimiter detector
212struct DynamicDelimiterDetector;
213
214/// Pattern for identifying dynamic heredoc delimiters
215static DYNAMIC_DELIMITER_PATTERN: LazyLock<Regex> =
216    LazyLock::new(|| match Regex::new(r"<<\s*\$\{[^}]+\}|<<\s*\$\w+|<<\s*`[^`]+`") {
217        Ok(re) => re,
218        Err(_) => unreachable!("DYNAMIC_DELIMITER_PATTERN regex failed to compile"),
219    });
220
221impl PatternDetector for DynamicDelimiterDetector {
222    fn detect(
223        &self,
224        code: &str,
225        offset: usize,
226        line_starts: &[usize],
227    ) -> Vec<(AntiPattern, Location)> {
228        let mut results = Vec::new();
229        let scan_code = mask_non_code_regions(code);
230
231        for cap in DYNAMIC_DELIMITER_PATTERN.captures_iter(&scan_code) {
232            if let Some(match_pos) = cap.get(0) {
233                let expression = match_pos.as_str().to_string();
234                let location = location_from_start(line_starts, offset, match_pos.start());
235
236                results.push((
237                    AntiPattern::DynamicHeredocDelimiter { location: location.clone(), expression },
238                    location,
239                ));
240            }
241        }
242
243        results
244    }
245
246    fn diagnose(&self, pattern: &AntiPattern) -> Option<Diagnostic> {
247        let AntiPattern::DynamicHeredocDelimiter { expression, .. } = pattern else {
248            return None;
249        };
250
251        Some(Diagnostic {
252            severity: Severity::Warning,
253            pattern: pattern.clone(),
254            message: format!("Dynamic heredoc delimiter: {}", expression),
255            explanation: "Using variables or expressions as heredoc delimiters makes it impossible to know the terminator without executing the code.".to_string(),
256            suggested_fix: Some("Use a literal string as the heredoc terminator.".to_string()),
257            references: vec!["perldoc perlop".to_string()],
258        })
259    }
260}
261
262// Source filter detector
263struct SourceFilterDetector;
264
265/// Pattern for identifying common source filter modules
266static SOURCE_FILTER_PATTERN: LazyLock<Regex> = LazyLock::new(|| {
267    match Regex::new(r"use\s+Filter::(Simple|Util::Call|cpp|exec|sh|decrypt|tee)") {
268        Ok(re) => re,
269        Err(_) => unreachable!("SOURCE_FILTER_PATTERN regex failed to compile"),
270    }
271});
272
273impl PatternDetector for SourceFilterDetector {
274    fn detect(
275        &self,
276        code: &str,
277        offset: usize,
278        line_starts: &[usize],
279    ) -> Vec<(AntiPattern, Location)> {
280        let mut results = Vec::new();
281        let scan_code = mask_non_code_regions(code);
282
283        for cap in SOURCE_FILTER_PATTERN.captures_iter(&scan_code) {
284            if let (Some(match_pos), Some(module_match)) = (cap.get(0), cap.get(1)) {
285                let filter_module = module_match.as_str().to_string();
286                let location = location_from_start(line_starts, offset, match_pos.start());
287
288                results.push((
289                    AntiPattern::SourceFilterHeredoc {
290                        location: location.clone(),
291                        module: filter_module,
292                    },
293                    location,
294                ));
295            }
296        }
297
298        results
299    }
300
301    fn diagnose(&self, pattern: &AntiPattern) -> Option<Diagnostic> {
302        let AntiPattern::SourceFilterHeredoc { module, .. } = pattern else {
303            return None;
304        };
305
306        Some(Diagnostic {
307            severity: Severity::Error,
308            pattern: pattern.clone(),
309            message: format!("Source filter detected: Filter::{}", module),
310            explanation: "Source filters rewrite the source code before it's parsed. Static analysis cannot reliably predict the state of the code after filtering.".to_string(),
311            suggested_fix: Some("Avoid using source filters. They are considered problematic and often replaced by better alternatives like Devel::Declare or modern Perl features.".to_string()),
312            references: vec!["perldoc Filter::Simple".to_string()],
313        })
314    }
315}
316
317// Regex heredoc detector
318struct RegexHeredocDetector;
319
320/// Pattern for identifying heredocs inside regex code blocks
321static REGEX_HEREDOC_PATTERN: LazyLock<Regex> =
322    LazyLock::new(|| match Regex::new(r"\(\?\{[^}]*<<[^}]*\}") {
323        Ok(re) => re,
324        Err(_) => unreachable!("REGEX_HEREDOC_PATTERN regex failed to compile"),
325    });
326
327impl PatternDetector for RegexHeredocDetector {
328    fn detect(
329        &self,
330        code: &str,
331        offset: usize,
332        line_starts: &[usize],
333    ) -> Vec<(AntiPattern, Location)> {
334        let mut results = Vec::new();
335        let scan_code = mask_non_code_regions(code);
336
337        for cap in REGEX_HEREDOC_PATTERN.captures_iter(&scan_code) {
338            if let Some(match_pos) = cap.get(0) {
339                let location = location_from_start(line_starts, offset, match_pos.start());
340
341                results.push((
342                    AntiPattern::RegexCodeBlockHeredoc { location: location.clone() },
343                    location,
344                ));
345            }
346        }
347
348        results
349    }
350
351    fn diagnose(&self, pattern: &AntiPattern) -> Option<Diagnostic> {
352        if let AntiPattern::RegexCodeBlockHeredoc { .. } = pattern {
353            Some(Diagnostic {
354                severity: Severity::Warning,
355                pattern: pattern.clone(),
356                message: "Heredoc inside regex code block".to_string(),
357                explanation: "Declaring heredocs inside (?{ ... }) or (??{ ... }) blocks is extremely rare and difficult to parse correctly.".to_string(),
358                suggested_fix: None,
359                references: vec!["perldoc perlre".to_string()],
360            })
361        } else {
362            None
363        }
364    }
365}
366
367// Eval heredoc detector
368struct EvalHeredocDetector;
369
370/// Pattern for identifying heredocs inside eval strings
371static EVAL_HEREDOC_PATTERN: LazyLock<Regex> =
372    LazyLock::new(|| match Regex::new(r#"eval\s+(?:'[^']*<<[^']*'|"[^"]*<<[^"]*")"#) {
373        Ok(re) => re,
374        Err(_) => unreachable!("EVAL_HEREDOC_PATTERN regex failed to compile"),
375    });
376
377impl PatternDetector for EvalHeredocDetector {
378    fn detect(
379        &self,
380        code: &str,
381        offset: usize,
382        line_starts: &[usize],
383    ) -> Vec<(AntiPattern, Location)> {
384        let mut results = Vec::new();
385
386        for cap in EVAL_HEREDOC_PATTERN.captures_iter(code) {
387            if let Some(match_pos) = cap.get(0) {
388                let location = location_from_start(line_starts, offset, match_pos.start());
389
390                results.push((
391                    AntiPattern::EvalStringHeredoc { location: location.clone() },
392                    location,
393                ));
394            }
395        }
396
397        results
398    }
399
400    fn diagnose(&self, pattern: &AntiPattern) -> Option<Diagnostic> {
401        if let AntiPattern::EvalStringHeredoc { .. } = pattern {
402            Some(Diagnostic {
403                severity: Severity::Warning,
404                pattern: pattern.clone(),
405                message: "Heredoc inside eval string".to_string(),
406                explanation: "Heredocs declared inside strings passed to eval require double parsing and can hide malicious or complex code.".to_string(),
407                suggested_fix: Some("Consider using a block eval or moving the heredoc outside the eval string.".to_string()),
408                references: vec!["perldoc -f eval".to_string()],
409            })
410        } else {
411            None
412        }
413    }
414}
415
416// Tied handle detector
417struct TiedHandleDetector;
418
419/// Pattern for identifying tie statements
420static TIE_PATTERN: LazyLock<Regex> = LazyLock::new(|| match Regex::new(r"tie\s+([*$]\w+)") {
421    Ok(re) => re,
422    Err(_) => unreachable!("TIE_PATTERN regex failed to compile"),
423});
424
425/// Pattern for identifying print statements that write heredocs to a handle.
426static PRINT_HEREDOC_PATTERN: LazyLock<Regex> =
427    LazyLock::new(|| match Regex::new(r"print\s+([*$]?\w+)\s+<<") {
428        Ok(re) => re,
429        Err(_) => unreachable!("PRINT_HEREDOC_PATTERN regex failed to compile"),
430    });
431
432impl PatternDetector for TiedHandleDetector {
433    fn detect(
434        &self,
435        code: &str,
436        offset: usize,
437        line_starts: &[usize],
438    ) -> Vec<(AntiPattern, Location)> {
439        let mut results = Vec::new();
440        let scan_code = mask_non_code_regions(code);
441
442        // First collect tied handles in normalized form:
443        // *FH -> FH, $fh -> $fh.
444        let mut tied_handles = HashSet::new();
445        for cap in TIE_PATTERN.captures_iter(&scan_code) {
446            if let Some(handle_match) = cap.get(1) {
447                let raw_handle = handle_match.as_str();
448                let normalized = raw_handle.strip_prefix('*').unwrap_or(raw_handle);
449                tied_handles.insert(normalized.to_string());
450            }
451        }
452
453        // Use a single static regex for all print-heredoc matches, then filter
454        // by whether the handle is in the tied set. This avoids O(n) Regex
455        // compilations (one per tied handle) and is faster for large files.
456        for cap in PRINT_HEREDOC_PATTERN.captures_iter(&scan_code) {
457            let (Some(match_pos), Some(handle_match)) = (cap.get(0), cap.get(1)) else {
458                continue;
459            };
460
461            let raw_print_handle = handle_match.as_str();
462            let normalized_print_handle =
463                raw_print_handle.strip_prefix('*').unwrap_or(raw_print_handle);
464
465            if tied_handles.contains(normalized_print_handle) {
466                let location = location_from_start(line_starts, offset, match_pos.start());
467                results.push((
468                    AntiPattern::TiedHandleHeredoc {
469                        location: location.clone(),
470                        handle_name: normalized_print_handle.to_string(),
471                    },
472                    location,
473                ));
474            }
475        }
476
477        results
478    }
479
480    fn diagnose(&self, pattern: &AntiPattern) -> Option<Diagnostic> {
481        let AntiPattern::TiedHandleHeredoc { handle_name, .. } = pattern else {
482            return None;
483        };
484
485        Some(Diagnostic {
486            severity: Severity::Info,
487            pattern: pattern.clone(),
488            message: format!("Heredoc written to tied handle '{}'", handle_name),
489            explanation: "Writing to a tied handle invokes custom code. The behavior of heredoc output depends on the tied class implementation.".to_string(),
490            suggested_fix: None,
491            references: vec!["perldoc -f tie".to_string()],
492        })
493    }
494}
495
496impl Default for AntiPatternDetector {
497    fn default() -> Self {
498        Self::new()
499    }
500}
501
502impl AntiPatternDetector {
503    /// Create a detector pre-loaded with all seven built-in pattern checkers.
504    pub fn new() -> Self {
505        Self {
506            patterns: vec![
507                Box::new(FormatHeredocDetector),
508                Box::new(BeginTimeHeredocDetector),
509                Box::new(DynamicDelimiterDetector),
510                Box::new(SourceFilterDetector),
511                Box::new(RegexHeredocDetector),
512                Box::new(EvalHeredocDetector),
513                Box::new(TiedHandleDetector),
514            ],
515        }
516    }
517
518    /// Run all pattern checkers against `code` and return diagnostics sorted by offset.
519    pub fn detect_all(&self, code: &str) -> Vec<Diagnostic> {
520        let mut diagnostics = Vec::new();
521        let line_starts = build_line_starts(code);
522
523        for detector in &self.patterns {
524            let patterns = detector.detect(code, 0, &line_starts);
525            for (pattern, _) in patterns {
526                if let Some(diagnostic) = detector.diagnose(&pattern) {
527                    diagnostics.push(diagnostic);
528                }
529            }
530        }
531
532        diagnostics.sort_by_key(|d| match &d.pattern {
533            AntiPattern::FormatHeredoc { location, .. }
534            | AntiPattern::BeginTimeHeredoc { location, .. }
535            | AntiPattern::DynamicHeredocDelimiter { location, .. }
536            | AntiPattern::SourceFilterHeredoc { location, .. }
537            | AntiPattern::RegexCodeBlockHeredoc { location, .. }
538            | AntiPattern::EvalStringHeredoc { location, .. }
539            | AntiPattern::TiedHandleHeredoc { location, .. } => location.offset,
540        });
541
542        diagnostics
543    }
544
545    /// Format a list of diagnostics as a human-readable plain-text report.
546    ///
547    /// Prints a header, a count, and one entry per diagnostic including its
548    /// severity, location, explanation, optional suggested fix, and references.
549    pub fn format_report(&self, diagnostics: &[Diagnostic]) -> String {
550        let mut report = String::from("Anti-Pattern Analysis Report\n");
551        report.push_str("============================\n\n");
552
553        if diagnostics.is_empty() {
554            report.push_str("No problematic patterns detected.\n");
555            return report;
556        }
557
558        report.push_str(&format!("Found {} problematic patterns:\n\n", diagnostics.len()));
559
560        for (i, diag) in diagnostics.iter().enumerate() {
561            report.push_str(&format!(
562                "{}. {} ({})\n",
563                i + 1,
564                diag.message,
565                match diag.severity {
566                    Severity::Error => "ERROR",
567                    Severity::Warning => "WARNING",
568                    Severity::Info => "INFO",
569                }
570            ));
571
572            report.push_str(&format!(
573                "   Location: {}\n",
574                match &diag.pattern {
575                    AntiPattern::FormatHeredoc { location, .. }
576                    | AntiPattern::BeginTimeHeredoc { location, .. }
577                    | AntiPattern::DynamicHeredocDelimiter { location, .. }
578                    | AntiPattern::SourceFilterHeredoc { location, .. }
579                    | AntiPattern::RegexCodeBlockHeredoc { location, .. }
580                    | AntiPattern::EvalStringHeredoc { location, .. }
581                    | AntiPattern::TiedHandleHeredoc { location, .. } =>
582                        format!("line {}, column {}", location.line, location.column),
583                }
584            ));
585
586            report.push_str(&format!("   Explanation: {}\n", diag.explanation));
587
588            if let Some(fix) = &diag.suggested_fix {
589                report.push_str(&format!(
590                    "   Suggested fix:\n     {}\n",
591                    fix.lines().collect::<Vec<_>>().join("\n     ")
592                ));
593            }
594
595            if !diag.references.is_empty() {
596                report.push_str(&format!("   References: {}\n", diag.references.join(", ")));
597            }
598
599            report.push('\n');
600        }
601
602        report
603    }
604}
605
606#[cfg(test)]
607mod tests;