Skip to main content

perl_heredoc_anti_patterns/
lib.rs

1//! Anti-pattern detection for heredoc edge cases
2//!
3//! This crate provides detection and analysis of problematic Perl patterns
4//! that make static parsing difficult or impossible, particularly around heredocs.
5//!
6//! The [`AntiPatternDetector`] scans Perl source for seven categories of
7//! heredoc-related anti-patterns and produces [`Diagnostic`]s describing each
8//! finding, with severity, explanation, suggested fix, and documentation
9//! references.
10
11use regex::Regex;
12use std::sync::LazyLock;
13
14#[derive(Debug, Clone, PartialEq)]
15pub struct Location {
16    pub line: usize,
17    pub column: usize,
18    pub offset: usize,
19}
20
21#[derive(Debug, Clone, PartialEq)]
22pub enum Severity {
23    Error,   // Code will likely fail
24    Warning, // Code works but is problematic
25    Info,    // Code could be improved
26}
27
28#[derive(Debug, Clone, PartialEq)]
29pub enum AntiPattern {
30    FormatHeredoc { location: Location, format_name: String, heredoc_delimiter: String },
31    BeginTimeHeredoc { location: Location, heredoc_content: String, side_effects: Vec<String> },
32    DynamicHeredocDelimiter { location: Location, expression: String },
33    SourceFilterHeredoc { location: Location, module: String },
34    RegexCodeBlockHeredoc { location: Location },
35    EvalStringHeredoc { location: Location },
36    TiedHandleHeredoc { location: Location, handle_name: String },
37}
38
39#[derive(Debug, Clone, PartialEq)]
40pub struct Diagnostic {
41    pub severity: Severity,
42    pub pattern: AntiPattern,
43    pub message: String,
44    pub explanation: String,
45    pub suggested_fix: Option<String>,
46    pub references: Vec<String>,
47}
48
49pub struct AntiPatternDetector {
50    patterns: Vec<Box<dyn PatternDetector>>,
51}
52
53trait PatternDetector: Send + Sync {
54    fn detect(&self, code: &str, offset: usize) -> Vec<(AntiPattern, Location)>;
55    fn diagnose(&self, pattern: &AntiPattern) -> Option<Diagnostic>;
56}
57
58// Format heredoc detector
59struct FormatHeredocDetector;
60
61/// Pattern for identifying format declarations
62static FORMAT_PATTERN: LazyLock<Regex> =
63    LazyLock::new(|| match Regex::new(r"(?m)^\s*format\s+(\w+)\s*=\s*$") {
64        Ok(re) => re,
65        Err(_) => unreachable!("FORMAT_PATTERN regex failed to compile"),
66    });
67
68impl PatternDetector for FormatHeredocDetector {
69    fn detect(&self, code: &str, offset: usize) -> Vec<(AntiPattern, Location)> {
70        let mut results = Vec::new();
71
72        for cap in FORMAT_PATTERN.captures_iter(code) {
73            if let (Some(match_pos), Some(name_match)) = (cap.get(0), cap.get(1)) {
74                let format_name = name_match.as_str().to_string();
75                let location = Location {
76                    line: code[..match_pos.start()].lines().count(),
77                    column: match_pos.start() - code[..match_pos.start()].rfind('\n').unwrap_or(0),
78                    offset: offset + match_pos.start(),
79                };
80
81                // Look for heredoc marker inside format body (simplified)
82                let body_start = match_pos.end();
83                let body_end = code[body_start..].find("\n.").unwrap_or(code.len() - body_start);
84                let body = &code[body_start..body_start + body_end];
85
86                if body.contains("<<") {
87                    results.push((
88                        AntiPattern::FormatHeredoc {
89                            location: location.clone(),
90                            format_name,
91                            heredoc_delimiter: "UNKNOWN".to_string(), // Would need better extraction
92                        },
93                        location,
94                    ));
95                }
96            }
97        }
98
99        results
100    }
101
102    fn diagnose(&self, pattern: &AntiPattern) -> Option<Diagnostic> {
103        let AntiPattern::FormatHeredoc { format_name, .. } = pattern else {
104            return None;
105        };
106
107        Some(Diagnostic {
108            severity: Severity::Warning,
109            pattern: pattern.clone(),
110            message: format!("Heredoc declared inside format '{}'", format_name),
111            explanation: "Heredocs inside format declarations are often handled specially by the Perl interpreter and can be difficult to parse statically.".to_string(),
112            suggested_fix: Some("Consider moving the heredoc outside the format or using a simple string if possible.".to_string()),
113            references: vec!["perldoc perlform".to_string()],
114        })
115    }
116}
117
118// BEGIN-time heredoc detector
119struct BeginTimeHeredocDetector;
120
121/// Pattern for identifying BEGIN blocks with heredocs
122static BEGIN_BLOCK_PATTERN: LazyLock<Regex> =
123    LazyLock::new(|| match Regex::new(r"(?s)\bBEGIN\s*\{([^}]*<<[^}]*)\}") {
124        Ok(re) => re,
125        Err(_) => unreachable!("BEGIN_BLOCK_PATTERN regex failed to compile"),
126    });
127
128impl PatternDetector for BeginTimeHeredocDetector {
129    fn detect(&self, code: &str, offset: usize) -> Vec<(AntiPattern, Location)> {
130        let mut results = Vec::new();
131
132        for cap in BEGIN_BLOCK_PATTERN.captures_iter(code) {
133            if let (Some(match_pos), Some(content_match)) = (cap.get(0), cap.get(1)) {
134                let block_content = content_match.as_str();
135                let location = Location {
136                    line: code[..match_pos.start()].lines().count(),
137                    column: match_pos.start() - code[..match_pos.start()].rfind('\n').unwrap_or(0),
138                    offset: offset + match_pos.start(),
139                };
140
141                results.push((
142                    AntiPattern::BeginTimeHeredoc {
143                        location: location.clone(),
144                        heredoc_content: block_content.to_string(),
145                        side_effects: vec!["Phase-dependent parsing".to_string()],
146                    },
147                    location,
148                ));
149            }
150        }
151
152        results
153    }
154
155    fn diagnose(&self, pattern: &AntiPattern) -> Option<Diagnostic> {
156        if let AntiPattern::BeginTimeHeredoc { .. } = pattern {
157            Some(Diagnostic {
158                severity: Severity::Error,
159                pattern: pattern.clone(),
160                message: "Heredoc declared during BEGIN-time".to_string(),
161                explanation: "Heredocs declared inside BEGIN blocks are evaluated during the compilation phase. This can lead to complex side effects that are difficult to track statically.".to_string(),
162                suggested_fix: Some("Move the heredoc declaration out of the BEGIN block if it doesn't need to be evaluated during compilation.".to_string()),
163                references: vec!["perldoc perlmod".to_string()],
164            })
165        } else {
166            None
167        }
168    }
169}
170
171// Dynamic delimiter detector
172struct DynamicDelimiterDetector;
173
174/// Pattern for identifying dynamic heredoc delimiters
175static DYNAMIC_DELIMITER_PATTERN: LazyLock<Regex> =
176    LazyLock::new(|| match Regex::new(r"<<\s*\$\{[^}]+\}|<<\s*\$\w+|<<\s*`[^`]+`") {
177        Ok(re) => re,
178        Err(_) => unreachable!("DYNAMIC_DELIMITER_PATTERN regex failed to compile"),
179    });
180
181impl PatternDetector for DynamicDelimiterDetector {
182    fn detect(&self, code: &str, offset: usize) -> Vec<(AntiPattern, Location)> {
183        let mut results = Vec::new();
184
185        for cap in DYNAMIC_DELIMITER_PATTERN.captures_iter(code) {
186            if let Some(match_pos) = cap.get(0) {
187                let expression = match_pos.as_str().to_string();
188                let location = Location {
189                    line: code[..match_pos.start()].lines().count(),
190                    column: match_pos.start() - code[..match_pos.start()].rfind('\n').unwrap_or(0),
191                    offset: offset + match_pos.start(),
192                };
193
194                results.push((
195                    AntiPattern::DynamicHeredocDelimiter { location: location.clone(), expression },
196                    location,
197                ));
198            }
199        }
200
201        results
202    }
203
204    fn diagnose(&self, pattern: &AntiPattern) -> Option<Diagnostic> {
205        let AntiPattern::DynamicHeredocDelimiter { expression, .. } = pattern else {
206            return None;
207        };
208
209        Some(Diagnostic {
210            severity: Severity::Warning,
211            pattern: pattern.clone(),
212            message: format!("Dynamic heredoc delimiter: {}", expression),
213            explanation: "Using variables or expressions as heredoc delimiters makes it impossible to know the terminator without executing the code.".to_string(),
214            suggested_fix: Some("Use a literal string as the heredoc terminator.".to_string()),
215            references: vec!["perldoc perlop".to_string()],
216        })
217    }
218}
219
220// Source filter detector
221struct SourceFilterDetector;
222
223/// Pattern for identifying common source filter modules
224static SOURCE_FILTER_PATTERN: LazyLock<Regex> = LazyLock::new(|| {
225    match Regex::new(r"use\s+Filter::(Simple|Util::Call|cpp|exec|sh|decrypt|tee)") {
226        Ok(re) => re,
227        Err(_) => unreachable!("SOURCE_FILTER_PATTERN regex failed to compile"),
228    }
229});
230
231impl PatternDetector for SourceFilterDetector {
232    fn detect(&self, code: &str, offset: usize) -> Vec<(AntiPattern, Location)> {
233        let mut results = Vec::new();
234
235        for cap in SOURCE_FILTER_PATTERN.captures_iter(code) {
236            if let (Some(match_pos), Some(module_match)) = (cap.get(0), cap.get(1)) {
237                let filter_module = module_match.as_str().to_string();
238                let location = Location {
239                    line: code[..match_pos.start()].lines().count(),
240                    column: match_pos.start() - code[..match_pos.start()].rfind('\n').unwrap_or(0),
241                    offset: offset + match_pos.start(),
242                };
243
244                results.push((
245                    AntiPattern::SourceFilterHeredoc {
246                        location: location.clone(),
247                        module: filter_module,
248                    },
249                    location,
250                ));
251            }
252        }
253
254        results
255    }
256
257    fn diagnose(&self, pattern: &AntiPattern) -> Option<Diagnostic> {
258        let AntiPattern::SourceFilterHeredoc { module, .. } = pattern else {
259            return None;
260        };
261
262        Some(Diagnostic {
263            severity: Severity::Error,
264            pattern: pattern.clone(),
265            message: format!("Source filter detected: Filter::{}", module),
266            explanation: "Source filters rewrite the source code before it's parsed. Static analysis cannot reliably predict the state of the code after filtering.".to_string(),
267            suggested_fix: Some("Avoid using source filters. They are considered problematic and often replaced by better alternatives like Devel::Declare or modern Perl features.".to_string()),
268            references: vec!["perldoc Filter::Simple".to_string()],
269        })
270    }
271}
272
273// Regex heredoc detector
274struct RegexHeredocDetector;
275
276/// Pattern for identifying heredocs inside regex code blocks
277static REGEX_HEREDOC_PATTERN: LazyLock<Regex> =
278    LazyLock::new(|| match Regex::new(r"\(\?\{[^}]*<<[^}]*\}") {
279        Ok(re) => re,
280        Err(_) => unreachable!("REGEX_HEREDOC_PATTERN regex failed to compile"),
281    });
282
283impl PatternDetector for RegexHeredocDetector {
284    fn detect(&self, code: &str, offset: usize) -> Vec<(AntiPattern, Location)> {
285        let mut results = Vec::new();
286
287        for cap in REGEX_HEREDOC_PATTERN.captures_iter(code) {
288            if let Some(match_pos) = cap.get(0) {
289                let location = Location {
290                    line: code[..match_pos.start()].lines().count(),
291                    column: match_pos.start() - code[..match_pos.start()].rfind('\n').unwrap_or(0),
292                    offset: offset + match_pos.start(),
293                };
294
295                results.push((
296                    AntiPattern::RegexCodeBlockHeredoc { location: location.clone() },
297                    location,
298                ));
299            }
300        }
301
302        results
303    }
304
305    fn diagnose(&self, pattern: &AntiPattern) -> Option<Diagnostic> {
306        if let AntiPattern::RegexCodeBlockHeredoc { .. } = pattern {
307            Some(Diagnostic {
308                severity: Severity::Warning,
309                pattern: pattern.clone(),
310                message: "Heredoc inside regex code block".to_string(),
311                explanation: "Declaring heredocs inside (?{ ... }) or (??{ ... }) blocks is extremely rare and difficult to parse correctly.".to_string(),
312                suggested_fix: None,
313                references: vec!["perldoc perlre".to_string()],
314            })
315        } else {
316            None
317        }
318    }
319}
320
321// Eval heredoc detector
322struct EvalHeredocDetector;
323
324/// Pattern for identifying heredocs inside eval strings
325static EVAL_HEREDOC_PATTERN: LazyLock<Regex> =
326    LazyLock::new(|| match Regex::new(r#"eval\s+(?:'[^']*<<[^']*'|"[^"]*<<[^"]*")"#) {
327        Ok(re) => re,
328        Err(_) => unreachable!("EVAL_HEREDOC_PATTERN regex failed to compile"),
329    });
330
331impl PatternDetector for EvalHeredocDetector {
332    fn detect(&self, code: &str, offset: usize) -> Vec<(AntiPattern, Location)> {
333        let mut results = Vec::new();
334
335        for cap in EVAL_HEREDOC_PATTERN.captures_iter(code) {
336            if let Some(match_pos) = cap.get(0) {
337                let location = Location {
338                    line: code[..match_pos.start()].lines().count(),
339                    column: match_pos.start() - code[..match_pos.start()].rfind('\n').unwrap_or(0),
340                    offset: offset + match_pos.start(),
341                };
342
343                results.push((
344                    AntiPattern::EvalStringHeredoc { location: location.clone() },
345                    location,
346                ));
347            }
348        }
349
350        results
351    }
352
353    fn diagnose(&self, pattern: &AntiPattern) -> Option<Diagnostic> {
354        if let AntiPattern::EvalStringHeredoc { .. } = pattern {
355            Some(Diagnostic {
356                severity: Severity::Warning,
357                pattern: pattern.clone(),
358                message: "Heredoc inside eval string".to_string(),
359                explanation: "Heredocs declared inside strings passed to eval require double parsing and can hide malicious or complex code.".to_string(),
360                suggested_fix: Some("Consider using a block eval or moving the heredoc outside the eval string.".to_string()),
361                references: vec!["perldoc -f eval".to_string()],
362            })
363        } else {
364            None
365        }
366    }
367}
368
369// Tied handle detector
370struct TiedHandleDetector;
371
372/// Pattern for identifying tie statements
373static TIE_PATTERN: LazyLock<Regex> = LazyLock::new(|| match Regex::new(r"tie\s+([*$]\w+)") {
374    Ok(re) => re,
375    Err(_) => unreachable!("TIE_PATTERN regex failed to compile"),
376});
377
378impl PatternDetector for TiedHandleDetector {
379    fn detect(&self, code: &str, offset: usize) -> Vec<(AntiPattern, Location)> {
380        let mut results = Vec::new();
381
382        // First find tied handles
383        let mut tied_handles = Vec::new();
384        for cap in TIE_PATTERN.captures_iter(code) {
385            if let Some(handle_match) = cap.get(1) {
386                tied_handles.push(handle_match.as_str());
387            }
388        }
389
390        for raw_handle in tied_handles {
391            // If it's a glob (*FH), we typically print to the bare handle (FH).
392            // If it's a scalar ($fh), we print to the scalar ($fh).
393            let handle_to_search = raw_handle.strip_prefix('*').unwrap_or(raw_handle);
394
395            // Look for usage of this handle with heredoc
396            let usage_pattern = format!(r"print\s+{}\s+<<", regex::escape(handle_to_search));
397            if let Ok(re) = Regex::new(&usage_pattern)
398                && let Some(usage_match) = re.find(code)
399            {
400                let location = Location {
401                    line: code[..usage_match.start()].lines().count(),
402                    column: usage_match.start()
403                        - code[..usage_match.start()].rfind('\n').unwrap_or(0),
404                    offset: offset + usage_match.start(),
405                };
406
407                results.push((
408                    AntiPattern::TiedHandleHeredoc {
409                        location: location.clone(),
410                        handle_name: handle_to_search.to_string(),
411                    },
412                    location,
413                ));
414            }
415        }
416
417        results
418    }
419
420    fn diagnose(&self, pattern: &AntiPattern) -> Option<Diagnostic> {
421        let AntiPattern::TiedHandleHeredoc { handle_name, .. } = pattern else {
422            return None;
423        };
424
425        Some(Diagnostic {
426            severity: Severity::Info,
427            pattern: pattern.clone(),
428            message: format!("Heredoc written to tied handle '{}'", handle_name),
429            explanation: "Writing to a tied handle invokes custom code. The behavior of heredoc output depends on the tied class implementation.".to_string(),
430            suggested_fix: None,
431            references: vec!["perldoc -f tie".to_string()],
432        })
433    }
434}
435
436impl Default for AntiPatternDetector {
437    fn default() -> Self {
438        Self::new()
439    }
440}
441
442impl AntiPatternDetector {
443    pub fn new() -> Self {
444        Self {
445            patterns: vec![
446                Box::new(FormatHeredocDetector),
447                Box::new(BeginTimeHeredocDetector),
448                Box::new(DynamicDelimiterDetector),
449                Box::new(SourceFilterDetector),
450                Box::new(RegexHeredocDetector),
451                Box::new(EvalHeredocDetector),
452                Box::new(TiedHandleDetector),
453            ],
454        }
455    }
456
457    pub fn detect_all(&self, code: &str) -> Vec<Diagnostic> {
458        let mut diagnostics = Vec::new();
459
460        for detector in &self.patterns {
461            let patterns = detector.detect(code, 0);
462            for (pattern, _) in patterns {
463                if let Some(diagnostic) = detector.diagnose(&pattern) {
464                    diagnostics.push(diagnostic);
465                }
466            }
467        }
468
469        diagnostics.sort_by_key(|d| match &d.pattern {
470            AntiPattern::FormatHeredoc { location, .. }
471            | AntiPattern::BeginTimeHeredoc { location, .. }
472            | AntiPattern::DynamicHeredocDelimiter { location, .. }
473            | AntiPattern::SourceFilterHeredoc { location, .. }
474            | AntiPattern::RegexCodeBlockHeredoc { location, .. }
475            | AntiPattern::EvalStringHeredoc { location, .. }
476            | AntiPattern::TiedHandleHeredoc { location, .. } => location.offset,
477        });
478
479        diagnostics
480    }
481
482    pub fn format_report(&self, diagnostics: &[Diagnostic]) -> String {
483        let mut report = String::from("Anti-Pattern Analysis Report\n");
484        report.push_str("============================\n\n");
485
486        if diagnostics.is_empty() {
487            report.push_str("No problematic patterns detected.\n");
488            return report;
489        }
490
491        report.push_str(&format!("Found {} problematic patterns:\n\n", diagnostics.len()));
492
493        for (i, diag) in diagnostics.iter().enumerate() {
494            report.push_str(&format!(
495                "{}. {} ({})\n",
496                i + 1,
497                diag.message,
498                match diag.severity {
499                    Severity::Error => "ERROR",
500                    Severity::Warning => "WARNING",
501                    Severity::Info => "INFO",
502                }
503            ));
504
505            report.push_str(&format!(
506                "   Location: {}\n",
507                match &diag.pattern {
508                    AntiPattern::FormatHeredoc { location, .. }
509                    | AntiPattern::BeginTimeHeredoc { location, .. }
510                    | AntiPattern::DynamicHeredocDelimiter { location, .. }
511                    | AntiPattern::SourceFilterHeredoc { location, .. }
512                    | AntiPattern::RegexCodeBlockHeredoc { location, .. }
513                    | AntiPattern::EvalStringHeredoc { location, .. }
514                    | AntiPattern::TiedHandleHeredoc { location, .. } =>
515                        format!("line {}, column {}", location.line, location.column),
516                }
517            ));
518
519            report.push_str(&format!("   Explanation: {}\n", diag.explanation));
520
521            if let Some(fix) = &diag.suggested_fix {
522                report.push_str(&format!(
523                    "   Suggested fix:\n     {}\n",
524                    fix.lines().collect::<Vec<_>>().join("\n     ")
525                ));
526            }
527
528            if !diag.references.is_empty() {
529                report.push_str(&format!("   References: {}\n", diag.references.join(", ")));
530            }
531
532            report.push('\n');
533        }
534
535        report
536    }
537}
538
539#[cfg(test)]
540mod tests {
541    use super::*;
542
543    #[test]
544    fn test_format_heredoc_detection() {
545        let detector = AntiPatternDetector::new();
546        let code = r#"
547format REPORT =
548<<'END'
549Name: @<<<<<<<<<<<<
550$name
551END
552.
553"#;
554
555        let diagnostics = detector.detect_all(code);
556        // Note: DynamicDelimiterDetector might also flag the << inside the format body as a false positive.
557        // But FormatHeredoc should appear first because it starts at 'format'.
558        // So diagnostics[0] should be FormatHeredoc.
559        assert!(!diagnostics.is_empty());
560        assert!(matches!(diagnostics[0].pattern, AntiPattern::FormatHeredoc { .. }));
561    }
562
563    #[test]
564    fn test_begin_heredoc_detection() {
565        let detector = AntiPatternDetector::new();
566        let code = r###"
567BEGIN {
568    $config = <<'END';
569    server = localhost
570END
571}
572"###;
573
574        let diagnostics = detector.detect_all(code);
575        assert_eq!(diagnostics.len(), 1);
576        assert!(matches!(diagnostics[0].pattern, AntiPattern::BeginTimeHeredoc { .. }));
577    }
578
579    #[test]
580    fn test_dynamic_delimiter_detection() {
581        let detector = AntiPatternDetector::new();
582        let code = r###"
583my $delimiter = "EOF";
584my $content = <<$delimiter;
585This is dynamic
586EOF
587"###;
588
589        let diagnostics = detector.detect_all(code);
590        assert_eq!(diagnostics.len(), 1);
591        assert!(matches!(diagnostics[0].pattern, AntiPattern::DynamicHeredocDelimiter { .. }));
592    }
593
594    #[test]
595    fn test_source_filter_detection() {
596        let detector = AntiPatternDetector::new();
597        let code = r###"
598use Filter::Simple;
599print <<EOF;
600Filtered content
601EOF
602"###;
603        let diagnostics = detector.detect_all(code);
604        assert_eq!(diagnostics.len(), 1);
605        assert!(matches!(diagnostics[0].pattern, AntiPattern::SourceFilterHeredoc { .. }));
606    }
607
608    #[test]
609    fn test_regex_heredoc_detection() {
610        let detector = AntiPatternDetector::new();
611        let code = r###"
612m/pattern(?{
613    print <<'MATCH';
614    Match text
615MATCH
616})/
617"###;
618        let diagnostics = detector.detect_all(code);
619        assert_eq!(diagnostics.len(), 1);
620        assert!(matches!(diagnostics[0].pattern, AntiPattern::RegexCodeBlockHeredoc { .. }));
621    }
622
623    #[test]
624    fn test_eval_heredoc_detection() {
625        let detector = AntiPatternDetector::new();
626        let code = r###"
627eval 'print <<"EVAL";
628Eval content
629EVAL';
630"###;
631        let diagnostics = detector.detect_all(code);
632        assert_eq!(diagnostics.len(), 1);
633        assert!(matches!(diagnostics[0].pattern, AntiPattern::EvalStringHeredoc { .. }));
634    }
635
636    #[test]
637    fn test_tied_handle_detection() {
638        let detector = AntiPatternDetector::new();
639        let code = r###"
640tie *FH, 'Tie::Handle';
641print FH <<'DATA';
642Tied output
643DATA
644"###;
645        let diagnostics = detector.detect_all(code);
646        assert_eq!(diagnostics.len(), 1);
647        assert!(matches!(diagnostics[0].pattern, AntiPattern::TiedHandleHeredoc { .. }));
648    }
649
650    #[test]
651    fn test_tied_scalar_handle_detection() {
652        let detector = AntiPatternDetector::new();
653        let code = r###"
654tie $fh, 'Tie::Handle';
655print $fh <<'DATA';
656Tied output
657DATA
658"###;
659        let diagnostics = detector.detect_all(code);
660        assert_eq!(diagnostics.len(), 1);
661        assert!(matches!(diagnostics[0].pattern, AntiPattern::TiedHandleHeredoc { .. }));
662    }
663}