Skip to main content

perl_parser_core/syntax/
source_file.rs

1//! Shared Perl source-file classification helpers.
2//!
3//! These helpers provide one canonical definition for what constitutes a Perl
4//! source file across workspace discovery and runtime file operations.
5
6use std::path::Path;
7
8/// Number of bytes to inspect for binary content detection.
9///
10/// 4 KB is enough to catch all common binary formats (ELF, PE, ZIP, PNG, …)
11/// while being cheap to scan.
12const BINARY_PROBE_BYTES: usize = 4096;
13
14/// Returns `true` if `text` appears to contain binary (non-text) content.
15///
16/// The heuristic checks the first [`BINARY_PROBE_BYTES`] bytes for null bytes
17/// (`\0`).  A single null byte is sufficient to classify the content as
18/// binary: valid Perl (or any UTF-8 text) never contains null bytes outside of
19/// raw string literals, and real-world binary formats (ELF, PE/COFF, ZIP,
20/// PNG, …) all begin with or contain null bytes in their headers.
21///
22/// # Why null bytes?
23///
24/// - Fast: a single `memchr`-style scan of at most 4 KB.
25/// - Low false-positive rate: Perl source virtually never contains `\0`.
26/// - High true-positive rate: every common compiled binary contains `\0`.
27#[must_use]
28pub fn is_binary_content(text: &str) -> bool {
29    text.bytes().take(BINARY_PROBE_BYTES).any(|b| b == 0)
30}
31
32/// Canonical Perl source file extensions.
33///
34/// Includes core Perl script and module extensions as well as common embedded
35/// Perl template formats: `.ep` (Mojolicious), `.tt`/`.tt2` (Template Toolkit),
36/// and `.mason` (Mason/HTML::Mason).
37pub const PERL_SOURCE_EXTENSIONS: [&str; 9] =
38    ["pl", "pm", "t", "psgi", "cgi", "ep", "tt", "tt2", "mason"];
39
40/// Returns `true` if `extension` is a recognized Perl source extension.
41///
42/// Accepts values with or without a leading dot and matches
43/// case-insensitively.
44#[must_use]
45pub fn is_perl_source_extension(extension: &str) -> bool {
46    let ext = extension.strip_prefix('.').unwrap_or(extension);
47    PERL_SOURCE_EXTENSIONS.iter().any(|candidate| candidate.eq_ignore_ascii_case(ext))
48}
49
50/// Returns `true` if `path` points to a recognized Perl source file.
51#[must_use]
52pub fn is_perl_source_path(path: &Path) -> bool {
53    path.extension().and_then(|ext| ext.to_str()).is_some_and(is_perl_source_extension)
54}
55
56/// Returns `true` if `uri` or path-like string points to a Perl source file.
57///
58/// Supports:
59/// - Plain filesystem paths
60/// - `file://` URIs
61/// - Optional query/fragment suffixes
62#[must_use]
63pub fn is_perl_source_uri(uri: &str) -> bool {
64    let without_fragment = uri.split('#').next().unwrap_or(uri);
65    let without_query = without_fragment.split('?').next().unwrap_or(without_fragment);
66    is_perl_source_path(Path::new(without_query))
67}
68
69#[cfg(test)]
70mod tests {
71    use super::{
72        BINARY_PROBE_BYTES, PERL_SOURCE_EXTENSIONS, is_binary_content, is_perl_source_extension,
73        is_perl_source_path, is_perl_source_uri,
74    };
75    use std::path::Path;
76
77    #[test]
78    fn exposes_expected_extension_set() {
79        assert_eq!(
80            PERL_SOURCE_EXTENSIONS,
81            ["pl", "pm", "t", "psgi", "cgi", "ep", "tt", "tt2", "mason"]
82        );
83    }
84
85    #[test]
86    fn classifies_extensions_case_insensitively() {
87        assert!(is_perl_source_extension("pl"));
88        assert!(is_perl_source_extension(".pm"));
89        assert!(is_perl_source_extension("T"));
90        assert!(is_perl_source_extension("PsGi"));
91        assert!(is_perl_source_extension("cgi"));
92        assert!(is_perl_source_extension(".CGI"));
93        assert!(!is_perl_source_extension("txt"));
94    }
95
96    #[test]
97    fn classifies_filesystem_paths() {
98        assert!(is_perl_source_path(Path::new("/workspace/script.pl")));
99        assert!(is_perl_source_path(Path::new("/workspace/lib/Foo/Bar.PM")));
100        assert!(is_perl_source_path(Path::new("/workspace/app.psgi")));
101        assert!(is_perl_source_path(Path::new("/var/www/cgi-bin/form.cgi")));
102        assert!(is_perl_source_path(Path::new("/var/www/cgi-bin/upload.CGI")));
103        assert!(!is_perl_source_path(Path::new("/workspace/README.md")));
104        assert!(!is_perl_source_path(Path::new("/workspace/no_extension")));
105    }
106
107    #[test]
108    fn classifies_uri_like_inputs() {
109        assert!(is_perl_source_uri("file:///workspace/script.pl"));
110        assert!(is_perl_source_uri("file:///workspace/lib/Foo/Bar.pm"));
111        assert!(is_perl_source_uri("file:///workspace/app.psgi"));
112        assert!(is_perl_source_uri("file:///workspace/app.psgi?version=1#section"));
113        assert!(is_perl_source_uri("file:///var/www/cgi-bin/form.cgi"));
114        assert!(is_perl_source_uri("file:///var/www/cgi-bin/search.cgi?q=perl#results"));
115        assert!(!is_perl_source_uri("file:///workspace/README.md"));
116    }
117
118    #[test]
119    fn cgi_and_psgi_are_recognized() {
120        // CGI scripts (.cgi) — web projects, Apache/Nginx CGI handlers
121        assert!(is_perl_source_extension("cgi"));
122        assert!(is_perl_source_extension("CGI"));
123        assert!(is_perl_source_path(Path::new("/var/www/cgi-bin/form.cgi")));
124        assert!(is_perl_source_uri("file:///var/www/cgi-bin/form.cgi"));
125
126        // PSGI apps (.psgi) — Plack/PSGI applications
127        assert!(is_perl_source_extension("psgi"));
128        assert!(is_perl_source_extension("PSGI"));
129        assert!(is_perl_source_path(Path::new("/workspace/app.psgi")));
130        assert!(is_perl_source_uri("file:///workspace/app.psgi"));
131
132        // Non-Perl extensions remain unrecognized
133        assert!(!is_perl_source_extension("sh"));
134        assert!(!is_perl_source_extension("py"));
135    }
136
137    #[test]
138    fn template_extensions_are_recognized() {
139        // .ep — Mojolicious embedded Perl templates
140        assert!(is_perl_source_extension("ep"));
141        assert!(is_perl_source_extension("EP"));
142        assert!(is_perl_source_path(Path::new("/app/templates/index.html.ep")));
143        assert!(is_perl_source_uri("file:///app/templates/index.html.ep"));
144
145        // .tt — Template Toolkit templates (version 2 default)
146        assert!(is_perl_source_extension("tt"));
147        assert!(is_perl_source_extension("TT"));
148        assert!(is_perl_source_path(Path::new("/app/templates/page.tt")));
149        assert!(is_perl_source_uri("file:///app/templates/page.tt"));
150
151        // .tt2 — Template Toolkit 2 explicit extension
152        assert!(is_perl_source_extension("tt2"));
153        assert!(is_perl_source_extension("TT2"));
154        assert!(is_perl_source_path(Path::new("/app/templates/layout.tt2")));
155        assert!(is_perl_source_uri("file:///app/templates/layout.tt2"));
156
157        // .mason — HTML::Mason / Mason2 templates
158        assert!(is_perl_source_extension("mason"));
159        assert!(is_perl_source_extension("MASON"));
160        assert!(is_perl_source_path(Path::new("/app/comp/header.mason")));
161        assert!(is_perl_source_uri("file:///app/comp/header.mason"));
162
163        // Non-template extensions remain unrecognized
164        assert!(!is_perl_source_extension("html"));
165        assert!(!is_perl_source_extension("tmpl"));
166    }
167
168    #[test]
169    fn supports_windows_style_paths() {
170        assert!(is_perl_source_uri(r"C:\workspace\script.pl"));
171        assert!(is_perl_source_uri(r"file:///C:/workspace/lib/Foo.pm"));
172        assert!(!is_perl_source_uri(r"C:\workspace\README.txt"));
173    }
174
175    // ── is_binary_content ─────────────────────────────────────────────────
176
177    #[test]
178    fn binary_content_null_byte_is_detected() {
179        // Simulate a binary file arriving as a string with embedded null bytes
180        let binary = "PK\x00\x03some binary content\x00\x00\x00";
181        assert!(is_binary_content(binary), "null bytes must trigger binary guard");
182    }
183
184    #[test]
185    fn binary_content_single_null_byte_triggers_guard() {
186        let text = "use strict;\x00\nuse warnings;\n";
187        assert!(is_binary_content(text), "single null byte must trigger binary guard");
188    }
189
190    #[test]
191    fn binary_content_clean_perl_is_not_binary() {
192        let perl = "#!/usr/bin/perl\nuse strict;\nuse warnings;\n\nprint \"Hello, World!\\n\";\n";
193        assert!(!is_binary_content(perl), "clean Perl source must not be classified as binary");
194    }
195
196    #[test]
197    fn binary_content_empty_string_is_not_binary() {
198        assert!(!is_binary_content(""), "empty string must not be classified as binary");
199    }
200
201    #[test]
202    fn binary_content_unicode_text_is_not_binary() {
203        // High-byte UTF-8 sequences must not trigger the guard
204        let utf8 = "# Perl with Unicode: \u{00e9}t\u{00e9}\nprint \"caf\u{00e9}\\n\";\n";
205        assert!(!is_binary_content(utf8), "UTF-8 text without null bytes must not be binary");
206    }
207
208    #[test]
209    fn binary_content_only_scans_first_probe_window() {
210        // A null byte beyond the probe window must NOT trigger the guard —
211        // we only scan the first BINARY_PROBE_BYTES bytes.
212        let safe_prefix = "a".repeat(BINARY_PROBE_BYTES);
213        let text_with_late_null = format!("{safe_prefix}\x00trailing");
214        assert!(
215            !is_binary_content(&text_with_late_null),
216            "null byte beyond probe window must not trigger the guard"
217        );
218    }
219
220    #[test]
221    fn binary_content_null_byte_at_probe_boundary() {
222        // A null byte exactly at the last probe byte must still be detected
223        let prefix = "a".repeat(BINARY_PROBE_BYTES - 1);
224        let text = format!("{prefix}\x00rest");
225        assert!(is_binary_content(&text), "null byte at probe boundary must trigger binary guard");
226    }
227
228    #[test]
229    fn binary_content_elf_header_is_detected() {
230        // ELF magic: \x7fELF followed by binary data
231        let elf_like = "\x7fELF\x02\x01\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00";
232        assert!(is_binary_content(elf_like), "ELF-like header with null bytes must be binary");
233    }
234
235    #[test]
236    fn binary_content_zip_pk_header_is_detected() {
237        // ZIP files start with PK\x03\x04
238        let zip_like = "PK\x03\x04\x14\x00\x00\x00\x08\x00";
239        assert!(is_binary_content(zip_like), "ZIP-like header with null bytes must be binary");
240    }
241}