Skip to main content

perl_parser_core/syntax/
source_file.rs

1//! Shared Perl source-file classification helpers.
2//!
3//! These helpers provide one canonical definition for what constitutes a Perl
4//! source file across workspace discovery and runtime file operations.
5
6use std::borrow::Cow;
7use std::path::Path;
8
9/// Number of bytes to inspect for binary content detection.
10///
11/// 4 KB is enough to catch all common binary formats (ELF, PE, ZIP, PNG, …)
12/// while being cheap to scan.
13const BINARY_PROBE_BYTES: usize = 4096;
14
15/// Returns `true` if `text` appears to contain binary (non-text) content.
16///
17/// The heuristic checks the first [`BINARY_PROBE_BYTES`] bytes for null bytes
18/// (`\0`).  A single null byte is sufficient to classify the content as
19/// binary: valid Perl (or any UTF-8 text) never contains null bytes outside of
20/// raw string literals, and real-world binary formats (ELF, PE/COFF, ZIP,
21/// PNG, …) all begin with or contain null bytes in their headers.
22///
23/// # Why null bytes?
24///
25/// - Fast: a single `memchr`-style scan of at most 4 KB.
26/// - Low false-positive rate: Perl source virtually never contains `\0`.
27/// - High true-positive rate: every common compiled binary contains `\0`.
28#[must_use]
29pub fn is_binary_content(text: &str) -> bool {
30    text.bytes().take(BINARY_PROBE_BYTES).any(|b| b == 0)
31}
32
33/// Canonical Perl source file extensions.
34///
35/// Includes core Perl script and module extensions as well as common embedded
36/// Perl template formats: `.ep` (Mojolicious), `.tt`/`.tt2` (Template Toolkit),
37/// and `.mason` (Mason/HTML::Mason).
38pub const PERL_SOURCE_EXTENSIONS: [&str; 9] =
39    ["pl", "pm", "t", "psgi", "cgi", "ep", "tt", "tt2", "mason"];
40
41/// Returns `true` if `extension` is a recognized Perl source extension.
42///
43/// Accepts values with or without a leading dot and matches
44/// case-insensitively.
45#[must_use]
46pub fn is_perl_source_extension(extension: &str) -> bool {
47    let ext = extension.strip_prefix('.').unwrap_or(extension);
48    PERL_SOURCE_EXTENSIONS.iter().any(|candidate| candidate.eq_ignore_ascii_case(ext))
49}
50
51/// Returns `true` if `path` points to a recognized Perl source file.
52#[must_use]
53pub fn is_perl_source_path(path: &Path) -> bool {
54    path.extension().and_then(|ext| ext.to_str()).is_some_and(is_perl_source_extension)
55}
56
57/// Returns `true` if `uri` or path-like string points to a Perl source file.
58///
59/// Supports:
60/// - Plain filesystem paths
61/// - `file://` URIs
62/// - Percent-encoded URI path segments
63/// - Optional query/fragment suffixes
64#[must_use]
65pub fn is_perl_source_uri(uri: &str) -> bool {
66    let path_part = uri.split_once(['?', '#']).map_or(uri, |(path_prefix, _)| path_prefix);
67    let decoded_path = percent_decode_uri_path(path_part);
68    is_perl_source_path(Path::new(decoded_path.as_ref()))
69}
70
71fn percent_decode_uri_path(path: &str) -> Cow<'_, str> {
72    if !path.as_bytes().contains(&b'%') {
73        return Cow::Borrowed(path);
74    }
75
76    let bytes = path.as_bytes();
77    let mut decoded = Vec::with_capacity(bytes.len());
78    let mut index = 0;
79    let mut changed = false;
80
81    while index < bytes.len() {
82        if bytes[index] == b'%'
83            && let (Some(high), Some(low)) = (bytes.get(index + 1), bytes.get(index + 2))
84            && let (Some(high), Some(low)) = (hex_value(*high), hex_value(*low))
85        {
86            decoded.push((high << 4) | low);
87            index += 3;
88            changed = true;
89        } else {
90            decoded.push(bytes[index]);
91            index += 1;
92        }
93    }
94
95    if !changed {
96        return Cow::Borrowed(path);
97    }
98
99    String::from_utf8(decoded).map_or(Cow::Borrowed(path), Cow::Owned)
100}
101
102fn hex_value(byte: u8) -> Option<u8> {
103    match byte {
104        b'0'..=b'9' => Some(byte - b'0'),
105        b'a'..=b'f' => Some(byte - b'a' + 10),
106        b'A'..=b'F' => Some(byte - b'A' + 10),
107        _ => None,
108    }
109}
110
111#[cfg(test)]
112mod tests {
113    use super::{
114        BINARY_PROBE_BYTES, PERL_SOURCE_EXTENSIONS, is_binary_content, is_perl_source_extension,
115        is_perl_source_path, is_perl_source_uri,
116    };
117    use std::path::Path;
118
119    #[test]
120    fn exposes_expected_extension_set() {
121        assert_eq!(
122            PERL_SOURCE_EXTENSIONS,
123            ["pl", "pm", "t", "psgi", "cgi", "ep", "tt", "tt2", "mason"]
124        );
125    }
126
127    #[test]
128    fn classifies_extensions_case_insensitively() {
129        assert!(is_perl_source_extension("pl"));
130        assert!(is_perl_source_extension(".pm"));
131        assert!(is_perl_source_extension("T"));
132        assert!(is_perl_source_extension("PsGi"));
133        assert!(is_perl_source_extension("cgi"));
134        assert!(is_perl_source_extension(".CGI"));
135        assert!(!is_perl_source_extension("txt"));
136    }
137
138    #[test]
139    fn classifies_filesystem_paths() {
140        assert!(is_perl_source_path(Path::new("/workspace/script.pl")));
141        assert!(is_perl_source_path(Path::new("/workspace/lib/Foo/Bar.PM")));
142        assert!(is_perl_source_path(Path::new("/workspace/app.psgi")));
143        assert!(is_perl_source_path(Path::new("/var/www/cgi-bin/form.cgi")));
144        assert!(is_perl_source_path(Path::new("/var/www/cgi-bin/upload.CGI")));
145        assert!(!is_perl_source_path(Path::new("/workspace/README.md")));
146        assert!(!is_perl_source_path(Path::new("/workspace/no_extension")));
147    }
148
149    #[test]
150    fn classifies_uri_like_inputs() {
151        assert!(is_perl_source_uri("file:///workspace/script.pl"));
152        assert!(is_perl_source_uri("file:///workspace/lib/Foo/Bar.pm"));
153        assert!(is_perl_source_uri("file:///workspace/app.psgi"));
154        assert!(is_perl_source_uri("file:///workspace/app.psgi?version=1#section"));
155        assert!(is_perl_source_uri("file:///var/www/cgi-bin/form.cgi"));
156        assert!(is_perl_source_uri("file:///var/www/cgi-bin/search.cgi?q=perl#results"));
157        assert!(!is_perl_source_uri("file:///workspace/README.md"));
158    }
159
160    #[test]
161    fn classifies_percent_encoded_uri_path_extensions() {
162        assert!(is_perl_source_uri("file:///workspace/script%2Epl"));
163        assert!(is_perl_source_uri("file:///workspace/lib/Foo%2FBar.%70%6D"));
164        assert!(is_perl_source_uri("file:///workspace/templates/index%2Ehtml%2Eep?rev=1#L4"));
165        assert!(!is_perl_source_uri("file:///workspace/README%2Emd"));
166    }
167
168    #[test]
169    fn invalid_percent_escapes_remain_literal() {
170        assert!(is_perl_source_uri("file:///workspace/script%ZZ.pl"));
171        assert!(!is_perl_source_uri("file:///workspace/script.%ZZ"));
172    }
173
174    #[test]
175    fn cgi_and_psgi_are_recognized() {
176        // CGI scripts (.cgi) — web projects, Apache/Nginx CGI handlers
177        assert!(is_perl_source_extension("cgi"));
178        assert!(is_perl_source_extension("CGI"));
179        assert!(is_perl_source_path(Path::new("/var/www/cgi-bin/form.cgi")));
180        assert!(is_perl_source_uri("file:///var/www/cgi-bin/form.cgi"));
181
182        // PSGI apps (.psgi) — Plack/PSGI applications
183        assert!(is_perl_source_extension("psgi"));
184        assert!(is_perl_source_extension("PSGI"));
185        assert!(is_perl_source_path(Path::new("/workspace/app.psgi")));
186        assert!(is_perl_source_uri("file:///workspace/app.psgi"));
187
188        // Non-Perl extensions remain unrecognized
189        assert!(!is_perl_source_extension("sh"));
190        assert!(!is_perl_source_extension("py"));
191    }
192
193    #[test]
194    fn template_extensions_are_recognized() {
195        // .ep — Mojolicious embedded Perl templates
196        assert!(is_perl_source_extension("ep"));
197        assert!(is_perl_source_extension("EP"));
198        assert!(is_perl_source_path(Path::new("/app/templates/index.html.ep")));
199        assert!(is_perl_source_uri("file:///app/templates/index.html.ep"));
200
201        // .tt — Template Toolkit templates (version 2 default)
202        assert!(is_perl_source_extension("tt"));
203        assert!(is_perl_source_extension("TT"));
204        assert!(is_perl_source_path(Path::new("/app/templates/page.tt")));
205        assert!(is_perl_source_uri("file:///app/templates/page.tt"));
206
207        // .tt2 — Template Toolkit 2 explicit extension
208        assert!(is_perl_source_extension("tt2"));
209        assert!(is_perl_source_extension("TT2"));
210        assert!(is_perl_source_path(Path::new("/app/templates/layout.tt2")));
211        assert!(is_perl_source_uri("file:///app/templates/layout.tt2"));
212
213        // .mason — HTML::Mason / Mason2 templates
214        assert!(is_perl_source_extension("mason"));
215        assert!(is_perl_source_extension("MASON"));
216        assert!(is_perl_source_path(Path::new("/app/comp/header.mason")));
217        assert!(is_perl_source_uri("file:///app/comp/header.mason"));
218
219        // Non-template extensions remain unrecognized
220        assert!(!is_perl_source_extension("html"));
221        assert!(!is_perl_source_extension("tmpl"));
222    }
223
224    #[test]
225    fn supports_windows_style_paths() {
226        assert!(is_perl_source_uri(r"C:\workspace\script.pl"));
227        assert!(is_perl_source_uri(r"file:///C:/workspace/lib/Foo.pm"));
228        assert!(!is_perl_source_uri(r"C:\workspace\README.txt"));
229    }
230
231    // ── is_binary_content ─────────────────────────────────────────────────
232
233    #[test]
234    fn binary_content_null_byte_is_detected() {
235        // Simulate a binary file arriving as a string with embedded null bytes
236        let binary = "PK\x00\x03some binary content\x00\x00\x00";
237        assert!(is_binary_content(binary), "null bytes must trigger binary guard");
238    }
239
240    #[test]
241    fn binary_content_single_null_byte_triggers_guard() {
242        let text = "use strict;\x00\nuse warnings;\n";
243        assert!(is_binary_content(text), "single null byte must trigger binary guard");
244    }
245
246    #[test]
247    fn binary_content_clean_perl_is_not_binary() {
248        let perl = "#!/usr/bin/perl\nuse strict;\nuse warnings;\n\nprint \"Hello, World!\\n\";\n";
249        assert!(!is_binary_content(perl), "clean Perl source must not be classified as binary");
250    }
251
252    #[test]
253    fn binary_content_empty_string_is_not_binary() {
254        assert!(!is_binary_content(""), "empty string must not be classified as binary");
255    }
256
257    #[test]
258    fn binary_content_unicode_text_is_not_binary() {
259        // High-byte UTF-8 sequences must not trigger the guard
260        let utf8 = "# Perl with Unicode: \u{00e9}t\u{00e9}\nprint \"caf\u{00e9}\\n\";\n";
261        assert!(!is_binary_content(utf8), "UTF-8 text without null bytes must not be binary");
262    }
263
264    #[test]
265    fn binary_content_only_scans_first_probe_window() {
266        // A null byte beyond the probe window must NOT trigger the guard —
267        // we only scan the first BINARY_PROBE_BYTES bytes.
268        let safe_prefix = "a".repeat(BINARY_PROBE_BYTES);
269        let text_with_late_null = format!("{safe_prefix}\x00trailing");
270        assert!(
271            !is_binary_content(&text_with_late_null),
272            "null byte beyond probe window must not trigger the guard"
273        );
274    }
275
276    #[test]
277    fn binary_content_null_byte_at_probe_boundary() {
278        // A null byte exactly at the last probe byte must still be detected
279        let prefix = "a".repeat(BINARY_PROBE_BYTES - 1);
280        let text = format!("{prefix}\x00rest");
281        assert!(is_binary_content(&text), "null byte at probe boundary must trigger binary guard");
282    }
283
284    #[test]
285    fn binary_content_elf_header_is_detected() {
286        // ELF magic: \x7fELF followed by binary data
287        let elf_like = "\x7fELF\x02\x01\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00";
288        assert!(is_binary_content(elf_like), "ELF-like header with null bytes must be binary");
289    }
290
291    #[test]
292    fn binary_content_zip_pk_header_is_detected() {
293        // ZIP files start with PK\x03\x04
294        let zip_like = "PK\x03\x04\x14\x00\x00\x00\x08\x00";
295        assert!(is_binary_content(zip_like), "ZIP-like header with null bytes must be binary");
296    }
297}