perl_parser_core/syntax/
source_file.rs1use std::borrow::Cow;
7use std::path::Path;
8
9const BINARY_PROBE_BYTES: usize = 4096;
14
15#[must_use]
29pub fn is_binary_content(text: &str) -> bool {
30 text.bytes().take(BINARY_PROBE_BYTES).any(|b| b == 0)
31}
32
33pub const PERL_SOURCE_EXTENSIONS: [&str; 9] =
39 ["pl", "pm", "t", "psgi", "cgi", "ep", "tt", "tt2", "mason"];
40
41#[must_use]
46pub fn is_perl_source_extension(extension: &str) -> bool {
47 let ext = extension.strip_prefix('.').unwrap_or(extension);
48 PERL_SOURCE_EXTENSIONS.iter().any(|candidate| candidate.eq_ignore_ascii_case(ext))
49}
50
51#[must_use]
53pub fn is_perl_source_path(path: &Path) -> bool {
54 path.extension().and_then(|ext| ext.to_str()).is_some_and(is_perl_source_extension)
55}
56
57#[must_use]
65pub fn is_perl_source_uri(uri: &str) -> bool {
66 let path_part = uri.split_once(['?', '#']).map_or(uri, |(path_prefix, _)| path_prefix);
67 let decoded_path = percent_decode_uri_path(path_part);
68 is_perl_source_path(Path::new(decoded_path.as_ref()))
69}
70
71fn percent_decode_uri_path(path: &str) -> Cow<'_, str> {
72 if !path.as_bytes().contains(&b'%') {
73 return Cow::Borrowed(path);
74 }
75
76 let bytes = path.as_bytes();
77 let mut decoded = Vec::with_capacity(bytes.len());
78 let mut index = 0;
79 let mut changed = false;
80
81 while index < bytes.len() {
82 if bytes[index] == b'%'
83 && let (Some(high), Some(low)) = (bytes.get(index + 1), bytes.get(index + 2))
84 && let (Some(high), Some(low)) = (hex_value(*high), hex_value(*low))
85 {
86 decoded.push((high << 4) | low);
87 index += 3;
88 changed = true;
89 } else {
90 decoded.push(bytes[index]);
91 index += 1;
92 }
93 }
94
95 if !changed {
96 return Cow::Borrowed(path);
97 }
98
99 String::from_utf8(decoded).map_or(Cow::Borrowed(path), Cow::Owned)
100}
101
102fn hex_value(byte: u8) -> Option<u8> {
103 match byte {
104 b'0'..=b'9' => Some(byte - b'0'),
105 b'a'..=b'f' => Some(byte - b'a' + 10),
106 b'A'..=b'F' => Some(byte - b'A' + 10),
107 _ => None,
108 }
109}
110
111#[cfg(test)]
112mod tests {
113 use super::{
114 BINARY_PROBE_BYTES, PERL_SOURCE_EXTENSIONS, is_binary_content, is_perl_source_extension,
115 is_perl_source_path, is_perl_source_uri,
116 };
117 use std::path::Path;
118
119 #[test]
120 fn exposes_expected_extension_set() {
121 assert_eq!(
122 PERL_SOURCE_EXTENSIONS,
123 ["pl", "pm", "t", "psgi", "cgi", "ep", "tt", "tt2", "mason"]
124 );
125 }
126
127 #[test]
128 fn classifies_extensions_case_insensitively() {
129 assert!(is_perl_source_extension("pl"));
130 assert!(is_perl_source_extension(".pm"));
131 assert!(is_perl_source_extension("T"));
132 assert!(is_perl_source_extension("PsGi"));
133 assert!(is_perl_source_extension("cgi"));
134 assert!(is_perl_source_extension(".CGI"));
135 assert!(!is_perl_source_extension("txt"));
136 }
137
138 #[test]
139 fn classifies_filesystem_paths() {
140 assert!(is_perl_source_path(Path::new("/workspace/script.pl")));
141 assert!(is_perl_source_path(Path::new("/workspace/lib/Foo/Bar.PM")));
142 assert!(is_perl_source_path(Path::new("/workspace/app.psgi")));
143 assert!(is_perl_source_path(Path::new("/var/www/cgi-bin/form.cgi")));
144 assert!(is_perl_source_path(Path::new("/var/www/cgi-bin/upload.CGI")));
145 assert!(!is_perl_source_path(Path::new("/workspace/README.md")));
146 assert!(!is_perl_source_path(Path::new("/workspace/no_extension")));
147 }
148
149 #[test]
150 fn classifies_uri_like_inputs() {
151 assert!(is_perl_source_uri("file:///workspace/script.pl"));
152 assert!(is_perl_source_uri("file:///workspace/lib/Foo/Bar.pm"));
153 assert!(is_perl_source_uri("file:///workspace/app.psgi"));
154 assert!(is_perl_source_uri("file:///workspace/app.psgi?version=1#section"));
155 assert!(is_perl_source_uri("file:///var/www/cgi-bin/form.cgi"));
156 assert!(is_perl_source_uri("file:///var/www/cgi-bin/search.cgi?q=perl#results"));
157 assert!(!is_perl_source_uri("file:///workspace/README.md"));
158 }
159
160 #[test]
161 fn classifies_percent_encoded_uri_path_extensions() {
162 assert!(is_perl_source_uri("file:///workspace/script%2Epl"));
163 assert!(is_perl_source_uri("file:///workspace/lib/Foo%2FBar.%70%6D"));
164 assert!(is_perl_source_uri("file:///workspace/templates/index%2Ehtml%2Eep?rev=1#L4"));
165 assert!(!is_perl_source_uri("file:///workspace/README%2Emd"));
166 }
167
168 #[test]
169 fn invalid_percent_escapes_remain_literal() {
170 assert!(is_perl_source_uri("file:///workspace/script%ZZ.pl"));
171 assert!(!is_perl_source_uri("file:///workspace/script.%ZZ"));
172 }
173
174 #[test]
175 fn cgi_and_psgi_are_recognized() {
176 assert!(is_perl_source_extension("cgi"));
178 assert!(is_perl_source_extension("CGI"));
179 assert!(is_perl_source_path(Path::new("/var/www/cgi-bin/form.cgi")));
180 assert!(is_perl_source_uri("file:///var/www/cgi-bin/form.cgi"));
181
182 assert!(is_perl_source_extension("psgi"));
184 assert!(is_perl_source_extension("PSGI"));
185 assert!(is_perl_source_path(Path::new("/workspace/app.psgi")));
186 assert!(is_perl_source_uri("file:///workspace/app.psgi"));
187
188 assert!(!is_perl_source_extension("sh"));
190 assert!(!is_perl_source_extension("py"));
191 }
192
193 #[test]
194 fn template_extensions_are_recognized() {
195 assert!(is_perl_source_extension("ep"));
197 assert!(is_perl_source_extension("EP"));
198 assert!(is_perl_source_path(Path::new("/app/templates/index.html.ep")));
199 assert!(is_perl_source_uri("file:///app/templates/index.html.ep"));
200
201 assert!(is_perl_source_extension("tt"));
203 assert!(is_perl_source_extension("TT"));
204 assert!(is_perl_source_path(Path::new("/app/templates/page.tt")));
205 assert!(is_perl_source_uri("file:///app/templates/page.tt"));
206
207 assert!(is_perl_source_extension("tt2"));
209 assert!(is_perl_source_extension("TT2"));
210 assert!(is_perl_source_path(Path::new("/app/templates/layout.tt2")));
211 assert!(is_perl_source_uri("file:///app/templates/layout.tt2"));
212
213 assert!(is_perl_source_extension("mason"));
215 assert!(is_perl_source_extension("MASON"));
216 assert!(is_perl_source_path(Path::new("/app/comp/header.mason")));
217 assert!(is_perl_source_uri("file:///app/comp/header.mason"));
218
219 assert!(!is_perl_source_extension("html"));
221 assert!(!is_perl_source_extension("tmpl"));
222 }
223
224 #[test]
225 fn supports_windows_style_paths() {
226 assert!(is_perl_source_uri(r"C:\workspace\script.pl"));
227 assert!(is_perl_source_uri(r"file:///C:/workspace/lib/Foo.pm"));
228 assert!(!is_perl_source_uri(r"C:\workspace\README.txt"));
229 }
230
231 #[test]
234 fn binary_content_null_byte_is_detected() {
235 let binary = "PK\x00\x03some binary content\x00\x00\x00";
237 assert!(is_binary_content(binary), "null bytes must trigger binary guard");
238 }
239
240 #[test]
241 fn binary_content_single_null_byte_triggers_guard() {
242 let text = "use strict;\x00\nuse warnings;\n";
243 assert!(is_binary_content(text), "single null byte must trigger binary guard");
244 }
245
246 #[test]
247 fn binary_content_clean_perl_is_not_binary() {
248 let perl = "#!/usr/bin/perl\nuse strict;\nuse warnings;\n\nprint \"Hello, World!\\n\";\n";
249 assert!(!is_binary_content(perl), "clean Perl source must not be classified as binary");
250 }
251
252 #[test]
253 fn binary_content_empty_string_is_not_binary() {
254 assert!(!is_binary_content(""), "empty string must not be classified as binary");
255 }
256
257 #[test]
258 fn binary_content_unicode_text_is_not_binary() {
259 let utf8 = "# Perl with Unicode: \u{00e9}t\u{00e9}\nprint \"caf\u{00e9}\\n\";\n";
261 assert!(!is_binary_content(utf8), "UTF-8 text without null bytes must not be binary");
262 }
263
264 #[test]
265 fn binary_content_only_scans_first_probe_window() {
266 let safe_prefix = "a".repeat(BINARY_PROBE_BYTES);
269 let text_with_late_null = format!("{safe_prefix}\x00trailing");
270 assert!(
271 !is_binary_content(&text_with_late_null),
272 "null byte beyond probe window must not trigger the guard"
273 );
274 }
275
276 #[test]
277 fn binary_content_null_byte_at_probe_boundary() {
278 let prefix = "a".repeat(BINARY_PROBE_BYTES - 1);
280 let text = format!("{prefix}\x00rest");
281 assert!(is_binary_content(&text), "null byte at probe boundary must trigger binary guard");
282 }
283
284 #[test]
285 fn binary_content_elf_header_is_detected() {
286 let elf_like = "\x7fELF\x02\x01\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00";
288 assert!(is_binary_content(elf_like), "ELF-like header with null bytes must be binary");
289 }
290
291 #[test]
292 fn binary_content_zip_pk_header_is_detected() {
293 let zip_like = "PK\x03\x04\x14\x00\x00\x00\x08\x00";
295 assert!(is_binary_content(zip_like), "ZIP-like header with null bytes must be binary");
296 }
297}