use std::borrow::Cow;
use std::path::Path;
const BINARY_PROBE_BYTES: usize = 4096;
#[must_use]
pub fn is_binary_content(text: &str) -> bool {
text.bytes().take(BINARY_PROBE_BYTES).any(|b| b == 0)
}
pub const PERL_SOURCE_EXTENSIONS: [&str; 9] =
["pl", "pm", "t", "psgi", "cgi", "ep", "tt", "tt2", "mason"];
#[must_use]
pub fn is_perl_source_extension(extension: &str) -> bool {
let ext = extension.strip_prefix('.').unwrap_or(extension);
PERL_SOURCE_EXTENSIONS.iter().any(|candidate| candidate.eq_ignore_ascii_case(ext))
}
#[must_use]
pub fn is_perl_source_path(path: &Path) -> bool {
path.extension().and_then(|ext| ext.to_str()).is_some_and(is_perl_source_extension)
}
#[must_use]
pub fn is_perl_source_uri(uri: &str) -> bool {
let path_part = uri.split_once(['?', '#']).map_or(uri, |(path_prefix, _)| path_prefix);
let decoded_path = percent_decode_uri_path(path_part);
is_perl_source_path(Path::new(decoded_path.as_ref()))
}
fn percent_decode_uri_path(path: &str) -> Cow<'_, str> {
if !path.as_bytes().contains(&b'%') {
return Cow::Borrowed(path);
}
let bytes = path.as_bytes();
let mut decoded = Vec::with_capacity(bytes.len());
let mut index = 0;
let mut changed = false;
while index < bytes.len() {
if bytes[index] == b'%'
&& let (Some(high), Some(low)) = (bytes.get(index + 1), bytes.get(index + 2))
&& let (Some(high), Some(low)) = (hex_value(*high), hex_value(*low))
{
decoded.push((high << 4) | low);
index += 3;
changed = true;
} else {
decoded.push(bytes[index]);
index += 1;
}
}
if !changed {
return Cow::Borrowed(path);
}
String::from_utf8(decoded).map_or(Cow::Borrowed(path), Cow::Owned)
}
fn hex_value(byte: u8) -> Option<u8> {
match byte {
b'0'..=b'9' => Some(byte - b'0'),
b'a'..=b'f' => Some(byte - b'a' + 10),
b'A'..=b'F' => Some(byte - b'A' + 10),
_ => None,
}
}
#[cfg(test)]
mod tests {
use super::{
BINARY_PROBE_BYTES, PERL_SOURCE_EXTENSIONS, is_binary_content, is_perl_source_extension,
is_perl_source_path, is_perl_source_uri,
};
use std::path::Path;
#[test]
fn exposes_expected_extension_set() {
assert_eq!(
PERL_SOURCE_EXTENSIONS,
["pl", "pm", "t", "psgi", "cgi", "ep", "tt", "tt2", "mason"]
);
}
#[test]
fn classifies_extensions_case_insensitively() {
assert!(is_perl_source_extension("pl"));
assert!(is_perl_source_extension(".pm"));
assert!(is_perl_source_extension("T"));
assert!(is_perl_source_extension("PsGi"));
assert!(is_perl_source_extension("cgi"));
assert!(is_perl_source_extension(".CGI"));
assert!(!is_perl_source_extension("txt"));
}
#[test]
fn classifies_filesystem_paths() {
assert!(is_perl_source_path(Path::new("/workspace/script.pl")));
assert!(is_perl_source_path(Path::new("/workspace/lib/Foo/Bar.PM")));
assert!(is_perl_source_path(Path::new("/workspace/app.psgi")));
assert!(is_perl_source_path(Path::new("/var/www/cgi-bin/form.cgi")));
assert!(is_perl_source_path(Path::new("/var/www/cgi-bin/upload.CGI")));
assert!(!is_perl_source_path(Path::new("/workspace/README.md")));
assert!(!is_perl_source_path(Path::new("/workspace/no_extension")));
}
#[test]
fn classifies_uri_like_inputs() {
assert!(is_perl_source_uri("file:///workspace/script.pl"));
assert!(is_perl_source_uri("file:///workspace/lib/Foo/Bar.pm"));
assert!(is_perl_source_uri("file:///workspace/app.psgi"));
assert!(is_perl_source_uri("file:///workspace/app.psgi?version=1#section"));
assert!(is_perl_source_uri("file:///var/www/cgi-bin/form.cgi"));
assert!(is_perl_source_uri("file:///var/www/cgi-bin/search.cgi?q=perl#results"));
assert!(!is_perl_source_uri("file:///workspace/README.md"));
}
#[test]
fn classifies_percent_encoded_uri_path_extensions() {
assert!(is_perl_source_uri("file:///workspace/script%2Epl"));
assert!(is_perl_source_uri("file:///workspace/lib/Foo%2FBar.%70%6D"));
assert!(is_perl_source_uri("file:///workspace/templates/index%2Ehtml%2Eep?rev=1#L4"));
assert!(!is_perl_source_uri("file:///workspace/README%2Emd"));
}
#[test]
fn invalid_percent_escapes_remain_literal() {
assert!(is_perl_source_uri("file:///workspace/script%ZZ.pl"));
assert!(!is_perl_source_uri("file:///workspace/script.%ZZ"));
}
#[test]
fn cgi_and_psgi_are_recognized() {
assert!(is_perl_source_extension("cgi"));
assert!(is_perl_source_extension("CGI"));
assert!(is_perl_source_path(Path::new("/var/www/cgi-bin/form.cgi")));
assert!(is_perl_source_uri("file:///var/www/cgi-bin/form.cgi"));
assert!(is_perl_source_extension("psgi"));
assert!(is_perl_source_extension("PSGI"));
assert!(is_perl_source_path(Path::new("/workspace/app.psgi")));
assert!(is_perl_source_uri("file:///workspace/app.psgi"));
assert!(!is_perl_source_extension("sh"));
assert!(!is_perl_source_extension("py"));
}
#[test]
fn template_extensions_are_recognized() {
assert!(is_perl_source_extension("ep"));
assert!(is_perl_source_extension("EP"));
assert!(is_perl_source_path(Path::new("/app/templates/index.html.ep")));
assert!(is_perl_source_uri("file:///app/templates/index.html.ep"));
assert!(is_perl_source_extension("tt"));
assert!(is_perl_source_extension("TT"));
assert!(is_perl_source_path(Path::new("/app/templates/page.tt")));
assert!(is_perl_source_uri("file:///app/templates/page.tt"));
assert!(is_perl_source_extension("tt2"));
assert!(is_perl_source_extension("TT2"));
assert!(is_perl_source_path(Path::new("/app/templates/layout.tt2")));
assert!(is_perl_source_uri("file:///app/templates/layout.tt2"));
assert!(is_perl_source_extension("mason"));
assert!(is_perl_source_extension("MASON"));
assert!(is_perl_source_path(Path::new("/app/comp/header.mason")));
assert!(is_perl_source_uri("file:///app/comp/header.mason"));
assert!(!is_perl_source_extension("html"));
assert!(!is_perl_source_extension("tmpl"));
}
#[test]
fn supports_windows_style_paths() {
assert!(is_perl_source_uri(r"C:\workspace\script.pl"));
assert!(is_perl_source_uri(r"file:///C:/workspace/lib/Foo.pm"));
assert!(!is_perl_source_uri(r"C:\workspace\README.txt"));
}
#[test]
fn binary_content_null_byte_is_detected() {
let binary = "PK\x00\x03some binary content\x00\x00\x00";
assert!(is_binary_content(binary), "null bytes must trigger binary guard");
}
#[test]
fn binary_content_single_null_byte_triggers_guard() {
let text = "use strict;\x00\nuse warnings;\n";
assert!(is_binary_content(text), "single null byte must trigger binary guard");
}
#[test]
fn binary_content_clean_perl_is_not_binary() {
let perl = "#!/usr/bin/perl\nuse strict;\nuse warnings;\n\nprint \"Hello, World!\\n\";\n";
assert!(!is_binary_content(perl), "clean Perl source must not be classified as binary");
}
#[test]
fn binary_content_empty_string_is_not_binary() {
assert!(!is_binary_content(""), "empty string must not be classified as binary");
}
#[test]
fn binary_content_unicode_text_is_not_binary() {
let utf8 = "# Perl with Unicode: \u{00e9}t\u{00e9}\nprint \"caf\u{00e9}\\n\";\n";
assert!(!is_binary_content(utf8), "UTF-8 text without null bytes must not be binary");
}
#[test]
fn binary_content_only_scans_first_probe_window() {
let safe_prefix = "a".repeat(BINARY_PROBE_BYTES);
let text_with_late_null = format!("{safe_prefix}\x00trailing");
assert!(
!is_binary_content(&text_with_late_null),
"null byte beyond probe window must not trigger the guard"
);
}
#[test]
fn binary_content_null_byte_at_probe_boundary() {
let prefix = "a".repeat(BINARY_PROBE_BYTES - 1);
let text = format!("{prefix}\x00rest");
assert!(is_binary_content(&text), "null byte at probe boundary must trigger binary guard");
}
#[test]
fn binary_content_elf_header_is_detected() {
let elf_like = "\x7fELF\x02\x01\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00";
assert!(is_binary_content(elf_like), "ELF-like header with null bytes must be binary");
}
#[test]
fn binary_content_zip_pk_header_is_detected() {
let zip_like = "PK\x03\x04\x14\x00\x00\x00\x08\x00";
assert!(is_binary_content(zip_like), "ZIP-like header with null bytes must be binary");
}
}