use std::path::Path;
const ZIP_MAGIC: &[u8] = &[0x50, 0x4B, 0x03, 0x04];
const PDF_MAGIC: &[u8] = b"%PDF";
pub fn detect_format(path: &Path, header_bytes: &[u8]) -> Option<&'static str> {
if header_bytes.len() >= 4 {
if header_bytes.starts_with(ZIP_MAGIC) {
return Some("zip");
}
if header_bytes.starts_with(PDF_MAGIC) {
return Some("pdf");
}
}
if let Some(fmt) = detect_by_extension(path) {
return Some(fmt);
}
let bytes = header_bytes
.strip_prefix(&[0xEF, 0xBB, 0xBF])
.unwrap_or(header_bytes);
if let Some(&first) = bytes.iter().find(|b| !b.is_ascii_whitespace())
&& (first == b'{' || first == b'[')
{
return Some("json");
}
None
}
pub fn detect_zip_format(data: &[u8]) -> Option<&'static str> {
let cursor = std::io::Cursor::new(data);
let mut archive = zip::ZipArchive::new(cursor).ok()?;
for i in 0..archive.len() {
if let Ok(file) = archive.by_index_raw(i) {
let name = file.name();
if name.starts_with("word/") {
return Some("docx");
}
if name.starts_with("ppt/") {
return Some("pptx");
}
if name.starts_with("xl/") {
return Some("xlsx");
}
}
}
None
}
fn detect_by_extension(path: &Path) -> Option<&'static str> {
let ext = path.extension()?.to_str()?.to_ascii_lowercase();
match ext.as_str() {
"docx" => Some("docx"),
"pptx" => Some("pptx"),
"xlsx" => Some("xlsx"),
"xls" => Some("xls"),
"csv" => Some("csv"),
"ipynb" => Some("ipynb"),
"json" => Some("json"),
"pdf" => Some("pdf"),
"html" | "htm" => Some("html"),
"xml" => Some("xml"),
"txt" | "text" | "log" | "md" | "markdown" | "rst" | "ini" | "cfg" | "conf" | "toml"
| "yaml" | "yml" => Some("txt"),
"png" | "jpg" | "jpeg" | "gif" | "webp" | "bmp" | "tiff" | "tif" | "svg" | "heic"
| "heif" | "avif" => Some("image"),
"c" | "h" | "cpp" | "cc" | "cxx" | "hpp" | "hxx" | "hh" | "py" | "pyw" | "js" | "mjs"
| "cjs" | "jsx" | "ts" | "mts" | "cts" | "tsx" | "rs" | "go" | "java" | "kt" | "kts"
| "rb" | "swift" | "cs" | "php" | "sh" | "bash" | "zsh" | "fish" | "pl" | "pm" | "lua"
| "r" | "scala" | "dart" | "ex" | "exs" | "erl" | "hs" | "ml" | "mli" | "sql" | "m"
| "mm" | "zig" | "nim" | "v" | "groovy" | "ps1" | "bat" | "cmd" => Some("code"),
_ => None,
}
}
#[cfg(test)]
mod tests {
use super::*;
use std::path::PathBuf;
#[test]
fn test_detect_format_docx_by_extension() {
let path = PathBuf::from("document.docx");
assert_eq!(detect_format(&path, &[]), Some("docx"));
}
#[test]
fn test_detect_format_pptx_by_extension() {
let path = PathBuf::from("slides.pptx");
assert_eq!(detect_format(&path, &[]), Some("pptx"));
}
#[test]
fn test_detect_format_xlsx_by_extension() {
let path = PathBuf::from("data.xlsx");
assert_eq!(detect_format(&path, &[]), Some("xlsx"));
}
#[test]
fn test_detect_format_csv_by_extension() {
let path = PathBuf::from("data.csv");
assert_eq!(detect_format(&path, &[]), Some("csv"));
}
#[test]
fn test_detect_format_json_by_extension() {
let path = PathBuf::from("config.json");
assert_eq!(detect_format(&path, &[]), Some("json"));
}
#[test]
fn test_detect_format_txt_by_extension() {
let path = PathBuf::from("readme.txt");
assert_eq!(detect_format(&path, &[]), Some("txt"));
}
#[test]
fn test_detect_format_text_variants() {
for ext in &[
"log", "md", "markdown", "rst", "ini", "cfg", "conf", "toml", "yaml", "yml",
] {
let path = PathBuf::from(format!("file.{}", ext));
assert_eq!(
detect_format(&path, &[]),
Some("txt"),
"expected 'txt' for .{}",
ext
);
}
}
#[test]
fn test_detect_format_pdf_by_extension() {
let path = PathBuf::from("paper.pdf");
assert_eq!(detect_format(&path, &[]), Some("pdf"));
}
#[test]
fn test_detect_format_html_by_extension() {
let path = PathBuf::from("page.html");
assert_eq!(detect_format(&path, &[]), Some("html"));
let path2 = PathBuf::from("page.htm");
assert_eq!(detect_format(&path2, &[]), Some("html"));
}
#[test]
fn test_detect_format_unknown_returns_none() {
let path = PathBuf::from("file.xyz");
assert_eq!(detect_format(&path, &[]), None);
}
#[test]
fn test_detect_format_no_extension_returns_none() {
let path = PathBuf::from("Makefile");
assert_eq!(detect_format(&path, &[]), None);
}
#[test]
fn test_detect_format_zip_magic_bytes_override_extension() {
let path = PathBuf::from("data.csv");
let zip_header = [0x50, 0x4B, 0x03, 0x04];
assert_eq!(detect_format(&path, &zip_header), Some("zip"));
}
#[test]
fn test_detect_format_pdf_magic_bytes_override_extension() {
let path = PathBuf::from("file.txt");
let pdf_header = b"%PDF-1.7";
assert_eq!(detect_format(&path, pdf_header), Some("pdf"));
}
#[test]
fn test_detect_format_json_heuristic_object() {
let path = PathBuf::from("data.bin");
let json_bytes = b" { \"key\": \"value\" }";
assert_eq!(detect_format(&path, json_bytes), Some("json"));
}
#[test]
fn test_detect_format_json_heuristic_array() {
let path = PathBuf::from("data.bin");
let json_bytes = b"[1, 2, 3]";
assert_eq!(detect_format(&path, json_bytes), Some("json"));
}
#[test]
fn test_detect_format_txt_starting_with_brace_returns_txt() {
let path = PathBuf::from("notes.txt");
let content = b"{ this is just a text file }";
assert_eq!(detect_format(&path, content), Some("txt"));
}
#[test]
fn test_detect_format_csv_starting_with_bracket_returns_csv() {
let path = PathBuf::from("data.csv");
let content = b"[header1],header2\nval1,val2";
assert_eq!(detect_format(&path, content), Some("csv"));
}
#[test]
fn test_detect_format_unknown_ext_with_json_content_returns_json() {
let path = PathBuf::from("data.dat");
let content = b"{ \"key\": \"value\" }";
assert_eq!(detect_format(&path, content), Some("json"));
}
#[test]
fn test_detect_format_unknown_ext_with_json_utf8_bom_returns_json() {
let path = PathBuf::from("data.dat");
let mut content = vec![0xEF, 0xBB, 0xBF];
content.extend_from_slice(b"{\"key\":\"value\"}");
assert_eq!(detect_format(&path, &content), Some("json"));
}
#[test]
fn test_detect_format_png_by_extension() {
let path = PathBuf::from("photo.png");
assert_eq!(detect_format(&path, &[]), Some("image"));
}
#[test]
fn test_detect_format_jpg_by_extension() {
let path = PathBuf::from("photo.jpg");
assert_eq!(detect_format(&path, &[]), Some("image"));
}
#[test]
fn test_detect_format_jpeg_by_extension() {
let path = PathBuf::from("photo.jpeg");
assert_eq!(detect_format(&path, &[]), Some("image"));
}
#[test]
fn test_detect_format_svg_by_extension() {
let path = PathBuf::from("icon.svg");
assert_eq!(detect_format(&path, &[]), Some("image"));
}
#[test]
fn test_detect_format_image_variants() {
for ext in &[
"png", "jpg", "jpeg", "gif", "webp", "bmp", "tiff", "tif", "svg", "heic", "heif",
"avif",
] {
let path = PathBuf::from(format!("file.{}", ext));
assert_eq!(
detect_format(&path, &[]),
Some("image"),
"expected 'image' for .{}",
ext
);
}
}
#[test]
fn test_detect_format_ipynb_by_extension() {
let path = PathBuf::from("notebook.ipynb");
assert_eq!(detect_format(&path, &[]), Some("ipynb"));
}
#[test]
fn test_detect_format_ipynb_not_caught_by_json_heuristic() {
let path = PathBuf::from("notebook.ipynb");
let content = b"{ \"cells\": [] }";
assert_eq!(detect_format(&path, content), Some("ipynb"));
}
#[test]
fn test_detect_format_code_variants() {
let code_extensions = [
"c", "h", "cpp", "cc", "cxx", "hpp", "hxx", "hh", "py", "pyw", "js", "mjs", "cjs",
"jsx", "ts", "mts", "cts", "tsx", "rs", "go", "java", "kt", "kts", "rb", "swift", "cs",
"php", "sh", "bash", "zsh", "fish", "pl", "pm", "lua", "r", "scala", "dart", "ex",
"exs", "erl", "hs", "ml", "mli", "sql", "m", "mm", "zig", "nim", "v", "groovy", "ps1",
"bat", "cmd",
];
for ext in &code_extensions {
let path = PathBuf::from(format!("file.{}", ext));
assert_eq!(
detect_format(&path, &[]),
Some("code"),
"expected 'code' for .{}",
ext
);
}
}
}