anytomd 1.2.2

Pure Rust library that converts various document formats into Markdown
Documentation
//! File format detection by magic bytes, file extension, and ZIP introspection.
//!
//! Detection follows a strict priority order: magic bytes first, then container
//! introspection (for ZIP-based formats), then file extension. This ensures
//! that misnamed files are handled correctly.

use std::path::Path;

/// Magic bytes signatures for supported formats.
const ZIP_MAGIC: &[u8] = &[0x50, 0x4B, 0x03, 0x04];
const PDF_MAGIC: &[u8] = b"%PDF";

/// Detect the document format from a file path and optional header bytes.
///
/// Priority: magic bytes → file extension → JSON heuristic (fallback).
/// For ZIP-based formats (DOCX, PPTX, XLSX), the caller should use
/// `detect_zip_format` on the full file data for accurate detection.
pub fn detect_format(path: &Path, header_bytes: &[u8]) -> Option<&'static str> {
    // 1. Magic bytes / file signature
    if header_bytes.len() >= 4 {
        if header_bytes.starts_with(ZIP_MAGIC) {
            // Cannot distinguish DOCX/PPTX/XLSX from magic bytes alone;
            // return "zip" — caller should use detect_zip_format for specifics.
            return Some("zip");
        }
        if header_bytes.starts_with(PDF_MAGIC) {
            return Some("pdf");
        }
    }

    // 2. File extension
    if let Some(fmt) = detect_by_extension(path) {
        return Some(fmt);
    }

    // 3. JSON heuristic (fallback for unknown extensions): starts with { or [
    // Skip optional UTF-8 BOM before scanning.
    let bytes = header_bytes
        .strip_prefix(&[0xEF, 0xBB, 0xBF])
        .unwrap_or(header_bytes);
    if let Some(&first) = bytes.iter().find(|b| !b.is_ascii_whitespace())
        && (first == b'{' || first == b'[')
    {
        return Some("json");
    }

    None
}

/// Detect the specific format of a ZIP-based file by inspecting its internal paths.
///
/// Returns "docx", "pptx", or "xlsx" based on the presence of characteristic
/// internal files. Returns None if the ZIP does not match a known format.
pub fn detect_zip_format(data: &[u8]) -> Option<&'static str> {
    let cursor = std::io::Cursor::new(data);
    let mut archive = zip::ZipArchive::new(cursor).ok()?;

    for i in 0..archive.len() {
        if let Ok(file) = archive.by_index_raw(i) {
            let name = file.name();
            if name.starts_with("word/") {
                return Some("docx");
            }
            if name.starts_with("ppt/") {
                return Some("pptx");
            }
            if name.starts_with("xl/") {
                return Some("xlsx");
            }
        }
    }

    None
}

/// Detect format by file extension alone.
fn detect_by_extension(path: &Path) -> Option<&'static str> {
    let ext = path.extension()?.to_str()?.to_ascii_lowercase();
    match ext.as_str() {
        "docx" => Some("docx"),
        "pptx" => Some("pptx"),
        "xlsx" => Some("xlsx"),
        "xls" => Some("xls"),
        "csv" => Some("csv"),
        "ipynb" => Some("ipynb"),
        "json" => Some("json"),
        "pdf" => Some("pdf"),
        "html" | "htm" => Some("html"),
        "xml" => Some("xml"),
        "txt" | "text" | "log" | "md" | "markdown" | "rst" | "ini" | "cfg" | "conf" | "toml"
        | "yaml" | "yml" => Some("txt"),
        "png" | "jpg" | "jpeg" | "gif" | "webp" | "bmp" | "tiff" | "tif" | "svg" | "heic"
        | "heif" | "avif" => Some("image"),
        "c" | "h" | "cpp" | "cc" | "cxx" | "hpp" | "hxx" | "hh" | "py" | "pyw" | "js" | "mjs"
        | "cjs" | "jsx" | "ts" | "mts" | "cts" | "tsx" | "rs" | "go" | "java" | "kt" | "kts"
        | "rb" | "swift" | "cs" | "php" | "sh" | "bash" | "zsh" | "fish" | "pl" | "pm" | "lua"
        | "r" | "scala" | "dart" | "ex" | "exs" | "erl" | "hs" | "ml" | "mli" | "sql" | "m"
        | "mm" | "zig" | "nim" | "v" | "groovy" | "ps1" | "bat" | "cmd" => Some("code"),
        _ => None,
    }
}

#[cfg(test)]
mod tests {
    use super::*;
    use std::path::PathBuf;

    #[test]
    fn test_detect_format_docx_by_extension() {
        let path = PathBuf::from("document.docx");
        assert_eq!(detect_format(&path, &[]), Some("docx"));
    }

    #[test]
    fn test_detect_format_pptx_by_extension() {
        let path = PathBuf::from("slides.pptx");
        assert_eq!(detect_format(&path, &[]), Some("pptx"));
    }

    #[test]
    fn test_detect_format_xlsx_by_extension() {
        let path = PathBuf::from("data.xlsx");
        assert_eq!(detect_format(&path, &[]), Some("xlsx"));
    }

    #[test]
    fn test_detect_format_csv_by_extension() {
        let path = PathBuf::from("data.csv");
        assert_eq!(detect_format(&path, &[]), Some("csv"));
    }

    #[test]
    fn test_detect_format_json_by_extension() {
        let path = PathBuf::from("config.json");
        assert_eq!(detect_format(&path, &[]), Some("json"));
    }

    #[test]
    fn test_detect_format_txt_by_extension() {
        let path = PathBuf::from("readme.txt");
        assert_eq!(detect_format(&path, &[]), Some("txt"));
    }

    #[test]
    fn test_detect_format_text_variants() {
        for ext in &[
            "log", "md", "markdown", "rst", "ini", "cfg", "conf", "toml", "yaml", "yml",
        ] {
            let path = PathBuf::from(format!("file.{}", ext));
            assert_eq!(
                detect_format(&path, &[]),
                Some("txt"),
                "expected 'txt' for .{}",
                ext
            );
        }
    }

    #[test]
    fn test_detect_format_pdf_by_extension() {
        let path = PathBuf::from("paper.pdf");
        assert_eq!(detect_format(&path, &[]), Some("pdf"));
    }

    #[test]
    fn test_detect_format_html_by_extension() {
        let path = PathBuf::from("page.html");
        assert_eq!(detect_format(&path, &[]), Some("html"));
        let path2 = PathBuf::from("page.htm");
        assert_eq!(detect_format(&path2, &[]), Some("html"));
    }

    #[test]
    fn test_detect_format_unknown_returns_none() {
        let path = PathBuf::from("file.xyz");
        assert_eq!(detect_format(&path, &[]), None);
    }

    #[test]
    fn test_detect_format_no_extension_returns_none() {
        let path = PathBuf::from("Makefile");
        assert_eq!(detect_format(&path, &[]), None);
    }

    #[test]
    fn test_detect_format_zip_magic_bytes_override_extension() {
        let path = PathBuf::from("data.csv");
        let zip_header = [0x50, 0x4B, 0x03, 0x04];
        // ZIP magic bytes should win over .csv extension
        assert_eq!(detect_format(&path, &zip_header), Some("zip"));
    }

    #[test]
    fn test_detect_format_pdf_magic_bytes_override_extension() {
        let path = PathBuf::from("file.txt");
        let pdf_header = b"%PDF-1.7";
        assert_eq!(detect_format(&path, pdf_header), Some("pdf"));
    }

    #[test]
    fn test_detect_format_json_heuristic_object() {
        let path = PathBuf::from("data.bin");
        let json_bytes = b"  { \"key\": \"value\" }";
        assert_eq!(detect_format(&path, json_bytes), Some("json"));
    }

    #[test]
    fn test_detect_format_json_heuristic_array() {
        let path = PathBuf::from("data.bin");
        let json_bytes = b"[1, 2, 3]";
        assert_eq!(detect_format(&path, json_bytes), Some("json"));
    }

    #[test]
    fn test_detect_format_txt_starting_with_brace_returns_txt() {
        let path = PathBuf::from("notes.txt");
        let content = b"{ this is just a text file }";
        assert_eq!(detect_format(&path, content), Some("txt"));
    }

    #[test]
    fn test_detect_format_csv_starting_with_bracket_returns_csv() {
        let path = PathBuf::from("data.csv");
        let content = b"[header1],header2\nval1,val2";
        assert_eq!(detect_format(&path, content), Some("csv"));
    }

    #[test]
    fn test_detect_format_unknown_ext_with_json_content_returns_json() {
        let path = PathBuf::from("data.dat");
        let content = b"{ \"key\": \"value\" }";
        assert_eq!(detect_format(&path, content), Some("json"));
    }

    #[test]
    fn test_detect_format_unknown_ext_with_json_utf8_bom_returns_json() {
        let path = PathBuf::from("data.dat");
        let mut content = vec![0xEF, 0xBB, 0xBF];
        content.extend_from_slice(b"{\"key\":\"value\"}");
        assert_eq!(detect_format(&path, &content), Some("json"));
    }

    #[test]
    fn test_detect_format_png_by_extension() {
        let path = PathBuf::from("photo.png");
        assert_eq!(detect_format(&path, &[]), Some("image"));
    }

    #[test]
    fn test_detect_format_jpg_by_extension() {
        let path = PathBuf::from("photo.jpg");
        assert_eq!(detect_format(&path, &[]), Some("image"));
    }

    #[test]
    fn test_detect_format_jpeg_by_extension() {
        let path = PathBuf::from("photo.jpeg");
        assert_eq!(detect_format(&path, &[]), Some("image"));
    }

    #[test]
    fn test_detect_format_svg_by_extension() {
        let path = PathBuf::from("icon.svg");
        assert_eq!(detect_format(&path, &[]), Some("image"));
    }

    #[test]
    fn test_detect_format_image_variants() {
        for ext in &[
            "png", "jpg", "jpeg", "gif", "webp", "bmp", "tiff", "tif", "svg", "heic", "heif",
            "avif",
        ] {
            let path = PathBuf::from(format!("file.{}", ext));
            assert_eq!(
                detect_format(&path, &[]),
                Some("image"),
                "expected 'image' for .{}",
                ext
            );
        }
    }

    #[test]
    fn test_detect_format_ipynb_by_extension() {
        let path = PathBuf::from("notebook.ipynb");
        assert_eq!(detect_format(&path, &[]), Some("ipynb"));
    }

    #[test]
    fn test_detect_format_ipynb_not_caught_by_json_heuristic() {
        // .ipynb extension should map to "ipynb", not fall through to JSON heuristic
        let path = PathBuf::from("notebook.ipynb");
        let content = b"{ \"cells\": [] }";
        assert_eq!(detect_format(&path, content), Some("ipynb"));
    }

    #[test]
    fn test_detect_format_code_variants() {
        let code_extensions = [
            "c", "h", "cpp", "cc", "cxx", "hpp", "hxx", "hh", "py", "pyw", "js", "mjs", "cjs",
            "jsx", "ts", "mts", "cts", "tsx", "rs", "go", "java", "kt", "kts", "rb", "swift", "cs",
            "php", "sh", "bash", "zsh", "fish", "pl", "pm", "lua", "r", "scala", "dart", "ex",
            "exs", "erl", "hs", "ml", "mli", "sql", "m", "mm", "zig", "nim", "v", "groovy", "ps1",
            "bat", "cmd",
        ];
        for ext in &code_extensions {
            let path = PathBuf::from(format!("file.{}", ext));
            assert_eq!(
                detect_format(&path, &[]),
                Some("code"),
                "expected 'code' for .{}",
                ext
            );
        }
    }
}