anytomd 1.2.2 - Docs.rs

#![warn(missing_docs)]
//! # anytomd
//!
//! A pure Rust library that converts various document formats into Markdown,
//! designed for LLM consumption.
//!
//! # Supported Formats
//!
//! | Format | Extensions |
//! |--------|-----------|
//! | DOCX | `.docx` |
//! | PPTX | `.pptx` |
//! | XLSX | `.xlsx` |
//! | XLS | `.xls` |
//! | HTML | `.html`, `.htm` |
//! | CSV | `.csv` |
//! | Jupyter Notebook | `.ipynb` |
//! | JSON | `.json` |
//! | XML | `.xml` |
//! | Images | `.png`, `.jpg`, `.gif`, `.webp`, `.bmp`, `.tiff`, `.svg`, `.heic`, `.avif` |
//! | Code | `.py`, `.rs`, `.js`, `.ts`, `.c`, `.cpp`, `.go`, `.java`, and more |
//! | Plain Text | `.txt`, `.md`, `.rst`, `.log`, `.toml`, `.yaml`, `.ini`, etc. |
//!
//! # Quick Start
//!
//! ```no_run
//! use anytomd::{convert_bytes, ConversionOptions};
//!
//! // Convert raw bytes with an explicit format
//! let options = ConversionOptions::default();
//! let csv_data = b"Name,Age\nAlice,30\nBob,25";
//! let result = convert_bytes(csv_data, "csv", &options).unwrap();
//! println!("{}", result.markdown);
//! ```
//!
//! On native targets, file-based conversion is also available:
//!
//! ```no_run
//! # #[cfg(not(target_arch = "wasm32"))]
//! # {
//! use anytomd::{convert_file, ConversionOptions};
//!
//! let options = ConversionOptions::default();
//! let result = convert_file("document.docx", &options).unwrap();
//! println!("{}", result.markdown);
//! # }
//! ```
//!
//! # Feature Flags
//!
//! | Feature | Description |
//! |---------|-------------|
//! | `async` | Async API: `convert_file_async`, `convert_bytes_async`, `AsyncImageDescriber` |
//! | `async-gemini` | `AsyncGeminiDescriber` for concurrent Gemini API calls |
//! | `wasm` | WebAssembly bindings via `wasm-bindgen` (`convertBytes`, `convertBytesWithOptions`) |
//! | `wasm` + `async-gemini` | Adds `convertBytesWithGemini` for async Gemini-powered conversion in WASM |

pub mod converter;
pub mod detection;
pub mod error;
pub mod markdown;
pub(crate) mod zip_utils;

#[cfg(feature = "async")]
pub use converter::{AsyncConversionOptions, AsyncImageDescriber};
pub use converter::{
    ConversionOptions, ConversionResult, ConversionWarning, Converter, ImageDescriber, WarningCode,
};
pub use error::ConvertError;

/// Built-in Google Gemini image description providers.
///
/// Contains `GeminiDescriber` (sync, native-only — not available on WASM)
/// and `AsyncGeminiDescriber` (behind the `async-gemini` feature).
pub mod gemini {
    #[cfg(not(target_arch = "wasm32"))]
    pub use crate::converter::gemini::GeminiDescriber;

    #[cfg(feature = "async-gemini")]
    pub use crate::converter::gemini::AsyncGeminiDescriber;
}

#[cfg(feature = "wasm")]
mod wasm;

#[cfg(not(target_arch = "wasm32"))]
use std::path::Path;

/// Convert a file at the given path to Markdown.
///
/// The format is auto-detected from magic bytes and file extension.
///
/// Not available on WASM targets — use [`convert_bytes`] instead.
#[cfg(not(target_arch = "wasm32"))]
pub fn convert_file(
    path: impl AsRef<Path>,
    options: &ConversionOptions,
) -> Result<ConversionResult, ConvertError> {
    let path = path.as_ref();
    let size = std::fs::metadata(path)?.len() as usize;
    if size > options.max_input_bytes {
        return Err(ConvertError::InputTooLarge {
            size,
            limit: options.max_input_bytes,
        });
    }

    let data = std::fs::read(path)?;

    if data.len() > options.max_input_bytes {
        return Err(ConvertError::InputTooLarge {
            size: data.len(),
            limit: options.max_input_bytes,
        });
    }

    let format = detection::detect_format(path, &data);

    // For ZIP-based formats, introspect to find the specific type
    let (format, is_zip_magic) = match format {
        Some("zip") => (detection::detect_zip_format(&data), true),
        other => (other, false),
    };

    let extension = match format {
        // Code files: pass through the original extension for language detection
        Some("code") => path.extension().and_then(|e| e.to_str()).unwrap_or("code"),
        Some(fmt) => fmt,
        None if is_zip_magic => {
            // ZIP magic bytes detected but not a known OOXML format — reject
            return Err(ConvertError::UnsupportedFormat {
                extension: "zip".to_string(),
            });
        }
        None => path.extension().and_then(|e| e.to_str()).unwrap_or(""),
    };

    convert_bytes(&data, extension, options)
}

/// Convert raw bytes to Markdown with an explicit format extension.
pub fn convert_bytes(
    data: &[u8],
    extension: &str,
    options: &ConversionOptions,
) -> Result<ConversionResult, ConvertError> {
    let extension_norm = normalize_extension(extension);

    if data.len() > options.max_input_bytes {
        return Err(ConvertError::InputTooLarge {
            size: data.len(),
            limit: options.max_input_bytes,
        });
    }

    if extension_norm == "pdf" {
        return Err(ConvertError::FormatNotSupported {
            extension: "pdf".to_string(),
            reason: "PDF is intentionally unsupported — Gemini, ChatGPT, and Claude \
                     handle PDF natively"
                .to_string(),
        });
    }

    use converter::code::CodeConverter;
    use converter::csv::CsvConverter;
    use converter::docx::DocxConverter;
    use converter::html::HtmlConverter;
    use converter::image::ImageConverter;
    use converter::ipynb::IpynbConverter;
    use converter::json::JsonConverter;
    use converter::plain_text::PlainTextConverter;
    use converter::pptx::PptxConverter;
    use converter::xlsx::XlsxConverter;
    use converter::xml::XmlConverter;

    // Code files: handled before the generic loop because CodeConverter
    // needs the extension for language detection (the Converter trait's
    // convert() method doesn't receive the extension).
    let code_conv = CodeConverter;
    if code_conv.can_convert(&extension_norm, data) {
        let result = code_conv.convert_with_extension(data, &extension_norm, options)?;
        return enforce_strict_mode(result, options.strict);
    }

    let converters: Vec<Box<dyn Converter>> = vec![
        Box::new(DocxConverter),
        Box::new(PptxConverter),
        Box::new(XlsxConverter),
        Box::new(IpynbConverter),
        Box::new(JsonConverter),
        Box::new(XmlConverter),
        Box::new(CsvConverter),
        Box::new(HtmlConverter),
        Box::new(ImageConverter),
        Box::new(PlainTextConverter),
    ];

    for conv in &converters {
        if conv.can_convert(&extension_norm, data) {
            let result = conv.convert(data, options)?;
            return enforce_strict_mode(result, options.strict);
        }
    }

    Err(ConvertError::UnsupportedFormat {
        extension: extension_norm,
    })
}

fn enforce_strict_mode(
    result: ConversionResult,
    strict: bool,
) -> Result<ConversionResult, ConvertError> {
    if !strict || result.warnings.is_empty() {
        return Ok(result);
    }

    let first = &result.warnings[0];
    let loc = first
        .location
        .as_deref()
        .map(|l| format!(" ({l})"))
        .unwrap_or_default();
    Err(ConvertError::MalformedDocument {
        reason: format!(
            "strict mode: encountered warning [{:?}] {}{}",
            first.code, first.message, loc
        ),
    })
}

fn normalize_extension(extension: &str) -> String {
    extension
        .trim()
        .trim_start_matches('.')
        .to_ascii_lowercase()
}

/// Convert a file at the given path to Markdown with async image description.
///
/// The format is auto-detected from magic bytes and file extension.
/// If an `async_image_describer` is set, all image descriptions are resolved
/// concurrently. The caller provides the async runtime.
///
/// Requires the `async` feature. Not available on WASM targets — use
/// [`convert_bytes_async`] instead.
#[cfg(feature = "async")]
#[cfg(not(target_arch = "wasm32"))]
pub async fn convert_file_async(
    path: impl AsRef<Path>,
    options: &converter::AsyncConversionOptions,
) -> Result<ConversionResult, ConvertError> {
    let path = path.as_ref();
    let size = std::fs::metadata(path)?.len() as usize;
    if size > options.base.max_input_bytes {
        return Err(ConvertError::InputTooLarge {
            size,
            limit: options.base.max_input_bytes,
        });
    }

    let data = std::fs::read(path)?;

    if data.len() > options.base.max_input_bytes {
        return Err(ConvertError::InputTooLarge {
            size: data.len(),
            limit: options.base.max_input_bytes,
        });
    }

    let format = detection::detect_format(path, &data);

    let (format, is_zip_magic) = match format {
        Some("zip") => (detection::detect_zip_format(&data), true),
        other => (other, false),
    };

    let extension = match format {
        Some("code") => path.extension().and_then(|e| e.to_str()).unwrap_or("code"),
        Some(fmt) => fmt,
        None if is_zip_magic => {
            return Err(ConvertError::UnsupportedFormat {
                extension: "zip".to_string(),
            });
        }
        None => path.extension().and_then(|e| e.to_str()).unwrap_or(""),
    };

    convert_bytes_async(&data, extension, options).await
}

/// Convert raw bytes to Markdown with async image description.
///
/// For image-bearing formats (docx, pptx, xlsx, image), uses `convert_inner()`
/// for parsing then resolves images concurrently via the async describer.
/// For other formats, falls through to the sync `convert()`.
///
/// Requires the `async` feature.
#[cfg(feature = "async")]
pub async fn convert_bytes_async(
    data: &[u8],
    extension: &str,
    options: &converter::AsyncConversionOptions,
) -> Result<ConversionResult, ConvertError> {
    let extension_norm = normalize_extension(extension);

    if data.len() > options.base.max_input_bytes {
        return Err(ConvertError::InputTooLarge {
            size: data.len(),
            limit: options.base.max_input_bytes,
        });
    }

    if extension_norm == "pdf" {
        return Err(ConvertError::FormatNotSupported {
            extension: "pdf".to_string(),
            reason: "PDF is intentionally unsupported — Gemini, ChatGPT, and Claude \
                     handle PDF natively"
                .to_string(),
        });
    }

    // For image-bearing formats, use convert_inner() + async resolve
    if let Some(ref describer) = options.async_image_describer {
        match extension_norm.as_str() {
            "docx" => {
                let conv = converter::docx::DocxConverter;
                let (mut result, pending) = conv.convert_inner(data, &options.base)?;
                if !pending.infos.is_empty() {
                    converter::ooxml_utils::resolve_image_placeholders_async(
                        &mut result.markdown,
                        &mut result.plain_text,
                        &pending.infos,
                        &pending.bytes,
                        describer.as_ref(),
                        &mut result.warnings,
                    )
                    .await;
                }
                return enforce_strict_mode(result, options.base.strict);
            }
            "pptx" => {
                let conv = converter::pptx::PptxConverter;
                let (mut result, pending) = conv.convert_inner(data, &options.base)?;
                if !pending.infos.is_empty() {
                    converter::ooxml_utils::resolve_image_placeholders_async(
                        &mut result.markdown,
                        &mut result.plain_text,
                        &pending.infos,
                        &pending.bytes,
                        describer.as_ref(),
                        &mut result.warnings,
                    )
                    .await;
                }
                return enforce_strict_mode(result, options.base.strict);
            }
            "xlsx" | "xls" => {
                let conv = converter::xlsx::XlsxConverter;
                let (mut result, pending) = conv.convert_inner(data, &options.base)?;
                if !pending.infos.is_empty() {
                    converter::ooxml_utils::resolve_image_placeholders_async(
                        &mut result.markdown,
                        &mut result.plain_text,
                        &pending.infos,
                        &pending.bytes,
                        describer.as_ref(),
                        &mut result.warnings,
                    )
                    .await;
                }
                return enforce_strict_mode(result, options.base.strict);
            }
            ext if converter::image::ImageConverter.can_convert(ext, data) => {
                let conv = converter::image::ImageConverter;
                let (mut result, pending) = conv.convert_inner(data, &options.base)?;
                if !pending.infos.is_empty() {
                    converter::ooxml_utils::resolve_image_placeholders_async(
                        &mut result.markdown,
                        &mut result.plain_text,
                        &pending.infos,
                        &pending.bytes,
                        describer.as_ref(),
                        &mut result.warnings,
                    )
                    .await;
                }
                return enforce_strict_mode(result, options.base.strict);
            }
            _ => {}
        }
    }

    // Fallback: use sync convert for non-image formats or when no async describer
    convert_bytes(data, &extension_norm, &options.base)
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_convert_bytes_input_too_large() {
        let data = vec![0u8; 1024];
        let options = ConversionOptions {
            max_input_bytes: 512,
            ..Default::default()
        };
        let result = convert_bytes(&data, "txt", &options);
        assert!(result.is_err());
        let err = result.unwrap_err();
        assert!(
            format!("{err}").contains("input too large"),
            "error was: {err}"
        );
    }

    #[test]
    fn test_convert_bytes_at_exact_limit_succeeds() {
        let data = b"Hello, world!";
        let options = ConversionOptions {
            max_input_bytes: data.len(),
            ..Default::default()
        };
        let result = convert_bytes(data, "txt", &options);
        assert!(result.is_ok());
    }

    #[test]
    fn test_convert_bytes_strict_mode_escalates_warning() {
        // Non-UTF8 bytes trigger a recoverable decoding warning in txt converter.
        let data = b"caf\xe9";
        let options = ConversionOptions {
            strict: true,
            ..Default::default()
        };
        let result = convert_bytes(data, "txt", &options);
        assert!(result.is_err());
        let err = result.unwrap_err();
        assert!(
            format!("{err}").contains("strict mode"),
            "error should mention strict mode: {err}"
        );
    }

    #[test]
    fn test_convert_bytes_non_strict_keeps_warning() {
        let data = b"caf\xe9";
        let result = convert_bytes(data, "txt", &ConversionOptions::default()).unwrap();
        assert!(!result.warnings.is_empty(), "expected decoding warning");
    }

    #[test]
    fn test_convert_bytes_extension_case_insensitive() {
        let result = convert_bytes(b"hello world", " TXT ", &ConversionOptions::default()).unwrap();
        assert!(result.markdown.contains("hello world"));
    }

    #[test]
    fn test_convert_bytes_extension_with_leading_dot() {
        let result = convert_bytes(b"hello world", ".txt", &ConversionOptions::default()).unwrap();
        assert!(result.markdown.contains("hello world"));
    }

    #[cfg(not(target_arch = "wasm32"))]
    #[test]
    fn test_convert_file_non_ooxml_zip_returns_unsupported() {
        // Create a minimal valid ZIP file that is NOT an OOXML format
        let mut buf = std::io::Cursor::new(Vec::new());
        {
            let mut zip_writer = zip::ZipWriter::new(&mut buf);
            let options = zip::write::SimpleFileOptions::default();
            zip_writer.start_file("hello.txt", options).unwrap();
            std::io::Write::write_all(&mut zip_writer, b"hello world").unwrap();
            zip_writer.finish().unwrap();
        }
        let zip_data = buf.into_inner();

        // Write to a temp file with .txt extension
        let dir = std::env::temp_dir().join("anytomd_test_zip_misroute");
        std::fs::create_dir_all(&dir).unwrap();
        let file_path = dir.join("archive.txt");
        std::fs::write(&file_path, &zip_data).unwrap();

        let options = ConversionOptions::default();
        let result = convert_file(&file_path, &options);

        assert!(result.is_err(), "expected UnsupportedFormat error");
        let err = result.unwrap_err();
        assert!(
            format!("{err}").contains("unsupported format"),
            "error was: {err}"
        );

        // Cleanup
        let _ = std::fs::remove_dir_all(&dir);
    }

    #[cfg(not(target_arch = "wasm32"))]
    #[test]
    fn test_convert_file_input_too_large() {
        // Use existing sample.csv fixture with a tiny limit
        let path = std::path::Path::new("tests/fixtures/sample.csv");
        if !path.exists() {
            return; // Skip if fixture not available
        }
        let file_size = std::fs::metadata(path).unwrap().len() as usize;
        let options = ConversionOptions {
            max_input_bytes: file_size.saturating_sub(1).max(1),
            ..Default::default()
        };
        let result = convert_file(path, &options);
        assert!(result.is_err());
        let err = result.unwrap_err();
        assert!(
            format!("{err}").contains("input too large"),
            "error was: {err}"
        );
    }

    #[test]
    fn test_convert_bytes_pdf_returns_descriptive_error() {
        let data = b"%PDF-1.7 fake pdf content";
        let options = ConversionOptions::default();
        let result = convert_bytes(data, "pdf", &options);
        assert!(result.is_err());
        let err = result.unwrap_err();
        let msg = format!("{err}");
        assert!(msg.contains("pdf"), "error should mention pdf: {msg}");
        assert!(
            msg.contains("intentionally unsupported"),
            "error should explain why: {msg}"
        );
    }

    #[cfg(not(target_arch = "wasm32"))]
    #[test]
    fn test_convert_file_pdf_returns_descriptive_error() {
        // Create a temp file with .pdf extension and PDF magic bytes
        let dir = std::env::temp_dir().join("anytomd_test_pdf_error");
        std::fs::create_dir_all(&dir).unwrap();
        let file_path = dir.join("test.pdf");
        std::fs::write(&file_path, b"%PDF-1.7 fake").unwrap();

        let options = ConversionOptions::default();
        let result = convert_file(&file_path, &options);
        assert!(result.is_err());
        let err = result.unwrap_err();
        let msg = format!("{err}");
        assert!(
            msg.contains("intentionally unsupported"),
            "error should explain why: {msg}"
        );

        // Cleanup
        let _ = std::fs::remove_dir_all(&dir);
    }

    #[cfg(not(target_arch = "wasm32"))]
    #[test]
    fn test_convert_file_unknown_ext_json_with_long_leading_whitespace() {
        let dir = std::env::temp_dir().join("anytomd_test_json_whitespace_detect");
        std::fs::create_dir_all(&dir).unwrap();
        let file_path = dir.join("payload.dat");
        let mut data = vec![b' '; 40];
        data.extend_from_slice(br#"{"k":1}"#);
        std::fs::write(&file_path, data).unwrap();

        let result = convert_file(&file_path, &ConversionOptions::default()).unwrap();
        assert!(
            result.markdown.contains("\"k\""),
            "expected JSON conversion, markdown was: {}",
            result.markdown
        );

        let _ = std::fs::remove_dir_all(&dir);
    }

    #[cfg(not(target_arch = "wasm32"))]
    #[test]
    fn test_convert_file_unknown_ext_json_with_utf8_bom() {
        let dir = std::env::temp_dir().join("anytomd_test_json_bom_detect");
        std::fs::create_dir_all(&dir).unwrap();
        let file_path = dir.join("payload.dat");
        let mut data = vec![0xEF, 0xBB, 0xBF];
        data.extend_from_slice(br#"{"k":1}"#);
        std::fs::write(&file_path, data).unwrap();

        let result = convert_file(&file_path, &ConversionOptions::default()).unwrap();
        assert!(
            result.markdown.contains("\"k\""),
            "expected JSON conversion, markdown was: {}",
            result.markdown
        );

        let _ = std::fs::remove_dir_all(&dir);
    }
}