anytomd 1.2.2

Pure Rust library that converts various document formats into Markdown
Documentation
#![cfg(not(target_arch = "wasm32"))]

mod common;

use anytomd::{ConversionOptions, convert_file};
use common::normalize;
use std::io::Cursor;

/// Content coverage test: verify key elements are present in the converted output.
///
/// Fixture: tests/fixtures/sample.docx
/// Contains: H1 "Sample Document", body paragraph, H2 "Section One",
///           paragraph with hyperlink to example.com, Korean text, emoji, H3 "Subsection".
#[test]
fn test_docx_convert_file_sample() {
    let result = convert_file("tests/fixtures/sample.docx", &ConversionOptions::default()).unwrap();

    // Title should be extracted from first H1
    assert_eq!(result.title, Some("Sample Document".to_string()));

    // Headings
    assert!(result.markdown.contains("# Sample Document"));
    assert!(result.markdown.contains("## Section One"));
    assert!(result.markdown.contains("### Subsection"));

    // Body paragraphs
    assert!(result.markdown.contains("This is a simple paragraph."));
    assert!(
        result
            .markdown
            .contains("Final paragraph with mixed content.")
    );

    // Hyperlink
    assert!(result.markdown.contains("[Example](https://example.com)"));

    // Unicode: Korean
    assert!(result.markdown.contains("한국어 테스트"));

    // Emoji
    assert!(result.markdown.contains("🚀"));
    assert!(result.markdown.contains("✨"));
    assert!(result.markdown.contains("🌍"));
}

/// Golden test: normalized comparison against expected output file.
#[test]
fn test_docx_golden_sample() {
    let result = convert_file("tests/fixtures/sample.docx", &ConversionOptions::default()).unwrap();
    let expected = include_str!("fixtures/expected/sample.docx.md");
    assert_eq!(normalize(&result.markdown), normalize(expected));
}

/// Direct convert_bytes test: verify the converter works with explicit extension.
#[test]
fn test_docx_convert_bytes_direct() {
    let data = std::fs::read("tests/fixtures/sample.docx").unwrap();
    let result = anytomd::convert_bytes(&data, "docx", &ConversionOptions::default()).unwrap();
    assert!(result.markdown.contains("# Sample Document"));
    assert!(result.markdown.contains("한국어"));
}

/// Integration test: text box extraction via mc:AlternateContent > Fallback > w:pict.
///
/// Builds a DOCX in memory with a normal paragraph, a text box (via mc:AlternateContent),
/// and another normal paragraph. Verifies text box content is extracted and document flow
/// is preserved.
#[test]
fn test_docx_textbox_integration() {
    use std::io::Write;
    use zip::ZipWriter;
    use zip::write::SimpleFileOptions;

    let buf = Vec::new();
    let mut zip = ZipWriter::new(Cursor::new(buf));
    let opts = SimpleFileOptions::default();

    // [Content_Types].xml
    zip.start_file("[Content_Types].xml", opts).unwrap();
    zip.write_all(
        br#"<?xml version="1.0" encoding="UTF-8" standalone="yes"?><Types xmlns="http://schemas.openxmlformats.org/package/2006/content-types"><Default Extension="rels" ContentType="application/vnd.openxmlformats-package.relationships+xml"/><Default Extension="xml" ContentType="application/xml"/><Override PartName="/word/document.xml" ContentType="application/vnd.openxmlformats-officedocument.wordprocessingml.document.main+xml"/></Types>"#,
    )
    .unwrap();

    // _rels/.rels
    zip.start_file("_rels/.rels", opts).unwrap();
    zip.write_all(
        br#"<?xml version="1.0" encoding="UTF-8" standalone="yes"?><Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships"><Relationship Id="rId1" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument" Target="word/document.xml"/></Relationships>"#,
    )
    .unwrap();

    // word/document.xml — paragraph + text box via mc:AlternateContent + paragraph
    let document_xml = r#"<?xml version="1.0" encoding="UTF-8" standalone="yes"?><w:document xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main" xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships" xmlns:mc="http://schemas.openxmlformats.org/markup-compatibility/2006" xmlns:v="urn:schemas-microsoft-com:vml"><w:body><w:p><w:r><w:t>Normal paragraph before.</w:t></w:r></w:p><mc:AlternateContent><mc:Choice Requires="wps"><w:p><w:r><w:t>DrawingML (hidden)</w:t></w:r></w:p></mc:Choice><mc:Fallback><w:p><w:r><w:pict><v:shape><v:textbox><w:txbxContent><w:p><w:r><w:t>Text box content here</w:t></w:r></w:p></w:txbxContent></v:textbox></v:shape></w:pict></w:r></w:p></mc:Fallback></mc:AlternateContent><w:p><w:r><w:t>Normal paragraph after.</w:t></w:r></w:p></w:body></w:document>"#;
    zip.start_file("word/document.xml", opts).unwrap();
    zip.write_all(document_xml.as_bytes()).unwrap();

    let cursor = zip.finish().unwrap();
    let data = cursor.into_inner();

    let result = anytomd::convert_bytes(&data, "docx", &ConversionOptions::default()).unwrap();
    assert!(
        result.markdown.contains("Normal paragraph before."),
        "markdown was: {}",
        result.markdown
    );
    assert!(
        result.markdown.contains("Text box content here"),
        "markdown was: {}",
        result.markdown
    );
    assert!(
        result.markdown.contains("Normal paragraph after."),
        "markdown was: {}",
        result.markdown
    );
    assert!(
        !result.markdown.contains("DrawingML (hidden)"),
        "Choice content should be skipped, markdown was: {}",
        result.markdown
    );
}

/// Build a DOCX fixture with text boxes for testing.
///
/// Contains:
/// - H1 "Text Box Test Document"
/// - Normal paragraph before text box
/// - mc:AlternateContent with text box in Fallback (bold + unicode + emoji)
/// - Normal paragraph after text box
/// - Direct w:pict text box (italic)
/// - Final paragraph
fn build_textbox_docx() -> Vec<u8> {
    use std::io::Write;
    use zip::ZipWriter;
    use zip::write::SimpleFileOptions;

    let buf = Vec::new();
    let mut zip = ZipWriter::new(Cursor::new(buf));
    let opts = SimpleFileOptions::default();

    zip.start_file("[Content_Types].xml", opts).unwrap();
    zip.write_all(
        br#"<?xml version="1.0" encoding="UTF-8" standalone="yes"?><Types xmlns="http://schemas.openxmlformats.org/package/2006/content-types"><Default Extension="rels" ContentType="application/vnd.openxmlformats-package.relationships+xml"/><Default Extension="xml" ContentType="application/xml"/><Override PartName="/word/document.xml" ContentType="application/vnd.openxmlformats-officedocument.wordprocessingml.document.main+xml"/></Types>"#,
    ).unwrap();

    zip.start_file("_rels/.rels", opts).unwrap();
    zip.write_all(
        br#"<?xml version="1.0" encoding="UTF-8" standalone="yes"?><Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships"><Relationship Id="rId1" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument" Target="word/document.xml"/></Relationships>"#,
    ).unwrap();

    let document_xml = concat!(
        r#"<?xml version="1.0" encoding="UTF-8" standalone="yes"?>"#,
        r#"<w:document xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main""#,
        r#" xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships""#,
        r#" xmlns:mc="http://schemas.openxmlformats.org/markup-compatibility/2006""#,
        r#" xmlns:v="urn:schemas-microsoft-com:vml""#,
        r#" xmlns:o="urn:schemas-microsoft-com:office:office">"#,
        r#"<w:body>"#,
        // H1
        r#"<w:p><w:pPr><w:pStyle w:val="Heading1"/></w:pPr><w:r><w:t>Text Box Test Document</w:t></w:r></w:p>"#,
        // Normal paragraph
        r#"<w:p><w:r><w:t xml:space="preserve">This paragraph comes before the text box.</w:t></w:r></w:p>"#,
        // mc:AlternateContent with text box in Fallback
        r#"<mc:AlternateContent>"#,
        r#"<mc:Choice Requires="wps"><w:p><w:r><w:t>DrawingML content that should be hidden</w:t></w:r></w:p></mc:Choice>"#,
        r#"<mc:Fallback><w:p><w:r><w:pict>"#,
        r#"<v:shape id="tb1" style="width:200pt;height:50pt"><v:textbox><w:txbxContent>"#,
        r#"<w:p><w:r><w:rPr><w:b/></w:rPr><w:t>Important notice in text box</w:t></w:r></w:p>"#,
        r#"<w:p><w:r><w:t xml:space="preserve">Second paragraph with 한국어 and 🚀 emoji.</w:t></w:r></w:p>"#,
        r#"</w:txbxContent></v:textbox></v:shape>"#,
        r#"</w:pict></w:r></w:p></mc:Fallback>"#,
        r#"</mc:AlternateContent>"#,
        // Normal paragraph after
        r#"<w:p><w:r><w:t xml:space="preserve">This paragraph comes after the text box.</w:t></w:r></w:p>"#,
        // Direct w:pict text box (italic)
        r#"<w:p><w:r><w:pict><v:rect style="width:150pt;height:30pt"><v:textbox><w:txbxContent>"#,
        r#"<w:p><w:r><w:rPr><w:i/></w:rPr><w:t>Direct VML text box content</w:t></w:r></w:p>"#,
        r#"</w:txbxContent></v:textbox></v:rect></w:pict></w:r></w:p>"#,
        // Final paragraph
        r#"<w:p><w:r><w:t>Final paragraph of the document.</w:t></w:r></w:p>"#,
        r#"</w:body></w:document>"#,
    );

    let styles_xml = r#"<?xml version="1.0" encoding="UTF-8"?><w:styles xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main"><w:style w:type="paragraph" w:styleId="Heading1"><w:name w:val="heading 1"/></w:style></w:styles>"#;

    zip.start_file("word/document.xml", opts).unwrap();
    zip.write_all(document_xml.as_bytes()).unwrap();

    zip.start_file("word/styles.xml", opts).unwrap();
    zip.write_all(styles_xml.as_bytes()).unwrap();

    let cursor = zip.finish().unwrap();
    cursor.into_inner()
}

/// Integration test: text box extraction from a programmatic DOCX fixture.
///
/// Tests the full end-to-end pipeline via `convert_bytes` with a realistic DOCX
/// containing heading, normal paragraphs, mc:AlternateContent text boxes, direct
/// VML text boxes, bold/italic formatting, Unicode, and emoji.
#[test]
fn test_docx_textbox_convert_file() {
    let data = build_textbox_docx();
    let result = anytomd::convert_bytes(&data, "docx", &ConversionOptions::default()).unwrap();

    // Heading
    assert!(
        result.markdown.contains("# Text Box Test Document"),
        "markdown was: {}",
        result.markdown
    );

    // Normal paragraphs preserved
    assert!(
        result
            .markdown
            .contains("This paragraph comes before the text box."),
        "markdown was: {}",
        result.markdown
    );
    assert!(
        result
            .markdown
            .contains("This paragraph comes after the text box."),
        "markdown was: {}",
        result.markdown
    );
    assert!(
        result.markdown.contains("Final paragraph of the document."),
        "markdown was: {}",
        result.markdown
    );

    // Text box content (via mc:AlternateContent > Fallback)
    assert!(
        result.markdown.contains("**Important notice in text box**"),
        "bold text box content missing, markdown was: {}",
        result.markdown
    );
    assert!(
        result.markdown.contains("한국어"),
        "Korean text in text box missing, markdown was: {}",
        result.markdown
    );
    assert!(
        result.markdown.contains("🚀"),
        "emoji in text box missing, markdown was: {}",
        result.markdown
    );

    // Direct w:pict text box
    assert!(
        result.markdown.contains("*Direct VML text box content*"),
        "italic direct text box content missing, markdown was: {}",
        result.markdown
    );

    // Choice content must NOT appear
    assert!(
        !result
            .markdown
            .contains("DrawingML content that should be hidden"),
        "mc:Choice content should be hidden, markdown was: {}",
        result.markdown
    );

    // Title
    assert_eq!(result.title, Some("Text Box Test Document".to_string()),);
}