hwp2md 0.5.0

HWP/HWPX ↔ Markdown bidirectional converter
Documentation
use super::*;

// -----------------------------------------------------------------------
// Helper: unwrap the section and panic with a descriptive message on error.
// -----------------------------------------------------------------------

fn section(xml: &str) -> ir::Section {
    parse_section_xml(xml).expect("parse_section_xml must not fail")
}

// -----------------------------------------------------------------------
// parse_section_xml -- table
// -----------------------------------------------------------------------

#[test]
fn simple_table_two_rows_two_cols() {
    let xml = concat!(
        r#"<root><hp:tbl colCnt="2">"#,
        r#"<hp:tr><hp:tc><hp:p><hp:run><hp:t>A1</hp:t></hp:run></hp:p></hp:tc>"#,
        r#"<hp:tc><hp:p><hp:run><hp:t>A2</hp:t></hp:run></hp:p></hp:tc></hp:tr>"#,
        r#"<hp:tr><hp:tc><hp:p><hp:run><hp:t>B1</hp:t></hp:run></hp:p></hp:tc>"#,
        r#"<hp:tc><hp:p><hp:run><hp:t>B2</hp:t></hp:run></hp:p></hp:tc></hp:tr>"#,
        r#"</hp:tbl></root>"#,
    );
    let s = section(xml);
    assert_eq!(s.blocks.len(), 1);
    match &s.blocks[0] {
        ir::Block::Table { rows, col_count, .. } => {
            assert_eq!(*col_count, 2);
            assert_eq!(rows.len(), 2);
            // First row is always the header row.
            assert!(rows[0].is_header, "first row must be is_header=true");
            assert!(!rows[1].is_header, "second row must be is_header=false");
            // Verify cell text content.
            let text_of = |row: usize, col: usize| -> String {
                match &rows[row].cells[col].blocks[0] {
                    ir::Block::Paragraph { inlines } => inlines[0].text.clone(),
                    other => panic!("unexpected block {other:?}"),
                }
            };
            assert_eq!(text_of(0, 0), "A1");
            assert_eq!(text_of(0, 1), "A2");
            assert_eq!(text_of(1, 0), "B1");
            assert_eq!(text_of(1, 1), "B2");
        }
        other => panic!("expected Table, got {other:?}"),
    }
}

#[test]
fn table_col_count_from_colcnt_attribute() {
    // colCnt="3" but only 2 cells per row -- col_count must be max(3, 2) = 3.
    let xml = concat!(
        r#"<root><hp:tbl colCnt="3"><hp:tr>"#,
        r#"<hp:tc><hp:p><hp:run><hp:t>X</hp:t></hp:run></hp:p></hp:tc>"#,
        r#"<hp:tc><hp:p><hp:run><hp:t>Y</hp:t></hp:run></hp:p></hp:tc>"#,
        r#"</hp:tr></hp:tbl></root>"#,
    );
    let s = section(xml);
    match &s.blocks[0] {
        ir::Block::Table { col_count, .. } => {
            assert_eq!(*col_count, 3);
        }
        other => panic!("expected Table, got {other:?}"),
    }
}

#[test]
fn table_cell_colspan_from_celladdr() {
    // cellAddr is a self-closing child of tc that sets colspan/rowspan.
    let xml = concat!(
        r#"<root><hp:tbl colCnt="3"><hp:tr>"#,
        r#"<hp:tc><hp:cellAddr colSpan="2" rowSpan="1"/><hp:p><hp:run><hp:t>merged</hp:t></hp:run></hp:p></hp:tc>"#,
        r#"<hp:tc><hp:p><hp:run><hp:t>single</hp:t></hp:run></hp:p></hp:tc>"#,
        r#"</hp:tr></hp:tbl></root>"#,
    );
    let s = section(xml);
    match &s.blocks[0] {
        ir::Block::Table { rows, .. } => {
            let cells = &rows[0].cells;
            assert_eq!(cells[0].colspan, 2, "first cell must have colspan=2");
            assert_eq!(cells[0].rowspan, 1, "first cell must have rowspan=1");
            assert_eq!(
                cells[1].colspan, 1,
                "second cell must have default colspan=1"
            );
        }
        other => panic!("expected Table, got {other:?}"),
    }
}

#[test]
fn table_cell_rowspan_from_celladdr() {
    let xml = concat!(
        r#"<root><hp:tbl colCnt="1"><hp:tr>"#,
        r#"<hp:tc><hp:cellAddr colSpan="1" rowSpan="3"/><hp:p><hp:run><hp:t>tall</hp:t></hp:run></hp:p></hp:tc>"#,
        r#"</hp:tr></hp:tbl></root>"#,
    );
    let s = section(xml);
    match &s.blocks[0] {
        ir::Block::Table { rows, .. } => {
            assert_eq!(rows[0].cells[0].rowspan, 3);
        }
        other => panic!("expected Table, got {other:?}"),
    }
}

#[test]
fn table_default_colspan_rowspan_without_celladdr() {
    // When no cellAddr is present the defaults must be colspan=1, rowspan=1.
    let xml = concat!(
        r#"<root><hp:tbl colCnt="1"><hp:tr>"#,
        r#"<hp:tc><hp:p><hp:run><hp:t>cell</hp:t></hp:run></hp:p></hp:tc>"#,
        r#"</hp:tr></hp:tbl></root>"#,
    );
    let s = section(xml);
    match &s.blocks[0] {
        ir::Block::Table { rows, .. } => {
            assert_eq!(rows[0].cells[0].colspan, 1);
            assert_eq!(rows[0].cells[0].rowspan, 1);
        }
        other => panic!("expected Table, got {other:?}"),
    }
}

#[test]
fn nested_paragraph_inside_table_cell() {
    // Text inside a table cell must end up in cell blocks, not section blocks.
    let xml = concat!(
        r#"<root><hp:tbl colCnt="1"><hp:tr>"#,
        r#"<hp:tc><hp:p><hp:run><hp:t>cell content</hp:t></hp:run></hp:p></hp:tc>"#,
        r#"</hp:tr></hp:tbl></root>"#,
    );
    let s = section(xml);
    assert_eq!(s.blocks.len(), 1);
    match &s.blocks[0] {
        ir::Block::Table { rows, .. } => {
            let cell = &rows[0].cells[0];
            assert_eq!(cell.blocks.len(), 1);
            match &cell.blocks[0] {
                ir::Block::Paragraph { inlines } => {
                    assert_eq!(inlines[0].text, "cell content");
                }
                other => panic!("expected Paragraph inside cell, got {other:?}"),
            }
        }
        other => panic!("expected Table, got {other:?}"),
    }
}

#[test]
fn image_inside_table_cell_goes_to_cell_blocks() {
    let xml = concat!(
        r#"<root><hp:tbl colCnt="1"><hp:tr>"#,
        r#"<hp:tc><hp:img src="fig.png" alt="figure"/></hp:tc>"#,
        r#"</hp:tr></hp:tbl></root>"#,
    );
    let s = section(xml);
    match &s.blocks[0] {
        ir::Block::Table { rows, .. } => {
            let cell = &rows[0].cells[0];
            assert_eq!(cell.blocks.len(), 1);
            match &cell.blocks[0] {
                ir::Block::Image { src, alt } => {
                    assert_eq!(src, "fig.png");
                    assert_eq!(alt, "figure");
                }
                other => panic!("expected Image inside cell, got {other:?}"),
            }
        }
        other => panic!("expected Table, got {other:?}"),
    }
}

#[test]
fn colspan_zero_defaults_to_one() {
    let xml = concat!(
        r#"<root><hp:tbl colCnt="1"><hp:tr>"#,
        r#"<hp:tc><hp:cellAddr colSpan="0" rowSpan="0"/><hp:p><hp:run><hp:t>x</hp:t></hp:run></hp:p></hp:tc>"#,
        r#"</hp:tr></hp:tbl></root>"#,
    );
    let s = section(xml);
    match &s.blocks[0] {
        ir::Block::Table { rows, .. } => {
            assert_eq!(rows[0].cells[0].colspan, 1, "colSpan=0 must default to 1");
            assert_eq!(rows[0].cells[0].rowspan, 1, "rowSpan=0 must default to 1");
        }
        other => panic!("expected Table, got {other:?}"),
    }
}

// -----------------------------------------------------------------------
// parse_section_xml -- list
// -----------------------------------------------------------------------

#[test]
fn ordered_list_without_li_produces_no_block() {
    // The current parser recognises <ol>/<ul> open/close but has no <li>
    // handler, so items will be empty.  The block is only pushed when
    // list_items is non-empty, so an empty ol must produce no block.
    // This test documents the current behaviour explicitly.
    let xml = r"<root><ol></ol></root>";
    let s = section(xml);
    assert!(
        s.blocks.is_empty(),
        "empty <ol> without <li> children must produce no block (current behaviour)"
    );
}

// -----------------------------------------------------------------------
// guess_mime_from_name -- all extensions
// -----------------------------------------------------------------------

#[test]
fn guess_mime_png() {
    assert_eq!(guess_mime_from_name("image.png"), "image/png");
}

#[test]
fn guess_mime_jpg() {
    assert_eq!(guess_mime_from_name("photo.jpg"), "image/jpeg");
}

#[test]
fn guess_mime_jpeg() {
    assert_eq!(guess_mime_from_name("photo.jpeg"), "image/jpeg");
}

#[test]
fn guess_mime_gif() {
    assert_eq!(guess_mime_from_name("anim.gif"), "image/gif");
}

#[test]
fn guess_mime_bmp() {
    assert_eq!(guess_mime_from_name("bitmap.bmp"), "image/bmp");
}

#[test]
fn guess_mime_svg() {
    assert_eq!(guess_mime_from_name("vector.svg"), "image/svg+xml");
}

#[test]
fn guess_mime_wmf() {
    assert_eq!(guess_mime_from_name("metafile.wmf"), "image/x-wmf");
}

#[test]
fn guess_mime_emf() {
    assert_eq!(guess_mime_from_name("enhanced.emf"), "image/x-emf");
}

#[test]
fn guess_mime_unknown_extension_falls_back_to_octet_stream() {
    assert_eq!(
        guess_mime_from_name("archive.xyz"),
        "application/octet-stream"
    );
}

#[test]
fn guess_mime_no_extension_falls_back_to_octet_stream() {
    assert_eq!(
        guess_mime_from_name("nodotfile"),
        "application/octet-stream"
    );
}

#[test]
fn guess_mime_case_insensitive_uppercase_png() {
    assert_eq!(guess_mime_from_name("PHOTO.PNG"), "image/png");
}

#[test]
fn guess_mime_case_insensitive_mixed_jpg() {
    assert_eq!(guess_mime_from_name("Photo.Jpg"), "image/jpeg");
}

#[test]
fn guess_mime_case_insensitive_svg() {
    assert_eq!(guess_mime_from_name("LOGO.SVG"), "image/svg+xml");
}

// -----------------------------------------------------------------------
// Page break (B-3) — `<hp:ctrl id="newPage"/>` produces Block::PageBreak
// -----------------------------------------------------------------------

#[test]
fn ctrl_newpage_produces_page_break_block() {
    let xml = r#"<root>
        <hp:p><hp:run><hp:t>before</hp:t></hp:run></hp:p>
        <hp:p><hp:run><hp:ctrl id="newPage"/></hp:run></hp:p>
        <hp:p><hp:run><hp:t>after</hp:t></hp:run></hp:p>
    </root>"#;
    let s = section(xml);
    let kinds: Vec<&'static str> = s
        .blocks
        .iter()
        .map(|b| match b {
            ir::Block::Paragraph { .. } => "para",
            ir::Block::PageBreak => "pb",
            _ => "other",
        })
        .collect();
    // The "before" paragraph, then PageBreak, then "after" paragraph.
    assert!(kinds.contains(&"pb"), "expected PageBreak block: {kinds:?}");
    let pb_idx = kinds.iter().position(|k| *k == "pb").unwrap();
    assert!(
        kinds[..pb_idx].contains(&"para") && kinds[pb_idx + 1..].contains(&"para"),
        "PageBreak must sit between the two paragraphs: {kinds:?}"
    );
}

#[test]
fn ctrl_pagebreak_id_alias_also_recognised() {
    let xml = r#"<root>
        <hp:p><hp:run><hp:ctrl id="pageBreak"/></hp:run></hp:p>
    </root>"#;
    let s = section(xml);
    assert!(
        s.blocks.iter().any(|b| matches!(b, ir::Block::PageBreak)),
        "id=pageBreak must produce PageBreak: {:?}",
        s.blocks
    );
}

#[test]
fn ctrl_unknown_id_does_not_produce_page_break() {
    // A ctrl with an unknown id (e.g. column break we don't model) must not
    // be silently treated as a page break.
    let xml = r#"<root>
        <hp:p><hp:run><hp:ctrl id="someOtherCtrl"/></hp:run></hp:p>
    </root>"#;
    let s = section(xml);
    assert!(
        !s.blocks.iter().any(|b| matches!(b, ir::Block::PageBreak)),
        "unknown ctrl id must not yield PageBreak: {:?}",
        s.blocks
    );
}

#[test]
fn ctrl_newpage_mid_paragraph_splits_into_three_ordered_blocks() {
    // Sprint 3 review HIGH: when the page-break ctrl appears between text
    // runs inside a single <hp:p>, the buffered "before" text must be
    // flushed as its own paragraph BEFORE the PageBreak block, and the
    // "after" text must end up in a separate paragraph after the break.
    let xml = r#"<root>
        <hp:p><hp:run>
            <hp:t>before</hp:t>
            <hp:ctrl id="newPage"/>
            <hp:t>after</hp:t>
        </hp:run></hp:p>
    </root>"#;
    let s = section(xml);
    let kinds: Vec<&'static str> = s
        .blocks
        .iter()
        .map(|b| match b {
            ir::Block::Paragraph { .. } => "para",
            ir::Block::PageBreak => "pb",
            _ => "other",
        })
        .collect();
    assert_eq!(
        kinds,
        vec!["para", "pb", "para"],
        "mid-paragraph page break must split into ordered [para, pb, para]; \
         got blocks: {:?}",
        s.blocks
    );

    // Verify the text on each side ended up in the correct paragraph.
    let texts: Vec<String> = s
        .blocks
        .iter()
        .filter_map(|b| match b {
            ir::Block::Paragraph { inlines } => {
                Some(inlines.iter().map(|i| i.text.as_str()).collect::<String>())
            }
            _ => None,
        })
        .collect();
    assert_eq!(
        texts,
        vec!["before".to_string(), "after".to_string()],
        "paragraph text must not cross the page-break boundary"
    );
}