//! DOCX (Office Open XML) to Markdown converter.
//!
//! Parses DOCX files directly from their OOXML ZIP structure using `zip` + `quick-xml`,
//! without intermediate HTML conversion. Extracts headings, paragraphs, tables,
//! bold/italic, hyperlinks, lists, embedded images, and text boxes (`w:pict` /
//! `v:textbox` / `w:txbxContent`). Text boxes wrapped in `mc:AlternateContent` are
//! handled by skipping the `mc:Choice` branch and processing `mc:Fallback` (VML).
use std::collections::HashMap;
use std::io::Cursor;
use quick_xml::Reader;
use quick_xml::events::Event;
use zip::ZipArchive;
use crate::converter::ooxml_utils::{
ImageInfo, PendingImageResolution, Relationship, parse_relationships,
resolve_image_placeholders, resolve_relative_to_file,
};
use crate::converter::{
ConversionOptions, ConversionResult, ConversionWarning, Converter, WarningCode,
};
use crate::error::ConvertError;
use crate::markdown::{
build_table, build_table_plain, format_heading, format_list_item, format_list_item_plain,
wrap_formatting,
};
use crate::zip_utils::{read_zip_bytes, read_zip_text};
/// Converts DOCX files to Markdown.
pub struct DocxConverter;
// ---- Data types ----
/// The kind of block element a paragraph represents.
#[derive(Debug, Clone, PartialEq)]
enum ParagraphKind {
Normal,
Heading(u8), // level 1..=6
ListItem {
ordered: bool,
level: u8,
num_id: String,
}, // list item from numbering
}
/// A numbering level definition from numbering.xml.
#[derive(Debug, Clone)]
struct NumberingLevel {
ordered: bool,
}
// ---- Styles parsing ----
/// Parse styles.xml to extract a mapping from style ID to heading level.
fn parse_styles(xml: &str) -> HashMap<String, u8> {
let mut styles = HashMap::new();
let mut reader = Reader::from_str(xml);
let mut current_style_id: Option<String> = None;
let mut current_heading_level: Option<u8> = None;
loop {
match reader.read_event() {
Ok(Event::Start(ref e)) | Ok(Event::Empty(ref e)) => {
let local = e.local_name();
let local_str = std::str::from_utf8(local.as_ref()).unwrap_or("");
if local_str == "style" {
current_style_id = None;
current_heading_level = None;
for attr in e.attributes().flatten() {
let local_name = attr.key.local_name();
let attr_local = std::str::from_utf8(local_name.as_ref()).unwrap_or("");
if attr_local == "styleId" {
let val = String::from_utf8_lossy(&attr.value).to_string();
if let Some(level) = extract_heading_level_from_id(&val) {
current_heading_level = Some(level);
}
current_style_id = Some(val);
}
}
} else if local_str == "name" && current_style_id.is_some() {
for attr in e.attributes().flatten() {
let local_name = attr.key.local_name();
let attr_local = std::str::from_utf8(local_name.as_ref()).unwrap_or("");
if attr_local == "val" {
let val = String::from_utf8_lossy(&attr.value);
if let Some(level) = extract_heading_level_from_name(&val) {
current_heading_level = Some(level);
}
}
}
}
}
Ok(Event::End(ref e)) => {
let local = e.local_name();
let local_str = std::str::from_utf8(local.as_ref()).unwrap_or("");
if local_str == "style" {
if let (Some(id), Some(level)) =
(current_style_id.take(), current_heading_level.take())
{
styles.insert(id, level);
}
current_style_id = None;
current_heading_level = None;
}
}
Ok(Event::Eof) => break,
Err(_) => break,
_ => {}
}
}
styles
}
/// Extract heading level from a style ID like "Heading1", "Heading2", etc.
fn extract_heading_level_from_id(style_id: &str) -> Option<u8> {
let lower = style_id.to_ascii_lowercase();
lower
.strip_prefix("heading")
.and_then(|rest| rest.parse::<u8>().ok())
.filter(|&l| (1..=9).contains(&l))
}
/// Extract heading level from a style name like "heading 1", "Heading 2", etc.
fn extract_heading_level_from_name(name: &str) -> Option<u8> {
let lower = name.to_ascii_lowercase();
let trimmed = lower.trim();
if let Some(rest) = trimmed.strip_prefix("heading") {
rest.trim()
.parse::<u8>()
.ok()
.filter(|&l| (1..=9).contains(&l))
} else {
None
}
}
// ---- Numbering parsing ----
/// Parse numbering.xml to extract numbering definitions.
///
/// Returns a mapping from (numId, level) to NumberingLevel.
/// Handles the indirection: numId → abstractNumId → level definitions.
fn parse_numbering(xml: &str) -> HashMap<(String, u8), NumberingLevel> {
let mut reader = Reader::from_str(xml);
// abstractNumId -> Vec<(level, ordered)>
let mut abstract_defs: HashMap<String, Vec<(u8, bool)>> = HashMap::new();
// numId -> abstractNumId
let mut num_to_abstract: HashMap<String, String> = HashMap::new();
let mut current_abstract_id: Option<String> = None;
let mut current_lvl: Option<u8> = None;
let mut in_abstract_num = false;
let mut in_lvl = false;
let mut in_num = false;
let mut current_num_id: Option<String> = None;
loop {
match reader.read_event() {
Ok(Event::Start(ref e)) | Ok(Event::Empty(ref e)) => {
let local = e.local_name();
let local_str = std::str::from_utf8(local.as_ref()).unwrap_or("");
match local_str {
"abstractNum" => {
in_abstract_num = true;
for attr in e.attributes().flatten() {
let local_name = attr.key.local_name();
let k = std::str::from_utf8(local_name.as_ref()).unwrap_or("");
if k == "abstractNumId" {
let id = String::from_utf8_lossy(&attr.value).to_string();
current_abstract_id = Some(id.clone());
abstract_defs.entry(id).or_default();
}
}
}
"lvl" if in_abstract_num => {
in_lvl = true;
for attr in e.attributes().flatten() {
let local_name = attr.key.local_name();
let k = std::str::from_utf8(local_name.as_ref()).unwrap_or("");
if k == "ilvl" {
current_lvl =
String::from_utf8_lossy(&attr.value).parse::<u8>().ok();
}
}
}
"numFmt" if in_lvl => {
if let (Some(abs_id), Some(lvl)) = (¤t_abstract_id, current_lvl) {
for attr in e.attributes().flatten() {
let local_name = attr.key.local_name();
let k = std::str::from_utf8(local_name.as_ref()).unwrap_or("");
if k == "val" {
let fmt = String::from_utf8_lossy(&attr.value).to_string();
let ordered = is_ordered_format(&fmt);
abstract_defs
.entry(abs_id.clone())
.or_default()
.push((lvl, ordered));
}
}
}
}
"num" => {
in_num = true;
for attr in e.attributes().flatten() {
let local_name = attr.key.local_name();
let k = std::str::from_utf8(local_name.as_ref()).unwrap_or("");
if k == "numId" {
current_num_id =
Some(String::from_utf8_lossy(&attr.value).to_string());
}
}
}
"abstractNumId" if in_num => {
if let Some(ref num_id) = current_num_id {
for attr in e.attributes().flatten() {
let local_name = attr.key.local_name();
let k = std::str::from_utf8(local_name.as_ref()).unwrap_or("");
if k == "val" {
let abs_id = String::from_utf8_lossy(&attr.value).to_string();
num_to_abstract.insert(num_id.clone(), abs_id);
}
}
}
}
_ => {}
}
}
Ok(Event::End(ref e)) => {
let local = e.local_name();
let local_str = std::str::from_utf8(local.as_ref()).unwrap_or("");
match local_str {
"abstractNum" => {
in_abstract_num = false;
current_abstract_id = None;
}
"lvl" => {
in_lvl = false;
current_lvl = None;
}
"num" => {
in_num = false;
current_num_id = None;
}
_ => {}
}
}
Ok(Event::Eof) => break,
Err(_) => break,
_ => {}
}
}
// Build final mapping: (numId, level) -> NumberingLevel
let mut result: HashMap<(String, u8), NumberingLevel> = HashMap::new();
for (num_id, abs_id) in &num_to_abstract {
if let Some(levels) = abstract_defs.get(abs_id) {
for &(lvl, ordered) in levels {
result.insert((num_id.clone(), lvl), NumberingLevel { ordered });
}
}
}
result
}
/// Determine if a numFmt value represents an ordered (numbered) list.
fn is_ordered_format(fmt: &str) -> bool {
matches!(
fmt,
"decimal" | "upperRoman" | "lowerRoman" | "upperLetter" | "lowerLetter" | "decimalZero"
)
}
// ---- Run segment merging ----
/// A segment of text within a run, with formatting info.
#[derive(Debug, Clone)]
struct RunSegment {
text: String,
bold: bool,
italic: bool,
}
/// Saved paragraph-level state for text box context save/restore.
///
/// When entering `<w:txbxContent>`, the current paragraph state is saved and reset
/// so that inner `<w:p>` elements can be processed normally. On exit, the state is
/// restored to continue the outer paragraph.
#[derive(Debug, Clone)]
struct SavedParagraphState {
in_paragraph: bool,
in_run: bool,
in_text: bool,
in_run_properties: bool,
current_para_kind: ParagraphKind,
current_para_runs: Vec<RunSegment>,
current_para_runs_plain: Vec<RunSegment>,
current_run_bold: bool,
current_run_italic: bool,
in_hyperlink: bool,
current_hyperlink_url: Option<String>,
hyperlink_runs: Vec<RunSegment>,
hyperlink_runs_plain: Vec<RunSegment>,
in_para_properties: bool,
in_num_pr: bool,
current_num_id: Option<String>,
current_ilvl: Option<u8>,
}
/// Merge adjacent segments with the same formatting, then apply `wrap_formatting`
/// once per merged group.
fn merge_and_format_runs(runs: &[RunSegment]) -> String {
if runs.is_empty() {
return String::new();
}
let mut result = String::new();
let mut i = 0;
while i < runs.len() {
let bold = runs[i].bold;
let italic = runs[i].italic;
let mut merged_text = runs[i].text.clone();
let mut j = i + 1;
while j < runs.len() && runs[j].bold == bold && runs[j].italic == italic {
merged_text.push_str(&runs[j].text);
j += 1;
}
result.push_str(&wrap_formatting(&merged_text, bold, italic));
i = j;
}
result
}
/// Concatenate run segments into plain text without any bold/italic formatting.
fn merge_runs_plain(runs: &[RunSegment]) -> String {
let mut result = String::new();
for run in runs {
result.push_str(&run.text);
}
result
}
// ---- Document body parsing ----
/// Parse the main document.xml body and produce Markdown and plain text output.
///
/// Returns (markdown, plain_text, title, warnings, image_infos).
/// Images are emitted with unique placeholder alt text `__img_N__`.
/// `image_counter` is incremented for each image to ensure uniqueness.
fn parse_document(
xml: &str,
styles: &HashMap<String, u8>,
relationships: &HashMap<String, Relationship>,
numbering: &HashMap<(String, u8), NumberingLevel>,
image_counter: &mut usize,
) -> (
String,
String,
Option<String>,
Vec<ConversionWarning>,
Vec<ImageInfo>,
) {
let mut reader = Reader::from_str(xml);
let mut warnings = Vec::new();
let mut output = String::new();
let mut plain_output = String::new();
let mut title: Option<String> = None;
// Paragraph-level state
let mut in_body = false;
let mut in_paragraph = false;
let mut current_para_kind = ParagraphKind::Normal;
let mut current_para_runs: Vec<RunSegment> = Vec::new();
// Plain text counterpart: tracks text without markdown link/image syntax
let mut current_para_runs_plain: Vec<RunSegment> = Vec::new();
// Run-level state
let mut in_run = false;
let mut in_text = false;
// Run properties state (bold/italic)
let mut in_run_properties = false;
let mut current_run_bold = false;
let mut current_run_italic = false;
// Hyperlink state
let mut in_hyperlink = false;
let mut current_hyperlink_url: Option<String> = None;
let mut hyperlink_runs: Vec<RunSegment> = Vec::new();
let mut hyperlink_runs_plain: Vec<RunSegment> = Vec::new();
// Paragraph properties state (for list detection)
let mut in_para_properties = false;
let mut in_num_pr = false;
let mut current_num_id: Option<String> = None;
let mut current_ilvl: Option<u8> = None;
// List counter tracking: (numId, level) -> counter
let mut list_counters: HashMap<(String, u8), usize> = HashMap::new();
// Track if last paragraph was a list item (for single-newline separation)
let mut last_was_list = false;
// Table state
let mut in_table = false;
let mut in_table_row = false;
let mut in_table_cell = false;
let mut table_rows: Vec<Vec<String>> = Vec::new();
let mut current_row: Vec<String> = Vec::new();
let mut current_cell_text = String::new();
let mut cell_paragraph_count: usize = 0;
// Plain-text table state (no markdown formatting in cells)
let mut table_rows_plain: Vec<Vec<String>> = Vec::new();
let mut current_row_plain: Vec<String> = Vec::new();
let mut current_cell_text_plain = String::new();
// Drawing/Image state
let mut in_drawing = false;
let mut current_image_alt: Option<String> = None;
let mut current_image_rel_id: Option<String> = None;
// Image info tracking for placeholder-based replacement
let mut image_infos: Vec<ImageInfo> = Vec::new();
// mc:AlternateContent state: skip Choice, process Fallback
let mut in_mc_choice = false;
let mut mc_choice_depth: u32 = 0;
// Text box state: w:pict > v:shape > v:textbox > w:txbxContent
let mut in_pict = false;
let mut in_textbox_content = false;
let mut saved_paragraph_state: Option<SavedParagraphState> = None;
loop {
match reader.read_event() {
Ok(Event::Start(ref e)) => {
let local = e.local_name();
let local_str = std::str::from_utf8(local.as_ref()).unwrap_or("");
// mc:AlternateContent handling: skip Choice, process Fallback
if in_mc_choice {
mc_choice_depth += 1;
continue;
}
match local_str {
"AlternateContent" => {
// Just a wrapper — content inside is either Choice or Fallback
continue;
}
"Choice" => {
in_mc_choice = true;
mc_choice_depth = 1;
continue;
}
"Fallback" => {
// Process Fallback content normally — just skip this tag
continue;
}
_ => {}
}
// Text box handling: w:pict > ... > w:txbxContent
match local_str {
"pict" if in_run => {
in_pict = true;
continue;
}
"txbxContent" if in_pict => {
// Save current paragraph state and reset for inner paragraphs
saved_paragraph_state = Some(SavedParagraphState {
in_paragraph,
in_run,
in_text,
in_run_properties,
current_para_kind: current_para_kind.clone(),
current_para_runs: current_para_runs.clone(),
current_para_runs_plain: current_para_runs_plain.clone(),
current_run_bold,
current_run_italic,
in_hyperlink,
current_hyperlink_url: current_hyperlink_url.clone(),
hyperlink_runs: hyperlink_runs.clone(),
hyperlink_runs_plain: hyperlink_runs_plain.clone(),
in_para_properties,
in_num_pr,
current_num_id: current_num_id.clone(),
current_ilvl,
});
// Reset paragraph-level state for text box content
in_paragraph = false;
in_run = false;
in_text = false;
in_run_properties = false;
current_para_kind = ParagraphKind::Normal;
current_para_runs.clear();
current_para_runs_plain.clear();
current_run_bold = false;
current_run_italic = false;
in_hyperlink = false;
current_hyperlink_url = None;
hyperlink_runs.clear();
hyperlink_runs_plain.clear();
in_para_properties = false;
in_num_pr = false;
current_num_id = None;
current_ilvl = None;
in_textbox_content = true;
continue;
}
// VML elements inside w:pict are transparent containers
"shape" | "rect" | "roundrect" | "textbox" | "group" if in_pict => {
continue;
}
_ => {}
}
match local_str {
"body" => {
in_body = true;
}
"tbl" if in_body => {
in_table = true;
table_rows.clear();
table_rows_plain.clear();
}
"tr" if in_table => {
in_table_row = true;
current_row.clear();
current_row_plain.clear();
}
"tc" if in_table_row => {
in_table_cell = true;
current_cell_text.clear();
current_cell_text_plain.clear();
cell_paragraph_count = 0;
}
"p" if in_body => {
in_paragraph = true;
current_para_kind = ParagraphKind::Normal;
current_para_runs.clear();
current_para_runs_plain.clear();
current_num_id = None;
current_ilvl = None;
}
"pPr" if in_paragraph => {
in_para_properties = true;
}
"pStyle" if in_para_properties => {
for attr in e.attributes().flatten() {
let local_name = attr.key.local_name();
let attr_local = std::str::from_utf8(local_name.as_ref()).unwrap_or("");
if attr_local == "val" {
let val = String::from_utf8_lossy(&attr.value);
current_para_kind = resolve_paragraph_kind(&val, styles);
}
}
}
"numPr" if in_para_properties => {
in_num_pr = true;
}
"ilvl" if in_num_pr => {
for attr in e.attributes().flatten() {
let local_name = attr.key.local_name();
let attr_local = std::str::from_utf8(local_name.as_ref()).unwrap_or("");
if attr_local == "val" {
current_ilvl =
String::from_utf8_lossy(&attr.value).parse::<u8>().ok();
}
}
}
"numId" if in_num_pr => {
for attr in e.attributes().flatten() {
let local_name = attr.key.local_name();
let attr_local = std::str::from_utf8(local_name.as_ref()).unwrap_or("");
if attr_local == "val" {
let val = String::from_utf8_lossy(&attr.value).to_string();
// numId "0" means no numbering
if val != "0" {
current_num_id = Some(val);
}
}
}
}
"hyperlink" if in_paragraph => {
in_hyperlink = true;
hyperlink_runs.clear();
hyperlink_runs_plain.clear();
current_hyperlink_url = None;
for attr in e.attributes().flatten() {
let key = std::str::from_utf8(attr.key.as_ref()).unwrap_or("");
if key == "r:id" || key.ends_with(":id") {
let rid = String::from_utf8_lossy(&attr.value).to_string();
current_hyperlink_url =
resolve_hyperlink_url(&rid, relationships, &mut warnings);
}
}
}
"r" if in_paragraph => {
in_run = true;
current_run_bold = false;
current_run_italic = false;
}
"rPr" if in_run => {
in_run_properties = true;
}
"b" if in_run_properties => {
// Bold: <w:b/> or <w:b w:val="true"/>
// Check for explicit false
current_run_bold = !is_val_false(e);
}
"i" if in_run_properties => {
current_run_italic = !is_val_false(e);
}
"t" if in_run => {
in_text = true;
}
"drawing" if in_run => {
in_drawing = true;
current_image_alt = None;
current_image_rel_id = None;
}
"docPr" if in_drawing => {
// <wp:docPr descr="Alt text"/>
for attr in e.attributes().flatten() {
let local_name = attr.key.local_name();
let k = std::str::from_utf8(local_name.as_ref()).unwrap_or("");
if k == "descr" {
let val = String::from_utf8_lossy(&attr.value).to_string();
if !val.is_empty() {
current_image_alt = Some(val);
}
}
}
}
"blip" if in_drawing => {
// <a:blip r:embed="rId5"/>
for attr in e.attributes().flatten() {
let key = std::str::from_utf8(attr.key.as_ref()).unwrap_or("");
if key == "r:embed" || key.ends_with(":embed") {
current_image_rel_id =
Some(String::from_utf8_lossy(&attr.value).to_string());
}
}
}
_ => {}
}
}
Ok(Event::Empty(ref e)) => {
if in_mc_choice {
continue;
}
let local = e.local_name();
let local_str = std::str::from_utf8(local.as_ref()).unwrap_or("");
match local_str {
"pStyle" if in_para_properties => {
for attr in e.attributes().flatten() {
let local_name = attr.key.local_name();
let attr_local = std::str::from_utf8(local_name.as_ref()).unwrap_or("");
if attr_local == "val" {
let val = String::from_utf8_lossy(&attr.value);
current_para_kind = resolve_paragraph_kind(&val, styles);
}
}
}
"ilvl" if in_num_pr => {
for attr in e.attributes().flatten() {
let local_name = attr.key.local_name();
let attr_local = std::str::from_utf8(local_name.as_ref()).unwrap_or("");
if attr_local == "val" {
current_ilvl =
String::from_utf8_lossy(&attr.value).parse::<u8>().ok();
}
}
}
"numId" if in_num_pr => {
for attr in e.attributes().flatten() {
let local_name = attr.key.local_name();
let attr_local = std::str::from_utf8(local_name.as_ref()).unwrap_or("");
if attr_local == "val" {
let val = String::from_utf8_lossy(&attr.value).to_string();
if val != "0" {
current_num_id = Some(val);
}
}
}
}
"b" if in_run_properties => {
current_run_bold = !is_val_false(e);
}
"i" if in_run_properties => {
current_run_italic = !is_val_false(e);
}
"br" if in_run => {
let seg = RunSegment {
text: "\n".to_string(),
bold: false,
italic: false,
};
if in_hyperlink {
hyperlink_runs.push(seg.clone());
hyperlink_runs_plain.push(seg);
} else {
current_para_runs.push(seg.clone());
current_para_runs_plain.push(seg);
}
}
"hyperlink" if in_paragraph => {
// Self-closing hyperlink (unlikely but handle gracefully)
}
"docPr" if in_drawing => {
for attr in e.attributes().flatten() {
let local_name = attr.key.local_name();
let k = std::str::from_utf8(local_name.as_ref()).unwrap_or("");
if k == "descr" {
let val = String::from_utf8_lossy(&attr.value).to_string();
if !val.is_empty() {
current_image_alt = Some(val);
}
}
}
}
"blip" if in_drawing => {
for attr in e.attributes().flatten() {
let key = std::str::from_utf8(attr.key.as_ref()).unwrap_or("");
if key == "r:embed" || key.ends_with(":embed") {
current_image_rel_id =
Some(String::from_utf8_lossy(&attr.value).to_string());
}
}
}
_ => {}
}
}
Ok(Event::Text(ref e)) => {
if in_mc_choice {
continue;
}
if in_text && in_run {
let text = e.unescape().unwrap_or_default().to_string();
let seg = RunSegment {
text,
bold: current_run_bold,
italic: current_run_italic,
};
if in_hyperlink {
hyperlink_runs.push(seg.clone());
hyperlink_runs_plain.push(seg);
} else {
current_para_runs.push(seg.clone());
current_para_runs_plain.push(seg);
}
}
}
Ok(Event::End(ref e)) => {
let local = e.local_name();
let local_str = std::str::from_utf8(local.as_ref()).unwrap_or("");
// mc:Choice depth tracking
if in_mc_choice {
mc_choice_depth -= 1;
if mc_choice_depth == 0 {
in_mc_choice = false;
}
continue;
}
// mc:AlternateContent and mc:Fallback end tags — just skip
if local_str == "AlternateContent" || local_str == "Fallback" {
continue;
}
// Text box end handling
if local_str == "txbxContent" && in_textbox_content {
// Flush any pending paragraph inside the text box
// (the normal "p" end handler will have already flushed it,
// but guard against edge cases)
in_textbox_content = false;
// Restore saved paragraph state
if let Some(saved) = saved_paragraph_state.take() {
in_paragraph = saved.in_paragraph;
in_run = saved.in_run;
in_text = saved.in_text;
in_run_properties = saved.in_run_properties;
current_para_kind = saved.current_para_kind;
current_para_runs = saved.current_para_runs;
current_para_runs_plain = saved.current_para_runs_plain;
current_run_bold = saved.current_run_bold;
current_run_italic = saved.current_run_italic;
in_hyperlink = saved.in_hyperlink;
current_hyperlink_url = saved.current_hyperlink_url;
hyperlink_runs = saved.hyperlink_runs;
hyperlink_runs_plain = saved.hyperlink_runs_plain;
in_para_properties = saved.in_para_properties;
in_num_pr = saved.in_num_pr;
current_num_id = saved.current_num_id;
current_ilvl = saved.current_ilvl;
}
continue;
}
if local_str == "pict" && in_pict {
in_pict = false;
continue;
}
// VML end tags inside w:pict are transparent
if in_pict
&& matches!(
local_str,
"shape" | "rect" | "roundrect" | "textbox" | "group"
)
{
continue;
}
match local_str {
"body" => {
in_body = false;
}
"tbl" if in_table => {
// Render table
if !table_rows.is_empty() {
let first_row = &table_rows[0];
let headers: Vec<&str> = first_row.iter().map(|s| s.as_str()).collect();
let data_rows: Vec<Vec<&str>> = table_rows[1..]
.iter()
.map(|row| row.iter().map(|s| s.as_str()).collect())
.collect();
let table_md = build_table(&headers, &data_rows);
output.push_str(&table_md);
output.push('\n');
// Use plain-text rows (no markdown formatting) for plain output
let first_row_plain = &table_rows_plain[0];
let headers_plain: Vec<&str> =
first_row_plain.iter().map(|s| s.as_str()).collect();
let data_rows_plain: Vec<Vec<&str>> = table_rows_plain[1..]
.iter()
.map(|row| row.iter().map(|s| s.as_str()).collect())
.collect();
let table_plain = build_table_plain(&headers_plain, &data_rows_plain);
plain_output.push_str(&table_plain);
plain_output.push('\n');
}
in_table = false;
table_rows.clear();
table_rows_plain.clear();
last_was_list = false;
}
"tr" if in_table_row => {
table_rows.push(current_row.clone());
current_row.clear();
table_rows_plain.push(current_row_plain.clone());
current_row_plain.clear();
in_table_row = false;
}
"tc" if in_table_cell => {
current_row.push(current_cell_text.trim().to_string());
current_cell_text.clear();
current_row_plain.push(current_cell_text_plain.trim().to_string());
current_cell_text_plain.clear();
in_table_cell = false;
}
"p" if in_paragraph => {
// Resolve list item kind from numPr
if let (Some(num_id), Some(ilvl)) = (¤t_num_id, current_ilvl) {
let key = (num_id.clone(), ilvl);
let ordered = numbering.get(&key).map(|nl| nl.ordered).unwrap_or(false); // default to bullet
current_para_kind = ParagraphKind::ListItem {
ordered,
level: ilvl,
num_id: num_id.clone(),
};
}
// Merge runs into final paragraph text (markdown with formatting)
let current_para_text = merge_and_format_runs(¤t_para_runs);
// Plain text: no bold/italic markers, no link/image syntax
let current_para_text_plain = merge_runs_plain(¤t_para_runs_plain);
if in_table_cell {
// In a table cell: accumulate text
if cell_paragraph_count > 0 && !current_para_text.is_empty() {
current_cell_text.push(' ');
}
current_cell_text.push_str(current_para_text.trim());
if cell_paragraph_count > 0 && !current_para_text_plain.is_empty() {
current_cell_text_plain.push(' ');
}
current_cell_text_plain.push_str(current_para_text_plain.trim());
cell_paragraph_count += 1;
} else {
// Normal paragraph finalization
let is_list =
matches!(current_para_kind, ParagraphKind::ListItem { .. });
finalize_paragraph(
¤t_para_kind,
¤t_para_text,
¤t_para_text_plain,
&mut output,
&mut plain_output,
&mut title,
&mut list_counters,
last_was_list,
);
last_was_list = is_list;
}
in_paragraph = false;
current_para_runs.clear();
current_para_runs_plain.clear();
current_num_id = None;
current_ilvl = None;
}
"pPr" => {
in_para_properties = false;
}
"numPr" => {
in_num_pr = false;
}
"hyperlink" if in_hyperlink => {
let link_text = merge_and_format_runs(&hyperlink_runs);
let link_text_plain = merge_runs_plain(&hyperlink_runs_plain);
let link_md = if let Some(ref url) = current_hyperlink_url {
format!("[{}]({})", link_text, url)
} else {
link_text
};
current_para_runs.push(RunSegment {
text: link_md,
bold: false,
italic: false,
});
// Plain text: just the link text, no URL
current_para_runs_plain.push(RunSegment {
text: link_text_plain,
bold: false,
italic: false,
});
in_hyperlink = false;
hyperlink_runs.clear();
hyperlink_runs_plain.clear();
current_hyperlink_url = None;
}
"rPr" => {
in_run_properties = false;
}
"r" => {
in_run = false;
in_text = false;
current_run_bold = false;
current_run_italic = false;
}
"t" => {
in_text = false;
}
"drawing" if in_drawing => {
// Emit image markdown with unique placeholder
if let Some(ref rel_id) = current_image_rel_id {
let filename = relationships
.get(rel_id)
.map(|r| {
// Extract just the filename from path
r.target.rsplit('/').next().unwrap_or(&r.target).to_string()
})
.unwrap_or_default();
if !filename.is_empty() {
let original_alt =
current_image_alt.as_deref().unwrap_or("").to_string();
let placeholder = format!("__img_{n}__", n = *image_counter);
*image_counter += 1;
image_infos.push(ImageInfo {
placeholder: placeholder.clone(),
original_alt,
filename: filename.clone(),
bytes_key: rel_id.clone(),
});
let img_md = format!("");
let seg = RunSegment {
text: img_md,
bold: false,
italic: false,
};
// Plain text: just the placeholder (no image markdown syntax)
let seg_plain = RunSegment {
text: placeholder,
bold: false,
italic: false,
};
if in_hyperlink {
hyperlink_runs.push(seg);
hyperlink_runs_plain.push(seg_plain);
} else {
current_para_runs.push(seg);
current_para_runs_plain.push(seg_plain);
}
} else {
warnings.push(ConversionWarning {
code: WarningCode::SkippedElement,
message: format!(
"image relationship '{rel_id}' not found in rels"
),
location: Some(rel_id.clone()),
});
}
}
in_drawing = false;
current_image_alt = None;
current_image_rel_id = None;
}
_ => {}
}
}
Ok(Event::Eof) => break,
Err(_) => break,
_ => {}
}
}
// Trim trailing newlines to a single trailing newline
let markdown = output.trim_end().to_string();
let markdown = if markdown.is_empty() {
markdown
} else {
format!("{}\n", markdown)
};
let plain_text = plain_output.trim_end().to_string();
let plain_text = if plain_text.is_empty() {
plain_text
} else {
format!("{}\n", plain_text)
};
(markdown, plain_text, title, warnings, image_infos)
}
/// Check if a `w:val` attribute on an element is explicitly false ("0" or "false").
fn is_val_false(e: &quick_xml::events::BytesStart) -> bool {
for attr in e.attributes().flatten() {
let local_name = attr.key.local_name();
let k = std::str::from_utf8(local_name.as_ref()).unwrap_or("");
if k == "val" {
let v = String::from_utf8_lossy(&attr.value);
return v == "0" || v.eq_ignore_ascii_case("false");
}
}
false
}
/// Resolve paragraph kind from a style value.
fn resolve_paragraph_kind(style_val: &str, styles: &HashMap<String, u8>) -> ParagraphKind {
if let Some(level) = extract_heading_level_from_id(style_val) {
let clamped = level.clamp(1, 6);
return ParagraphKind::Heading(clamped);
}
if let Some(&level) = styles.get(style_val) {
let clamped = level.clamp(1, 6);
return ParagraphKind::Heading(clamped);
}
ParagraphKind::Normal
}
/// Resolve a hyperlink URL from a relationship ID.
fn resolve_hyperlink_url(
rid: &str,
relationships: &HashMap<String, Relationship>,
warnings: &mut Vec<ConversionWarning>,
) -> Option<String> {
match relationships.get(rid) {
Some(rel) => Some(rel.target.clone()),
None => {
warnings.push(ConversionWarning {
code: WarningCode::SkippedElement,
message: format!("hyperlink relationship '{rid}' not found in rels"),
location: Some(rid.to_string()),
});
None
}
}
}
/// Finalize a paragraph: emit heading, list item, or plain text into the output buffers.
#[allow(clippy::too_many_arguments)]
fn finalize_paragraph(
kind: &ParagraphKind,
text: &str,
text_plain: &str,
output: &mut String,
plain_output: &mut String,
title: &mut Option<String>,
list_counters: &mut HashMap<(String, u8), usize>,
last_was_list: bool,
) {
let trimmed = text.trim();
let trimmed_plain = text_plain.trim();
if trimmed.is_empty() {
return;
}
match kind {
ParagraphKind::Heading(level) => {
if last_was_list {
output.push('\n');
plain_output.push('\n');
}
output.push_str(&format_heading(*level, trimmed));
output.push('\n');
// Plain text: just the text, no # markers
plain_output.push_str(trimmed_plain);
plain_output.push_str("\n\n");
if *level == 1 && title.is_none() {
*title = Some(trimmed_plain.to_string());
}
}
ParagraphKind::ListItem {
ordered,
level,
num_id,
} => {
let counter = if *ordered {
let key = (num_id.clone(), *level);
let c = list_counters.entry(key).or_insert(0);
*c += 1;
*c
} else {
1
};
let item = format_list_item(*level, *ordered, counter, trimmed);
output.push_str(&item);
output.push('\n');
// Plain text: indented text without bullet/number markers
let item_plain = format_list_item_plain(*level, trimmed_plain);
plain_output.push_str(&item_plain);
plain_output.push('\n');
}
ParagraphKind::Normal => {
if last_was_list {
output.push('\n');
plain_output.push('\n');
}
output.push_str(trimmed);
output.push_str("\n\n");
plain_output.push_str(trimmed_plain);
plain_output.push_str("\n\n");
}
}
}
// ---- Internal conversion (parse + image extraction, no resolution) ----
impl DocxConverter {
/// Parse the document and extract images without resolving placeholders.
///
/// Returns the conversion result (with unresolved placeholders in markdown)
/// and pending image data for later resolution (sync or async).
pub(crate) fn convert_inner(
&self,
data: &[u8],
options: &ConversionOptions,
) -> Result<(ConversionResult, PendingImageResolution), ConvertError> {
let cursor = Cursor::new(data);
let mut archive = ZipArchive::new(cursor)?;
crate::zip_utils::validate_zip_budget(&mut archive, options.max_uncompressed_zip_bytes)?;
// 1. Parse styles.xml (optional)
let styles = match read_zip_text(&mut archive, "word/styles.xml")? {
Some(xml) => parse_styles(&xml),
None => HashMap::new(),
};
// 2. Parse document.xml.rels (optional)
let relationships = match read_zip_text(&mut archive, "word/_rels/document.xml.rels")? {
Some(xml) => parse_relationships(&xml),
None => HashMap::new(),
};
// 3. Parse numbering.xml (optional)
let numbering = match read_zip_text(&mut archive, "word/numbering.xml")? {
Some(xml) => parse_numbering(&xml),
None => HashMap::new(),
};
// 4. Parse document.xml (required)
let document_xml = read_zip_text(&mut archive, "word/document.xml")?.ok_or_else(|| {
ConvertError::MalformedDocument {
reason: "missing word/document.xml".to_string(),
}
})?;
let mut image_counter: usize = 0;
let (markdown, plain_text, title, mut warnings, image_infos) = parse_document(
&document_xml,
&styles,
&relationships,
&numbering,
&mut image_counter,
);
// 5. Extract embedded images if requested or if describer needs them
let need_image_bytes = options.extract_images || options.image_describer.is_some();
let mut images: Vec<(String, Vec<u8>)> = Vec::new();
let mut image_bytes_map: HashMap<String, Vec<u8>> = HashMap::new();
if need_image_bytes {
let mut total_image_bytes: usize = 0;
for (rel_id, rel) in &relationships {
if !rel.rel_type.contains("image") {
continue;
}
if total_image_bytes >= options.max_total_image_bytes {
break;
}
let image_path = resolve_relative_to_file("word/document.xml", &rel.target);
if let Ok(Some(img_data)) = read_zip_bytes(&mut archive, &image_path) {
total_image_bytes += img_data.len();
if total_image_bytes <= options.max_total_image_bytes {
let filename = image_path
.rsplit('/')
.next()
.unwrap_or(&image_path)
.to_string();
if options.extract_images {
images.push((filename.clone(), img_data.clone()));
}
image_bytes_map.insert(rel_id.clone(), img_data);
} else {
warnings.push(ConversionWarning {
code: WarningCode::ResourceLimitReached,
message: format!(
"total image bytes exceeded limit ({})",
options.max_total_image_bytes
),
location: Some(image_path),
});
}
}
}
}
let result = ConversionResult {
markdown,
plain_text,
title,
images,
warnings,
};
let pending = PendingImageResolution {
infos: image_infos,
bytes: image_bytes_map,
};
Ok((result, pending))
}
}
// ---- Converter trait impl ----
impl Converter for DocxConverter {
fn supported_extensions(&self) -> &[&str] {
&["docx"]
}
fn convert(
&self,
data: &[u8],
options: &ConversionOptions,
) -> Result<ConversionResult, ConvertError> {
let (mut result, pending) = self.convert_inner(data, options)?;
resolve_image_placeholders(
&mut result.markdown,
&mut result.plain_text,
&pending.infos,
&pending.bytes,
options.image_describer.as_deref(),
&mut result.warnings,
);
Ok(result)
}
}
#[cfg(test)]
mod tests {
use super::*;
// ---- Helper: build minimal DOCX ZIP in memory ----
/// Build a minimal DOCX file in memory from document XML, optional styles XML,
/// optional relationships XML, and optional numbering XML.
fn build_test_docx(
document_xml: &str,
styles_xml: Option<&str>,
rels_xml: Option<&str>,
) -> Vec<u8> {
build_test_docx_with_numbering(document_xml, styles_xml, rels_xml, None)
}
fn build_test_docx_with_numbering(
document_xml: &str,
styles_xml: Option<&str>,
rels_xml: Option<&str>,
numbering_xml: Option<&str>,
) -> Vec<u8> {
use std::io::Write;
use zip::ZipWriter;
use zip::write::SimpleFileOptions;
let buf = Vec::new();
let mut zip = ZipWriter::new(Cursor::new(buf));
let opts = SimpleFileOptions::default();
// [Content_Types].xml
let mut ct = String::from(r#"<?xml version="1.0" encoding="UTF-8" standalone="yes"?>"#);
ct.push_str(
r#"<Types xmlns="http://schemas.openxmlformats.org/package/2006/content-types">"#,
);
ct.push_str(
r#"<Default Extension="rels" ContentType="application/vnd.openxmlformats-package.relationships+xml"/>"#,
);
ct.push_str(r#"<Default Extension="xml" ContentType="application/xml"/>"#);
ct.push_str(
r#"<Override PartName="/word/document.xml" ContentType="application/vnd.openxmlformats-officedocument.wordprocessingml.document.main+xml"/>"#,
);
ct.push_str("</Types>");
zip.start_file("[Content_Types].xml", opts).unwrap();
zip.write_all(ct.as_bytes()).unwrap();
// _rels/.rels
zip.start_file("_rels/.rels", opts).unwrap();
zip.write_all(
br#"<?xml version="1.0" encoding="UTF-8" standalone="yes"?><Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships"><Relationship Id="rId1" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument" Target="word/document.xml"/></Relationships>"#,
)
.unwrap();
// word/document.xml
zip.start_file("word/document.xml", opts).unwrap();
zip.write_all(document_xml.as_bytes()).unwrap();
// word/styles.xml (optional)
if let Some(styles) = styles_xml {
zip.start_file("word/styles.xml", opts).unwrap();
zip.write_all(styles.as_bytes()).unwrap();
}
// word/_rels/document.xml.rels (optional)
if let Some(rels) = rels_xml {
zip.start_file("word/_rels/document.xml.rels", opts)
.unwrap();
zip.write_all(rels.as_bytes()).unwrap();
}
// word/numbering.xml (optional)
if let Some(numbering) = numbering_xml {
zip.start_file("word/numbering.xml", opts).unwrap();
zip.write_all(numbering.as_bytes()).unwrap();
}
let cursor = zip.finish().unwrap();
cursor.into_inner()
}
/// Wrap paragraph content in a minimal document.xml structure.
fn wrap_body(body: &str) -> String {
format!(
r#"<?xml version="1.0" encoding="UTF-8" standalone="yes"?><w:document xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main" xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships" xmlns:wp="http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing" xmlns:a="http://schemas.openxmlformats.org/drawingml/2006/main" xmlns:pic="http://schemas.openxmlformats.org/drawingml/2006/picture" xmlns:mc="http://schemas.openxmlformats.org/markup-compatibility/2006" xmlns:v="urn:schemas-microsoft-com:vml" xmlns:o="urn:schemas-microsoft-com:office:office"><w:body>{body}</w:body></w:document>"#
)
}
/// Build a simple paragraph XML element.
fn para(text: &str) -> String {
format!(r#"<w:p><w:r><w:t xml:space="preserve">{text}</w:t></w:r></w:p>"#)
}
/// Build a heading paragraph XML element with a direct style ID.
fn heading_para(text: &str, level: u8) -> String {
format!(
r#"<w:p><w:pPr><w:pStyle w:val="Heading{level}"/></w:pPr><w:r><w:t>{text}</w:t></w:r></w:p>"#
)
}
/// Build a bold paragraph.
fn bold_para(text: &str) -> String {
format!(r#"<w:p><w:r><w:rPr><w:b/></w:rPr><w:t>{text}</w:t></w:r></w:p>"#)
}
/// Build an italic paragraph.
fn italic_para(text: &str) -> String {
format!(r#"<w:p><w:r><w:rPr><w:i/></w:rPr><w:t>{text}</w:t></w:r></w:p>"#)
}
/// Build a bold+italic paragraph.
fn bold_italic_para(text: &str) -> String {
format!(r#"<w:p><w:r><w:rPr><w:b/><w:i/></w:rPr><w:t>{text}</w:t></w:r></w:p>"#)
}
// ---- Existing tests (unchanged) ----
#[test]
fn test_docx_supported_extensions() {
let converter = DocxConverter;
assert_eq!(converter.supported_extensions(), &["docx"]);
}
#[test]
fn test_docx_can_convert() {
let converter = DocxConverter;
assert!(converter.can_convert("docx", &[]));
assert!(!converter.can_convert("xlsx", &[]));
assert!(!converter.can_convert("pdf", &[]));
assert!(!converter.can_convert("txt", &[]));
}
#[test]
fn test_docx_invalid_data_returns_error() {
let converter = DocxConverter;
let result = converter.convert(b"not a valid docx file", &ConversionOptions::default());
assert!(result.is_err());
}
#[test]
fn test_docx_single_paragraph() {
let doc = wrap_body(¶("Hello, world!"));
let data = build_test_docx(&doc, None, None);
let converter = DocxConverter;
let result = converter
.convert(&data, &ConversionOptions::default())
.unwrap();
assert_eq!(result.markdown.trim(), "Hello, world!");
}
#[test]
fn test_docx_multiple_paragraphs() {
let body = format!("{}{}", para("First paragraph."), para("Second paragraph."));
let doc = wrap_body(&body);
let data = build_test_docx(&doc, None, None);
let converter = DocxConverter;
let result = converter
.convert(&data, &ConversionOptions::default())
.unwrap();
assert!(result.markdown.contains("First paragraph."));
assert!(result.markdown.contains("Second paragraph."));
assert!(
result
.markdown
.contains("First paragraph.\n\nSecond paragraph.")
);
}
#[test]
fn test_docx_empty_document() {
let doc = wrap_body("");
let data = build_test_docx(&doc, None, None);
let converter = DocxConverter;
let result = converter
.convert(&data, &ConversionOptions::default())
.unwrap();
assert_eq!(result.markdown, "");
}
#[test]
fn test_docx_unicode_cjk() {
let body = format!(
"{}{}{}",
para("한국어 테스트"),
para("中文测试"),
para("日本語テスト")
);
let doc = wrap_body(&body);
let data = build_test_docx(&doc, None, None);
let converter = DocxConverter;
let result = converter
.convert(&data, &ConversionOptions::default())
.unwrap();
assert!(result.markdown.contains("한국어 테스트"));
assert!(result.markdown.contains("中文测试"));
assert!(result.markdown.contains("日本語テスト"));
}
#[test]
fn test_docx_emoji() {
let body = para("Rocket: 🚀 Stars: ✨ Earth: 🌍");
let doc = wrap_body(&body);
let data = build_test_docx(&doc, None, None);
let converter = DocxConverter;
let result = converter
.convert(&data, &ConversionOptions::default())
.unwrap();
assert!(result.markdown.contains("🚀"));
assert!(result.markdown.contains("✨"));
assert!(result.markdown.contains("🌍"));
}
#[test]
fn test_docx_heading_levels() {
let body = format!(
"{}{}{}{}{}{}",
heading_para("Heading 1", 1),
heading_para("Heading 2", 2),
heading_para("Heading 3", 3),
heading_para("Heading 4", 4),
heading_para("Heading 5", 5),
heading_para("Heading 6", 6),
);
let doc = wrap_body(&body);
let data = build_test_docx(&doc, None, None);
let converter = DocxConverter;
let result = converter
.convert(&data, &ConversionOptions::default())
.unwrap();
assert!(result.markdown.contains("# Heading 1\n"));
assert!(result.markdown.contains("## Heading 2\n"));
assert!(result.markdown.contains("### Heading 3\n"));
assert!(result.markdown.contains("#### Heading 4\n"));
assert!(result.markdown.contains("##### Heading 5\n"));
assert!(result.markdown.contains("###### Heading 6\n"));
}
#[test]
fn test_docx_heading_from_styles_xml() {
let body = r#"<w:p><w:pPr><w:pStyle w:val="CustomTitle"/></w:pPr><w:r><w:t>My Title</w:t></w:r></w:p>"#;
let doc = wrap_body(body);
let styles = r#"<?xml version="1.0" encoding="UTF-8"?><w:styles xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main"><w:style w:type="paragraph" w:styleId="CustomTitle"><w:name w:val="heading 1"/></w:style></w:styles>"#;
let data = build_test_docx(&doc, Some(styles), None);
let converter = DocxConverter;
let result = converter
.convert(&data, &ConversionOptions::default())
.unwrap();
assert!(result.markdown.contains("# My Title\n"));
}
#[test]
fn test_docx_first_heading1_becomes_title() {
let body = format!(
"{}{}{}",
heading_para("Document Title", 1),
para("Some text."),
heading_para("Another H1", 1),
);
let doc = wrap_body(&body);
let data = build_test_docx(&doc, None, None);
let converter = DocxConverter;
let result = converter
.convert(&data, &ConversionOptions::default())
.unwrap();
assert_eq!(result.title, Some("Document Title".to_string()));
}
#[test]
fn test_docx_missing_styles_xml_graceful() {
let body = format!("{}{}", heading_para("Title", 1), para("Body text."),);
let doc = wrap_body(&body);
let data = build_test_docx(&doc, None, None);
let converter = DocxConverter;
let result = converter
.convert(&data, &ConversionOptions::default())
.unwrap();
assert!(result.markdown.contains("# Title\n"));
assert!(result.markdown.contains("Body text."));
assert!(result.warnings.is_empty());
}
#[test]
fn test_docx_hyperlink() {
let body =
r#"<w:p><w:hyperlink r:id="rId1"><w:r><w:t>Example</w:t></w:r></w:hyperlink></w:p>"#;
let doc = wrap_body(body);
let rels = r#"<?xml version="1.0" encoding="UTF-8"?><Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships"><Relationship Id="rId1" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/hyperlink" Target="https://example.com" TargetMode="External"/></Relationships>"#;
let data = build_test_docx(&doc, None, Some(rels));
let converter = DocxConverter;
let result = converter
.convert(&data, &ConversionOptions::default())
.unwrap();
assert!(result.markdown.contains("[Example](https://example.com)"));
}
#[test]
fn test_docx_hyperlink_missing_rel() {
let body = r#"<w:p><w:hyperlink r:id="rId99"><w:r><w:t>Broken Link</w:t></w:r></w:hyperlink></w:p>"#;
let doc = wrap_body(body);
let data = build_test_docx(&doc, None, None);
let converter = DocxConverter;
let result = converter
.convert(&data, &ConversionOptions::default())
.unwrap();
assert!(result.markdown.contains("Broken Link"));
assert!(!result.markdown.contains('['));
assert!(!result.warnings.is_empty());
assert_eq!(result.warnings[0].code, WarningCode::SkippedElement);
}
#[test]
fn test_docx_line_break() {
let body = r#"<w:p><w:r><w:t>Line one</w:t><w:br/><w:t>Line two</w:t></w:r></w:p>"#;
let doc = wrap_body(body);
let data = build_test_docx(&doc, None, None);
let converter = DocxConverter;
let result = converter
.convert(&data, &ConversionOptions::default())
.unwrap();
assert!(result.markdown.contains("Line one\nLine two"));
}
#[test]
fn test_docx_multiple_runs_joined() {
let body = r#"<w:p><w:r><w:t xml:space="preserve">Hello </w:t></w:r><w:r><w:t>world</w:t></w:r></w:p>"#;
let doc = wrap_body(body);
let data = build_test_docx(&doc, None, None);
let converter = DocxConverter;
let result = converter
.convert(&data, &ConversionOptions::default())
.unwrap();
assert!(result.markdown.contains("Hello world"));
}
// ---- Bold/Italic tests ----
#[test]
fn test_docx_bold_text() {
let doc = wrap_body(&bold_para("Bold text"));
let data = build_test_docx(&doc, None, None);
let converter = DocxConverter;
let result = converter
.convert(&data, &ConversionOptions::default())
.unwrap();
assert!(result.markdown.contains("**Bold text**"));
}
#[test]
fn test_docx_italic_text() {
let doc = wrap_body(&italic_para("Italic text"));
let data = build_test_docx(&doc, None, None);
let converter = DocxConverter;
let result = converter
.convert(&data, &ConversionOptions::default())
.unwrap();
assert!(result.markdown.contains("*Italic text*"));
}
#[test]
fn test_docx_bold_italic_nested() {
let doc = wrap_body(&bold_italic_para("Bold and italic"));
let data = build_test_docx(&doc, None, None);
let converter = DocxConverter;
let result = converter
.convert(&data, &ConversionOptions::default())
.unwrap();
assert!(result.markdown.contains("***Bold and italic***"));
}
#[test]
fn test_docx_bold_val_false_not_bold() {
// <w:b w:val="0"/> means NOT bold
let body = r#"<w:p><w:r><w:rPr><w:b w:val="0"/></w:rPr><w:t>Not bold</w:t></w:r></w:p>"#;
let doc = wrap_body(body);
let data = build_test_docx(&doc, None, None);
let converter = DocxConverter;
let result = converter
.convert(&data, &ConversionOptions::default())
.unwrap();
assert!(result.markdown.contains("Not bold"));
assert!(!result.markdown.contains("**"));
}
#[test]
fn test_docx_mixed_formatting_runs() {
// Normal + bold + normal in one paragraph
let body = r#"<w:p><w:r><w:t xml:space="preserve">Normal </w:t></w:r><w:r><w:rPr><w:b/></w:rPr><w:t xml:space="preserve">bold </w:t></w:r><w:r><w:t>normal</w:t></w:r></w:p>"#;
let doc = wrap_body(body);
let data = build_test_docx(&doc, None, None);
let converter = DocxConverter;
let result = converter
.convert(&data, &ConversionOptions::default())
.unwrap();
assert!(result.markdown.contains("Normal **bold** normal"));
}
#[test]
fn test_docx_bold_in_hyperlink() {
let body = r#"<w:p><w:hyperlink r:id="rId1"><w:r><w:rPr><w:b/></w:rPr><w:t>Bold Link</w:t></w:r></w:hyperlink></w:p>"#;
let doc = wrap_body(body);
let rels = r#"<?xml version="1.0" encoding="UTF-8"?><Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships"><Relationship Id="rId1" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/hyperlink" Target="https://example.com" TargetMode="External"/></Relationships>"#;
let data = build_test_docx(&doc, None, Some(rels));
let converter = DocxConverter;
let result = converter
.convert(&data, &ConversionOptions::default())
.unwrap();
assert!(
result
.markdown
.contains("[**Bold Link**](https://example.com)")
);
}
#[test]
fn test_docx_empty_run_no_markers() {
// Empty bold run should not produce bare **
let body =
r#"<w:p><w:r><w:rPr><w:b/></w:rPr><w:t></w:t></w:r><w:r><w:t>text</w:t></w:r></w:p>"#;
let doc = wrap_body(body);
let data = build_test_docx(&doc, None, None);
let converter = DocxConverter;
let result = converter
.convert(&data, &ConversionOptions::default())
.unwrap();
assert!(!result.markdown.contains("****"));
assert!(result.markdown.contains("text"));
}
#[test]
fn test_docx_adjacent_bold_runs_merged() {
// Two consecutive bold runs should produce **Hello World** not **Hello** **World**
let body = r#"<w:p><w:r><w:rPr><w:b/></w:rPr><w:t xml:space="preserve">Hello </w:t></w:r><w:r><w:rPr><w:b/></w:rPr><w:t>World</w:t></w:r></w:p>"#;
let doc = wrap_body(body);
let data = build_test_docx(&doc, None, None);
let converter = DocxConverter;
let result = converter
.convert(&data, &ConversionOptions::default())
.unwrap();
assert!(
result.markdown.contains("**Hello World**"),
"expected '**Hello World**' but markdown was: {}",
result.markdown
);
assert!(
!result.markdown.contains("** **"),
"should not have separate markers"
);
}
#[test]
fn test_docx_adjacent_italic_runs_merged() {
let body = r#"<w:p><w:r><w:rPr><w:i/></w:rPr><w:t xml:space="preserve">Hello </w:t></w:r><w:r><w:rPr><w:i/></w:rPr><w:t>World</w:t></w:r></w:p>"#;
let doc = wrap_body(body);
let data = build_test_docx(&doc, None, None);
let converter = DocxConverter;
let result = converter
.convert(&data, &ConversionOptions::default())
.unwrap();
assert!(
result.markdown.contains("*Hello World*"),
"expected '*Hello World*' but markdown was: {}",
result.markdown
);
}
#[test]
fn test_docx_formatting_change_between_runs() {
// Bold run then italic run should NOT merge
let body = r#"<w:p><w:r><w:rPr><w:b/></w:rPr><w:t xml:space="preserve">bold </w:t></w:r><w:r><w:rPr><w:i/></w:rPr><w:t>italic</w:t></w:r></w:p>"#;
let doc = wrap_body(body);
let data = build_test_docx(&doc, None, None);
let converter = DocxConverter;
let result = converter
.convert(&data, &ConversionOptions::default())
.unwrap();
assert!(
result.markdown.contains("**bold** *italic*"),
"expected '**bold** *italic*' but markdown was: {}",
result.markdown
);
}
#[test]
fn test_docx_split_word_across_bold_runs() {
// Word split across two bold runs (common in spell-check/revision tracking)
// Should produce **Hello** not **Hel****lo**
let body = r#"<w:p><w:r><w:rPr><w:b/></w:rPr><w:t>Hel</w:t></w:r><w:r><w:rPr><w:b/></w:rPr><w:t>lo</w:t></w:r></w:p>"#;
let doc = wrap_body(body);
let data = build_test_docx(&doc, None, None);
let converter = DocxConverter;
let result = converter
.convert(&data, &ConversionOptions::default())
.unwrap();
assert!(
result.markdown.contains("**Hello**"),
"expected '**Hello**' but markdown was: {}",
result.markdown
);
assert!(
!result.markdown.contains("****"),
"should not have adjacent markers"
);
}
// ---- Table tests ----
#[test]
fn test_docx_table_basic() {
let body = r#"<w:tbl><w:tr><w:tc><w:p><w:r><w:t>H1</w:t></w:r></w:p></w:tc><w:tc><w:p><w:r><w:t>H2</w:t></w:r></w:p></w:tc></w:tr><w:tr><w:tc><w:p><w:r><w:t>A</w:t></w:r></w:p></w:tc><w:tc><w:p><w:r><w:t>B</w:t></w:r></w:p></w:tc></w:tr></w:tbl>"#;
let doc = wrap_body(body);
let data = build_test_docx(&doc, None, None);
let converter = DocxConverter;
let result = converter
.convert(&data, &ConversionOptions::default())
.unwrap();
assert!(result.markdown.contains("| H1 | H2 |"));
assert!(result.markdown.contains("|---|---|"));
assert!(result.markdown.contains("| A | B |"));
}
#[test]
fn test_docx_table_empty_cells() {
let body = r#"<w:tbl><w:tr><w:tc><w:p><w:r><w:t>A</w:t></w:r></w:p></w:tc><w:tc><w:p></w:p></w:tc></w:tr><w:tr><w:tc><w:p></w:p></w:tc><w:tc><w:p><w:r><w:t>D</w:t></w:r></w:p></w:tc></w:tr></w:tbl>"#;
let doc = wrap_body(body);
let data = build_test_docx(&doc, None, None);
let converter = DocxConverter;
let result = converter
.convert(&data, &ConversionOptions::default())
.unwrap();
assert!(result.markdown.contains("| A |"));
assert!(result.markdown.contains("| D |"));
}
#[test]
fn test_docx_table_with_formatting() {
let body = r#"<w:tbl><w:tr><w:tc><w:p><w:r><w:rPr><w:b/></w:rPr><w:t>Bold</w:t></w:r></w:p></w:tc><w:tc><w:p><w:r><w:t>Normal</w:t></w:r></w:p></w:tc></w:tr></w:tbl>"#;
let doc = wrap_body(body);
let data = build_test_docx(&doc, None, None);
let converter = DocxConverter;
let result = converter
.convert(&data, &ConversionOptions::default())
.unwrap();
assert!(result.markdown.contains("**Bold**"));
assert!(result.markdown.contains("Normal"));
}
#[test]
fn test_docx_table_between_paragraphs() {
let body = format!(
"{}{}{}",
para("Before table."),
r#"<w:tbl><w:tr><w:tc><w:p><w:r><w:t>Cell</w:t></w:r></w:p></w:tc></w:tr></w:tbl>"#,
para("After table.")
);
let doc = wrap_body(&body);
let data = build_test_docx(&doc, None, None);
let converter = DocxConverter;
let result = converter
.convert(&data, &ConversionOptions::default())
.unwrap();
assert!(result.markdown.contains("Before table."));
assert!(result.markdown.contains("| Cell |"));
assert!(result.markdown.contains("After table."));
}
#[test]
fn test_docx_table_unicode() {
let body = r#"<w:tbl><w:tr><w:tc><w:p><w:r><w:t>한국어</w:t></w:r></w:p></w:tc><w:tc><w:p><w:r><w:t>中文</w:t></w:r></w:p></w:tc></w:tr></w:tbl>"#;
let doc = wrap_body(body);
let data = build_test_docx(&doc, None, None);
let converter = DocxConverter;
let result = converter
.convert(&data, &ConversionOptions::default())
.unwrap();
assert!(result.markdown.contains("한국어"));
assert!(result.markdown.contains("中文"));
}
#[test]
fn test_docx_table_merged_cells_no_panic() {
// Table with gridSpan (horizontal merge) — converter doesn't handle merging
// but should not panic. The first row has 1 cell with gridSpan=2, so
// build_table treats it as a 1-column table (header dictates column count).
// Row 2's second cell "B" gets truncated — that's expected current behavior.
let body = r#"<w:tbl><w:tr><w:tc><w:tcPr><w:gridSpan w:val="2"/></w:tcPr><w:p><w:r><w:t>Merged</w:t></w:r></w:p></w:tc></w:tr><w:tr><w:tc><w:p><w:r><w:t>A</w:t></w:r></w:p></w:tc><w:tc><w:p><w:r><w:t>B</w:t></w:r></w:p></w:tc></w:tr></w:tbl>"#;
let doc = wrap_body(body);
let data = build_test_docx(&doc, None, None);
let converter = DocxConverter;
let result = converter
.convert(&data, &ConversionOptions::default())
.unwrap();
// No panic, and at least the merged header + first cell are preserved
assert!(result.markdown.contains("Merged"));
assert!(result.markdown.contains("A"));
}
// ---- List tests ----
#[test]
fn test_docx_unordered_list() {
let numbering = r#"<?xml version="1.0" encoding="UTF-8"?><w:numbering xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main"><w:abstractNum w:abstractNumId="0"><w:lvl w:ilvl="0"><w:numFmt w:val="bullet"/></w:lvl></w:abstractNum><w:num w:numId="1"><w:abstractNumId w:val="0"/></w:num></w:numbering>"#;
let body = r#"<w:p><w:pPr><w:numPr><w:ilvl w:val="0"/><w:numId w:val="1"/></w:numPr></w:pPr><w:r><w:t>Item 1</w:t></w:r></w:p><w:p><w:pPr><w:numPr><w:ilvl w:val="0"/><w:numId w:val="1"/></w:numPr></w:pPr><w:r><w:t>Item 2</w:t></w:r></w:p>"#;
let doc = wrap_body(body);
let data = build_test_docx_with_numbering(&doc, None, None, Some(numbering));
let converter = DocxConverter;
let result = converter
.convert(&data, &ConversionOptions::default())
.unwrap();
assert!(result.markdown.contains("- Item 1"));
assert!(result.markdown.contains("- Item 2"));
}
#[test]
fn test_docx_ordered_list() {
let numbering = r#"<?xml version="1.0" encoding="UTF-8"?><w:numbering xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main"><w:abstractNum w:abstractNumId="0"><w:lvl w:ilvl="0"><w:numFmt w:val="decimal"/></w:lvl></w:abstractNum><w:num w:numId="1"><w:abstractNumId w:val="0"/></w:num></w:numbering>"#;
let body = r#"<w:p><w:pPr><w:numPr><w:ilvl w:val="0"/><w:numId w:val="1"/></w:numPr></w:pPr><w:r><w:t>First</w:t></w:r></w:p><w:p><w:pPr><w:numPr><w:ilvl w:val="0"/><w:numId w:val="1"/></w:numPr></w:pPr><w:r><w:t>Second</w:t></w:r></w:p>"#;
let doc = wrap_body(body);
let data = build_test_docx_with_numbering(&doc, None, None, Some(numbering));
let converter = DocxConverter;
let result = converter
.convert(&data, &ConversionOptions::default())
.unwrap();
assert!(result.markdown.contains("1. First"));
assert!(result.markdown.contains("2. Second"));
}
#[test]
fn test_docx_nested_list() {
let numbering = r#"<?xml version="1.0" encoding="UTF-8"?><w:numbering xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main"><w:abstractNum w:abstractNumId="0"><w:lvl w:ilvl="0"><w:numFmt w:val="bullet"/></w:lvl><w:lvl w:ilvl="1"><w:numFmt w:val="bullet"/></w:lvl></w:abstractNum><w:num w:numId="1"><w:abstractNumId w:val="0"/></w:num></w:numbering>"#;
let body = r#"<w:p><w:pPr><w:numPr><w:ilvl w:val="0"/><w:numId w:val="1"/></w:numPr></w:pPr><w:r><w:t>Parent</w:t></w:r></w:p><w:p><w:pPr><w:numPr><w:ilvl w:val="1"/><w:numId w:val="1"/></w:numPr></w:pPr><w:r><w:t>Child</w:t></w:r></w:p>"#;
let doc = wrap_body(body);
let data = build_test_docx_with_numbering(&doc, None, None, Some(numbering));
let converter = DocxConverter;
let result = converter
.convert(&data, &ConversionOptions::default())
.unwrap();
assert!(result.markdown.contains("- Parent"));
assert!(result.markdown.contains(" - Child"));
}
#[test]
fn test_docx_mixed_list_and_paragraph() {
let numbering = r#"<?xml version="1.0" encoding="UTF-8"?><w:numbering xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main"><w:abstractNum w:abstractNumId="0"><w:lvl w:ilvl="0"><w:numFmt w:val="bullet"/></w:lvl></w:abstractNum><w:num w:numId="1"><w:abstractNumId w:val="0"/></w:num></w:numbering>"#;
let body = format!(
"{}{}{}",
para("Before list."),
r#"<w:p><w:pPr><w:numPr><w:ilvl w:val="0"/><w:numId w:val="1"/></w:numPr></w:pPr><w:r><w:t>List item</w:t></w:r></w:p>"#,
para("After list.")
);
let doc = wrap_body(&body);
let data = build_test_docx_with_numbering(&doc, None, None, Some(numbering));
let converter = DocxConverter;
let result = converter
.convert(&data, &ConversionOptions::default())
.unwrap();
assert!(result.markdown.contains("Before list."));
assert!(result.markdown.contains("- List item"));
assert!(result.markdown.contains("After list."));
}
#[test]
fn test_docx_list_with_bold() {
let numbering = r#"<?xml version="1.0" encoding="UTF-8"?><w:numbering xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main"><w:abstractNum w:abstractNumId="0"><w:lvl w:ilvl="0"><w:numFmt w:val="bullet"/></w:lvl></w:abstractNum><w:num w:numId="1"><w:abstractNumId w:val="0"/></w:num></w:numbering>"#;
let body = r#"<w:p><w:pPr><w:numPr><w:ilvl w:val="0"/><w:numId w:val="1"/></w:numPr></w:pPr><w:r><w:rPr><w:b/></w:rPr><w:t>Bold item</w:t></w:r></w:p>"#;
let doc = wrap_body(body);
let data = build_test_docx_with_numbering(&doc, None, None, Some(numbering));
let converter = DocxConverter;
let result = converter
.convert(&data, &ConversionOptions::default())
.unwrap();
assert!(result.markdown.contains("- **Bold item**"));
}
#[test]
fn test_docx_two_separate_ordered_lists_restart_numbering() {
// Two ordered lists with different numId, separated by a normal paragraph.
// The second list should restart numbering at 1.
let numbering = r#"<?xml version="1.0" encoding="UTF-8"?><w:numbering xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main"><w:abstractNum w:abstractNumId="0"><w:lvl w:ilvl="0"><w:numFmt w:val="decimal"/></w:lvl></w:abstractNum><w:abstractNum w:abstractNumId="1"><w:lvl w:ilvl="0"><w:numFmt w:val="decimal"/></w:lvl></w:abstractNum><w:num w:numId="1"><w:abstractNumId w:val="0"/></w:num><w:num w:numId="2"><w:abstractNumId w:val="1"/></w:num></w:numbering>"#;
let body = format!(
"{}{}{}{}{}{}{}",
// First ordered list (numId=1)
r#"<w:p><w:pPr><w:numPr><w:ilvl w:val="0"/><w:numId w:val="1"/></w:numPr></w:pPr><w:r><w:t>Alpha</w:t></w:r></w:p>"#,
r#"<w:p><w:pPr><w:numPr><w:ilvl w:val="0"/><w:numId w:val="1"/></w:numPr></w:pPr><w:r><w:t>Beta</w:t></w:r></w:p>"#,
r#"<w:p><w:pPr><w:numPr><w:ilvl w:val="0"/><w:numId w:val="1"/></w:numPr></w:pPr><w:r><w:t>Gamma</w:t></w:r></w:p>"#,
// Normal paragraph separating the lists
para("Separator paragraph."),
// Second ordered list (numId=2)
r#"<w:p><w:pPr><w:numPr><w:ilvl w:val="0"/><w:numId w:val="2"/></w:numPr></w:pPr><w:r><w:t>First</w:t></w:r></w:p>"#,
r#"<w:p><w:pPr><w:numPr><w:ilvl w:val="0"/><w:numId w:val="2"/></w:numPr></w:pPr><w:r><w:t>Second</w:t></w:r></w:p>"#,
r#"<w:p><w:pPr><w:numPr><w:ilvl w:val="0"/><w:numId w:val="2"/></w:numPr></w:pPr><w:r><w:t>Third</w:t></w:r></w:p>"#,
);
let doc = wrap_body(&body);
let data = build_test_docx_with_numbering(&doc, None, None, Some(numbering));
let converter = DocxConverter;
let result = converter
.convert(&data, &ConversionOptions::default())
.unwrap();
// First list: 1, 2, 3
assert!(
result.markdown.contains("1. Alpha"),
"markdown was: {}",
result.markdown
);
assert!(
result.markdown.contains("2. Beta"),
"markdown was: {}",
result.markdown
);
assert!(
result.markdown.contains("3. Gamma"),
"markdown was: {}",
result.markdown
);
// Second list: should restart at 1, not continue at 4
assert!(
result.markdown.contains("1. First"),
"expected '1. First' but markdown was: {}",
result.markdown
);
assert!(
result.markdown.contains("2. Second"),
"expected '2. Second' but markdown was: {}",
result.markdown
);
assert!(
result.markdown.contains("3. Third"),
"expected '3. Third' but markdown was: {}",
result.markdown
);
}
#[test]
fn test_docx_parse_numbering_missing_graceful() {
// No numbering.xml — numPr should fall back to bullet
let body = r#"<w:p><w:pPr><w:numPr><w:ilvl w:val="0"/><w:numId w:val="1"/></w:numPr></w:pPr><w:r><w:t>Fallback item</w:t></w:r></w:p>"#;
let doc = wrap_body(body);
let data = build_test_docx(&doc, None, None);
let converter = DocxConverter;
let result = converter
.convert(&data, &ConversionOptions::default())
.unwrap();
// Falls back to bullet (unordered) when numbering.xml is missing
assert!(result.markdown.contains("- Fallback item"));
}
// ---- Image tests ----
#[test]
fn test_docx_inline_image() {
let body = r#"<w:p><w:r><w:drawing><wp:inline><wp:docPr descr=""/><a:graphic><a:graphicData><pic:pic><pic:blipFill><a:blip r:embed="rId2"/></pic:blipFill></pic:pic></a:graphicData></a:graphic></wp:inline></w:drawing></w:r></w:p>"#;
let doc = wrap_body(body);
let rels = r#"<?xml version="1.0" encoding="UTF-8"?><Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships"><Relationship Id="rId2" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/image" Target="media/image1.png"/></Relationships>"#;
let data = build_test_docx(&doc, None, Some(rels));
let converter = DocxConverter;
let result = converter
.convert(&data, &ConversionOptions::default())
.unwrap();
assert!(result.markdown.contains(""));
}
#[test]
fn test_docx_image_with_alt_text() {
let body = r#"<w:p><w:r><w:drawing><wp:inline><wp:docPr descr="A nice photo"/><a:graphic><a:graphicData><pic:pic><pic:blipFill><a:blip r:embed="rId2"/></pic:blipFill></pic:pic></a:graphicData></a:graphic></wp:inline></w:drawing></w:r></w:p>"#;
let doc = wrap_body(body);
let rels = r#"<?xml version="1.0" encoding="UTF-8"?><Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships"><Relationship Id="rId2" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/image" Target="media/photo.jpg"/></Relationships>"#;
let data = build_test_docx(&doc, None, Some(rels));
let converter = DocxConverter;
let result = converter
.convert(&data, &ConversionOptions::default())
.unwrap();
assert!(result.markdown.contains(""));
}
#[test]
fn test_docx_image_missing_rel_graceful() {
let body = r#"<w:p><w:r><w:drawing><wp:inline><wp:docPr descr="Missing"/><a:graphic><a:graphicData><pic:pic><pic:blipFill><a:blip r:embed="rId99"/></pic:blipFill></pic:pic></a:graphicData></a:graphic></wp:inline></w:drawing></w:r></w:p>"#;
let doc = wrap_body(body);
let data = build_test_docx(&doc, None, None);
let converter = DocxConverter;
let result = converter
.convert(&data, &ConversionOptions::default())
.unwrap();
// Image should be skipped with a warning
assert!(!result.warnings.is_empty());
assert!(result.warnings[0].message.contains("not found"));
}
// ---- Numbering parser unit tests ----
#[test]
fn test_parse_numbering_bullet() {
let xml = r#"<?xml version="1.0" encoding="UTF-8"?><w:numbering xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main"><w:abstractNum w:abstractNumId="0"><w:lvl w:ilvl="0"><w:numFmt w:val="bullet"/></w:lvl></w:abstractNum><w:num w:numId="1"><w:abstractNumId w:val="0"/></w:num></w:numbering>"#;
let result = parse_numbering(xml);
assert_eq!(
result.get(&("1".to_string(), 0)).map(|n| n.ordered),
Some(false)
);
}
#[test]
fn test_parse_numbering_decimal() {
let xml = r#"<?xml version="1.0" encoding="UTF-8"?><w:numbering xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main"><w:abstractNum w:abstractNumId="0"><w:lvl w:ilvl="0"><w:numFmt w:val="decimal"/></w:lvl></w:abstractNum><w:num w:numId="1"><w:abstractNumId w:val="0"/></w:num></w:numbering>"#;
let result = parse_numbering(xml);
assert_eq!(
result.get(&("1".to_string(), 0)).map(|n| n.ordered),
Some(true)
);
}
// ---- Resource limit tests ----
#[test]
fn test_docx_zip_budget_exceeded_returns_error() {
let doc = wrap_body(¶("Hello"));
let data = build_test_docx(&doc, None, None);
let converter = DocxConverter;
// Set budget to 1 byte — any real DOCX will exceed this
let options = ConversionOptions {
max_uncompressed_zip_bytes: 1,
..Default::default()
};
let result = converter.convert(&data, &options);
assert!(result.is_err());
let err = result.unwrap_err();
assert!(
format!("{err}").contains("input too large"),
"error was: {err}"
);
}
#[test]
fn test_docx_relationship_type_captured() {
let rels = r#"<?xml version="1.0" encoding="UTF-8"?><Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships"><Relationship Id="rId1" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/image" Target="media/image1.png"/><Relationship Id="rId2" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/hyperlink" Target="https://example.com" TargetMode="External"/></Relationships>"#;
let result = parse_relationships(rels);
assert_eq!(
result.get("rId1").map(|r| r.rel_type.as_str()),
Some("http://schemas.openxmlformats.org/officeDocument/2006/relationships/image")
);
assert_eq!(
result.get("rId2").map(|r| r.rel_type.as_str()),
Some("http://schemas.openxmlformats.org/officeDocument/2006/relationships/hyperlink")
);
}
// ---- Image extraction tests ----
/// Helper: build a DOCX with an embedded image file.
fn build_test_docx_with_image(
document_xml: &str,
rels_xml: &str,
image_path: &str,
image_data: &[u8],
) -> Vec<u8> {
use std::io::Write;
use zip::ZipWriter;
use zip::write::SimpleFileOptions;
let buf = Vec::new();
let mut zip = ZipWriter::new(Cursor::new(buf));
let opts = SimpleFileOptions::default();
// [Content_Types].xml
let ct = r#"<?xml version="1.0" encoding="UTF-8" standalone="yes"?><Types xmlns="http://schemas.openxmlformats.org/package/2006/content-types"><Default Extension="rels" ContentType="application/vnd.openxmlformats-package.relationships+xml"/><Default Extension="xml" ContentType="application/xml"/><Default Extension="png" ContentType="image/png"/><Override PartName="/word/document.xml" ContentType="application/vnd.openxmlformats-officedocument.wordprocessingml.document.main+xml"/></Types>"#;
zip.start_file("[Content_Types].xml", opts).unwrap();
zip.write_all(ct.as_bytes()).unwrap();
// _rels/.rels
zip.start_file("_rels/.rels", opts).unwrap();
zip.write_all(
br#"<?xml version="1.0" encoding="UTF-8" standalone="yes"?><Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships"><Relationship Id="rId1" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument" Target="word/document.xml"/></Relationships>"#,
)
.unwrap();
// word/document.xml
zip.start_file("word/document.xml", opts).unwrap();
zip.write_all(document_xml.as_bytes()).unwrap();
// word/_rels/document.xml.rels
zip.start_file("word/_rels/document.xml.rels", opts)
.unwrap();
zip.write_all(rels_xml.as_bytes()).unwrap();
// Image file
zip.start_file(image_path, opts).unwrap();
zip.write_all(image_data).unwrap();
let cursor = zip.finish().unwrap();
cursor.into_inner()
}
#[test]
fn test_docx_image_extraction_enabled() {
let body = r#"<w:p><w:r><w:drawing><wp:inline><wp:docPr descr="Test image"/><a:graphic><a:graphicData><pic:pic><pic:blipFill><a:blip r:embed="rId2"/></pic:blipFill></pic:pic></a:graphicData></a:graphic></wp:inline></w:drawing></w:r></w:p>"#;
let doc = wrap_body(body);
let rels = r#"<?xml version="1.0" encoding="UTF-8"?><Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships"><Relationship Id="rId2" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/image" Target="media/image1.png"/></Relationships>"#;
let fake_png = b"fake-png-data-for-test";
let data = build_test_docx_with_image(&doc, rels, "word/media/image1.png", fake_png);
let converter = DocxConverter;
let options = ConversionOptions {
extract_images: true,
..Default::default()
};
let result = converter.convert(&data, &options).unwrap();
assert!(!result.images.is_empty(), "expected extracted images");
assert_eq!(result.images[0].0, "image1.png");
assert_eq!(result.images[0].1, fake_png);
}
#[test]
fn test_docx_image_extraction_disabled_by_default() {
let body = r#"<w:p><w:r><w:drawing><wp:inline><wp:docPr descr="Test"/><a:graphic><a:graphicData><pic:pic><pic:blipFill><a:blip r:embed="rId2"/></pic:blipFill></pic:pic></a:graphicData></a:graphic></wp:inline></w:drawing></w:r></w:p>"#;
let doc = wrap_body(body);
let rels = r#"<?xml version="1.0" encoding="UTF-8"?><Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships"><Relationship Id="rId2" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/image" Target="media/image1.png"/></Relationships>"#;
let fake_png = b"fake-png-data";
let data = build_test_docx_with_image(&doc, rels, "word/media/image1.png", fake_png);
let converter = DocxConverter;
let result = converter
.convert(&data, &ConversionOptions::default())
.unwrap();
assert!(result.images.is_empty());
}
#[test]
fn test_docx_image_extraction_respects_budget() {
let body = r#"<w:p><w:r><w:drawing><wp:inline><wp:docPr descr="Big"/><a:graphic><a:graphicData><pic:pic><pic:blipFill><a:blip r:embed="rId2"/></pic:blipFill></pic:pic></a:graphicData></a:graphic></wp:inline></w:drawing></w:r></w:p>"#;
let doc = wrap_body(body);
let rels = r#"<?xml version="1.0" encoding="UTF-8"?><Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships"><Relationship Id="rId2" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/image" Target="media/image1.png"/></Relationships>"#;
let fake_png = vec![0u8; 1024]; // 1 KB image
let data = build_test_docx_with_image(&doc, rels, "word/media/image1.png", &fake_png);
let converter = DocxConverter;
let options = ConversionOptions {
extract_images: true,
max_total_image_bytes: 100, // Budget smaller than image
..Default::default()
};
let result = converter.convert(&data, &options).unwrap();
// Image should not be extracted (exceeds budget)
assert!(result.images.is_empty());
// Should have a ResourceLimitReached warning
assert!(
result
.warnings
.iter()
.any(|w| w.code == WarningCode::ResourceLimitReached),
"expected ResourceLimitReached warning, got: {:?}",
result.warnings
);
}
// ---- Image describer tests ----
use crate::converter::ImageDescriber;
use std::sync::Arc;
struct MockDescriber {
description: String,
}
impl ImageDescriber for MockDescriber {
fn describe(
&self,
_image_bytes: &[u8],
_mime_type: &str,
_prompt: &str,
) -> Result<String, ConvertError> {
Ok(self.description.clone())
}
}
struct FailingDescriber;
impl ImageDescriber for FailingDescriber {
fn describe(
&self,
_image_bytes: &[u8],
_mime_type: &str,
_prompt: &str,
) -> Result<String, ConvertError> {
Err(ConvertError::ImageDescriptionError {
reason: "API error".to_string(),
})
}
}
#[test]
fn test_docx_image_describer_replaces_alt_text() {
let body = r#"<w:p><w:r><w:drawing><wp:inline><wp:docPr descr=""/><a:graphic><a:graphicData><pic:pic><pic:blipFill><a:blip r:embed="rId2"/></pic:blipFill></pic:pic></a:graphicData></a:graphic></wp:inline></w:drawing></w:r></w:p>"#;
let doc = wrap_body(body);
let rels = r#"<?xml version="1.0" encoding="UTF-8"?><Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships"><Relationship Id="rId2" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/image" Target="media/image1.png"/></Relationships>"#;
let fake_png = b"fake-png-data";
let data = build_test_docx_with_image(&doc, rels, "word/media/image1.png", fake_png);
let converter = DocxConverter;
let options = ConversionOptions {
image_describer: Some(Arc::new(MockDescriber {
description: "A beautiful sunset over the ocean".to_string(),
})),
..Default::default()
};
let result = converter.convert(&data, &options).unwrap();
assert!(
result
.markdown
.contains(""),
"markdown was: {}",
result.markdown
);
// Images should not be in result.images since extract_images is false
assert!(result.images.is_empty());
}
#[test]
fn test_docx_image_describer_with_extract_images() {
let body = r#"<w:p><w:r><w:drawing><wp:inline><wp:docPr descr=""/><a:graphic><a:graphicData><pic:pic><pic:blipFill><a:blip r:embed="rId2"/></pic:blipFill></pic:pic></a:graphicData></a:graphic></wp:inline></w:drawing></w:r></w:p>"#;
let doc = wrap_body(body);
let rels = r#"<?xml version="1.0" encoding="UTF-8"?><Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships"><Relationship Id="rId2" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/image" Target="media/image1.png"/></Relationships>"#;
let fake_png = b"fake-png-data";
let data = build_test_docx_with_image(&doc, rels, "word/media/image1.png", fake_png);
let converter = DocxConverter;
let options = ConversionOptions {
extract_images: true,
image_describer: Some(Arc::new(MockDescriber {
description: "Described image".to_string(),
})),
..Default::default()
};
let result = converter.convert(&data, &options).unwrap();
assert!(result.markdown.contains(""));
assert!(!result.images.is_empty());
}
#[test]
fn test_docx_image_describer_absolute_target_path() {
let body = r#"<w:p><w:r><w:drawing><wp:inline><wp:docPr descr="Original alt"/><a:graphic><a:graphicData><pic:pic><pic:blipFill><a:blip r:embed="rId2"/></pic:blipFill></pic:pic></a:graphicData></a:graphic></wp:inline></w:drawing></w:r></w:p>"#;
let doc = wrap_body(body);
let rels = r#"<?xml version="1.0" encoding="UTF-8"?><Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships"><Relationship Id="rId2" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/image" Target="/word/media/image1.png"/></Relationships>"#;
let fake_png = b"fake-png-data";
let data = build_test_docx_with_image(&doc, rels, "word/media/image1.png", fake_png);
let converter = DocxConverter;
let options = ConversionOptions {
image_describer: Some(Arc::new(MockDescriber {
description: "Described image".to_string(),
})),
..Default::default()
};
let result = converter.convert(&data, &options).unwrap();
assert!(
result.markdown.contains(""),
"markdown was: {}",
result.markdown
);
}
#[test]
fn test_docx_image_describer_dot_slash_target_path() {
let body = r#"<w:p><w:r><w:drawing><wp:inline><wp:docPr descr="Original alt"/><a:graphic><a:graphicData><pic:pic><pic:blipFill><a:blip r:embed="rId2"/></pic:blipFill></pic:pic></a:graphicData></a:graphic></wp:inline></w:drawing></w:r></w:p>"#;
let doc = wrap_body(body);
let rels = r#"<?xml version="1.0" encoding="UTF-8"?><Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships"><Relationship Id="rId2" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/image" Target="./media/image1.png"/></Relationships>"#;
let fake_png = b"fake-png-data";
let data = build_test_docx_with_image(&doc, rels, "word/media/image1.png", fake_png);
let converter = DocxConverter;
let options = ConversionOptions {
image_describer: Some(Arc::new(MockDescriber {
description: "Described image".to_string(),
})),
..Default::default()
};
let result = converter.convert(&data, &options).unwrap();
assert!(
result.markdown.contains(""),
"markdown was: {}",
result.markdown
);
}
/// Helper: build a DOCX with multiple embedded image files.
fn build_test_docx_with_images(
document_xml: &str,
rels_xml: &str,
images: &[(&str, &[u8])], // (zip_path, data)
) -> Vec<u8> {
use std::io::Write;
use zip::ZipWriter;
use zip::write::SimpleFileOptions;
let buf = Vec::new();
let mut zip = ZipWriter::new(Cursor::new(buf));
let opts = SimpleFileOptions::default();
let ct = r#"<?xml version="1.0" encoding="UTF-8" standalone="yes"?><Types xmlns="http://schemas.openxmlformats.org/package/2006/content-types"><Default Extension="rels" ContentType="application/vnd.openxmlformats-package.relationships+xml"/><Default Extension="xml" ContentType="application/xml"/><Default Extension="png" ContentType="image/png"/><Override PartName="/word/document.xml" ContentType="application/vnd.openxmlformats-officedocument.wordprocessingml.document.main+xml"/></Types>"#;
zip.start_file("[Content_Types].xml", opts).unwrap();
zip.write_all(ct.as_bytes()).unwrap();
zip.start_file("_rels/.rels", opts).unwrap();
zip.write_all(
br#"<?xml version="1.0" encoding="UTF-8" standalone="yes"?><Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships"><Relationship Id="rId1" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument" Target="word/document.xml"/></Relationships>"#,
).unwrap();
zip.start_file("word/document.xml", opts).unwrap();
zip.write_all(document_xml.as_bytes()).unwrap();
zip.start_file("word/_rels/document.xml.rels", opts)
.unwrap();
zip.write_all(rels_xml.as_bytes()).unwrap();
for (path, data) in images {
zip.start_file(*path, opts).unwrap();
zip.write_all(data).unwrap();
}
let cursor = zip.finish().unwrap();
cursor.into_inner()
}
/// A mock describer that returns descriptions based on the image bytes.
struct MockDescriberByContent;
impl ImageDescriber for MockDescriberByContent {
fn describe(
&self,
image_bytes: &[u8],
_mime_type: &str,
_prompt: &str,
) -> Result<String, ConvertError> {
// Return different descriptions based on the content
let content = String::from_utf8_lossy(image_bytes);
if content.contains("cat") {
Ok("A photo of a cat".to_string())
} else if content.contains("dog") {
Ok("A photo of a dog".to_string())
} else {
Ok("Unknown image".to_string())
}
}
}
#[test]
fn test_docx_duplicate_image_filenames_independent_descriptions() {
// Two images in the same document, both referencing different relationship IDs
// that point to files with the SAME filename (media/image1.png) but different content.
// In practice DOCX files with duplicate filenames come from different relationship IDs.
// Here we use two different rel IDs pointing to different paths (media/img_a.png and
// media/img_b.png) but the filenames extracted are different. To truly test duplicate
// filenames, we need the same Target in two rels (which doesn't happen in practice).
//
// Instead, test the real scenario: two images with the same filename in the markdown
// output. We simulate this by having two rels with the same target path.
let body = format!(
"{}{}{}",
r#"<w:p><w:r><w:drawing><wp:inline><wp:docPr descr="First image"/><a:graphic><a:graphicData><pic:pic><pic:blipFill><a:blip r:embed="rId2"/></pic:blipFill></pic:pic></a:graphicData></a:graphic></wp:inline></w:drawing></w:r></w:p>"#,
para("Text between images"),
r#"<w:p><w:r><w:drawing><wp:inline><wp:docPr descr="Second image"/><a:graphic><a:graphicData><pic:pic><pic:blipFill><a:blip r:embed="rId3"/></pic:blipFill></pic:pic></a:graphicData></a:graphic></wp:inline></w:drawing></w:r></w:p>"#,
);
let doc = wrap_body(&body);
// Both rels point to the same filename
let rels = r#"<?xml version="1.0" encoding="UTF-8"?><Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships"><Relationship Id="rId2" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/image" Target="media/image1.png"/><Relationship Id="rId3" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/image" Target="media/image1.png"/></Relationships>"#;
let fake_png = b"fake-cat-png-data";
let data = build_test_docx_with_images(&doc, rels, &[("word/media/image1.png", fake_png)]);
let converter = DocxConverter;
let options = ConversionOptions {
image_describer: Some(Arc::new(MockDescriberByContent)),
..Default::default()
};
let result = converter.convert(&data, &options).unwrap();
// Both images should have descriptions, and the text between them must be preserved
let md = &result.markdown;
assert!(
md.contains(""),
"expected first image described, markdown was: {}",
md
);
assert!(
md.contains("Text between images"),
"expected text between images preserved, markdown was: {}",
md
);
// Count occurrences of the described image — should be exactly 2
let count = md.matches("").count();
assert_eq!(
count, 2,
"expected 2 described images, found {} in: {}",
count, md
);
}
#[test]
fn test_docx_duplicate_basenames_different_paths_independent_descriptions() {
let body = format!(
"{}{}{}",
r#"<w:p><w:r><w:drawing><wp:inline><wp:docPr descr="First image"/><a:graphic><a:graphicData><pic:pic><pic:blipFill><a:blip r:embed="rId2"/></pic:blipFill></pic:pic></a:graphicData></a:graphic></wp:inline></w:drawing></w:r></w:p>"#,
para("Text between images"),
r#"<w:p><w:r><w:drawing><wp:inline><wp:docPr descr="Second image"/><a:graphic><a:graphicData><pic:pic><pic:blipFill><a:blip r:embed="rId3"/></pic:blipFill></pic:pic></a:graphicData></a:graphic></wp:inline></w:drawing></w:r></w:p>"#,
);
let doc = wrap_body(&body);
let rels = r#"<?xml version="1.0" encoding="UTF-8"?><Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships"><Relationship Id="rId2" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/image" Target="media/a/image1.png"/><Relationship Id="rId3" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/image" Target="media/b/image1.png"/></Relationships>"#;
let data = build_test_docx_with_images(
&doc,
rels,
&[
("word/media/a/image1.png", b"fake-cat-png-data"),
("word/media/b/image1.png", b"fake-dog-png-data"),
],
);
let converter = DocxConverter;
let options = ConversionOptions {
image_describer: Some(Arc::new(MockDescriberByContent)),
..Default::default()
};
let result = converter.convert(&data, &options).unwrap();
let md = &result.markdown;
assert!(
md.contains(""),
"expected cat description, markdown was: {}",
md
);
assert!(
md.contains(""),
"expected dog description, markdown was: {}",
md
);
}
#[test]
fn test_docx_image_describer_error_keeps_original_alt() {
let body = r#"<w:p><w:r><w:drawing><wp:inline><wp:docPr descr="Original alt"/><a:graphic><a:graphicData><pic:pic><pic:blipFill><a:blip r:embed="rId2"/></pic:blipFill></pic:pic></a:graphicData></a:graphic></wp:inline></w:drawing></w:r></w:p>"#;
let doc = wrap_body(body);
let rels = r#"<?xml version="1.0" encoding="UTF-8"?><Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships"><Relationship Id="rId2" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/image" Target="media/image1.png"/></Relationships>"#;
let fake_png = b"fake-png-data";
let data = build_test_docx_with_image(&doc, rels, "word/media/image1.png", fake_png);
let converter = DocxConverter;
let options = ConversionOptions {
image_describer: Some(Arc::new(FailingDescriber)),
..Default::default()
};
let result = converter.convert(&data, &options).unwrap();
assert!(
result.markdown.contains(""),
"markdown was: {}",
result.markdown
);
assert!(
result
.warnings
.iter()
.any(|w| w.code == WarningCode::SkippedElement
&& w.message.contains("image description failed")),
"expected SkippedElement warning for image description failure"
);
}
// ---- Plain text output tests ----
#[test]
fn test_docx_plain_text_paragraphs_and_headings() {
let body = format!(
"{}{}{}",
heading_para("My Title", 1),
para("Normal paragraph."),
heading_para("Section", 2),
);
let doc = wrap_body(&body);
let data = build_test_docx(&doc, None, None);
let converter = DocxConverter;
let result = converter
.convert(&data, &ConversionOptions::default())
.unwrap();
// Markdown should have # markers
assert!(result.markdown.contains("# My Title"));
assert!(result.markdown.contains("## Section"));
// Plain text: no # markers, just text
assert!(
!result.plain_text.contains('#'),
"plain_text should not contain # heading markers, was: {}",
result.plain_text
);
assert!(
result.plain_text.contains("My Title"),
"plain_text should contain heading text, was: {}",
result.plain_text
);
assert!(
result.plain_text.contains("Normal paragraph."),
"plain_text should contain paragraph text, was: {}",
result.plain_text
);
assert!(
result.plain_text.contains("Section"),
"plain_text should contain second heading text, was: {}",
result.plain_text
);
}
#[test]
fn test_docx_plain_text_no_bold_italic_markers() {
let body = format!(
"{}{}{}",
bold_para("Bold text"),
italic_para("Italic text"),
bold_italic_para("Both"),
);
let doc = wrap_body(&body);
let data = build_test_docx(&doc, None, None);
let converter = DocxConverter;
let result = converter
.convert(&data, &ConversionOptions::default())
.unwrap();
// Markdown has formatting markers
assert!(result.markdown.contains("**Bold text**"));
assert!(result.markdown.contains("*Italic text*"));
assert!(result.markdown.contains("***Both***"));
// Plain text: no * markers at all
assert!(
!result.plain_text.contains('*'),
"plain_text should not contain * markers, was: {}",
result.plain_text
);
assert!(result.plain_text.contains("Bold text"));
assert!(result.plain_text.contains("Italic text"));
assert!(result.plain_text.contains("Both"));
}
#[test]
fn test_docx_plain_text_hyperlink_no_markdown_syntax() {
let body = r#"<w:p><w:hyperlink r:id="rId1"><w:r><w:t>Example Link</w:t></w:r></w:hyperlink></w:p>"#;
let doc = wrap_body(body);
let rels = r#"<?xml version="1.0" encoding="UTF-8"?><Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships"><Relationship Id="rId1" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/hyperlink" Target="https://example.com" TargetMode="External"/></Relationships>"#;
let data = build_test_docx(&doc, None, Some(rels));
let converter = DocxConverter;
let result = converter
.convert(&data, &ConversionOptions::default())
.unwrap();
// Markdown has link syntax
assert!(
result
.markdown
.contains("[Example Link](https://example.com)")
);
// Plain text: just the text, no brackets or URL
assert!(
result.plain_text.contains("Example Link"),
"plain_text should contain link text, was: {}",
result.plain_text
);
assert!(
!result.plain_text.contains('['),
"plain_text should not contain [ bracket, was: {}",
result.plain_text
);
assert!(
!result.plain_text.contains("https://example.com"),
"plain_text should not contain URL, was: {}",
result.plain_text
);
}
#[test]
fn test_docx_plain_text_table_tab_separated() {
let body = r#"<w:tbl><w:tr><w:tc><w:p><w:r><w:t>H1</w:t></w:r></w:p></w:tc><w:tc><w:p><w:r><w:t>H2</w:t></w:r></w:p></w:tc></w:tr><w:tr><w:tc><w:p><w:r><w:t>A</w:t></w:r></w:p></w:tc><w:tc><w:p><w:r><w:t>B</w:t></w:r></w:p></w:tc></w:tr></w:tbl>"#;
let doc = wrap_body(body);
let data = build_test_docx(&doc, None, None);
let converter = DocxConverter;
let result = converter
.convert(&data, &ConversionOptions::default())
.unwrap();
// Markdown has pipe-delimited table
assert!(result.markdown.contains("| H1 | H2 |"));
assert!(result.markdown.contains("|---|---|"));
// Plain text: tab-separated, no pipes or separator row
assert!(
result.plain_text.contains("H1\tH2"),
"plain_text should have tab-separated headers, was: {}",
result.plain_text
);
assert!(
result.plain_text.contains("A\tB"),
"plain_text should have tab-separated data, was: {}",
result.plain_text
);
assert!(
!result.plain_text.contains('|'),
"plain_text should not contain pipe characters, was: {}",
result.plain_text
);
assert!(
!result.plain_text.contains("---"),
"plain_text should not contain separator row, was: {}",
result.plain_text
);
}
#[test]
fn test_docx_plain_text_image_no_markdown_syntax() {
let body = r#"<w:p><w:r><w:drawing><wp:inline><wp:docPr descr="A photo"/><a:graphic><a:graphicData><pic:pic><pic:blipFill><a:blip r:embed="rId2"/></pic:blipFill></pic:pic></a:graphicData></a:graphic></wp:inline></w:drawing></w:r></w:p>"#;
let doc = wrap_body(body);
let rels = r#"<?xml version="1.0" encoding="UTF-8"?><Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships"><Relationship Id="rId2" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/image" Target="media/photo.jpg"/></Relationships>"#;
let data = build_test_docx(&doc, None, Some(rels));
let converter = DocxConverter;
let result = converter
.convert(&data, &ConversionOptions::default())
.unwrap();
// Markdown has image syntax: 
assert!(result.markdown.contains("photo.jpg"));
assert!(result.markdown.contains("!["));
// Plain text: just the placeholder, no ![] or () syntax
assert!(
!result.plain_text.contains("!["),
"plain_text should not contain ![ image syntax, was: {}",
result.plain_text
);
assert!(
!result.plain_text.contains("photo.jpg"),
"plain_text should not contain image filename, was: {}",
result.plain_text
);
}
#[test]
fn test_docx_plain_text_list_no_markers() {
let numbering = r#"<?xml version="1.0" encoding="UTF-8"?><w:numbering xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main"><w:abstractNum w:abstractNumId="0"><w:lvl w:ilvl="0"><w:numFmt w:val="bullet"/></w:lvl></w:abstractNum><w:num w:numId="1"><w:abstractNumId w:val="0"/></w:num></w:numbering>"#;
let body = r#"<w:p><w:pPr><w:numPr><w:ilvl w:val="0"/><w:numId w:val="1"/></w:numPr></w:pPr><w:r><w:t>Item 1</w:t></w:r></w:p><w:p><w:pPr><w:numPr><w:ilvl w:val="0"/><w:numId w:val="1"/></w:numPr></w:pPr><w:r><w:t>Item 2</w:t></w:r></w:p>"#;
let doc = wrap_body(body);
let data = build_test_docx_with_numbering(&doc, None, None, Some(numbering));
let converter = DocxConverter;
let result = converter
.convert(&data, &ConversionOptions::default())
.unwrap();
// Markdown has bullet markers
assert!(result.markdown.contains("- Item 1"));
assert!(result.markdown.contains("- Item 2"));
// Plain text: no bullet markers
assert!(
!result.plain_text.contains("- "),
"plain_text should not contain bullet markers, was: {}",
result.plain_text
);
assert!(result.plain_text.contains("Item 1"));
assert!(result.plain_text.contains("Item 2"));
}
#[test]
fn test_docx_plain_text_table_bold_cells_no_formatting() {
// Table with bold text in cells — plain_text should not contain **
let body = r#"<w:tbl><w:tr><w:tc><w:p><w:r><w:t>Header</w:t></w:r></w:p></w:tc><w:tc><w:p><w:r><w:t>Value</w:t></w:r></w:p></w:tc></w:tr><w:tr><w:tc><w:p><w:r><w:rPr><w:b/></w:rPr><w:t>Bold Cell</w:t></w:r></w:p></w:tc><w:tc><w:p><w:r><w:t>Normal</w:t></w:r></w:p></w:tc></w:tr></w:tbl>"#;
let doc = wrap_body(body);
let data = build_test_docx(&doc, None, None);
let converter = DocxConverter;
let result = converter
.convert(&data, &ConversionOptions::default())
.unwrap();
// Markdown should have bold markers
assert!(
result.markdown.contains("**Bold Cell**"),
"markdown should contain bold markers, was: {}",
result.markdown
);
// Plain text should NOT have bold markers
assert!(
!result.plain_text.contains("**"),
"plain_text should not contain ** markers, was: {}",
result.plain_text
);
assert!(
result.plain_text.contains("Bold Cell"),
"plain_text should contain the cell text"
);
}
#[test]
fn test_docx_plain_text_table_hyperlink_cells_no_markdown() {
// Table with hyperlinked text in cells — plain_text should not contain [](url)
let rels = r#"<?xml version="1.0" encoding="UTF-8"?><Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships"><Relationship Id="rId1" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/hyperlink" Target="https://example.com" TargetMode="External"/></Relationships>"#;
let body = r#"<w:tbl><w:tr><w:tc><w:p><w:r><w:t>Name</w:t></w:r></w:p></w:tc></w:tr><w:tr><w:tc><w:p><w:hyperlink r:id="rId1"><w:r><w:t>Click Here</w:t></w:r></w:hyperlink></w:p></w:tc></w:tr></w:tbl>"#;
let doc = wrap_body(body);
let data = build_test_docx(&doc, None, Some(rels));
let converter = DocxConverter;
let result = converter
.convert(&data, &ConversionOptions::default())
.unwrap();
// Markdown should have link syntax
assert!(
result
.markdown
.contains("[Click Here](https://example.com)"),
"markdown should contain link syntax, was: {}",
result.markdown
);
// Plain text should NOT have link syntax
assert!(
!result.plain_text.contains('['),
"plain_text should not contain [ from link syntax, was: {}",
result.plain_text
);
assert!(
!result.plain_text.contains("https://example.com"),
"plain_text should not contain URL, was: {}",
result.plain_text
);
assert!(
result.plain_text.contains("Click Here"),
"plain_text should contain the link text"
);
}
#[test]
fn test_docx_title_no_markdown_formatting() {
// Heading 1 with bold text — title should be plain text, no **
let body = r#"<w:p><w:pPr><w:pStyle w:val="Heading1"/></w:pPr><w:r><w:rPr><w:b/></w:rPr><w:t>Bold Title</w:t></w:r></w:p>"#;
let doc = wrap_body(body);
let data = build_test_docx(&doc, None, None);
let converter = DocxConverter;
let result = converter
.convert(&data, &ConversionOptions::default())
.unwrap();
assert_eq!(
result.title.as_deref(),
Some("Bold Title"),
"title should not contain markdown bold markers, was: {:?}",
result.title
);
assert!(
!result.title.as_deref().unwrap_or("").contains("**"),
"title must not contain ** markers"
);
}
#[test]
fn test_docx_title_hyperlink_no_markdown_syntax() {
// Heading 1 with a hyperlink — title should be plain text, no [](url)
let rels = r#"<?xml version="1.0" encoding="UTF-8"?><Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships"><Relationship Id="rId1" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/hyperlink" Target="https://example.com" TargetMode="External"/></Relationships>"#;
let body = r#"<w:p><w:pPr><w:pStyle w:val="Heading1"/></w:pPr><w:hyperlink r:id="rId1"><w:r><w:t>Linked Title</w:t></w:r></w:hyperlink></w:p>"#;
let doc = wrap_body(body);
let data = build_test_docx(&doc, None, Some(rels));
let converter = DocxConverter;
let result = converter
.convert(&data, &ConversionOptions::default())
.unwrap();
assert_eq!(
result.title.as_deref(),
Some("Linked Title"),
"title should not contain markdown link syntax, was: {:?}",
result.title
);
assert!(
!result.title.as_deref().unwrap_or("").contains('['),
"title must not contain [ from link syntax"
);
}
// ---- mc:AlternateContent tests ----
#[test]
fn test_docx_alternate_content_fallback_used() {
// mc:AlternateContent with Choice (DrawingML) and Fallback (VML).
// Fallback text should appear; Choice text should NOT.
let body = r#"<mc:AlternateContent><mc:Choice Requires="wps"><w:p><w:r><w:t>Choice text (should be hidden)</w:t></w:r></w:p></mc:Choice><mc:Fallback><w:p><w:r><w:t>Fallback text visible</w:t></w:r></w:p></mc:Fallback></mc:AlternateContent>"#;
let doc = wrap_body(body);
let data = build_test_docx(&doc, None, None);
let converter = DocxConverter;
let result = converter
.convert(&data, &ConversionOptions::default())
.unwrap();
assert!(
result.markdown.contains("Fallback text visible"),
"markdown was: {}",
result.markdown
);
assert!(
!result.markdown.contains("Choice text"),
"Choice text should be skipped, markdown was: {}",
result.markdown
);
}
#[test]
fn test_docx_alternate_content_choice_skipped() {
// mc:AlternateContent with only Choice (no Fallback) — nothing should appear
let body = r#"<w:p><w:r><w:t>Before AC</w:t></w:r></w:p><mc:AlternateContent><mc:Choice Requires="wps"><w:p><w:r><w:t>Hidden</w:t></w:r></w:p></mc:Choice></mc:AlternateContent><w:p><w:r><w:t>After AC</w:t></w:r></w:p>"#;
let doc = wrap_body(body);
let data = build_test_docx(&doc, None, None);
let converter = DocxConverter;
let result = converter
.convert(&data, &ConversionOptions::default())
.unwrap();
assert!(result.markdown.contains("Before AC"));
assert!(result.markdown.contains("After AC"));
assert!(
!result.markdown.contains("Hidden"),
"Choice content should be skipped, markdown was: {}",
result.markdown
);
}
// ---- Text box tests ----
#[test]
fn test_docx_textbox_basic() {
// Simple text box: w:pict > v:shape > v:textbox > w:txbxContent > w:p
let body = r#"<w:p><w:r><w:pict><v:shape><v:textbox><w:txbxContent><w:p><w:r><w:t>Text box content</w:t></w:r></w:p></w:txbxContent></v:textbox></v:shape></w:pict></w:r></w:p>"#;
let doc = wrap_body(body);
let data = build_test_docx(&doc, None, None);
let converter = DocxConverter;
let result = converter
.convert(&data, &ConversionOptions::default())
.unwrap();
assert!(
result.markdown.contains("Text box content"),
"markdown was: {}",
result.markdown
);
}
#[test]
fn test_docx_textbox_with_formatting() {
// Bold and italic text inside a text box
let body = r#"<w:p><w:r><w:pict><v:shape><v:textbox><w:txbxContent><w:p><w:r><w:rPr><w:b/></w:rPr><w:t>Bold in box</w:t></w:r></w:p></w:txbxContent></v:textbox></v:shape></w:pict></w:r></w:p>"#;
let doc = wrap_body(body);
let data = build_test_docx(&doc, None, None);
let converter = DocxConverter;
let result = converter
.convert(&data, &ConversionOptions::default())
.unwrap();
assert!(
result.markdown.contains("**Bold in box**"),
"markdown was: {}",
result.markdown
);
}
#[test]
fn test_docx_textbox_multiple_paragraphs() {
// Two paragraphs inside one text box
let body = r#"<w:p><w:r><w:pict><v:shape><v:textbox><w:txbxContent><w:p><w:r><w:t>First TB para</w:t></w:r></w:p><w:p><w:r><w:t>Second TB para</w:t></w:r></w:p></w:txbxContent></v:textbox></v:shape></w:pict></w:r></w:p>"#;
let doc = wrap_body(body);
let data = build_test_docx(&doc, None, None);
let converter = DocxConverter;
let result = converter
.convert(&data, &ConversionOptions::default())
.unwrap();
assert!(
result.markdown.contains("First TB para"),
"markdown was: {}",
result.markdown
);
assert!(
result.markdown.contains("Second TB para"),
"markdown was: {}",
result.markdown
);
}
#[test]
fn test_docx_textbox_via_alternate_content() {
// Full mc:AlternateContent > Fallback > w:pict > v:shape > v:textbox path
let body = r#"<mc:AlternateContent><mc:Choice Requires="wps"><w:p><w:r><w:t>DrawingML choice</w:t></w:r></w:p></mc:Choice><mc:Fallback><w:p><w:r><w:pict><v:shape><v:textbox><w:txbxContent><w:p><w:r><w:t>VML text box</w:t></w:r></w:p></w:txbxContent></v:textbox></v:shape></w:pict></w:r></w:p></mc:Fallback></mc:AlternateContent>"#;
let doc = wrap_body(body);
let data = build_test_docx(&doc, None, None);
let converter = DocxConverter;
let result = converter
.convert(&data, &ConversionOptions::default())
.unwrap();
assert!(
result.markdown.contains("VML text box"),
"markdown was: {}",
result.markdown
);
assert!(
!result.markdown.contains("DrawingML choice"),
"Choice content should be hidden, markdown was: {}",
result.markdown
);
}
#[test]
fn test_docx_textbox_between_paragraphs() {
// Text box surrounded by normal paragraphs — verify document flow
let body = format!(
"{}{}{}",
para("Before text box."),
r#"<w:p><w:r><w:pict><v:shape><v:textbox><w:txbxContent><w:p><w:r><w:t>Inside box</w:t></w:r></w:p></w:txbxContent></v:textbox></v:shape></w:pict></w:r></w:p>"#,
para("After text box.")
);
let doc = wrap_body(&body);
let data = build_test_docx(&doc, None, None);
let converter = DocxConverter;
let result = converter
.convert(&data, &ConversionOptions::default())
.unwrap();
assert!(
result.markdown.contains("Before text box."),
"markdown was: {}",
result.markdown
);
assert!(
result.markdown.contains("Inside box"),
"markdown was: {}",
result.markdown
);
assert!(
result.markdown.contains("After text box."),
"markdown was: {}",
result.markdown
);
}
#[test]
fn test_docx_textbox_unicode() {
// CJK and emoji in text box
let body = r#"<w:p><w:r><w:pict><v:shape><v:textbox><w:txbxContent><w:p><w:r><w:t>한국어 🚀 中文</w:t></w:r></w:p></w:txbxContent></v:textbox></v:shape></w:pict></w:r></w:p>"#;
let doc = wrap_body(body);
let data = build_test_docx(&doc, None, None);
let converter = DocxConverter;
let result = converter
.convert(&data, &ConversionOptions::default())
.unwrap();
assert!(result.markdown.contains("한국어"));
assert!(result.markdown.contains("🚀"));
assert!(result.markdown.contains("中文"));
}
#[test]
fn test_docx_textbox_empty() {
// Empty text box — should not crash or produce garbage output
let body = r#"<w:p><w:r><w:pict><v:shape><v:textbox><w:txbxContent></w:txbxContent></v:textbox></v:shape></w:pict></w:r></w:p>"#;
let doc = wrap_body(body);
let data = build_test_docx(&doc, None, None);
let converter = DocxConverter;
let result = converter
.convert(&data, &ConversionOptions::default())
.unwrap();
// Should not contain any text from the empty text box
assert!(
result.markdown.trim().is_empty(),
"expected empty output, got: {}",
result.markdown
);
}
}