use std::collections::HashMap;
use crate::charts;
use crate::container::OoxmlContainer;
use crate::error::{Error, Result};
use crate::model::{
Block, Cell, CellAlignment, Document, ListInfo, ListType, Metadata, Paragraph, Resource,
ResourceType, RevisionType, Row, Section, Table, TextAlignment, TextRun, TextStyle,
VerticalAlignment,
};
use super::numbering::NumberingMap;
use super::styles::StyleMap;
pub struct DocxParser {
container: OoxmlContainer,
styles: StyleMap,
numbering: NumberingMap,
relationships: crate::container::Relationships,
footnotes: HashMap<String, String>,
endnotes: HashMap<String, String>,
}
impl DocxParser {
pub fn open(path: impl AsRef<std::path::Path>) -> Result<Self> {
let container = OoxmlContainer::open(path)?;
Self::from_container(container)
}
pub fn from_bytes(data: Vec<u8>) -> Result<Self> {
let container = OoxmlContainer::from_bytes(data)?;
Self::from_container(container)
}
fn from_container(container: OoxmlContainer) -> Result<Self> {
let styles = match container.read_xml_optional("word/styles.xml")? {
Some(xml) => StyleMap::parse(&xml)?,
None => StyleMap::default(),
};
let numbering = match container.read_xml_optional("word/numbering.xml")? {
Some(xml) => NumberingMap::parse(&xml)?,
None => NumberingMap::default(),
};
let relationships = container.read_optional_relationships_for_part("word/document.xml")?;
let footnotes = match container.read_xml_optional("word/footnotes.xml")? {
Some(xml) => parse_notes_xml(&xml, b"w:footnote"),
None => HashMap::new(),
};
let endnotes = match container.read_xml_optional("word/endnotes.xml")? {
Some(xml) => parse_notes_xml(&xml, b"w:endnote"),
None => HashMap::new(),
};
Ok(Self {
container,
styles,
numbering,
relationships,
footnotes,
endnotes,
})
}
pub fn parse(&mut self) -> Result<Document> {
let mut doc = Document::new();
doc.metadata = self.parse_metadata()?;
let mut main_section = self.parse_document_xml()?;
let chart_tables = self.parse_charts()?;
for table in chart_tables {
main_section.add_block(Block::Table(table));
}
if !self.footnotes.is_empty() {
let mut ids: Vec<&String> = self.footnotes.keys().collect();
ids.sort_by(|a, b| {
a.parse::<u64>()
.unwrap_or(u64::MAX)
.cmp(&b.parse::<u64>().unwrap_or(u64::MAX))
});
for id in ids {
if let Some(text) = self.footnotes.get(id) {
let para = Paragraph::with_text(format!("[^{}]: {}", id, text));
main_section.add_block(Block::Paragraph(para));
}
}
}
if !self.endnotes.is_empty() {
let mut ids: Vec<&String> = self.endnotes.keys().collect();
ids.sort_by(|a, b| {
a.parse::<u64>()
.unwrap_or(u64::MAX)
.cmp(&b.parse::<u64>().unwrap_or(u64::MAX))
});
for id in ids {
if let Some(text) = self.endnotes.get(id) {
let para = Paragraph::with_text(format!("[^e{}]: {}", id, text));
main_section.add_block(Block::Paragraph(para));
}
}
}
doc.add_section(main_section);
self.extract_resources(&mut doc)?;
Ok(doc)
}
fn parse_metadata(&self) -> Result<Metadata> {
self.container.parse_core_metadata()
}
fn parse_charts(&self) -> Result<Vec<Table>> {
let mut tables = Vec::new();
for (rel_type, rels) in &self.relationships.by_type {
if !rel_type.contains("chart") {
continue;
}
for rel in rels {
let chart_path = if rel.target.starts_with('/') {
rel.target[1..].to_string()
} else {
format!("word/{}", rel.target)
};
let chart_xml = self.container.read_xml(&chart_path)?;
match charts::parse_chart_xml(&chart_xml) {
Ok(chart_data) => {
if !chart_data.is_empty() {
let mut table = chart_data.to_table();
if let Some(ref title) = chart_data.title {
if !title.is_empty() {
if let Some(first_row) = table.rows.first_mut() {
if let Some(first_cell) = first_row.cells.first_mut() {
let original = first_cell.plain_text();
first_cell.content.clear();
first_cell.content.push(Paragraph::with_text(format!(
"{} ({})",
original, title
)));
}
}
}
}
tables.push(table);
}
}
Err(e) => return Err(e),
}
}
}
Ok(tables)
}
fn parse_document_xml(&mut self) -> Result<Section> {
let xml = self.container.read_xml("word/document.xml")?;
let mut section = Section::new(0);
let mut reader = quick_xml::Reader::from_str(&xml);
reader.config_mut().trim_text(false);
let mut buf = Vec::new();
let mut in_body = false;
let mut paragraph_xml = String::new();
let mut table_xml = String::new();
let mut in_paragraph = false;
let mut para_depth: u32 = 0; let mut table_depth: u32 = 0; let mut in_sect_pr = false; let mut default_header_rid: Option<String> = None;
let mut default_footer_rid: Option<String> = None;
loop {
match reader.read_event_into(&mut buf) {
Ok(quick_xml::events::Event::Start(ref e)) => {
let name = e.name();
match name.as_ref() {
b"w:body" => {
in_body = true;
}
b"w:p" if in_body && table_depth == 0 && !in_paragraph => {
in_paragraph = true;
paragraph_xml.clear();
paragraph_xml.push_str("<w:p");
for attr in e.attributes().flatten() {
paragraph_xml.push_str(&format!(
" {}=\"{}\"",
String::from_utf8_lossy(attr.key.as_ref()),
String::from_utf8_lossy(&attr.value)
));
}
paragraph_xml.push('>');
}
b"w:sectPr" if in_body && !in_paragraph && table_depth == 0 => {
in_sect_pr = true;
}
b"w:tbl" if in_body => {
if table_depth == 0 {
table_xml.clear();
}
table_depth += 1;
table_xml.push_str("<w:tbl>");
}
_ => {
if in_paragraph {
if name.as_ref() == b"w:p" {
para_depth += 1;
}
paragraph_xml.push('<');
paragraph_xml.push_str(&String::from_utf8_lossy(name.as_ref()));
for attr in e.attributes().flatten() {
paragraph_xml.push_str(&format!(
" {}=\"{}\"",
String::from_utf8_lossy(attr.key.as_ref()),
String::from_utf8_lossy(&attr.value)
));
}
paragraph_xml.push('>');
} else if table_depth > 0 {
table_xml.push('<');
table_xml.push_str(&String::from_utf8_lossy(name.as_ref()));
for attr in e.attributes().flatten() {
table_xml.push_str(&format!(
" {}=\"{}\"",
String::from_utf8_lossy(attr.key.as_ref()),
String::from_utf8_lossy(&attr.value)
));
}
table_xml.push('>');
}
}
}
}
Ok(quick_xml::events::Event::Empty(ref e)) => {
if in_sect_pr {
let name = e.name();
match name.as_ref() {
b"w:headerReference" | b"w:footerReference" => {
let mut ref_type = String::new();
let mut r_id = String::new();
for attr in e.attributes().flatten() {
match attr.key.as_ref() {
b"w:type" => {
ref_type =
String::from_utf8_lossy(&attr.value).to_string();
}
b"r:id" => {
r_id = String::from_utf8_lossy(&attr.value).to_string();
}
_ => {}
}
}
if ref_type == "default" && !r_id.is_empty() {
if name.as_ref() == b"w:headerReference" {
default_header_rid = Some(r_id);
} else {
default_footer_rid = Some(r_id);
}
}
}
_ => {}
}
} else if in_paragraph {
let name = e.name();
paragraph_xml.push('<');
paragraph_xml.push_str(&String::from_utf8_lossy(name.as_ref()));
for attr in e.attributes().flatten() {
paragraph_xml.push_str(&format!(
" {}=\"{}\"",
String::from_utf8_lossy(attr.key.as_ref()),
String::from_utf8_lossy(&attr.value)
));
}
paragraph_xml.push_str("/>");
} else if table_depth > 0 {
let name = e.name();
table_xml.push('<');
table_xml.push_str(&String::from_utf8_lossy(name.as_ref()));
for attr in e.attributes().flatten() {
table_xml.push_str(&format!(
" {}=\"{}\"",
String::from_utf8_lossy(attr.key.as_ref()),
String::from_utf8_lossy(&attr.value)
));
}
table_xml.push_str("/>");
}
}
Ok(quick_xml::events::Event::Text(ref e)) => {
if in_paragraph {
let text = crate::decode::decode_text_lossy(e);
paragraph_xml.push_str(&escape_xml(&text));
} else if table_depth > 0 {
let text = crate::decode::decode_text_lossy(e);
table_xml.push_str(&escape_xml(&text));
}
}
Ok(quick_xml::events::Event::End(ref e)) => {
let name = e.name();
match name.as_ref() {
b"w:body" => {
in_body = false;
}
b"w:sectPr" if in_sect_pr => {
in_sect_pr = false;
}
b"w:p" if in_paragraph && table_depth == 0 && para_depth == 0 => {
paragraph_xml.push_str("</w:p>");
let textbox_paras = self.extract_textbox_paragraphs(¶graph_xml);
if let Ok(para) = self.parse_paragraph(¶graph_xml) {
section.add_block(Block::Paragraph(para));
}
for tb_para in textbox_paras {
section.add_block(Block::Paragraph(tb_para));
}
in_paragraph = false;
}
b"w:tbl" if table_depth > 0 => {
table_xml.push_str("</w:tbl>");
table_depth -= 1;
if table_depth == 0 {
if let Ok(table) = self.parse_table(&table_xml) {
section.add_block(Block::Table(table));
}
}
}
_ => {
if in_paragraph {
if name.as_ref() == b"w:p" {
para_depth = para_depth.saturating_sub(1);
}
paragraph_xml.push_str("</");
paragraph_xml.push_str(&String::from_utf8_lossy(name.as_ref()));
paragraph_xml.push('>');
} else if table_depth > 0 {
table_xml.push_str("</");
table_xml.push_str(&String::from_utf8_lossy(name.as_ref()));
table_xml.push('>');
}
}
}
}
Ok(quick_xml::events::Event::Eof) => break,
Err(e) => {
return Err(Error::xml_parse_with_context(
e.to_string(),
"word/document.xml",
))
}
_ => {}
}
buf.clear();
}
if let Some(rid) = default_header_rid {
if let Some(paragraphs) = self.parse_header_footer_by_rid(&rid)? {
if !paragraphs.is_empty() {
section.header = Some(paragraphs);
}
}
}
if let Some(rid) = default_footer_rid {
if let Some(paragraphs) = self.parse_header_footer_by_rid(&rid)? {
if !paragraphs.is_empty() {
section.footer = Some(paragraphs);
}
}
}
Ok(section)
}
fn parse_header_footer_by_rid(&self, rid: &str) -> Result<Option<Vec<Paragraph>>> {
let Some(rel) = self.relationships.get(rid) else {
return Ok(None);
};
let path = OoxmlContainer::resolve_path("word/document.xml", &rel.target);
match self.container.read_xml_optional(&path)? {
Some(xml) => Ok(Some(parse_header_footer_xml(&xml))),
None => Ok(None),
}
}
fn parse_paragraph(&mut self, xml: &str) -> Result<Paragraph> {
use crate::model::InlineImage;
let mut para = Paragraph::new();
let mut reader = quick_xml::Reader::from_str(xml);
reader.config_mut().trim_text(false);
let mut buf = Vec::new();
let mut in_ppr = false;
let mut in_rpr = false;
let mut in_run = false;
let mut in_text = false; let mut in_instr_text = false; let mut in_drawing = false; let mut in_ins = false; let mut in_del = false; let mut txbx_content_depth: u32 = 0; let mut mc_fallback_depth: u32 = 0; let mut current_style = TextStyle::default();
let mut current_hyperlink: Option<String> = None;
let mut current_image_alt: Option<String> = None;
loop {
match reader.read_event_into(&mut buf) {
Ok(quick_xml::events::Event::Start(ref e)) => match e.name().as_ref() {
b"mc:Fallback" => {
mc_fallback_depth += 1;
}
b"w:txbxContent" if mc_fallback_depth == 0 => {
txbx_content_depth += 1;
}
_ if mc_fallback_depth > 0 => {} _ if txbx_content_depth > 0 => {} b"w:pPr" => in_ppr = true,
b"w:rPr" => in_rpr = true,
b"w:r" => {
in_run = true;
current_style = TextStyle::default();
}
b"w:t" => in_text = true,
b"w:instrText" => in_instr_text = true,
b"w:drawing" => {
in_drawing = true;
current_image_alt = None;
}
b"w:ins" => in_ins = true,
b"w:del" => in_del = true,
b"w:hyperlink" => {
for attr in e.attributes().flatten() {
if attr.key.as_ref() == b"r:id" {
let rel_id = String::from_utf8_lossy(&attr.value);
if let Some(rel) = self.relationships.get(&rel_id) {
current_hyperlink = Some(rel.target.clone());
}
}
}
}
_ => {}
},
Ok(quick_xml::events::Event::Empty(ref e)) => match e.name().as_ref() {
_ if mc_fallback_depth > 0 || txbx_content_depth > 0 => {} b"w:pStyle" if in_ppr => {
for attr in e.attributes().flatten() {
if attr.key.as_ref() == b"w:val" {
let style_id = String::from_utf8_lossy(&attr.value);
para.style_id = Some(style_id.to_string());
para.heading = self.styles.get_heading_level(&style_id);
if let Some(style) = self.styles.styles.get(style_id.as_ref()) {
if !style.name.is_empty() {
para.style_name = Some(style.name.clone());
}
}
}
}
}
b"w:jc" if in_ppr => {
for attr in e.attributes().flatten() {
if attr.key.as_ref() == b"w:val" {
let val = String::from_utf8_lossy(&attr.value);
para.alignment = match val.as_ref() {
"center" => TextAlignment::Center,
"right" => TextAlignment::Right,
"both" | "distribute" => TextAlignment::Justify,
_ => TextAlignment::Left,
};
}
}
}
b"w:b" if in_rpr => {
let val = get_bool_attr(e, b"w:val");
current_style.bold = val.unwrap_or(true);
}
b"w:i" if in_rpr => {
let val = get_bool_attr(e, b"w:val");
current_style.italic = val.unwrap_or(true);
}
b"w:u" if in_rpr => {
for attr in e.attributes().flatten() {
if attr.key.as_ref() == b"w:val" {
let val = String::from_utf8_lossy(&attr.value);
current_style.underline = val != "none";
}
}
}
b"w:strike" if in_rpr => {
let val = get_bool_attr(e, b"w:val");
current_style.strikethrough = val.unwrap_or(true);
}
b"w:vertAlign" if in_rpr => {
for attr in e.attributes().flatten() {
if attr.key.as_ref() == b"w:val" {
let val = String::from_utf8_lossy(&attr.value);
match val.as_ref() {
"superscript" => current_style.superscript = true,
"subscript" => current_style.subscript = true,
_ => {}
}
}
}
}
b"w:sz" if in_rpr => {
for attr in e.attributes().flatten() {
if attr.key.as_ref() == b"w:val" {
let val = String::from_utf8_lossy(&attr.value);
current_style.size = val.parse().ok();
}
}
}
b"w:color" if in_rpr => {
for attr in e.attributes().flatten() {
if attr.key.as_ref() == b"w:val" {
let val = String::from_utf8_lossy(&attr.value);
if val != "auto" {
current_style.color = Some(val.to_string());
}
}
}
}
b"w:highlight" if in_rpr => {
for attr in e.attributes().flatten() {
if attr.key.as_ref() == b"w:val" {
current_style.highlight =
Some(String::from_utf8_lossy(&attr.value).to_string());
}
}
}
b"w:rFonts" if in_rpr => {
for attr in e.attributes().flatten() {
if attr.key.as_ref() == b"w:ascii" {
current_style.font =
Some(String::from_utf8_lossy(&attr.value).to_string());
break;
}
}
}
b"wp:docPr" if in_drawing => {
for attr in e.attributes().flatten() {
if attr.key.as_ref() == b"descr" {
current_image_alt =
Some(String::from_utf8_lossy(&attr.value).to_string());
}
}
}
b"a:blip" if in_drawing => {
for attr in e.attributes().flatten() {
if attr.key.as_ref() == b"r:embed" {
let rel_id = String::from_utf8_lossy(&attr.value).to_string();
let image = InlineImage {
resource_id: rel_id,
alt_text: current_image_alt.clone(),
width: None,
height: None,
};
para.images.push(image);
}
}
}
b"w:br" if in_run => {
let mut is_page_break = false;
for attr in e.attributes().flatten() {
if attr.key.as_ref() == b"w:type" {
let break_type = String::from_utf8_lossy(&attr.value);
is_page_break = break_type == "page";
}
}
let current_revision = if in_del {
RevisionType::Deleted
} else if in_ins {
RevisionType::Inserted
} else {
RevisionType::None
};
if is_page_break {
if let Some(last_run) = para.runs.last_mut() {
last_run.page_break = true;
} else {
para.runs.push(TextRun {
text: String::new(),
style: current_style.clone(),
hyperlink: None,
line_break: false,
page_break: true,
revision: current_revision,
});
}
} else {
if let Some(last_run) = para.runs.last_mut() {
last_run.line_break = true;
} else {
para.runs.push(TextRun {
text: String::new(),
style: current_style.clone(),
hyperlink: None,
line_break: true,
page_break: false,
revision: current_revision,
});
}
}
}
b"w:tab" if in_run => {
let current_revision = if in_del {
RevisionType::Deleted
} else if in_ins {
RevisionType::Inserted
} else {
RevisionType::None
};
para.runs.push(TextRun {
text: "\t".to_string(),
style: current_style.clone(),
hyperlink: current_hyperlink.clone(),
line_break: false,
page_break: false,
revision: current_revision,
});
}
b"w:cr" if in_run => {
if let Some(last_run) = para.runs.last_mut() {
last_run.line_break = true;
} else {
let current_revision = if in_del {
RevisionType::Deleted
} else if in_ins {
RevisionType::Inserted
} else {
RevisionType::None
};
para.runs.push(TextRun {
text: String::new(),
style: current_style.clone(),
hyperlink: None,
line_break: true,
page_break: false,
revision: current_revision,
});
}
}
b"w:noBreakHyphen" if in_run => {
let current_revision = if in_del {
RevisionType::Deleted
} else if in_ins {
RevisionType::Inserted
} else {
RevisionType::None
};
para.runs.push(TextRun {
text: "\u{2011}".to_string(), style: current_style.clone(),
hyperlink: current_hyperlink.clone(),
line_break: false,
page_break: false,
revision: current_revision,
});
}
b"w:softHyphen" if in_run => {
let current_revision = if in_del {
RevisionType::Deleted
} else if in_ins {
RevisionType::Inserted
} else {
RevisionType::None
};
para.runs.push(TextRun {
text: "\u{00AD}".to_string(), style: current_style.clone(),
hyperlink: current_hyperlink.clone(),
line_break: false,
page_break: false,
revision: current_revision,
});
}
b"w:noBreakSpace" if in_run => {
let current_revision = if in_del {
RevisionType::Deleted
} else if in_ins {
RevisionType::Inserted
} else {
RevisionType::None
};
para.runs.push(TextRun {
text: "\u{00A0}".to_string(), style: current_style.clone(),
hyperlink: current_hyperlink.clone(),
line_break: false,
page_break: false,
revision: current_revision,
});
}
b"w:footnoteReference" if in_run => {
for attr in e.attributes().flatten() {
if attr.key.as_ref() == b"w:id" {
let id = String::from_utf8_lossy(&attr.value).to_string();
if self.footnotes.contains_key(&id) {
para.runs.push(TextRun::plain(format!("[^{}]", id)));
}
}
}
}
b"w:endnoteReference" if in_run => {
for attr in e.attributes().flatten() {
if attr.key.as_ref() == b"w:id" {
let id = String::from_utf8_lossy(&attr.value).to_string();
if self.endnotes.contains_key(&id) {
para.runs.push(TextRun::plain(format!("[^e{}]", id)));
}
}
}
}
_ => {}
},
Ok(quick_xml::events::Event::Text(ref e))
if in_run
&& in_text
&& !in_instr_text
&& mc_fallback_depth == 0
&& txbx_content_depth == 0 =>
{
let text = crate::decode::decode_text_lossy(e);
if !text.is_empty() {
let current_revision = if in_del {
RevisionType::Deleted
} else if in_ins {
RevisionType::Inserted
} else {
RevisionType::None
};
let run = TextRun {
text,
style: current_style.clone(),
hyperlink: current_hyperlink.clone(),
line_break: false,
page_break: false,
revision: current_revision,
};
para.runs.push(run);
}
}
Ok(quick_xml::events::Event::End(ref e)) => match e.name().as_ref() {
b"mc:Fallback" if mc_fallback_depth > 0 => {
mc_fallback_depth -= 1;
}
b"w:txbxContent" if txbx_content_depth > 0 => {
txbx_content_depth -= 1;
}
_ if mc_fallback_depth > 0 || txbx_content_depth > 0 => {} b"w:pPr" => in_ppr = false,
b"w:rPr" => in_rpr = false,
b"w:r" => in_run = false,
b"w:t" => in_text = false,
b"w:instrText" => in_instr_text = false,
b"w:hyperlink" => current_hyperlink = None,
b"w:drawing" => {
in_drawing = false;
current_image_alt = None;
}
b"w:ins" => in_ins = false,
b"w:del" => in_del = false,
_ => {}
},
Ok(quick_xml::events::Event::Eof) => break,
Err(e) => return Err(Error::xml_parse_with_context(e.to_string(), "paragraph")),
_ => {}
}
buf.clear();
}
para.list_info = self.parse_list_info(xml);
Ok(para)
}
fn extract_textbox_paragraphs(&mut self, xml: &str) -> Vec<Paragraph> {
let mut paragraphs = Vec::new();
let mut reader = quick_xml::Reader::from_str(xml);
reader.config_mut().trim_text(false);
let mut buf = Vec::new();
let mut mc_fallback_depth: u32 = 0;
let mut txbx_content_depth: u32 = 0;
let mut in_txbx_para = false;
let mut txbx_para_xml = String::new();
let mut txbx_para_depth: u32 = 0;
loop {
match reader.read_event_into(&mut buf) {
Ok(quick_xml::events::Event::Start(ref e)) => {
let name = e.name();
match name.as_ref() {
b"mc:Fallback" => {
mc_fallback_depth += 1;
}
b"w:txbxContent" if mc_fallback_depth == 0 => {
txbx_content_depth += 1;
}
b"w:p" if txbx_content_depth > 0 && !in_txbx_para => {
in_txbx_para = true;
txbx_para_depth = 0;
txbx_para_xml.clear();
txbx_para_xml.push_str("<w:p");
for attr in e.attributes().flatten() {
txbx_para_xml.push_str(&format!(
" {}=\"{}\"",
String::from_utf8_lossy(attr.key.as_ref()),
String::from_utf8_lossy(&attr.value)
));
}
txbx_para_xml.push('>');
}
_ if in_txbx_para => {
txbx_para_depth += 1;
txbx_para_xml.push('<');
txbx_para_xml.push_str(&String::from_utf8_lossy(name.as_ref()));
for attr in e.attributes().flatten() {
txbx_para_xml.push_str(&format!(
" {}=\"{}\"",
String::from_utf8_lossy(attr.key.as_ref()),
String::from_utf8_lossy(&attr.value)
));
}
txbx_para_xml.push('>');
}
_ => {}
}
}
Ok(quick_xml::events::Event::Empty(ref e)) if in_txbx_para => {
let name = e.name();
txbx_para_xml.push('<');
txbx_para_xml.push_str(&String::from_utf8_lossy(name.as_ref()));
for attr in e.attributes().flatten() {
txbx_para_xml.push_str(&format!(
" {}=\"{}\"",
String::from_utf8_lossy(attr.key.as_ref()),
String::from_utf8_lossy(&attr.value)
));
}
txbx_para_xml.push_str("/>");
}
Ok(quick_xml::events::Event::Text(ref e)) if in_txbx_para => {
let text = crate::decode::decode_text_lossy(e);
txbx_para_xml.push_str(&escape_xml(&text));
}
Ok(quick_xml::events::Event::End(ref e)) => {
let name = e.name();
match name.as_ref() {
b"mc:Fallback" if mc_fallback_depth > 0 => {
mc_fallback_depth -= 1;
}
b"w:txbxContent" if txbx_content_depth > 0 => {
txbx_content_depth -= 1;
}
b"w:p" if in_txbx_para && txbx_para_depth == 0 => {
txbx_para_xml.push_str("</w:p>");
if let Ok(para) = self.parse_paragraph(&txbx_para_xml) {
if !para.plain_text().is_empty() {
paragraphs.push(para);
}
}
in_txbx_para = false;
}
_ if in_txbx_para => {
txbx_para_depth = txbx_para_depth.saturating_sub(1);
txbx_para_xml.push_str("</");
txbx_para_xml.push_str(&String::from_utf8_lossy(name.as_ref()));
txbx_para_xml.push('>');
}
_ => {}
}
}
Ok(quick_xml::events::Event::Eof) => break,
Err(_) => break,
_ => {}
}
buf.clear();
}
paragraphs
}
fn parse_list_info(&mut self, xml: &str) -> Option<ListInfo> {
let mut reader = quick_xml::Reader::from_str(xml);
reader.config_mut().trim_text(true);
let mut buf = Vec::new();
let mut num_id: Option<String> = None;
let mut level: u8 = 0;
let mut in_num_pr = false;
loop {
match reader.read_event_into(&mut buf) {
Ok(quick_xml::events::Event::Start(ref e)) if e.name().as_ref() == b"w:numPr" => {
in_num_pr = true;
}
Ok(quick_xml::events::Event::Empty(ref e)) if in_num_pr => {
match e.name().as_ref() {
b"w:numId" => {
for attr in e.attributes().flatten() {
if attr.key.as_ref() == b"w:val" {
num_id = Some(String::from_utf8_lossy(&attr.value).to_string());
}
}
}
b"w:ilvl" => {
for attr in e.attributes().flatten() {
if attr.key.as_ref() == b"w:val" {
let val = String::from_utf8_lossy(&attr.value);
level = val.parse().unwrap_or(0);
}
}
}
_ => {}
}
}
Ok(quick_xml::events::Event::End(ref e)) if e.name().as_ref() == b"w:numPr" => {
in_num_pr = false;
}
Ok(quick_xml::events::Event::Eof) => break,
_ => {}
}
buf.clear();
}
if let Some(ref nid) = num_id {
if let Some((list_type, number)) = self.numbering.get_list_info(nid, level) {
return Some(ListInfo {
list_type,
level,
number: if list_type == ListType::Numbered {
Some(number)
} else {
None
},
});
}
}
None
}
#[allow(clippy::only_used_in_recursion)] fn parse_table(&self, xml: &str) -> Result<Table> {
use crate::model::InlineImage;
let mut table = Table::new();
let mut reader = quick_xml::Reader::from_str(xml);
reader.config_mut().trim_text(false);
let mut buf = Vec::new();
let mut in_row = false;
let mut in_cell = false;
let mut in_paragraph = false;
let mut in_run = false;
let mut in_rpr = false; let mut in_text = false; let mut in_instr_text = false; let mut in_drawing = false; let mut current_image_alt: Option<String> = None;
let mut current_row: Option<Row> = None;
let mut cell_paragraphs: Vec<Paragraph> = Vec::new();
let mut cell_nested_tables: Vec<Table> = Vec::new();
let mut current_paragraph: Option<Paragraph> = None;
let mut current_style = TextStyle::default();
let mut is_header_row = false;
let mut col_span = 1u32;
let mut row_span = 1u32;
let mut cell_alignment = CellAlignment::Left;
let mut in_tc_pr = false;
let mut nested_table_depth: u32 = 0;
let mut nested_table_xml = String::new();
loop {
match reader.read_event_into(&mut buf) {
Ok(quick_xml::events::Event::Start(ref e)) => {
let name = e.name();
if nested_table_depth > 0 {
nested_table_xml.push('<');
nested_table_xml.push_str(&String::from_utf8_lossy(name.as_ref()));
for attr in e.attributes().flatten() {
nested_table_xml.push_str(&format!(
" {}=\"{}\"",
String::from_utf8_lossy(attr.key.as_ref()),
String::from_utf8_lossy(&attr.value)
));
}
nested_table_xml.push('>');
if name.as_ref() == b"w:tbl" {
nested_table_depth += 1;
}
continue;
}
match name.as_ref() {
b"w:tbl" if in_cell => {
nested_table_depth = 1;
nested_table_xml.clear();
nested_table_xml.push_str("<w:tbl>");
}
b"w:tr" => {
in_row = true;
current_row = Some(Row {
cells: Vec::new(),
is_header: false,
height: None,
});
is_header_row = false;
}
b"w:tc" => {
in_cell = true;
cell_paragraphs.clear();
cell_nested_tables.clear();
col_span = 1;
row_span = 1;
cell_alignment = CellAlignment::Left;
}
b"w:tcPr" if in_cell => {
in_tc_pr = true;
}
b"w:p" if in_cell => {
in_paragraph = true;
current_paragraph = Some(Paragraph::new());
}
b"w:r" if in_paragraph => {
in_run = true;
current_style = TextStyle::default();
}
b"w:rPr" if in_run => in_rpr = true,
b"w:t" => in_text = true,
b"w:instrText" => in_instr_text = true,
b"w:drawing" => {
in_drawing = true;
current_image_alt = None;
}
_ => {}
}
}
Ok(quick_xml::events::Event::Empty(ref e)) => {
let name = e.name();
if nested_table_depth > 0 {
nested_table_xml.push('<');
nested_table_xml.push_str(&String::from_utf8_lossy(name.as_ref()));
for attr in e.attributes().flatten() {
nested_table_xml.push_str(&format!(
" {}=\"{}\"",
String::from_utf8_lossy(attr.key.as_ref()),
String::from_utf8_lossy(&attr.value)
));
}
nested_table_xml.push_str("/>");
continue;
}
match name.as_ref() {
b"w:tblHeader" if in_row => {
is_header_row = true;
}
b"w:gridSpan" if in_cell => {
for attr in e.attributes().flatten() {
if attr.key.as_ref() == b"w:val" {
let val = String::from_utf8_lossy(&attr.value);
col_span = val.parse().unwrap_or(1);
}
}
}
b"w:vMerge" if in_cell => {
let mut has_val = false;
for attr in e.attributes().flatten() {
if attr.key.as_ref() == b"w:val" {
has_val = true;
}
}
if !has_val {
row_span = 0;
}
}
b"w:jc" if in_tc_pr => {
for attr in e.attributes().flatten() {
if attr.key.as_ref() == b"w:val" {
let val = String::from_utf8_lossy(&attr.value);
cell_alignment = match val.as_ref() {
"center" => CellAlignment::Center,
"right" | "end" => CellAlignment::Right,
_ => CellAlignment::Left,
};
}
}
}
b"w:b" if in_rpr => {
let val = get_bool_attr(e, b"w:val");
current_style.bold = val.unwrap_or(true);
}
b"w:i" if in_rpr => {
let val = get_bool_attr(e, b"w:val");
current_style.italic = val.unwrap_or(true);
}
b"w:u" if in_rpr => {
for attr in e.attributes().flatten() {
if attr.key.as_ref() == b"w:val" {
let val = String::from_utf8_lossy(&attr.value);
current_style.underline = val != "none";
}
}
}
b"w:strike" if in_rpr => {
let val = get_bool_attr(e, b"w:val");
current_style.strikethrough = val.unwrap_or(true);
}
b"wp:docPr" if in_drawing => {
for attr in e.attributes().flatten() {
if attr.key.as_ref() == b"descr" {
current_image_alt =
Some(String::from_utf8_lossy(&attr.value).to_string());
}
}
}
b"a:blip" if in_drawing => {
for attr in e.attributes().flatten() {
if attr.key.as_ref() == b"r:embed" {
let rel_id = String::from_utf8_lossy(&attr.value).to_string();
let image = InlineImage {
resource_id: rel_id,
alt_text: current_image_alt.clone(),
width: None,
height: None,
};
if let Some(ref mut para) = current_paragraph {
para.images.push(image);
}
}
}
}
_ => {}
}
}
Ok(quick_xml::events::Event::Text(ref e)) => {
if nested_table_depth > 0 {
let text = crate::decode::decode_text_lossy(e);
nested_table_xml.push_str(&escape_xml(&text));
continue;
}
if in_run && in_text && !in_instr_text {
let text = crate::decode::decode_text_lossy(e);
if !text.is_empty() {
if let Some(ref mut para) = current_paragraph {
let run = TextRun {
text,
style: current_style.clone(),
hyperlink: None,
line_break: false,
page_break: false,
revision: RevisionType::None,
};
para.runs.push(run);
}
}
}
}
Ok(quick_xml::events::Event::End(ref e)) => {
let name = e.name();
if nested_table_depth > 0 {
if name.as_ref() == b"w:tbl" {
nested_table_xml.push_str("</w:tbl>");
nested_table_depth -= 1;
if nested_table_depth == 0 {
if let Ok(nested_table) = self.parse_table(&nested_table_xml) {
cell_nested_tables.push(nested_table);
}
}
} else {
nested_table_xml.push_str("</");
nested_table_xml.push_str(&String::from_utf8_lossy(name.as_ref()));
nested_table_xml.push('>');
}
continue;
}
match name.as_ref() {
b"w:tr" => {
if let Some(mut row) = current_row.take() {
row.is_header = is_header_row;
table.add_row(row);
}
in_row = false;
}
b"w:tcPr" => {
in_tc_pr = false;
}
b"w:tc" => {
if row_span > 0 {
let content = if cell_paragraphs.is_empty() {
vec![Paragraph::new()]
} else {
let paragraphs = std::mem::take(&mut cell_paragraphs);
deduplicate_paragraph_block(paragraphs)
};
let cell = Cell {
content,
nested_tables: std::mem::take(&mut cell_nested_tables),
col_span,
row_span,
alignment: cell_alignment,
vertical_alignment: VerticalAlignment::default(),
is_header: is_header_row,
background: None,
};
if let Some(ref mut row) = current_row {
row.cells.push(cell);
}
}
in_cell = false;
}
b"w:p" if in_cell => {
if let Some(para) = current_paragraph.take() {
if !para.is_empty() {
let is_duplicate = cell_paragraphs
.last()
.map(|last| last.plain_text() == para.plain_text())
.unwrap_or(false);
if !is_duplicate {
cell_paragraphs.push(para);
}
}
}
in_paragraph = false;
}
b"w:r" => {
in_run = false;
}
b"w:rPr" => in_rpr = false,
b"w:t" => in_text = false,
b"w:instrText" => in_instr_text = false,
b"w:drawing" => {
in_drawing = false;
current_image_alt = None;
}
_ => {}
}
}
Ok(quick_xml::events::Event::Eof) => break,
Err(e) => return Err(Error::xml_parse_with_context(e.to_string(), "table")),
_ => {}
}
buf.clear();
}
Ok(table)
}
fn extract_resources(&self, doc: &mut Document) -> Result<()> {
for (id, rel) in &self.relationships.by_id {
if rel.rel_type.contains("/image") && !rel.external {
let path = OoxmlContainer::resolve_path("word/document.xml", &rel.target);
if let Ok(data) = self.container.read_binary(&path) {
let size = data.len();
let ext = std::path::Path::new(&path)
.extension()
.and_then(|e| e.to_str())
.unwrap_or("");
let resource = Resource {
resource_type: ResourceType::from_extension(ext),
filename: Some(
std::path::Path::new(&path)
.file_name()
.unwrap_or_default()
.to_string_lossy()
.to_string(),
),
mime_type: guess_mime_type(&path),
data,
size,
width: None,
height: None,
alt_text: None,
};
doc.resources.insert(id.clone(), resource);
}
}
}
Ok(())
}
pub fn container(&self) -> &OoxmlContainer {
&self.container
}
}
fn parse_notes_xml(xml: &str, note_tag: &[u8]) -> HashMap<String, String> {
let mut notes = HashMap::new();
let mut reader = quick_xml::Reader::from_str(xml);
reader.config_mut().trim_text(false);
let mut buf = Vec::new();
let mut current_id: Option<String> = None;
let mut current_text = String::new();
let mut in_note = false;
let mut in_text = false;
loop {
match reader.read_event_into(&mut buf) {
Ok(quick_xml::events::Event::Start(ref e)) => {
if e.name().as_ref() == note_tag {
let mut id = None;
let mut note_type = None;
for attr in e.attributes().flatten() {
match attr.key.as_ref() {
b"w:id" => {
id = Some(String::from_utf8_lossy(&attr.value).to_string());
}
b"w:type" => {
note_type = Some(String::from_utf8_lossy(&attr.value).to_string());
}
_ => {}
}
}
if let Some(ref t) = note_type {
if t == "separator" || t == "continuationSeparator" {
buf.clear();
continue;
}
}
if let Some(id_val) = id {
in_note = true;
current_id = Some(id_val);
current_text.clear();
}
} else if in_note && e.name().as_ref() == b"w:t" {
in_text = true;
}
}
Ok(quick_xml::events::Event::Text(ref e)) if in_note && in_text => {
current_text.push_str(&crate::decode::decode_text_lossy(e));
}
Ok(quick_xml::events::Event::End(ref e)) => {
if e.name().as_ref() == note_tag {
if in_note {
if let Some(id) = current_id.take() {
let trimmed = current_text.trim().to_string();
if !trimmed.is_empty() {
notes.insert(id, trimmed);
}
}
in_note = false;
}
} else if e.name().as_ref() == b"w:t" {
in_text = false;
}
}
Ok(quick_xml::events::Event::Eof) => break,
Err(_) => break,
_ => {}
}
buf.clear();
}
notes
}
fn parse_header_footer_xml(xml: &str) -> Vec<Paragraph> {
let mut paragraphs = Vec::new();
let mut reader = quick_xml::Reader::from_str(xml);
reader.config_mut().trim_text(false);
let mut buf = Vec::new();
let mut in_paragraph = false;
let mut in_text = false;
let mut current_text = String::new();
loop {
match reader.read_event_into(&mut buf) {
Ok(quick_xml::events::Event::Start(ref e)) => match e.name().as_ref() {
b"w:p" => {
in_paragraph = true;
current_text.clear();
}
b"w:t" if in_paragraph => {
in_text = true;
}
_ => {}
},
Ok(quick_xml::events::Event::Text(ref e)) if in_paragraph && in_text => {
current_text.push_str(&crate::decode::decode_text_lossy(e));
}
Ok(quick_xml::events::Event::End(ref e)) => match e.name().as_ref() {
b"w:p" if in_paragraph => {
let trimmed = current_text.trim().to_string();
if !trimmed.is_empty() {
paragraphs.push(Paragraph::with_text(trimmed));
}
in_paragraph = false;
}
b"w:t" => {
in_text = false;
}
_ => {}
},
Ok(quick_xml::events::Event::Eof) => break,
Err(_) => break,
_ => {}
}
buf.clear();
}
paragraphs
}
fn get_bool_attr(e: &quick_xml::events::BytesStart, key: &[u8]) -> Option<bool> {
for attr in e.attributes().flatten() {
if attr.key.as_ref() == key {
let val = String::from_utf8_lossy(&attr.value);
return Some(val != "0" && val != "false");
}
}
None
}
fn escape_xml(s: &str) -> String {
s.replace('&', "&")
.replace('<', "<")
.replace('>', ">")
.replace('"', """)
.replace('\'', "'")
}
fn guess_mime_type(path: &str) -> Option<String> {
let ext = std::path::Path::new(path)
.extension()
.and_then(|e| e.to_str())
.map(|e| e.to_lowercase())?;
Some(
match ext.as_str() {
"png" => "image/png",
"jpg" | "jpeg" => "image/jpeg",
"gif" => "image/gif",
"bmp" => "image/bmp",
"tiff" | "tif" => "image/tiff",
"svg" => "image/svg+xml",
"emf" => "image/x-emf",
"wmf" => "image/x-wmf",
_ => return None,
}
.to_string(),
)
}
fn deduplicate_paragraph_block(paragraphs: Vec<Paragraph>) -> Vec<Paragraph> {
let len = paragraphs.len();
if len < 2 {
return paragraphs;
}
if len.is_multiple_of(2) {
let half = len / 2;
let first_half = ¶graphs[..half];
let second_half = ¶graphs[half..];
let is_duplicate = first_half
.iter()
.zip(second_half.iter())
.all(|(a, b)| a.plain_text() == b.plain_text());
if is_duplicate {
return paragraphs.into_iter().take(half).collect();
}
}
paragraphs
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_open_docx() {
let path = "test-files/file-sample_1MB.docx";
if std::path::Path::new(path).exists() {
let parser = DocxParser::open(path);
assert!(parser.is_ok());
}
}
#[test]
fn test_parse_docx() {
let path = "test-files/file-sample_1MB.docx";
if std::path::Path::new(path).exists() {
let mut parser = DocxParser::open(path).unwrap();
let doc = parser.parse().unwrap();
assert!(!doc.sections.is_empty());
let text = doc.plain_text();
assert!(!text.is_empty());
assert!(text.contains("Lorem ipsum"));
}
}
#[test]
fn test_parse_headings() {
let path = "test-files/file-sample_1MB.docx";
if std::path::Path::new(path).exists() {
let mut parser = DocxParser::open(path).unwrap();
let doc = parser.parse().unwrap();
let headings: Vec<_> = doc.sections[0]
.content
.iter()
.filter_map(|block| {
if let Block::Paragraph(p) = block {
if p.is_heading() {
return Some(p);
}
}
None
})
.collect();
assert!(!headings.is_empty());
}
}
#[test]
fn test_extract_resources() {
let path = "test-files/file-sample_1MB.docx";
if std::path::Path::new(path).exists() {
let mut parser = DocxParser::open(path).unwrap();
let doc = parser.parse().unwrap();
if !doc.resources.is_empty() {
let resource = doc.resources.values().next().unwrap();
assert!(resource.is_image());
}
}
}
#[test]
fn test_whitespace_preserved_between_runs() {
let xml = r#"<w:p>
<w:r><w:t>DATE</w:t></w:r>
<w:r><w:t xml:space="preserve"> </w:t></w:r>
<w:r><w:t>OF</w:t></w:r>
<w:r><w:t xml:space="preserve"> </w:t></w:r>
<w:r><w:t>BIRTH</w:t></w:r>
</w:p>"#;
let container = crate::container::OoxmlContainer::from_bytes(Vec::new());
if container.is_err() {
return;
}
let container = container.unwrap();
let mut parser = DocxParser {
container,
styles: StyleMap::default(),
numbering: NumberingMap::default(),
relationships: crate::container::Relationships::default(),
footnotes: HashMap::new(),
endnotes: HashMap::new(),
};
let para = parser.parse_paragraph(xml).unwrap();
let text = para.plain_text();
assert!(
text.contains("DATE") && text.contains("OF") && text.contains("BIRTH"),
"Expected 'DATE OF BIRTH' with spaces, got: '{}'",
text
);
assert!(
text.contains(' '),
"Expected spaces between words, got: '{}'",
text
);
}
#[test]
fn test_whitespace_leading_trailing_preserved() {
let xml = r#"<w:p>
<w:r><w:t xml:space="preserve"> Hello World </w:t></w:r>
</w:p>"#;
let container = crate::container::OoxmlContainer::from_bytes(Vec::new());
if container.is_err() {
return;
}
let container = container.unwrap();
let mut parser = DocxParser {
container,
styles: StyleMap::default(),
numbering: NumberingMap::default(),
relationships: crate::container::Relationships::default(),
footnotes: HashMap::new(),
endnotes: HashMap::new(),
};
let para = parser.parse_paragraph(xml).unwrap();
let text = para.plain_text();
assert!(
text.starts_with(" ") || text.contains(" Hello"),
"Expected leading spaces, got: '{}'",
text
);
}
#[test]
fn test_tab_character_handling() {
let xml = r#"<w:p>
<w:r>
<w:t>Column1</w:t>
</w:r>
<w:r>
<w:tab/>
</w:r>
<w:r>
<w:t>Column2</w:t>
</w:r>
</w:p>"#;
let container = crate::container::OoxmlContainer::from_bytes(Vec::new());
if container.is_err() {
return;
}
let container = container.unwrap();
let mut parser = DocxParser {
container,
styles: StyleMap::default(),
numbering: NumberingMap::default(),
relationships: crate::container::Relationships::default(),
footnotes: HashMap::new(),
endnotes: HashMap::new(),
};
let para = parser.parse_paragraph(xml).unwrap();
let text = para.plain_text();
assert!(
text.contains('\t'),
"Expected tab character between columns, got: '{}'",
text
);
assert!(
text.contains("Column1") && text.contains("Column2"),
"Expected both column texts, got: '{}'",
text
);
}
#[test]
fn test_multiple_spaces_preserved() {
let xml = r#"<w:p>
<w:r><w:t xml:space="preserve">Word1 Word2</w:t></w:r>
</w:p>"#;
let container = crate::container::OoxmlContainer::from_bytes(Vec::new());
if container.is_err() {
return;
}
let container = container.unwrap();
let mut parser = DocxParser {
container,
styles: StyleMap::default(),
numbering: NumberingMap::default(),
relationships: crate::container::Relationships::default(),
footnotes: HashMap::new(),
endnotes: HashMap::new(),
};
let para = parser.parse_paragraph(xml).unwrap();
let text = para.plain_text();
assert!(
text.contains(" "),
"Expected 5 consecutive spaces, got: '{}'",
text
);
}
#[test]
fn test_carriage_return_handling() {
let xml = r#"<w:p>
<w:r>
<w:t>Line1</w:t>
<w:cr/>
<w:t>Line2</w:t>
</w:r>
</w:p>"#;
let container = crate::container::OoxmlContainer::from_bytes(Vec::new());
if container.is_err() {
return;
}
let container = container.unwrap();
let mut parser = DocxParser {
container,
styles: StyleMap::default(),
numbering: NumberingMap::default(),
relationships: crate::container::Relationships::default(),
footnotes: HashMap::new(),
endnotes: HashMap::new(),
};
let para = parser.parse_paragraph(xml).unwrap();
let has_line_break = para.runs.iter().any(|r| r.line_break);
assert!(has_line_break, "Expected line break from <w:cr/>");
}
#[test]
fn test_non_breaking_hyphen() {
let xml = r#"<w:p>
<w:r>
<w:t>non</w:t>
<w:noBreakHyphen/>
<w:t>breaking</w:t>
</w:r>
</w:p>"#;
let container = crate::container::OoxmlContainer::from_bytes(Vec::new());
if container.is_err() {
return;
}
let container = container.unwrap();
let mut parser = DocxParser {
container,
styles: StyleMap::default(),
numbering: NumberingMap::default(),
relationships: crate::container::Relationships::default(),
footnotes: HashMap::new(),
endnotes: HashMap::new(),
};
let para = parser.parse_paragraph(xml).unwrap();
let text = para.plain_text();
assert!(
text.contains("non") && text.contains("breaking"),
"Expected 'non' and 'breaking' text, got: '{}'",
text
);
assert!(
text.contains('\u{2011}'),
"Expected non-breaking hyphen U+2011, got: '{}'",
text
);
}
#[test]
fn test_tracked_changes_insertion() {
let xml = r#"<w:p>
<w:r><w:t>Original </w:t></w:r>
<w:ins>
<w:r><w:t>inserted </w:t></w:r>
</w:ins>
<w:r><w:t>text</w:t></w:r>
</w:p>"#;
let container = crate::container::OoxmlContainer::from_bytes(Vec::new());
if container.is_err() {
return;
}
let container = container.unwrap();
let mut parser = DocxParser {
container,
styles: StyleMap::default(),
numbering: NumberingMap::default(),
relationships: crate::container::Relationships::default(),
footnotes: HashMap::new(),
endnotes: HashMap::new(),
};
let para = parser.parse_paragraph(xml).unwrap();
let has_inserted = para
.runs
.iter()
.any(|r| r.revision == RevisionType::Inserted);
assert!(has_inserted, "Expected to find inserted revision");
let inserted_text: String = para
.runs
.iter()
.filter(|r| r.revision == RevisionType::Inserted)
.map(|r| r.text.as_str())
.collect();
assert!(
inserted_text.contains("inserted"),
"Expected 'inserted' text in revision, got: '{}'",
inserted_text
);
}
#[test]
fn test_tracked_changes_deletion() {
let xml = r#"<w:p>
<w:r><w:t>Keep this </w:t></w:r>
<w:del>
<w:r><w:t>deleted </w:t></w:r>
</w:del>
<w:r><w:t>text</w:t></w:r>
</w:p>"#;
let container = crate::container::OoxmlContainer::from_bytes(Vec::new());
if container.is_err() {
return;
}
let container = container.unwrap();
let mut parser = DocxParser {
container,
styles: StyleMap::default(),
numbering: NumberingMap::default(),
relationships: crate::container::Relationships::default(),
footnotes: HashMap::new(),
endnotes: HashMap::new(),
};
let para = parser.parse_paragraph(xml).unwrap();
let has_deleted = para
.runs
.iter()
.any(|r| r.revision == RevisionType::Deleted);
assert!(has_deleted, "Expected to find deleted revision");
let deleted_text: String = para
.runs
.iter()
.filter(|r| r.revision == RevisionType::Deleted)
.map(|r| r.text.as_str())
.collect();
assert!(
deleted_text.contains("deleted"),
"Expected 'deleted' text in revision, got: '{}'",
deleted_text
);
}
#[test]
fn test_parse_footnotes_xml() {
let xml = r#"<?xml version="1.0" encoding="UTF-8"?>
<w:footnotes xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">
<w:footnote w:id="0" w:type="separator">
<w:p><w:r><w:t>___</w:t></w:r></w:p>
</w:footnote>
<w:footnote w:id="1" w:type="continuationSeparator">
<w:p><w:r><w:t>---</w:t></w:r></w:p>
</w:footnote>
<w:footnote w:id="2">
<w:p><w:r><w:t>This is footnote two.</w:t></w:r></w:p>
</w:footnote>
<w:footnote w:id="3">
<w:p><w:r><w:t>Another footnote.</w:t></w:r></w:p>
</w:footnote>
</w:footnotes>"#;
let notes = parse_notes_xml(xml, b"w:footnote");
assert!(!notes.contains_key("0"), "separator should be skipped");
assert!(
!notes.contains_key("1"),
"continuationSeparator should be skipped"
);
assert_eq!(notes.get("2").unwrap(), "This is footnote two.");
assert_eq!(notes.get("3").unwrap(), "Another footnote.");
assert_eq!(notes.len(), 2);
}
#[test]
fn test_parse_endnotes_xml() {
let xml = r#"<?xml version="1.0" encoding="UTF-8"?>
<w:endnotes xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">
<w:endnote w:id="0" w:type="separator">
<w:p><w:r><w:t>___</w:t></w:r></w:p>
</w:endnote>
<w:endnote w:id="1">
<w:p><w:r><w:t>Endnote content here.</w:t></w:r></w:p>
</w:endnote>
</w:endnotes>"#;
let notes = parse_notes_xml(xml, b"w:endnote");
assert!(!notes.contains_key("0"), "separator should be skipped");
assert_eq!(notes.get("1").unwrap(), "Endnote content here.");
assert_eq!(notes.len(), 1);
}
#[test]
fn test_footnote_multi_run_text() {
let xml = r#"<?xml version="1.0" encoding="UTF-8"?>
<w:footnotes xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">
<w:footnote w:id="1">
<w:p>
<w:r><w:t>First </w:t></w:r>
<w:r><w:t>second </w:t></w:r>
<w:r><w:t>third.</w:t></w:r>
</w:p>
</w:footnote>
</w:footnotes>"#;
let notes = parse_notes_xml(xml, b"w:footnote");
assert_eq!(notes.get("1").unwrap(), "First second third.");
}
#[test]
fn test_parse_notes_xml_preserves_raw_malformed_entity() {
let xml = r#"<?xml version="1.0" encoding="UTF-8"?>
<w:footnotes xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">
<w:footnote w:id="2">
<w:p><w:r><w:t>Footnote &bogus; text</w:t></w:r></w:p>
</w:footnote>
</w:footnotes>"#;
let notes = parse_notes_xml(xml, b"w:footnote");
assert_eq!(notes.get("2").unwrap(), "Footnote &bogus; text");
}
#[test]
fn test_footnote_reference_in_paragraph() {
let xml = r#"<w:p>
<w:r><w:t>Some text</w:t></w:r>
<w:r><w:footnoteReference w:id="2"/></w:r>
<w:r><w:t> more text</w:t></w:r>
</w:p>"#;
let container = crate::container::OoxmlContainer::from_bytes(Vec::new());
if container.is_err() {
return;
}
let container = container.unwrap();
let mut footnotes = HashMap::new();
footnotes.insert("2".to_string(), "Footnote content".to_string());
let mut parser = DocxParser {
container,
styles: StyleMap::default(),
numbering: NumberingMap::default(),
relationships: crate::container::Relationships::default(),
footnotes,
endnotes: HashMap::new(),
};
let para = parser.parse_paragraph(xml).unwrap();
let text = para.plain_text();
assert!(
text.contains("[^2]"),
"Expected footnote reference [^2], got: '{}'",
text
);
assert!(
text.contains("Some text"),
"Expected original text, got: '{}'",
text
);
}
#[test]
fn test_endnote_reference_in_paragraph() {
let xml = r#"<w:p>
<w:r><w:t>Text with endnote</w:t></w:r>
<w:r><w:endnoteReference w:id="1"/></w:r>
</w:p>"#;
let container = crate::container::OoxmlContainer::from_bytes(Vec::new());
if container.is_err() {
return;
}
let container = container.unwrap();
let mut endnotes = HashMap::new();
endnotes.insert("1".to_string(), "Endnote content".to_string());
let mut parser = DocxParser {
container,
styles: StyleMap::default(),
numbering: NumberingMap::default(),
relationships: crate::container::Relationships::default(),
footnotes: HashMap::new(),
endnotes,
};
let para = parser.parse_paragraph(xml).unwrap();
let text = para.plain_text();
assert!(
text.contains("[^e1]"),
"Expected endnote reference [^e1], got: '{}'",
text
);
}
#[test]
fn test_footnote_reference_skipped_when_no_content() {
let xml = r#"<w:p>
<w:r><w:t>Text</w:t></w:r>
<w:r><w:footnoteReference w:id="99"/></w:r>
</w:p>"#;
let container = crate::container::OoxmlContainer::from_bytes(Vec::new());
if container.is_err() {
return;
}
let container = container.unwrap();
let mut parser = DocxParser {
container,
styles: StyleMap::default(),
numbering: NumberingMap::default(),
relationships: crate::container::Relationships::default(),
footnotes: HashMap::new(), endnotes: HashMap::new(),
};
let para = parser.parse_paragraph(xml).unwrap();
let text = para.plain_text();
assert!(
!text.contains("[^"),
"Should not insert marker for unknown footnote, got: '{}'",
text
);
}
#[test]
fn test_empty_footnote_skipped() {
let xml = r#"<?xml version="1.0" encoding="UTF-8"?>
<w:footnotes xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">
<w:footnote w:id="1">
<w:p><w:r><w:t> </w:t></w:r></w:p>
</w:footnote>
<w:footnote w:id="2">
<w:p><w:r><w:t>Real content.</w:t></w:r></w:p>
</w:footnote>
</w:footnotes>"#;
let notes = parse_notes_xml(xml, b"w:footnote");
assert!(
!notes.contains_key("1"),
"Whitespace-only note should be skipped"
);
assert_eq!(notes.get("2").unwrap(), "Real content.");
}
#[test]
fn test_parse_header_footer_xml_basic() {
let xml = r#"<?xml version="1.0" encoding="UTF-8"?>
<w:hdr xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">
<w:p>
<w:r><w:t>Company Name</w:t></w:r>
</w:p>
<w:p>
<w:r><w:t>Confidential</w:t></w:r>
</w:p>
</w:hdr>"#;
let paragraphs = parse_header_footer_xml(xml);
assert_eq!(paragraphs.len(), 2);
assert_eq!(paragraphs[0].plain_text(), "Company Name");
assert_eq!(paragraphs[1].plain_text(), "Confidential");
}
#[test]
fn test_parse_header_footer_xml_empty_paragraphs_skipped() {
let xml = r#"<?xml version="1.0" encoding="UTF-8"?>
<w:ftr xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">
<w:p></w:p>
<w:p>
<w:r><w:t>Page 1</w:t></w:r>
</w:p>
<w:p>
<w:r><w:t> </w:t></w:r>
</w:p>
</w:ftr>"#;
let paragraphs = parse_header_footer_xml(xml);
assert_eq!(paragraphs.len(), 1);
assert_eq!(paragraphs[0].plain_text(), "Page 1");
}
#[test]
fn test_parse_header_footer_xml_multiple_runs() {
let xml = r#"<?xml version="1.0" encoding="UTF-8"?>
<w:hdr xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">
<w:p>
<w:r><w:t>Draft - </w:t></w:r>
<w:r><w:t>Do Not Distribute</w:t></w:r>
</w:p>
</w:hdr>"#;
let paragraphs = parse_header_footer_xml(xml);
assert_eq!(paragraphs.len(), 1);
assert_eq!(paragraphs[0].plain_text(), "Draft - Do Not Distribute");
}
#[test]
fn test_parse_header_footer_xml_preserves_raw_malformed_entity() {
let xml = r#"<?xml version="1.0" encoding="UTF-8"?>
<w:ftr xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">
<w:p><w:r><w:t>Footer &bogus; text</w:t></w:r></w:p>
</w:ftr>"#;
let paragraphs = parse_header_footer_xml(xml);
assert_eq!(paragraphs.len(), 1);
assert_eq!(paragraphs[0].plain_text(), "Footer &bogus; text");
}
#[test]
fn test_parse_header_footer_xml_empty() {
let xml = r#"<?xml version="1.0" encoding="UTF-8"?>
<w:hdr xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">
</w:hdr>"#;
let paragraphs = parse_header_footer_xml(xml);
assert!(paragraphs.is_empty());
}
fn create_minimal_docx(document_xml: &str) -> Vec<u8> {
create_minimal_docx_with_document_rels(
document_xml,
Some(
r#"<?xml version="1.0" encoding="UTF-8"?>
<Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
</Relationships>"#,
),
)
}
fn create_minimal_docx_with_document_rels(
document_xml: &str,
document_rels_xml: Option<&str>,
) -> Vec<u8> {
use std::io::{Cursor, Write};
let buf = Cursor::new(Vec::new());
let mut zip = zip::ZipWriter::new(buf);
let options = zip::write::SimpleFileOptions::default()
.compression_method(zip::CompressionMethod::Stored);
zip.start_file("[Content_Types].xml", options).unwrap();
zip.write_all(br#"<?xml version="1.0" encoding="UTF-8"?>
<Types xmlns="http://schemas.openxmlformats.org/package/2006/content-types">
<Default Extension="rels" ContentType="application/vnd.openxmlformats-package.relationships+xml"/>
<Default Extension="xml" ContentType="application/xml"/>
<Override PartName="/word/document.xml" ContentType="application/vnd.openxmlformats-officedocument.wordprocessingml.document.main+xml"/>
</Types>"#).unwrap();
zip.start_file("_rels/.rels", options).unwrap();
zip.write_all(br#"<?xml version="1.0" encoding="UTF-8"?>
<Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
<Relationship Id="rId1" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument" Target="word/document.xml"/>
</Relationships>"#).unwrap();
if let Some(document_rels_xml) = document_rels_xml {
zip.start_file("word/_rels/document.xml.rels", options)
.unwrap();
zip.write_all(document_rels_xml.as_bytes()).unwrap();
}
zip.start_file("word/document.xml", options).unwrap();
zip.write_all(document_xml.as_bytes()).unwrap();
zip.finish().unwrap().into_inner()
}
fn create_minimal_docx_with_malformed_optional_part(extra_part_path: &str) -> Vec<u8> {
use std::io::{Cursor, Write};
let buf = Cursor::new(Vec::new());
let mut zip = zip::ZipWriter::new(buf);
let options = zip::write::SimpleFileOptions::default()
.compression_method(zip::CompressionMethod::Stored);
zip.start_file("[Content_Types].xml", options).unwrap();
zip.write_all(br#"<?xml version="1.0" encoding="UTF-8"?>
<Types xmlns="http://schemas.openxmlformats.org/package/2006/content-types">
<Default Extension="rels" ContentType="application/vnd.openxmlformats-package.relationships+xml"/>
<Default Extension="xml" ContentType="application/xml"/>
<Override PartName="/word/document.xml" ContentType="application/vnd.openxmlformats-officedocument.wordprocessingml.document.main+xml"/>
</Types>"#).unwrap();
zip.start_file("_rels/.rels", options).unwrap();
zip.write_all(br#"<?xml version="1.0" encoding="UTF-8"?>
<Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
<Relationship Id="rId1" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument" Target="word/document.xml"/>
</Relationships>"#).unwrap();
zip.start_file("word/document.xml", options).unwrap();
zip.write_all(
br#"<?xml version="1.0" encoding="UTF-8"?>
<w:document xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">
<w:body><w:p/></w:body>
</w:document>"#,
)
.unwrap();
zip.start_file(extra_part_path, options).unwrap();
zip.write_all(b"<?xml version=\"1.0\"?><root>Caf\xe9</root>")
.unwrap();
zip.finish().unwrap().into_inner()
}
#[test]
fn test_docx_non_utf8_optional_parts_surface_encoding_error() {
for part_path in &[
"word/styles.xml",
"word/numbering.xml",
"word/footnotes.xml",
"word/endnotes.xml",
] {
let data = create_minimal_docx_with_malformed_optional_part(part_path);
let err = match DocxParser::from_bytes(data) {
Ok(_) => panic!("malformed {part_path} must surface Error::Encoding"),
Err(err) => err,
};
assert!(
matches!(err, Error::Encoding(_)),
"expected Error::Encoding for {part_path}, got {err:?}"
);
}
}
fn empty_test_parser() -> DocxParser {
DocxParser::from_bytes(create_minimal_docx(
r#"<?xml version="1.0" encoding="UTF-8"?>
<w:document xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">
<w:body><w:p/></w:body>
</w:document>"#,
))
.unwrap()
}
#[test]
fn test_textbox_content_extracted() {
let doc_xml = r#"<?xml version="1.0" encoding="UTF-8"?>
<w:document xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main"
xmlns:wps="http://schemas.microsoft.com/office/word/2010/wordprocessingShape">
<w:body>
<w:p>
<w:r>
<w:t>Normal paragraph</w:t>
</w:r>
</w:p>
<w:p>
<w:r>
<w:drawing>
<wps:wsp>
<wps:txbx>
<w:txbxContent>
<w:p>
<w:r><w:t>Text box content here</w:t></w:r>
</w:p>
</w:txbxContent>
</wps:txbx>
</wps:wsp>
</w:drawing>
</w:r>
</w:p>
</w:body>
</w:document>"#;
let data = create_minimal_docx(doc_xml);
let mut parser = DocxParser::from_bytes(data).unwrap();
let doc = parser.parse().unwrap();
let text = doc.plain_text();
assert!(
text.contains("Normal paragraph"),
"Should contain normal paragraph text"
);
assert!(
text.contains("Text box content here"),
"Should contain text box content, got: {}",
text
);
}
#[test]
fn test_docx_allows_missing_document_relationships_when_unused() {
let doc_xml = r#"<?xml version="1.0" encoding="UTF-8"?>
<w:document xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">
<w:body><w:p><w:r><w:t>Hello</w:t></w:r></w:p></w:body>
</w:document>"#;
let data = create_minimal_docx_with_document_rels(doc_xml, None);
let mut parser = DocxParser::from_bytes(data)
.expect("missing document relationships should be optional");
let doc = parser.parse().unwrap();
assert_eq!(doc.plain_text(), "Hello");
}
#[test]
fn test_docx_rejects_malformed_document_relationships() {
let doc_xml = r#"<?xml version="1.0" encoding="UTF-8"?>
<w:document xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">
<w:body><w:p><w:r><w:t>Hello</w:t></w:r></w:p></w:body>
</w:document>"#;
let data = create_minimal_docx_with_document_rels(doc_xml, Some("<Relationships"));
let err = DocxParser::from_bytes(data)
.err()
.expect("malformed document relationships should fail");
match err {
Error::XmlParseWithContext { location, .. } => {
assert_eq!(location, "word/_rels/document.xml.rels")
}
other => panic!("expected malformed document rels error, got {other:?}"),
}
}
#[test]
fn test_docx_body_malformed_entity_preserves_raw_text() {
let doc_xml = r#"<?xml version="1.0" encoding="UTF-8"?>
<w:document xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">
<w:body><w:p><w:r><w:t>Hello &bogus; body</w:t></w:r></w:p></w:body>
</w:document>"#;
let data = create_minimal_docx(doc_xml);
let mut parser = DocxParser::from_bytes(data).unwrap();
let doc = parser.parse().unwrap();
assert_eq!(doc.plain_text(), "Hello &bogus; body");
}
#[test]
fn test_docx_textbox_malformed_entity_preserves_raw_text() {
let doc_xml = r#"<?xml version="1.0" encoding="UTF-8"?>
<w:document xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main"
xmlns:wps="http://schemas.microsoft.com/office/word/2010/wordprocessingShape">
<w:body>
<w:p>
<w:r>
<w:drawing>
<wps:wsp>
<wps:txbx>
<w:txbxContent>
<w:p><w:r><w:t>Box &bogus; text</w:t></w:r></w:p>
</w:txbxContent>
</wps:txbx>
</wps:wsp>
</w:drawing>
</w:r>
</w:p>
</w:body>
</w:document>"#;
let data = create_minimal_docx(doc_xml);
let mut parser = DocxParser::from_bytes(data).unwrap();
let doc = parser.parse().unwrap();
assert!(doc.plain_text().contains("Box &bogus; text"));
}
#[test]
fn test_docx_nested_table_malformed_entity_preserves_raw_text() {
let parser = empty_test_parser();
let xml = r#"<w:tbl xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">
<w:tr>
<w:tc>
<w:tbl>
<w:tr>
<w:tc>
<w:p><w:r><w:t>Inner &bogus; table</w:t></w:r></w:p>
</w:tc>
</w:tr>
</w:tbl>
</w:tc>
</w:tr>
</w:tbl>"#;
let table = parser.parse_table(xml).unwrap();
assert_eq!(
table.rows[0].cells[0].nested_tables[0].rows[0].cells[0].plain_text(),
"Inner &bogus; table"
);
}
#[test]
fn test_textbox_mc_alternate_content_no_duplication() {
let doc_xml = r#"<?xml version="1.0" encoding="UTF-8"?>
<w:document xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main"
xmlns:mc="http://schemas.openxmlformats.org/markup-compatibility/2006"
xmlns:wps="http://schemas.microsoft.com/office/word/2010/wordprocessingShape"
xmlns:v="urn:schemas-microsoft-com:vml">
<w:body>
<w:p>
<w:r>
<mc:AlternateContent>
<mc:Choice>
<w:drawing>
<wps:wsp>
<wps:txbx>
<w:txbxContent>
<w:p>
<w:r><w:t>Unique text box</w:t></w:r>
</w:p>
</w:txbxContent>
</wps:txbx>
</wps:wsp>
</w:drawing>
</mc:Choice>
<mc:Fallback>
<w:pict>
<v:shape>
<v:textbox>
<w:txbxContent>
<w:p>
<w:r><w:t>Unique text box</w:t></w:r>
</w:p>
</w:txbxContent>
</v:textbox>
</v:shape>
</w:pict>
</mc:Fallback>
</mc:AlternateContent>
</w:r>
</w:p>
</w:body>
</w:document>"#;
let data = create_minimal_docx(doc_xml);
let mut parser = DocxParser::from_bytes(data).unwrap();
let doc = parser.parse().unwrap();
let text = doc.plain_text();
let count = text.matches("Unique text box").count();
assert_eq!(
count, 1,
"Text box content should appear exactly once, not duplicated. Full text: {}",
text
);
}
#[test]
fn test_textbox_multiple_paragraphs() {
let doc_xml = r#"<?xml version="1.0" encoding="UTF-8"?>
<w:document xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main"
xmlns:wps="http://schemas.microsoft.com/office/word/2010/wordprocessingShape">
<w:body>
<w:p>
<w:r>
<w:drawing>
<wps:wsp>
<wps:txbx>
<w:txbxContent>
<w:p>
<w:r><w:t>First text box paragraph</w:t></w:r>
</w:p>
<w:p>
<w:r><w:t>Second text box paragraph</w:t></w:r>
</w:p>
</w:txbxContent>
</wps:txbx>
</wps:wsp>
</w:drawing>
</w:r>
</w:p>
</w:body>
</w:document>"#;
let data = create_minimal_docx(doc_xml);
let mut parser = DocxParser::from_bytes(data).unwrap();
let doc = parser.parse().unwrap();
let text = doc.plain_text();
assert!(
text.contains("First text box paragraph"),
"Should contain first text box paragraph"
);
assert!(
text.contains("Second text box paragraph"),
"Should contain second text box paragraph"
);
}
#[test]
fn test_docx_chart_invalid_numeric_value_propagates_error() {
use std::io::{Cursor, Write};
use zip::write::SimpleFileOptions;
let chart_xml = r#"<?xml version="1.0"?>
<c:chartSpace xmlns:c="http://schemas.openxmlformats.org/drawingml/2006/chart">
<c:chart><c:plotArea><c:lineChart>
<c:ser>
<c:tx><c:strRef><c:strCache><c:pt idx="0"><c:v>S</c:v></c:pt></c:strCache></c:strRef></c:tx>
<c:cat><c:strRef><c:strCache><c:pt idx="0"><c:v>Q1</c:v></c:pt></c:strCache></c:strRef></c:cat>
<c:val><c:numRef><c:numCache><c:pt idx="0"><c:v>not-a-number</c:v></c:pt></c:numCache></c:numRef></c:val>
</c:ser>
</c:lineChart></c:plotArea></c:chart>
</c:chartSpace>"#;
let document_xml = r#"<?xml version="1.0" encoding="UTF-8"?>
<w:document xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main"
xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships"
xmlns:a="http://schemas.openxmlformats.org/drawingml/2006/main"
xmlns:c="http://schemas.openxmlformats.org/drawingml/2006/chart">
<w:body>
<w:p><w:r><w:drawing>
<a:graphic><a:graphicData>
<c:chart r:id="rIdChart"/>
</a:graphicData></a:graphic>
</w:drawing></w:r></w:p>
</w:body>
</w:document>"#;
let document_rels = r#"<?xml version="1.0" encoding="UTF-8"?>
<Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
<Relationship Id="rIdChart" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/chart" Target="charts/chart1.xml"/>
</Relationships>"#;
let buf = Cursor::new(Vec::new());
let mut zip = zip::ZipWriter::new(buf);
let options =
SimpleFileOptions::default().compression_method(zip::CompressionMethod::Stored);
zip.start_file("[Content_Types].xml", options).unwrap();
zip.write_all(br#"<?xml version="1.0" encoding="UTF-8"?>
<Types xmlns="http://schemas.openxmlformats.org/package/2006/content-types">
<Default Extension="rels" ContentType="application/vnd.openxmlformats-package.relationships+xml"/>
<Default Extension="xml" ContentType="application/xml"/>
<Override PartName="/word/document.xml" ContentType="application/vnd.openxmlformats-officedocument.wordprocessingml.document.main+xml"/>
</Types>"#).unwrap();
zip.start_file("_rels/.rels", options).unwrap();
zip.write_all(br#"<?xml version="1.0" encoding="UTF-8"?>
<Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
<Relationship Id="rId1" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument" Target="word/document.xml"/>
</Relationships>"#).unwrap();
zip.start_file("word/document.xml", options).unwrap();
zip.write_all(document_xml.as_bytes()).unwrap();
zip.start_file("word/_rels/document.xml.rels", options)
.unwrap();
zip.write_all(document_rels.as_bytes()).unwrap();
zip.start_file("word/charts/chart1.xml", options).unwrap();
zip.write_all(chart_xml.as_bytes()).unwrap();
let data = zip.finish().unwrap().into_inner();
let mut parser = DocxParser::from_bytes(data).unwrap();
let err = parser
.parse()
.expect_err("invalid chart numeric value must surface");
match err {
Error::InvalidData(msg) => assert!(
msg.contains("invalid chart numeric value"),
"unexpected msg: {msg}"
),
other => panic!("expected InvalidData, got {other:?}"),
}
}
#[test]
fn test_docx_missing_chart_part_propagates_error() {
use std::io::{Cursor, Write};
use zip::write::SimpleFileOptions;
let document_xml = r#"<?xml version="1.0" encoding="UTF-8"?>
<w:document xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main"
xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships"
xmlns:a="http://schemas.openxmlformats.org/drawingml/2006/main"
xmlns:c="http://schemas.openxmlformats.org/drawingml/2006/chart">
<w:body>
<w:p><w:r><w:drawing>
<a:graphic><a:graphicData>
<c:chart r:id="rIdChart"/>
</a:graphicData></a:graphic>
</w:drawing></w:r></w:p>
</w:body>
</w:document>"#;
let document_rels = r#"<?xml version="1.0" encoding="UTF-8"?>
<Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
<Relationship Id="rIdChart" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/chart" Target="charts/chart1.xml"/>
</Relationships>"#;
let buf = Cursor::new(Vec::new());
let mut zip = zip::ZipWriter::new(buf);
let options =
SimpleFileOptions::default().compression_method(zip::CompressionMethod::Stored);
zip.start_file("[Content_Types].xml", options).unwrap();
zip.write_all(br#"<?xml version="1.0" encoding="UTF-8"?>
<Types xmlns="http://schemas.openxmlformats.org/package/2006/content-types">
<Default Extension="rels" ContentType="application/vnd.openxmlformats-package.relationships+xml"/>
<Default Extension="xml" ContentType="application/xml"/>
<Override PartName="/word/document.xml" ContentType="application/vnd.openxmlformats-officedocument.wordprocessingml.document.main+xml"/>
</Types>"#).unwrap();
zip.start_file("_rels/.rels", options).unwrap();
zip.write_all(br#"<?xml version="1.0" encoding="UTF-8"?>
<Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
<Relationship Id="rId1" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument" Target="word/document.xml"/>
</Relationships>"#).unwrap();
zip.start_file("word/document.xml", options).unwrap();
zip.write_all(document_xml.as_bytes()).unwrap();
zip.start_file("word/_rels/document.xml.rels", options)
.unwrap();
zip.write_all(document_rels.as_bytes()).unwrap();
let data = zip.finish().unwrap().into_inner();
let mut parser = DocxParser::from_bytes(data).unwrap();
let err = parser
.parse()
.expect_err("missing referenced chart part must surface");
match err {
Error::MissingComponent(path) => assert_eq!(path, "word/charts/chart1.xml"),
other => panic!("expected MissingComponent, got {other:?}"),
}
}
#[test]
fn test_docx_body_mixed_entities_preserve_legitimate_and_malformed() {
use std::io::Write;
let mut buf = Vec::new();
{
let cursor = std::io::Cursor::new(&mut buf);
let mut zip = zip::ZipWriter::new(cursor);
let options = zip::write::SimpleFileOptions::default()
.compression_method(zip::CompressionMethod::Stored);
zip.start_file("[Content_Types].xml", options).unwrap();
zip.write_all(br#"<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
<Types xmlns="http://schemas.openxmlformats.org/package/2006/content-types">
<Default Extension="rels" ContentType="application/vnd.openxmlformats-package.relationships+xml"/>
<Default Extension="xml" ContentType="application/xml"/>
<Override PartName="/word/document.xml" ContentType="application/vnd.openxmlformats-officedocument.wordprocessingml.document.main+xml"/>
</Types>"#).unwrap();
zip.start_file("_rels/.rels", options).unwrap();
zip.write_all(br#"<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
<Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
<Relationship Id="rId1" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument" Target="word/document.xml"/>
</Relationships>"#).unwrap();
zip.start_file("word/document.xml", options).unwrap();
zip.write_all(
br#"<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
<w:document xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">
<w:body>
<w:p><w:r><w:t>A & B &bogus; C</w:t></w:r></w:p>
</w:body>
</w:document>"#,
)
.unwrap();
zip.finish().unwrap();
}
let mut parser = DocxParser::from_bytes(buf).expect("parser opens");
let doc = parser.parse().expect("document parses");
let text = doc.plain_text();
assert!(
text.contains("A & B &bogus; C"),
"expected legitimate entity decoded and malformed preserved; got {text:?}"
);
assert!(
!text.contains("A & B"),
"legitimate entity must not remain escaped; got {text:?}"
);
}
}