use std::collections::HashMap;
use crate::config::ConvertOptions;
use crate::error::{ConvertError, ConvertWarning};
const MAX_TABLE_DEPTH: usize = 64;
use crate::ir::{
Alignment, Block, BorderLineStyle, BorderSide, CellBorder, CellVerticalAlign, Color,
ColumnLayout, Document, FloatingImage, FloatingTextBox, ImageData, ImageFormat, Insets,
LineSpacing, Page, Paragraph, ParagraphStyle, Run, StyleSheet, TabAlignment, TabLeader,
TabStop, Table, TableCell, TableRow, TextDirection, TextStyle, VerticalTextAlign,
};
use crate::parser::Parser;
#[cfg(test)]
use self::contexts::scan_table_headers;
use self::contexts::{
BidiContext, ChartContext, DocxConversionContext, DrawingTextBoxContext, DrawingTextBoxInfo,
MathContext, NoteContext, SmallCapsContext, TableHeaderContext, VmlTextBoxContext,
VmlTextBoxInfo, WrapContext, build_chart_context_from_xml, build_math_context_from_xml,
build_note_context_from_xml, build_wrap_context_from_xml,
extract_column_layout_from_section_property, is_note_reference_run, read_zip_text,
scan_column_layouts,
};
use self::lists::{
NumberingMap, TaggedElement, build_numbering_map, extract_num_info, group_into_lists,
};
use self::media::{
extract_drawing_image, extract_drawing_text_box_blocks, extract_shape_image,
extract_vml_shape_text_box,
};
#[cfg(test)]
use self::sections::extract_page_size;
use self::sections::{
HeaderFooterAssets, build_flow_page_from_section, build_header_footer_assets,
};
use self::styles::{
DOC_DEFAULT_STYLE_ID, ResolvedStyle, StyleMap, TabStopOverride, apply_tab_stop_overrides,
build_style_map, get_paragraph_style_id, merge_paragraph_style, merge_text_style,
};
use self::tables::convert_table;
use self::text::{
extract_doc_default_text_style, extract_paragraph_style, extract_run_style, extract_run_text,
extract_run_text_skip_column_breaks, extract_tab_stop_overrides, is_column_break,
parse_hex_color, resolve_hyperlink_url,
};
#[cfg(test)]
use self::text::{extract_tab_stops, resolve_highlight_color};
#[path = "docx_contexts.rs"]
mod contexts;
#[path = "docx_lists.rs"]
mod lists;
#[path = "docx_media.rs"]
mod media;
#[path = "docx_sections.rs"]
mod sections;
#[path = "docx_styles.rs"]
mod styles;
#[path = "docx_tables.rs"]
mod tables;
#[path = "docx_text.rs"]
mod text;
pub struct DocxParser;
type ImageMap = HashMap<String, Vec<u8>>;
type HyperlinkMap = HashMap<String, String>;
fn build_hyperlink_map(docx: &docx_rs::Docx) -> HyperlinkMap {
docx.hyperlinks
.iter()
.map(|(rid, url, _type)| (rid.clone(), url.clone()))
.collect()
}
fn build_image_map(docx: &docx_rs::Docx) -> ImageMap {
docx.images
.iter()
.map(|(id, _path, _image, png)| (id.clone(), png.0.clone()))
.collect()
}
struct ZipPreParseAssets {
metadata: crate::ir::Metadata,
ctx: DocxConversionContext,
math: MathContext,
chart_ctx: ChartContext,
column_layouts: Vec<Option<ColumnLayout>>,
header_footer_assets: HeaderFooterAssets,
}
fn build_zip_preparse_assets(data: &[u8]) -> ZipPreParseAssets {
match crate::parser::open_zip(data) {
Ok(mut archive) => {
let metadata = crate::parser::metadata::extract_metadata_from_zip(&mut archive);
let doc_xml = read_zip_text(&mut archive, "word/document.xml");
let notes = build_note_context_from_xml(doc_xml.as_deref(), &mut archive);
let wraps = build_wrap_context_from_xml(doc_xml.as_deref());
let drawing_text_boxes = DrawingTextBoxContext::from_xml(doc_xml.as_deref());
let table_headers = TableHeaderContext::from_xml(doc_xml.as_deref());
let vml_text_boxes = VmlTextBoxContext::from_xml(doc_xml.as_deref());
let math = build_math_context_from_xml(doc_xml.as_deref());
let chart_ctx = build_chart_context_from_xml(doc_xml.as_deref(), &mut archive);
let column_layouts = doc_xml
.as_deref()
.map(scan_column_layouts)
.unwrap_or_default();
let bidi = BidiContext::from_xml(doc_xml.as_deref());
let small_caps = SmallCapsContext::from_xml(doc_xml.as_deref());
let header_footer_assets = build_header_footer_assets(&mut archive);
let ctx = DocxConversionContext {
notes,
wraps,
drawing_text_boxes,
table_headers,
vml_text_boxes,
bidi,
small_caps,
};
ZipPreParseAssets {
metadata,
ctx,
math,
chart_ctx,
column_layouts,
header_footer_assets,
}
}
Err(_) => ZipPreParseAssets {
metadata: crate::ir::Metadata::default(),
ctx: DocxConversionContext {
notes: NoteContext::empty(),
wraps: WrapContext::empty(),
drawing_text_boxes: DrawingTextBoxContext::from_xml(None),
table_headers: TableHeaderContext::from_xml(None),
vml_text_boxes: VmlTextBoxContext::from_xml(None),
bidi: BidiContext::from_xml(None),
small_caps: SmallCapsContext::from_xml(None),
},
math: MathContext::empty(),
chart_ctx: ChartContext::empty(),
column_layouts: Vec::new(),
header_footer_assets: HeaderFooterAssets::default(),
},
}
}
impl Parser for DocxParser {
fn parse(
&self,
data: &[u8],
_options: &ConvertOptions,
) -> Result<(Document, Vec<ConvertWarning>), ConvertError> {
let ZipPreParseAssets {
metadata,
mut ctx,
mut math,
mut chart_ctx,
column_layouts,
header_footer_assets,
} = build_zip_preparse_assets(data);
let docx = docx_rs::read_docx(data).map_err(|e| {
crate::parser::parse_err(format!("Failed to parse DOCX (docx-rs): {e}"))
})?;
ctx.notes.populate_style_ids(&docx.styles);
let images = build_image_map(&docx);
let hyperlinks = build_hyperlink_map(&docx);
let numberings = build_numbering_map(&docx.numberings);
let style_map = build_style_map(&docx.styles);
let mut warnings: Vec<ConvertWarning> = Vec::new();
let mut elements: Vec<TaggedElement> = Vec::new();
let mut pages: Vec<Page> = Vec::new();
let mut section_layout_index: usize = 0;
for (idx, child) in docx.document.children.iter().enumerate() {
let result = std::panic::catch_unwind(std::panic::AssertUnwindSafe(|| match child {
docx_rs::DocumentChild::Paragraph(para) => {
let mut tagged = vec![convert_paragraph_element(
para,
&images,
&hyperlinks,
&style_map,
&ctx,
)];
let eqs = math.take(idx);
for eq in eqs {
tagged.push(TaggedElement::Plain(vec![Block::MathEquation(eq)]));
}
let chs = chart_ctx.take(idx);
for ch in chs {
tagged.push(TaggedElement::Plain(vec![Block::Chart(ch)]));
}
tagged
}
docx_rs::DocumentChild::Table(table) => {
vec![TaggedElement::Plain(vec![Block::Table(convert_table(
table,
&images,
&hyperlinks,
&style_map,
&ctx,
0,
))])]
}
docx_rs::DocumentChild::StructuredDataTag(sdt) => {
convert_sdt_children(sdt, &images, &hyperlinks, &style_map, &ctx)
}
_ => vec![TaggedElement::Plain(vec![])],
}));
match result {
Ok(elems) => elements.extend(elems),
Err(panic_info) => {
let detail = if let Some(s) = panic_info.downcast_ref::<String>() {
s.clone()
} else if let Some(s) = panic_info.downcast_ref::<&str>() {
(*s).to_string()
} else {
"unknown panic".to_string()
};
warnings.push(ConvertWarning::ParseSkipped {
format: "DOCX".to_string(),
reason: format!(
"upstream panic caught (docx-rs): element at index {idx}: {detail}"
),
});
}
}
if let docx_rs::DocumentChild::Paragraph(para) = child
&& let Some(section_prop) = para.property.section_property.as_ref()
{
let column_layout = match column_layouts.get(section_layout_index) {
Some(layout) => layout.clone(),
None => extract_column_layout_from_section_property(section_prop),
};
pages.push(Page::Flow(build_flow_page_from_section(
section_prop,
std::mem::take(&mut elements),
&numberings,
&header_footer_assets,
column_layout,
&mut warnings,
)));
section_layout_index += 1;
}
}
let final_column_layout = match column_layouts.get(section_layout_index) {
Some(layout) => layout.clone(),
None => extract_column_layout_from_section_property(&docx.document.section_property),
};
pages.push(Page::Flow(build_flow_page_from_section(
&docx.document.section_property,
elements,
&numberings,
&header_footer_assets,
final_column_layout,
&mut warnings,
)));
Ok((
Document {
metadata,
pages,
styles: StyleSheet::default(),
},
warnings,
))
}
}
fn convert_sdt_children(
sdt: &docx_rs::StructuredDataTag,
images: &ImageMap,
hyperlinks: &HyperlinkMap,
style_map: &StyleMap,
ctx: &DocxConversionContext,
) -> Vec<TaggedElement> {
let mut result = Vec::new();
for child in &sdt.children {
match child {
docx_rs::StructuredDataTagChild::Paragraph(para) => {
result.push(convert_paragraph_element(
para, images, hyperlinks, style_map, ctx,
));
}
docx_rs::StructuredDataTagChild::Table(table) => {
result.push(TaggedElement::Plain(vec![Block::Table(convert_table(
table, images, hyperlinks, style_map, ctx, 0,
))]));
}
docx_rs::StructuredDataTagChild::StructuredDataTag(nested) => {
result.extend(convert_sdt_children(
nested, images, hyperlinks, style_map, ctx,
));
}
_ => {}
}
}
result
}
fn convert_paragraph_element(
para: &docx_rs::Paragraph,
images: &ImageMap,
hyperlinks: &HyperlinkMap,
style_map: &StyleMap,
ctx: &DocxConversionContext,
) -> TaggedElement {
let num_info = extract_num_info(para);
let mut blocks = Vec::new();
convert_paragraph_blocks(para, &mut blocks, images, hyperlinks, style_map, ctx);
match num_info {
Some(info) => {
let mut pre_blocks = Vec::new();
let mut paragraph = None;
for block in blocks {
match block {
Block::Paragraph(p) if paragraph.is_none() => {
paragraph = Some(p);
}
_ => pre_blocks.push(block),
}
}
if !pre_blocks.is_empty() {
pre_blocks.push(Block::Paragraph(paragraph.unwrap_or_else(|| Paragraph {
style: ParagraphStyle::default(),
runs: Vec::new(),
})));
TaggedElement::Plain(pre_blocks)
} else if let Some(p) = paragraph {
TaggedElement::ListParagraph { info, paragraph: p }
} else {
TaggedElement::Plain(vec![])
}
}
None => TaggedElement::Plain(blocks),
}
}
fn build_text_run(
text: String,
run_property: &docx_rs::RunProperty,
is_small_caps: bool,
resolved_style: Option<&ResolvedStyle>,
href: Option<String>,
) -> Option<Run> {
if text.is_empty() {
return None;
}
let mut explicit_style: TextStyle = extract_run_style(run_property);
if is_small_caps {
explicit_style.small_caps = Some(true);
}
Some(Run {
text,
style: merge_text_style(&explicit_style, resolved_style),
href,
footnote: None,
})
}
struct RunChildrenMedia {
has_column_break: bool,
text_box_blocks: Vec<Block>,
}
fn extract_run_children_media(
run: &docx_rs::Run,
images: &ImageMap,
hyperlinks: &HyperlinkMap,
style_map: &StyleMap,
ctx: &DocxConversionContext,
inline_images: &mut Vec<Block>,
) -> RunChildrenMedia {
let mut has_column_break: bool = false;
let mut text_box_blocks: Vec<Block> = Vec::new();
for run_child in &run.children {
if let docx_rs::RunChild::Drawing(drawing) = run_child
&& let Some(img_block) = extract_drawing_image(drawing, images, &ctx.wraps)
{
inline_images.push(img_block);
}
if let docx_rs::RunChild::Drawing(drawing) = run_child {
text_box_blocks.extend(extract_drawing_text_box_blocks(
drawing, images, hyperlinks, style_map, ctx,
));
}
if let docx_rs::RunChild::Shape(shape) = run_child {
let vml_text_box: VmlTextBoxInfo = ctx.vml_text_boxes.consume_next();
if let Some(floating_text_box) = extract_vml_shape_text_box(shape, &vml_text_box) {
text_box_blocks.push(Block::FloatingTextBox(floating_text_box));
} else {
text_box_blocks.extend(vml_text_box.into_blocks());
}
if let Some(img_block) = extract_shape_image(shape, images) {
inline_images.push(img_block);
}
}
if let docx_rs::RunChild::Break(br) = run_child
&& is_column_break(br)
{
has_column_break = true;
}
}
RunChildrenMedia {
has_column_break,
text_box_blocks,
}
}
fn process_hyperlink_runs(
hyperlink: &docx_rs::Hyperlink,
hyperlinks: &HyperlinkMap,
resolved_style: Option<&ResolvedStyle>,
ctx: &DocxConversionContext,
runs: &mut Vec<Run>,
) {
let href: Option<String> = resolve_hyperlink_url(hyperlink, hyperlinks);
for hchild in &hyperlink.children {
if let docx_rs::ParagraphChild::Run(run) = hchild {
let hl_small_caps: bool = ctx.small_caps.next_is_small_caps();
let text: String = extract_run_text(run);
if let Some(ir_run) = build_text_run(
text,
&run.run_property,
hl_small_caps,
resolved_style,
href.clone(),
) {
runs.push(ir_run);
}
}
}
}
fn convert_paragraph_blocks(
para: &docx_rs::Paragraph,
out: &mut Vec<Block>,
images: &ImageMap,
hyperlinks: &HyperlinkMap,
style_map: &StyleMap,
ctx: &DocxConversionContext,
) {
let is_rtl = ctx.bidi.next_is_bidi();
if para.property.page_break_before == Some(true) {
out.push(Block::PageBreak);
}
let resolved_style = get_paragraph_style_id(¶.property)
.and_then(|id| style_map.get(id))
.or_else(|| style_map.get(DOC_DEFAULT_STYLE_ID));
let mut runs: Vec<Run> = Vec::new();
let mut inline_images: Vec<Block> = Vec::new();
let mut emitted_text_box_blocks: bool = false;
for child in ¶.children {
match child {
docx_rs::ParagraphChild::Run(run) => {
let is_small_caps: bool = ctx.small_caps.next_is_small_caps();
if is_note_reference_run(run, &ctx.notes) {
if let Some(content) = ctx.notes.consume_next() {
runs.push(Run {
text: String::new(),
style: TextStyle::default(),
href: None,
footnote: Some(content),
});
}
continue;
}
let media = extract_run_children_media(
run,
images,
hyperlinks,
style_map,
ctx,
&mut inline_images,
);
if !media.text_box_blocks.is_empty() {
if !runs.is_empty() {
out.append(&mut inline_images);
push_paragraph_from_runs(out, para, resolved_style, is_rtl, &mut runs);
} else if !inline_images.is_empty() {
out.append(&mut inline_images);
}
emitted_text_box_blocks = true;
out.extend(media.text_box_blocks);
}
if media.has_column_break {
if !runs.is_empty() {
out.append(&mut inline_images);
push_paragraph_from_runs(out, para, resolved_style, is_rtl, &mut runs);
}
out.push(Block::ColumnBreak);
let text: String = extract_run_text_skip_column_breaks(run);
if let Some(ir_run) =
build_text_run(text, &run.run_property, is_small_caps, resolved_style, None)
{
runs.push(ir_run);
}
} else {
let text: String = extract_run_text(run);
if let Some(ir_run) =
build_text_run(text, &run.run_property, is_small_caps, resolved_style, None)
{
runs.push(ir_run);
}
}
}
docx_rs::ParagraphChild::Hyperlink(hyperlink) => {
process_hyperlink_runs(hyperlink, hyperlinks, resolved_style, ctx, &mut runs);
}
_ => {}
}
}
out.extend(inline_images);
if !runs.is_empty() || !emitted_text_box_blocks {
push_paragraph_from_runs(out, para, resolved_style, is_rtl, &mut runs);
}
}
fn push_paragraph_from_runs(
out: &mut Vec<Block>,
para: &docx_rs::Paragraph,
resolved_style: Option<&ResolvedStyle>,
is_rtl: bool,
runs: &mut Vec<Run>,
) {
let explicit_para_style = extract_paragraph_style(¶.property);
let explicit_tab_overrides = extract_tab_stop_overrides(¶.property.tabs);
let mut style = merge_paragraph_style(
&explicit_para_style,
explicit_tab_overrides.as_deref(),
resolved_style,
);
if is_rtl {
style.direction = Some(TextDirection::Rtl);
}
out.push(Block::Paragraph(Paragraph {
style,
runs: std::mem::take(runs),
}));
}
#[cfg(test)]
#[path = "docx_tests.rs"]
mod tests;