#[cfg(test)]
macro_rules! make_test_context {
($ctx:ident) => {
let __docx = rs_docx::Docx::default();
let __rels = std::collections::HashMap::<String, String>::new();
let mut __numbering_resolver = $crate::converter::NumberingResolver::new(&__docx);
let mut __image_extractor = $crate::converter::ImageExtractor::new_skip();
let __options = $crate::ConvertOptions::default();
let __style_resolver = $crate::converter::StyleResolver::new(&__docx.styles);
let mut $ctx = $crate::converter::ConversionContext::new(
&__rels,
&mut __numbering_resolver,
&mut __image_extractor,
&__options,
None,
None,
None,
&__style_resolver,
);
};
}
#[cfg(test)]
macro_rules! make_test_context_ext {
($ctx:ident, docx: $docx_expr:expr) => {
let __docx = $docx_expr;
let __rels = std::collections::HashMap::<String, String>::new();
let mut __numbering_resolver = $crate::converter::NumberingResolver::new(&__docx);
let mut __image_extractor = $crate::converter::ImageExtractor::new_skip();
let __options = $crate::ConvertOptions::default();
let __style_resolver = $crate::converter::StyleResolver::new(&__docx.styles);
let mut $ctx = $crate::converter::ConversionContext::new(
&__rels,
&mut __numbering_resolver,
&mut __image_extractor,
&__options,
__docx.comments.as_ref(),
__docx.footnotes.as_ref(),
__docx.endnotes.as_ref(),
&__style_resolver,
);
};
($ctx:ident, options: $opts_expr:expr) => {
let __docx = rs_docx::Docx::default();
let __rels = std::collections::HashMap::<String, String>::new();
let mut __numbering_resolver = $crate::converter::NumberingResolver::new(&__docx);
let mut __image_extractor = $crate::converter::ImageExtractor::new_skip();
let __options = $opts_expr;
let __style_resolver = $crate::converter::StyleResolver::new(&__docx.styles);
let mut $ctx = $crate::converter::ConversionContext::new(
&__rels,
&mut __numbering_resolver,
&mut __image_extractor,
&__options,
None,
None,
None,
&__style_resolver,
);
};
($ctx:ident, rels: $rels_expr:expr) => {
let __docx = rs_docx::Docx::default();
let __rels = $rels_expr;
let mut __numbering_resolver = $crate::converter::NumberingResolver::new(&__docx);
let mut __image_extractor = $crate::converter::ImageExtractor::new_skip();
let __options = $crate::ConvertOptions::default();
let __style_resolver = $crate::converter::StyleResolver::new(&__docx.styles);
let mut $ctx = $crate::converter::ConversionContext::new(
&__rels,
&mut __numbering_resolver,
&mut __image_extractor,
&__options,
None,
None,
None,
&__style_resolver,
);
};
}
mod image;
mod numbering;
mod paragraph;
mod run;
mod context;
mod styles;
mod table;
mod table_grid;
use crate::adapters::docx::{AstExtractor, DocxExtractor};
#[cfg(test)]
use crate::render::escape_html_attr;
use crate::render::{MarkdownRenderer, Renderer};
use crate::{error::Error, ConvertOptions, ImageHandling, Result};
#[cfg(test)]
use rs_docx::document::BodyContent;
use rs_docx::DocxFile;
use std::collections::HashMap;
use std::path::Path;
pub use self::context::ConversionContext;
pub use self::image::ImageExtractor;
pub use self::numbering::NumberingResolver;
pub use self::paragraph::ParagraphConverter;
pub use self::run::RunConverter;
pub use self::styles::StyleResolver;
pub use self::table::TableConverter;
pub struct DocxToMarkdown<E = DocxExtractor, R = MarkdownRenderer> {
options: ConvertOptions,
extractor: E,
renderer: R,
}
impl DocxToMarkdown<DocxExtractor, MarkdownRenderer> {
pub fn new(options: ConvertOptions) -> Self {
Self {
options,
extractor: DocxExtractor,
renderer: MarkdownRenderer,
}
}
pub fn with_defaults() -> Self {
Self::new(ConvertOptions::default())
}
pub fn builder() -> crate::Builder {
crate::Builder::new()
}
}
impl<E, R> DocxToMarkdown<E, R>
where
E: AstExtractor,
R: Renderer,
{
pub fn with_components(options: ConvertOptions, extractor: E, renderer: R) -> Self {
Self {
options,
extractor,
renderer,
}
}
pub fn convert<P: AsRef<Path>>(&self, path: P) -> Result<String> {
let path = path.as_ref();
let docx_file =
DocxFile::from_file(path).map_err(|e| Error::DocxParse(format!("{:?}", e)))?;
let docx = docx_file
.parse()
.map_err(|e| Error::DocxParse(format!("{:?}", e)))?;
let mut image_extractor = match &self.options.image_handling {
ImageHandling::SaveToDir(dir) => ImageExtractor::new_with_dir(path, dir.clone())?,
ImageHandling::Inline => ImageExtractor::new_inline(path)?,
ImageHandling::Skip => ImageExtractor::new_skip(),
};
self.convert_inner(&docx, &mut image_extractor)
}
pub fn convert_bytes(&self, bytes: &[u8]) -> Result<String> {
let reader = std::io::Cursor::new(bytes);
let docx_file =
DocxFile::from_reader(reader).map_err(|e| Error::DocxParse(format!("{:?}", e)))?;
let docx = docx_file
.parse()
.map_err(|e| Error::DocxParse(format!("{:?}", e)))?;
let mut image_extractor = match &self.options.image_handling {
ImageHandling::SaveToDir(dir) => {
ImageExtractor::new_with_dir_from_bytes(bytes, dir.clone())?
}
ImageHandling::Inline => ImageExtractor::new_inline_from_bytes(bytes)?,
ImageHandling::Skip => ImageExtractor::new_skip(),
};
self.convert_inner(&docx, &mut image_extractor)
}
pub fn convert_reader(&self, mut reader: impl std::io::Read + std::io::Seek) -> Result<String> {
let mut bytes = Vec::new();
reader.read_to_end(&mut bytes)?;
self.convert_bytes(&bytes)
}
#[deprecated(since = "0.5.0", note = "Use `convert_bytes` instead")]
pub fn convert_from_bytes(&self, bytes: &[u8]) -> Result<String> {
self.convert_bytes(bytes)
}
fn convert_inner<'a>(
&'a self,
docx: &'a rs_docx::Docx,
image_extractor: &'a mut ImageExtractor,
) -> Result<String> {
let rels = self.build_relationship_map(docx);
let mut numbering_resolver = NumberingResolver::new(docx);
let style_resolver = StyleResolver::new(&docx.styles);
let mut context = ConversionContext::new(
&rels,
&mut numbering_resolver,
image_extractor,
&self.options,
docx.comments.as_ref(),
docx.footnotes.as_ref(),
docx.endnotes.as_ref(),
&style_resolver,
);
let mut document = self
.extractor
.extract(&docx.document.body.content, &mut context)?;
document.references = context.reference_definitions();
if self.options.strict_reference_validation {
let missing = context.take_missing_references();
if !missing.is_empty() {
return Err(Error::MissingReference(missing.join(", ")));
}
}
self.renderer.render(&document)
}
#[cfg(test)]
fn convert_content<'a>(
content: &BodyContent<'a>,
context: &mut ConversionContext<'a>,
) -> Result<String> {
let mut output = String::new();
match content {
BodyContent::Paragraph(para) => {
let converted = ParagraphConverter::convert(para, context)?;
if !converted.is_empty() {
output.push_str(&converted);
output.push_str("\n\n");
}
}
BodyContent::Table(table) => {
let converted = TableConverter::convert(table, context)?;
output.push_str(&converted);
output.push_str("\n\n");
}
BodyContent::Run(run) => {
let converted = RunConverter::convert(run, context, None)?;
if !converted.is_empty() {
output.push_str(&converted);
output.push_str("\n\n");
}
}
BodyContent::TableCell(cell) => {
for item in &cell.content {
match item {
rs_docx::document::TableCellContent::Paragraph(para) => {
let converted = ParagraphConverter::convert(para, context)?;
if !converted.is_empty() {
output.push_str(&converted);
output.push_str("\n\n");
}
}
rs_docx::document::TableCellContent::Table(table) => {
let converted = TableConverter::convert(table, context)?;
output.push_str(&converted);
output.push_str("\n\n");
}
}
}
}
BodyContent::Sdt(sdt) => {
if let Some(sdt_content) = &sdt.content {
for child in &sdt_content.content {
output.push_str(&Self::convert_content(child, context)?);
}
}
}
BodyContent::BookmarkStart(bookmark) => {
if let Some(name) = &bookmark.name {
output.push_str(&format!("<a id=\"{}\"></a>", escape_html_attr(name)));
}
}
_ => {}
}
Ok(output)
}
fn build_relationship_map(&self, docx: &rs_docx::Docx) -> HashMap<String, String> {
let mut rels = HashMap::new();
if let Some(doc_rels) = &docx.document_rels {
for rel in &doc_rels.relationships {
rels.insert(rel.id.to_string(), rel.target.to_string());
}
}
rels
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::core::ast::{BlockNode, DocumentAst};
use rs_docx::document::{
BodyContent, BookmarkStart, EndNote, EndNotes, FootNote, FootNotes, Paragraph, Run,
RunContent, SDTContent, SDT, TableCell, Text,
};
use std::borrow::Cow;
use std::collections::HashMap;
use std::path::PathBuf;
use std::time::{SystemTime, UNIX_EPOCH};
fn temp_docx_path(prefix: &str) -> PathBuf {
let nanos = SystemTime::now()
.duration_since(UNIX_EPOCH)
.expect("system time must be after UNIX_EPOCH")
.as_nanos();
std::env::temp_dir().join(format!(
"undocx_converter_{}_{}_{}.docx",
prefix,
std::process::id(),
nanos
))
}
#[derive(Debug, Default, Clone, Copy)]
struct FakeExtractor;
impl AstExtractor for FakeExtractor {
fn extract<'a>(
&self,
_body: &[BodyContent<'a>],
context: &mut ConversionContext<'a>,
) -> Result<DocumentAst> {
let _ = context.register_footnote_reference(1);
Ok(DocumentAst {
blocks: vec![BlockNode::Paragraph("custom block".to_string())],
references: Default::default(),
})
}
}
#[derive(Debug, Default, Clone, Copy)]
struct MissingRefExtractor;
impl AstExtractor for MissingRefExtractor {
fn extract<'a>(
&self,
_body: &[BodyContent<'a>],
context: &mut ConversionContext<'a>,
) -> Result<DocumentAst> {
let _ = context.register_footnote_reference(999);
Ok(DocumentAst::default())
}
}
#[derive(Debug, Default, Clone, Copy)]
struct MissingCommentExtractor;
impl AstExtractor for MissingCommentExtractor {
fn extract<'a>(
&self,
_body: &[BodyContent<'a>],
context: &mut ConversionContext<'a>,
) -> Result<DocumentAst> {
let _ = context.register_comment_reference("404");
Ok(DocumentAst::default())
}
}
#[derive(Debug, Default, Clone, Copy)]
struct MissingEndnoteExtractor;
impl AstExtractor for MissingEndnoteExtractor {
fn extract<'a>(
&self,
_body: &[BodyContent<'a>],
context: &mut ConversionContext<'a>,
) -> Result<DocumentAst> {
let _ = context.register_endnote_reference(404);
Ok(DocumentAst::default())
}
}
#[derive(Debug, Default, Clone, Copy)]
struct FakeRenderer;
impl Renderer for FakeRenderer {
fn render(&self, document: &DocumentAst) -> Result<String> {
let first = document
.references
.footnotes
.first()
.map(String::as_str)
.unwrap_or("");
Ok(format!(
"blocks={};footnotes={};first={}",
document.blocks.len(),
document.references.footnotes.len(),
first
))
}
}
#[test]
fn test_convert_content_sdt_with_bookmark() {
make_test_context!(context);
let mut sdt = SDT::default();
let mut sdt_content = SDTContent::default();
let bookmark = BookmarkStart {
name: Some(Cow::Borrowed("TestAnchor")),
..Default::default()
};
sdt_content
.content
.push(BodyContent::BookmarkStart(bookmark));
let mut para = Paragraph::default();
use rs_docx::document::{ParagraphContent, Run, RunContent, Text};
let mut run = Run::default();
run.content.push(RunContent::Text(Text {
text: "Content".into(),
..Default::default()
}));
para.content.push(ParagraphContent::Run(run));
sdt_content.content.push(BodyContent::Paragraph(para));
sdt.content = Some(sdt_content);
let result = DocxToMarkdown::<DocxExtractor, MarkdownRenderer>::convert_content(
&BodyContent::Sdt(sdt),
&mut context,
)
.unwrap();
assert!(result.contains("<a id=\"TestAnchor\"></a>"));
assert!(result.contains("Content"));
}
#[test]
fn test_reference_registration_deduplicates_ids() {
let styles = rs_docx::styles::Styles::new();
let docx = rs_docx::Docx::default();
let mut numbering_resolver = NumberingResolver::new(&docx);
let mut image_extractor = ImageExtractor::new_skip();
let options = ConvertOptions::default();
let rels = HashMap::new();
let style_resolver = StyleResolver::new(&styles);
let mut context = ConversionContext::new(
&rels,
&mut numbering_resolver,
&mut image_extractor,
&options,
None,
None,
None,
&style_resolver,
);
assert_eq!(context.register_footnote_reference(42), "[^1]");
assert_eq!(context.register_footnote_reference(42), "[^1]");
assert_eq!(context.footnote_count(), 1);
assert_eq!(context.register_endnote_reference(7), "[^en1]");
assert_eq!(context.register_endnote_reference(7), "[^en1]");
assert_eq!(context.endnote_count(), 1);
assert_eq!(context.register_comment_reference("3"), "[^c3]");
assert_eq!(context.register_comment_reference("3"), "[^c3]");
assert_eq!(context.comment_count(), 1);
}
#[test]
fn test_with_components_uses_custom_extractor_and_renderer() {
let docx = rs_docx::Docx {
footnotes: Some(FootNotes {
content: vec![FootNote {
id: Some(1),
content: vec![BodyContent::Paragraph(
Paragraph::default().push_text("Injected note"),
)],
..Default::default()
}],
}),
..Default::default()
};
let options = ConvertOptions::default();
let converter = DocxToMarkdown::with_components(options, FakeExtractor, FakeRenderer);
let mut image_extractor = ImageExtractor::new_skip();
let rendered = converter
.convert_inner(&docx, &mut image_extractor)
.expect("conversion should succeed");
assert_eq!(rendered, "blocks=1;footnotes=1;first=Injected note");
}
#[test]
fn test_with_components_respects_strict_reference_validation() {
let docx = rs_docx::Docx::default();
let options = ConvertOptions {
strict_reference_validation: true,
..Default::default()
};
let converter = DocxToMarkdown::with_components(options, MissingRefExtractor, FakeRenderer);
let mut image_extractor = ImageExtractor::new_skip();
let err = converter
.convert_inner(&docx, &mut image_extractor)
.expect_err("strict validation should fail on missing references");
match err {
Error::MissingReference(msg) => assert!(msg.contains("footnote:999")),
other => panic!("unexpected error: {:?}", other),
}
}
#[test]
fn test_with_components_convert_bytes_uses_custom_pipeline() {
let path = temp_docx_path("bytes");
rs_docx::Docx::default()
.write_file(&path)
.expect("failed to write generated docx");
let bytes = std::fs::read(&path).expect("failed to read generated docx");
let _ = std::fs::remove_file(&path);
let converter =
DocxToMarkdown::with_components(ConvertOptions::default(), FakeExtractor, FakeRenderer);
let rendered = converter
.convert_bytes(&bytes)
.expect("conversion from bytes should succeed");
assert_eq!(rendered, "blocks=1;footnotes=1;first=");
}
#[test]
fn test_with_components_strict_validation_fails_for_missing_comment() {
let docx = rs_docx::Docx::default();
let options = ConvertOptions {
strict_reference_validation: true,
..Default::default()
};
let converter =
DocxToMarkdown::with_components(options, MissingCommentExtractor, FakeRenderer);
let mut image_extractor = ImageExtractor::new_skip();
let err = converter
.convert_inner(&docx, &mut image_extractor)
.expect_err("strict validation should fail on missing comment");
match err {
Error::MissingReference(msg) => assert!(msg.contains("comment:404")),
other => panic!("unexpected error: {:?}", other),
}
}
#[test]
fn test_with_components_strict_validation_fails_for_missing_endnote() {
let docx = rs_docx::Docx {
endnotes: Some(EndNotes {
content: vec![EndNote {
id: Some(1),
content: vec![BodyContent::Paragraph(
Paragraph::default().push_text("existing endnote"),
)],
..Default::default()
}],
}),
..Default::default()
};
let options = ConvertOptions {
strict_reference_validation: true,
..Default::default()
};
let converter =
DocxToMarkdown::with_components(options, MissingEndnoteExtractor, FakeRenderer);
let mut image_extractor = ImageExtractor::new_skip();
let err = converter
.convert_inner(&docx, &mut image_extractor)
.expect_err("strict validation should fail on missing endnote");
match err {
Error::MissingReference(msg) => assert!(msg.contains("endnote:404")),
other => panic!("unexpected error: {:?}", other),
}
}
#[test]
fn test_convert_content_body_run_is_rendered() {
let mut run = Run::default();
run.content.push(RunContent::Text(Text {
text: "loose run".into(),
..Default::default()
}));
let docx = rs_docx::Docx::default();
let rels = HashMap::new();
let mut numbering_resolver = NumberingResolver::new(&docx);
let mut image_extractor = ImageExtractor::new_skip();
let options = ConvertOptions::default();
let style_resolver = StyleResolver::new(&docx.styles);
let mut context = ConversionContext::new(
&rels,
&mut numbering_resolver,
&mut image_extractor,
&options,
None,
None,
None,
&style_resolver,
);
let output =
DocxToMarkdown::<DocxExtractor, MarkdownRenderer>::convert_content(
&BodyContent::Run(run),
&mut context,
)
.expect("conversion failed");
assert_eq!(output, "loose run\n\n");
}
#[test]
fn test_convert_content_body_table_cell_is_rendered() {
let cell = TableCell::paragraph(Paragraph::default().push_text("cell text"));
let docx = rs_docx::Docx::default();
let rels = HashMap::new();
let mut numbering_resolver = NumberingResolver::new(&docx);
let mut image_extractor = ImageExtractor::new_skip();
let options = ConvertOptions::default();
let style_resolver = StyleResolver::new(&docx.styles);
let mut context = ConversionContext::new(
&rels,
&mut numbering_resolver,
&mut image_extractor,
&options,
None,
None,
None,
&style_resolver,
);
let output =
DocxToMarkdown::<DocxExtractor, MarkdownRenderer>::convert_content(
&BodyContent::TableCell(cell),
&mut context,
)
.expect("conversion failed");
assert_eq!(output, "cell text\n\n");
}
}