Skip to main content

dm2xcod/converter/
mod.rs

1//! Converter modules for DOCX to Markdown transformation.
2
3mod hyperlink;
4mod image;
5mod numbering;
6mod paragraph;
7mod run;
8
9mod styles;
10mod table;
11
12use crate::{error::Error, ConvertOptions, ImageHandling, Result};
13use rs_docx::document::BodyContent;
14use rs_docx::DocxFile;
15use std::collections::HashMap;
16use std::path::Path;
17
18pub use self::hyperlink::resolve_hyperlink;
19pub use self::image::ImageExtractor;
20pub use self::numbering::NumberingResolver;
21pub use self::paragraph::ParagraphConverter;
22pub use self::run::RunConverter;
23pub use self::styles::StyleResolver;
24pub use self::table::TableConverter;
25
26/// Main converter struct that orchestrates DOCX to Markdown conversion.
27pub struct DocxToMarkdown {
28    options: ConvertOptions,
29}
30
31impl DocxToMarkdown {
32    /// Creates a new converter with the given options.
33    pub fn new(options: ConvertOptions) -> Self {
34        Self { options }
35    }
36
37    /// Creates a new converter with default options.
38    pub fn with_defaults() -> Self {
39        Self::new(ConvertOptions::default())
40    }
41
42    /// Converts a DOCX file to Markdown.
43    ///
44    /// # Arguments
45    /// * `path` - Path to the DOCX file
46    ///
47    /// # Returns
48    /// The converted Markdown content as a String.
49    /// Converts a DOCX file to Markdown.
50    ///
51    /// # Arguments
52    /// * `path` - Path to the DOCX file
53    ///
54    /// # Returns
55    /// The converted Markdown content as a String.
56    pub fn convert<P: AsRef<Path>>(&self, path: P) -> Result<String> {
57        let path = path.as_ref();
58
59        // Parse DOCX file
60        let docx_file =
61            DocxFile::from_file(path).map_err(|e| Error::DocxParse(format!("{:?}", e)))?;
62        let docx = docx_file
63            .parse()
64            .map_err(|e| Error::DocxParse(format!("{:?}", e)))?;
65
66        // Initialize image extractor based on options
67        let mut image_extractor = match &self.options.image_handling {
68            ImageHandling::SaveToDir(dir) => ImageExtractor::new_with_dir(path, dir.clone())?,
69            ImageHandling::Inline => ImageExtractor::new_inline(path)?,
70            ImageHandling::Skip => ImageExtractor::new_skip(),
71        };
72
73        self.convert_inner(&docx, &mut image_extractor)
74    }
75
76    /// Converts a DOCX file from bytes to Markdown.
77    ///
78    /// # Arguments
79    /// * `bytes` - The DOCX file content as bytes
80    ///
81    /// # Returns
82    /// The converted Markdown content as a String.
83    pub fn convert_from_bytes(&self, bytes: &[u8]) -> Result<String> {
84        let reader = std::io::Cursor::new(bytes);
85        let docx_file =
86            DocxFile::from_reader(reader).map_err(|e| Error::DocxParse(format!("{:?}", e)))?;
87        let docx = docx_file
88            .parse()
89            .map_err(|e| Error::DocxParse(format!("{:?}", e)))?;
90
91        // Initialize image extractor based on options
92        let mut image_extractor = match &self.options.image_handling {
93            ImageHandling::SaveToDir(dir) => {
94                ImageExtractor::new_with_dir_from_bytes(bytes, dir.clone())?
95            }
96            ImageHandling::Inline => ImageExtractor::new_inline_from_bytes(bytes)?,
97            ImageHandling::Skip => ImageExtractor::new_skip(),
98        };
99
100        self.convert_inner(&docx, &mut image_extractor)
101    }
102
103    fn convert_inner<'a>(
104        &'a self,
105        docx: &'a rs_docx::Docx,
106        image_extractor: &'a mut ImageExtractor,
107    ) -> Result<String> {
108        // Build relationship map for hyperlinks
109        let rels = self.build_relationship_map(&docx);
110
111        // Initialize numbering resolver
112        let mut numbering_resolver = NumberingResolver::new(&docx);
113
114        // Initialize style resolver
115        let style_resolver = StyleResolver::new(&docx.styles);
116
117        // Convert body content
118        let mut output = String::new();
119        let mut context = ConversionContext {
120            rels: &rels,
121            numbering: &mut numbering_resolver,
122            image_extractor,
123            options: &self.options,
124            footnotes: Vec::new(),
125            endnotes: Vec::new(),
126            comments: Vec::new(),
127            docx_comments: docx.comments.as_ref(),
128            docx_footnotes: docx.footnotes.as_ref(),
129            docx_endnotes: docx.endnotes.as_ref(),
130            styles: &docx.styles,
131            style_resolver: &style_resolver,
132        };
133
134        for content in &docx.document.body.content {
135            output.push_str(&Self::convert_content(content, &mut context)?);
136        }
137
138        // Add footnotes/endnotes/comments if any
139        if !context.footnotes.is_empty()
140            || !context.endnotes.is_empty()
141            || !context.comments.is_empty()
142        {
143            output.push_str("\n\n---\n\n");
144            for (i, note) in context.footnotes.iter().enumerate() {
145                output.push_str(&format!("[^{}]: {}\n", i + 1, note));
146            }
147            for (i, note) in context.endnotes.iter().enumerate() {
148                output.push_str(&format!("[^en{}]: {}\n", i + 1, note));
149            }
150            for (id, text) in context.comments.iter() {
151                output.push_str(&format!("[^c{}]: {}\n", id, text));
152            }
153        }
154
155        Ok(output)
156    }
157
158    fn convert_content(content: &BodyContent, context: &mut ConversionContext) -> Result<String> {
159        let mut output = String::new();
160        match content {
161            BodyContent::Paragraph(para) => {
162                let converted = ParagraphConverter::convert(para, context)?;
163                if !converted.is_empty() {
164                    output.push_str(&converted);
165                    output.push_str("\n\n");
166                }
167            }
168            BodyContent::Table(table) => {
169                let converted = TableConverter::convert(table, context)?;
170                output.push_str(&converted);
171                output.push_str("\n\n");
172            }
173            BodyContent::Sdt(sdt) => {
174                if let Some(sdt_content) = &sdt.content {
175                    for child in &sdt_content.content {
176                        output.push_str(&Self::convert_content(child, context)?);
177                    }
178                }
179            }
180            BodyContent::BookmarkStart(bookmark) => {
181                if let Some(name) = &bookmark.name {
182                    output.push_str(&format!("<a id=\"{}\"></a>", name));
183                }
184            }
185            _ => {}
186        }
187        Ok(output)
188    }
189
190    fn build_relationship_map<'a>(&self, docx: &'a rs_docx::Docx) -> HashMap<String, String> {
191        let mut rels = HashMap::new();
192
193        if let Some(doc_rels) = &docx.document_rels {
194            for rel in &doc_rels.relationships {
195                rels.insert(rel.id.to_string(), rel.target.to_string());
196            }
197        }
198
199        rels
200    }
201}
202
203/// Context passed through conversion for shared state.
204pub struct ConversionContext<'a> {
205    /// Relationship map (rId -> target URL/path)
206    pub rels: &'a HashMap<String, String>,
207    /// Numbering resolver for lists
208    pub numbering: &'a mut NumberingResolver<'a>,
209    /// Image extractor
210    pub image_extractor: &'a mut ImageExtractor,
211    /// Conversion options
212    pub options: &'a ConvertOptions,
213    /// Collected footnotes
214    pub footnotes: Vec<String>,
215    /// Collected endnotes
216    pub endnotes: Vec<String>,
217    /// Collected comments (id, text)
218    pub comments: Vec<(String, String)>,
219    /// Document comments reference
220    pub docx_comments: Option<&'a rs_docx::document::Comments<'a>>,
221    /// Document footnotes reference
222    pub docx_footnotes: Option<&'a rs_docx::document::FootNotes<'a>>,
223    /// Document endnotes reference
224    pub docx_endnotes: Option<&'a rs_docx::document::EndNotes<'a>>,
225    /// Document styles
226    pub styles: &'a rs_docx::styles::Styles<'a>,
227    /// Style resolver
228    pub style_resolver: &'a StyleResolver<'a>,
229}
230
231#[cfg(test)]
232mod tests {
233    use super::*;
234    use rs_docx::document::{BodyContent, BookmarkStart, Paragraph, SDTContent, SDT};
235    use std::borrow::Cow;
236    use std::collections::HashMap;
237
238    #[test]
239    fn test_convert_content_sdt_with_bookmark() {
240        // Setup mock docx parts
241        let styles = rs_docx::styles::Styles::new();
242        let docx = rs_docx::Docx::default();
243
244        let mut numbering_resolver = NumberingResolver::new(&docx);
245        let mut image_extractor = ImageExtractor::new_skip();
246        let options = ConvertOptions::default();
247        let rels = HashMap::new();
248        let style_resolver = StyleResolver::new(&styles);
249
250        let mut context = ConversionContext {
251            rels: &rels,
252            numbering: &mut numbering_resolver,
253            image_extractor: &mut image_extractor,
254            options: &options,
255            footnotes: Vec::new(),
256            endnotes: Vec::new(),
257            comments: Vec::new(),
258            docx_comments: None,
259            docx_footnotes: None,
260            docx_endnotes: None,
261            styles: &styles,
262            style_resolver: &style_resolver,
263        };
264
265        // Construct SDT with nested BookmarkStart and Paragraph
266        let mut sdt = SDT::default();
267        let mut sdt_content = SDTContent::default();
268
269        // Add BookmarkStart
270        let mut bookmark = BookmarkStart::default();
271        bookmark.name = Some(Cow::Borrowed("TestAnchor"));
272        sdt_content
273            .content
274            .push(BodyContent::BookmarkStart(bookmark));
275
276        // Add Paragraph
277        let mut para = Paragraph::default();
278        use rs_docx::document::{ParagraphContent, Run, RunContent, Text};
279        let mut run = Run::default();
280        run.content.push(RunContent::Text(Text {
281            text: "Content".into(),
282            ..Default::default()
283        }));
284        para.content.push(ParagraphContent::Run(run));
285
286        sdt_content.content.push(BodyContent::Paragraph(para));
287
288        sdt.content = Some(sdt_content);
289
290        // Convert
291        let result = DocxToMarkdown::convert_content(&BodyContent::Sdt(sdt), &mut context).unwrap();
292
293        // Verify
294        assert!(result.contains("<a id=\"TestAnchor\"></a>"));
295        assert!(result.contains("Content"));
296    }
297}