1mod hyperlink;
4mod image;
5mod numbering;
6mod paragraph;
7mod run;
8
9mod styles;
10mod table;
11
12use crate::{error::Error, ConvertOptions, ImageHandling, Result};
13use rs_docx::document::BodyContent;
14use rs_docx::DocxFile;
15use std::collections::HashMap;
16use std::path::Path;
17
18pub use self::hyperlink::resolve_hyperlink;
19pub use self::image::ImageExtractor;
20pub use self::numbering::NumberingResolver;
21pub use self::paragraph::ParagraphConverter;
22pub use self::run::RunConverter;
23pub use self::styles::StyleResolver;
24pub use self::table::TableConverter;
25
26pub struct DocxToMarkdown {
28 options: ConvertOptions,
29}
30
31impl DocxToMarkdown {
32 pub fn new(options: ConvertOptions) -> Self {
34 Self { options }
35 }
36
37 pub fn with_defaults() -> Self {
39 Self::new(ConvertOptions::default())
40 }
41
42 pub fn convert<P: AsRef<Path>>(&self, path: P) -> Result<String> {
57 let path = path.as_ref();
58
59 let docx_file =
61 DocxFile::from_file(path).map_err(|e| Error::DocxParse(format!("{:?}", e)))?;
62 let docx = docx_file
63 .parse()
64 .map_err(|e| Error::DocxParse(format!("{:?}", e)))?;
65
66 let mut image_extractor = match &self.options.image_handling {
68 ImageHandling::SaveToDir(dir) => ImageExtractor::new_with_dir(path, dir.clone())?,
69 ImageHandling::Inline => ImageExtractor::new_inline(path)?,
70 ImageHandling::Skip => ImageExtractor::new_skip(),
71 };
72
73 self.convert_inner(&docx, &mut image_extractor)
74 }
75
76 pub fn convert_from_bytes(&self, bytes: &[u8]) -> Result<String> {
84 let reader = std::io::Cursor::new(bytes);
85 let docx_file =
86 DocxFile::from_reader(reader).map_err(|e| Error::DocxParse(format!("{:?}", e)))?;
87 let docx = docx_file
88 .parse()
89 .map_err(|e| Error::DocxParse(format!("{:?}", e)))?;
90
91 let mut image_extractor = match &self.options.image_handling {
93 ImageHandling::SaveToDir(dir) => {
94 ImageExtractor::new_with_dir_from_bytes(bytes, dir.clone())?
95 }
96 ImageHandling::Inline => ImageExtractor::new_inline_from_bytes(bytes)?,
97 ImageHandling::Skip => ImageExtractor::new_skip(),
98 };
99
100 self.convert_inner(&docx, &mut image_extractor)
101 }
102
103 fn convert_inner<'a>(
104 &'a self,
105 docx: &'a rs_docx::Docx,
106 image_extractor: &'a mut ImageExtractor,
107 ) -> Result<String> {
108 let rels = self.build_relationship_map(&docx);
110
111 let mut numbering_resolver = NumberingResolver::new(&docx);
113
114 let style_resolver = StyleResolver::new(&docx.styles);
116
117 let mut output = String::new();
119 let mut context = ConversionContext {
120 rels: &rels,
121 numbering: &mut numbering_resolver,
122 image_extractor,
123 options: &self.options,
124 footnotes: Vec::new(),
125 endnotes: Vec::new(),
126 comments: Vec::new(),
127 docx_comments: docx.comments.as_ref(),
128 docx_footnotes: docx.footnotes.as_ref(),
129 docx_endnotes: docx.endnotes.as_ref(),
130 styles: &docx.styles,
131 style_resolver: &style_resolver,
132 };
133
134 for content in &docx.document.body.content {
135 output.push_str(&Self::convert_content(content, &mut context)?);
136 }
137
138 if !context.footnotes.is_empty()
140 || !context.endnotes.is_empty()
141 || !context.comments.is_empty()
142 {
143 output.push_str("\n\n---\n\n");
144 for (i, note) in context.footnotes.iter().enumerate() {
145 output.push_str(&format!("[^{}]: {}\n", i + 1, note));
146 }
147 for (i, note) in context.endnotes.iter().enumerate() {
148 output.push_str(&format!("[^en{}]: {}\n", i + 1, note));
149 }
150 for (id, text) in context.comments.iter() {
151 output.push_str(&format!("[^c{}]: {}\n", id, text));
152 }
153 }
154
155 Ok(output)
156 }
157
158 fn convert_content(content: &BodyContent, context: &mut ConversionContext) -> Result<String> {
159 let mut output = String::new();
160 match content {
161 BodyContent::Paragraph(para) => {
162 let converted = ParagraphConverter::convert(para, context)?;
163 if !converted.is_empty() {
164 output.push_str(&converted);
165 output.push_str("\n\n");
166 }
167 }
168 BodyContent::Table(table) => {
169 let converted = TableConverter::convert(table, context)?;
170 output.push_str(&converted);
171 output.push_str("\n\n");
172 }
173 BodyContent::Sdt(sdt) => {
174 if let Some(sdt_content) = &sdt.content {
175 for child in &sdt_content.content {
176 output.push_str(&Self::convert_content(child, context)?);
177 }
178 }
179 }
180 BodyContent::BookmarkStart(bookmark) => {
181 if let Some(name) = &bookmark.name {
182 output.push_str(&format!("<a id=\"{}\"></a>", name));
183 }
184 }
185 _ => {}
186 }
187 Ok(output)
188 }
189
190 fn build_relationship_map<'a>(&self, docx: &'a rs_docx::Docx) -> HashMap<String, String> {
191 let mut rels = HashMap::new();
192
193 if let Some(doc_rels) = &docx.document_rels {
194 for rel in &doc_rels.relationships {
195 rels.insert(rel.id.to_string(), rel.target.to_string());
196 }
197 }
198
199 rels
200 }
201}
202
203pub struct ConversionContext<'a> {
205 pub rels: &'a HashMap<String, String>,
207 pub numbering: &'a mut NumberingResolver<'a>,
209 pub image_extractor: &'a mut ImageExtractor,
211 pub options: &'a ConvertOptions,
213 pub footnotes: Vec<String>,
215 pub endnotes: Vec<String>,
217 pub comments: Vec<(String, String)>,
219 pub docx_comments: Option<&'a rs_docx::document::Comments<'a>>,
221 pub docx_footnotes: Option<&'a rs_docx::document::FootNotes<'a>>,
223 pub docx_endnotes: Option<&'a rs_docx::document::EndNotes<'a>>,
225 pub styles: &'a rs_docx::styles::Styles<'a>,
227 pub style_resolver: &'a StyleResolver<'a>,
229}
230
231#[cfg(test)]
232mod tests {
233 use super::*;
234 use rs_docx::document::{BodyContent, BookmarkStart, Paragraph, SDTContent, SDT};
235 use std::borrow::Cow;
236 use std::collections::HashMap;
237
238 #[test]
239 fn test_convert_content_sdt_with_bookmark() {
240 let styles = rs_docx::styles::Styles::new();
242 let docx = rs_docx::Docx::default();
243
244 let mut numbering_resolver = NumberingResolver::new(&docx);
245 let mut image_extractor = ImageExtractor::new_skip();
246 let options = ConvertOptions::default();
247 let rels = HashMap::new();
248 let style_resolver = StyleResolver::new(&styles);
249
250 let mut context = ConversionContext {
251 rels: &rels,
252 numbering: &mut numbering_resolver,
253 image_extractor: &mut image_extractor,
254 options: &options,
255 footnotes: Vec::new(),
256 endnotes: Vec::new(),
257 comments: Vec::new(),
258 docx_comments: None,
259 docx_footnotes: None,
260 docx_endnotes: None,
261 styles: &styles,
262 style_resolver: &style_resolver,
263 };
264
265 let mut sdt = SDT::default();
267 let mut sdt_content = SDTContent::default();
268
269 let mut bookmark = BookmarkStart::default();
271 bookmark.name = Some(Cow::Borrowed("TestAnchor"));
272 sdt_content
273 .content
274 .push(BodyContent::BookmarkStart(bookmark));
275
276 let mut para = Paragraph::default();
278 use rs_docx::document::{ParagraphContent, Run, RunContent, Text};
279 let mut run = Run::default();
280 run.content.push(RunContent::Text(Text {
281 text: "Content".into(),
282 ..Default::default()
283 }));
284 para.content.push(ParagraphContent::Run(run));
285
286 sdt_content.content.push(BodyContent::Paragraph(para));
287
288 sdt.content = Some(sdt_content);
289
290 let result = DocxToMarkdown::convert_content(&BodyContent::Sdt(sdt), &mut context).unwrap();
292
293 assert!(result.contains("<a id=\"TestAnchor\"></a>"));
295 assert!(result.contains("Content"));
296 }
297}