Skip to main content

rdocx_html/
lib.rs

1//! DOCX-to-HTML and DOCX-to-Markdown conversion.
2//!
3//! Works directly from semantic OXML types — no layout engine needed.
4
5mod css;
6mod emitter;
7mod markdown;
8
9use std::collections::HashMap;
10
11use rdocx_oxml::document::CT_Document;
12use rdocx_oxml::numbering::CT_Numbering;
13use rdocx_oxml::styles::CT_Styles;
14
15/// Options for HTML conversion.
16#[derive(Debug, Clone)]
17pub struct HtmlOptions {
18    /// Whether to inline images as base64 data URIs (default: true).
19    pub inline_images: bool,
20}
21
22impl Default for HtmlOptions {
23    fn default() -> Self {
24        Self {
25            inline_images: true,
26        }
27    }
28}
29
30/// Input for HTML conversion.
31pub struct HtmlInput {
32    pub document: CT_Document,
33    pub styles: CT_Styles,
34    pub numbering: Option<CT_Numbering>,
35    /// Images keyed by embed/relationship ID.
36    pub images: HashMap<String, ImageData>,
37    /// Hyperlink URLs keyed by relationship ID.
38    pub hyperlink_urls: HashMap<String, String>,
39}
40
41/// Image data for HTML embedding.
42pub struct ImageData {
43    pub data: Vec<u8>,
44    pub content_type: String,
45}
46
47/// Convert a DOCX document to a complete HTML document string.
48pub fn to_html_document(input: &HtmlInput, options: &HtmlOptions) -> String {
49    let body = to_html_fragment(input, options);
50    let css = css::generate_base_css();
51    format!(
52        "<!DOCTYPE html>\n<html>\n<head>\n<meta charset=\"UTF-8\">\n<style>\n{css}\n</style>\n</head>\n<body>\n{body}\n</body>\n</html>"
53    )
54}
55
56/// Convert a DOCX document to an HTML fragment (body content only).
57pub fn to_html_fragment(input: &HtmlInput, options: &HtmlOptions) -> String {
58    emitter::emit_body(
59        &input.document.body,
60        &input.styles,
61        input.numbering.as_ref(),
62        &input.images,
63        &input.hyperlink_urls,
64        options,
65    )
66}
67
68/// Convert a DOCX document to Markdown.
69pub fn to_markdown(input: &HtmlInput) -> String {
70    markdown::emit_markdown(
71        &input.document.body,
72        &input.styles,
73        input.numbering.as_ref(),
74        &input.hyperlink_urls,
75    )
76}
77
78#[cfg(test)]
79mod tests {
80    use super::*;
81    use rdocx_oxml::document::{BodyContent, CT_Document};
82    use rdocx_oxml::styles::CT_Styles;
83    use rdocx_oxml::text::CT_P;
84
85    fn simple_input(text: &str) -> HtmlInput {
86        let mut doc = CT_Document::new();
87        let mut p = CT_P::new();
88        p.add_run(text);
89        doc.body.add_paragraph(p);
90
91        HtmlInput {
92            document: doc,
93            styles: CT_Styles::new_default(),
94            numbering: None,
95            images: HashMap::new(),
96            hyperlink_urls: HashMap::new(),
97        }
98    }
99
100    #[test]
101    fn html_document_basic() {
102        let input = simple_input("Hello, World!");
103        let html = to_html_document(&input, &HtmlOptions::default());
104        assert!(html.contains("<!DOCTYPE html>"));
105        assert!(html.contains("Hello, World!"));
106        assert!(html.contains("<p"));
107    }
108
109    #[test]
110    fn html_fragment_basic() {
111        let input = simple_input("Test paragraph");
112        let html = to_html_fragment(&input, &HtmlOptions::default());
113        assert!(html.contains("Test paragraph"));
114        assert!(html.contains("<p"));
115        assert!(!html.contains("<!DOCTYPE"));
116    }
117
118    #[test]
119    fn markdown_basic() {
120        let input = simple_input("Test paragraph");
121        let md = to_markdown(&input);
122        assert!(md.contains("Test paragraph"));
123    }
124
125    #[test]
126    fn html_heading() {
127        let mut doc = CT_Document::new();
128        let mut p = CT_P::new();
129        p.add_run("Chapter 1");
130        p.properties = Some(rdocx_oxml::properties::CT_PPr {
131            style_id: Some("Heading1".to_string()),
132            ..Default::default()
133        });
134        doc.body.add_paragraph(p);
135
136        let input = HtmlInput {
137            document: doc,
138            styles: CT_Styles::new_default(),
139            numbering: None,
140            images: HashMap::new(),
141            hyperlink_urls: HashMap::new(),
142        };
143
144        let html = to_html_fragment(&input, &HtmlOptions::default());
145        assert!(html.contains("<h1"));
146        assert!(html.contains("Chapter 1"));
147    }
148
149    #[test]
150    fn html_table() {
151        let mut doc = CT_Document::new();
152        let mut tbl = rdocx_oxml::table::CT_Tbl::new();
153        let mut row = rdocx_oxml::table::CT_Row::new();
154        let mut cell = rdocx_oxml::table::CT_Tc::new();
155        let mut p = CT_P::new();
156        p.add_run("Cell text");
157        cell.content = vec![rdocx_oxml::table::CellContent::Paragraph(p)];
158        row.cells.push(cell);
159        tbl.rows.push(row);
160        doc.body.content.push(BodyContent::Table(tbl));
161
162        let input = HtmlInput {
163            document: doc,
164            styles: CT_Styles::new_default(),
165            numbering: None,
166            images: HashMap::new(),
167            hyperlink_urls: HashMap::new(),
168        };
169
170        let html = to_html_fragment(&input, &HtmlOptions::default());
171        assert!(html.contains("<table"));
172        assert!(html.contains("<td"));
173        assert!(html.contains("Cell text"));
174    }
175
176    #[test]
177    fn markdown_heading() {
178        let mut doc = CT_Document::new();
179        let mut p = CT_P::new();
180        p.add_run("Title");
181        p.properties = Some(rdocx_oxml::properties::CT_PPr {
182            style_id: Some("Heading1".to_string()),
183            ..Default::default()
184        });
185        doc.body.add_paragraph(p);
186
187        let input = HtmlInput {
188            document: doc,
189            styles: CT_Styles::new_default(),
190            numbering: None,
191            images: HashMap::new(),
192            hyperlink_urls: HashMap::new(),
193        };
194
195        let md = to_markdown(&input);
196        assert!(md.contains("# Title"));
197    }
198
199    #[test]
200    fn html_bold_italic() {
201        let mut doc = CT_Document::new();
202        let mut p = CT_P::new();
203        let mut r = rdocx_oxml::text::CT_R::new("bold text");
204        r.properties = Some(rdocx_oxml::properties::CT_RPr {
205            bold: Some(true),
206            italic: Some(true),
207            ..Default::default()
208        });
209        p.runs.push(r);
210        doc.body.add_paragraph(p);
211
212        let input = HtmlInput {
213            document: doc,
214            styles: CT_Styles::new_default(),
215            numbering: None,
216            images: HashMap::new(),
217            hyperlink_urls: HashMap::new(),
218        };
219
220        let html = to_html_fragment(&input, &HtmlOptions::default());
221        assert!(html.contains("<strong"));
222        assert!(html.contains("<em"));
223        assert!(html.contains("bold text"));
224    }
225}