Skip to main content

edgeparse_core/output/
docx.rs

1//! DOCX output generator (minimal stub).
2//!
3//! Produces a minimal Open XML (OOXML) .docx file as raw bytes.
4//! The DOCX format is a Zip archive containing XML parts.
5
6use crate::models::content::ContentElement;
7use crate::models::document::PdfDocument;
8use crate::models::table::TableTokenRow;
9use crate::EdgePdfError;
10use std::io::{Cursor, Write};
11use zip::write::SimpleFileOptions;
12use zip::ZipWriter;
13
14/// Generate a DOCX file as bytes from a PdfDocument.
15///
16/// # Errors
17/// Returns `EdgePdfError::OutputError` on write failures.
18pub fn to_docx(doc: &PdfDocument) -> Result<Vec<u8>, EdgePdfError> {
19    let buffer = Cursor::new(Vec::new());
20    let mut zip = ZipWriter::new(buffer);
21    let options = SimpleFileOptions::default();
22
23    // Content Types
24    zip.start_file("[Content_Types].xml", options)
25        .map_err(|e| EdgePdfError::OutputError(e.to_string()))?;
26    zip.write_all(CONTENT_TYPES_XML.as_bytes())
27        .map_err(|e| EdgePdfError::OutputError(e.to_string()))?;
28
29    // Relationships
30    zip.start_file("_rels/.rels", options)
31        .map_err(|e| EdgePdfError::OutputError(e.to_string()))?;
32    zip.write_all(RELS_XML.as_bytes())
33        .map_err(|e| EdgePdfError::OutputError(e.to_string()))?;
34
35    // Document relationships
36    zip.start_file("word/_rels/document.xml.rels", options)
37        .map_err(|e| EdgePdfError::OutputError(e.to_string()))?;
38    zip.write_all(DOC_RELS_XML.as_bytes())
39        .map_err(|e| EdgePdfError::OutputError(e.to_string()))?;
40
41    // Main document body
42    let body_xml = build_document_xml(doc);
43    zip.start_file("word/document.xml", options)
44        .map_err(|e| EdgePdfError::OutputError(e.to_string()))?;
45    zip.write_all(body_xml.as_bytes())
46        .map_err(|e| EdgePdfError::OutputError(e.to_string()))?;
47
48    let cursor = zip
49        .finish()
50        .map_err(|e| EdgePdfError::OutputError(e.to_string()))?;
51    Ok(cursor.into_inner())
52}
53
54/// Build the word/document.xml content.
55fn build_document_xml(doc: &PdfDocument) -> String {
56    let mut body = String::new();
57
58    if doc.kids.is_empty() {
59        body.push_str(&make_paragraph("No content extracted."));
60    } else {
61        for element in &doc.kids {
62            render_element(&mut body, element);
63        }
64    }
65
66    format!(
67        r#"<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
68<w:document xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main"
69            xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships">
70<w:body>
71{body}
72</w:body>
73</w:document>"#
74    )
75}
76
77fn render_element(out: &mut String, element: &ContentElement) {
78    match element {
79        ContentElement::Heading(h) => {
80            let level = h.heading_level.unwrap_or(1).clamp(1, 6);
81            let text = xml_escape(&h.base.base.value());
82            out.push_str(&make_heading(&text, level));
83        }
84        ContentElement::Paragraph(p) => {
85            let text = xml_escape(&p.base.value());
86            let trimmed = text.trim();
87            if !trimmed.is_empty() {
88                out.push_str(&make_paragraph(trimmed));
89            }
90        }
91        ContentElement::List(list) => {
92            for item in &list.list_items {
93                let label = token_rows_text(&item.label.content);
94                let body = token_rows_text(&item.body.content);
95                let text = format!("{} {}", label.trim(), body.trim());
96                out.push_str(&make_paragraph(&xml_escape(&text)));
97            }
98        }
99        ContentElement::TextBlock(tb) => {
100            let text = xml_escape(&tb.value());
101            let trimmed = text.trim();
102            if !trimmed.is_empty() {
103                out.push_str(&make_paragraph(trimmed));
104            }
105        }
106        _ => {}
107    }
108}
109
110fn token_rows_text(rows: &[TableTokenRow]) -> String {
111    rows.iter()
112        .flat_map(|row| row.iter())
113        .map(|token| token.base.value.as_str())
114        .collect::<Vec<_>>()
115        .join(" ")
116}
117
118fn make_paragraph(text: &str) -> String {
119    format!("<w:p><w:r><w:t xml:space=\"preserve\">{text}</w:t></w:r></w:p>\n")
120}
121
122fn make_heading(text: &str, level: u32) -> String {
123    format!(
124        "<w:p><w:pPr><w:pStyle w:val=\"Heading{level}\"/></w:pPr><w:r><w:t xml:space=\"preserve\">{text}</w:t></w:r></w:p>\n"
125    )
126}
127
128fn xml_escape(text: &str) -> String {
129    text.replace('&', "&amp;")
130        .replace('<', "&lt;")
131        .replace('>', "&gt;")
132        .replace('"', "&quot;")
133}
134
135const CONTENT_TYPES_XML: &str = r#"<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
136<Types xmlns="http://schemas.openxmlformats.org/package/2006/content-types">
137  <Default Extension="rels" ContentType="application/vnd.openxmlformats-package.relationships+xml"/>
138  <Default Extension="xml" ContentType="application/xml"/>
139  <Override PartName="/word/document.xml" ContentType="application/vnd.openxmlformats-officedocument.wordprocessingml.document.main+xml"/>
140</Types>"#;
141
142const RELS_XML: &str = r#"<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
143<Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
144  <Relationship Id="rId1" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument" Target="word/document.xml"/>
145</Relationships>"#;
146
147const DOC_RELS_XML: &str = r#"<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
148<Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
149</Relationships>"#;
150
151#[cfg(test)]
152mod tests {
153    use super::*;
154
155    #[test]
156    fn test_empty_doc_produces_valid_docx() {
157        let doc = PdfDocument::new("test.pdf".to_string());
158        let bytes = to_docx(&doc).unwrap();
159        // DOCX is a ZIP — starts with PK signature
160        assert!(bytes.len() > 100);
161        assert_eq!(&bytes[0..2], b"PK");
162    }
163
164    #[test]
165    fn test_xml_escape() {
166        assert_eq!(xml_escape("a & b"), "a &amp; b");
167        assert_eq!(xml_escape("<tag>"), "&lt;tag&gt;");
168    }
169
170    #[test]
171    fn test_make_paragraph() {
172        let p = make_paragraph("Hello");
173        assert!(p.contains("<w:t xml:space=\"preserve\">Hello</w:t>"));
174    }
175
176    #[test]
177    fn test_make_heading() {
178        let h = make_heading("Title", 1);
179        assert!(h.contains("Heading1"));
180        assert!(h.contains("Title"));
181    }
182}