edgeparse_core/output/
docx.rs1use crate::models::content::ContentElement;
7use crate::models::document::PdfDocument;
8use crate::models::table::TableTokenRow;
9use crate::EdgePdfError;
10use std::io::{Cursor, Write};
11use zip::write::SimpleFileOptions;
12use zip::ZipWriter;
13
14pub fn to_docx(doc: &PdfDocument) -> Result<Vec<u8>, EdgePdfError> {
19 let buffer = Cursor::new(Vec::new());
20 let mut zip = ZipWriter::new(buffer);
21 let options = SimpleFileOptions::default();
22
23 zip.start_file("[Content_Types].xml", options)
25 .map_err(|e| EdgePdfError::OutputError(e.to_string()))?;
26 zip.write_all(CONTENT_TYPES_XML.as_bytes())
27 .map_err(|e| EdgePdfError::OutputError(e.to_string()))?;
28
29 zip.start_file("_rels/.rels", options)
31 .map_err(|e| EdgePdfError::OutputError(e.to_string()))?;
32 zip.write_all(RELS_XML.as_bytes())
33 .map_err(|e| EdgePdfError::OutputError(e.to_string()))?;
34
35 zip.start_file("word/_rels/document.xml.rels", options)
37 .map_err(|e| EdgePdfError::OutputError(e.to_string()))?;
38 zip.write_all(DOC_RELS_XML.as_bytes())
39 .map_err(|e| EdgePdfError::OutputError(e.to_string()))?;
40
41 let body_xml = build_document_xml(doc);
43 zip.start_file("word/document.xml", options)
44 .map_err(|e| EdgePdfError::OutputError(e.to_string()))?;
45 zip.write_all(body_xml.as_bytes())
46 .map_err(|e| EdgePdfError::OutputError(e.to_string()))?;
47
48 let cursor = zip
49 .finish()
50 .map_err(|e| EdgePdfError::OutputError(e.to_string()))?;
51 Ok(cursor.into_inner())
52}
53
54fn build_document_xml(doc: &PdfDocument) -> String {
56 let mut body = String::new();
57
58 if doc.kids.is_empty() {
59 body.push_str(&make_paragraph("No content extracted."));
60 } else {
61 for element in &doc.kids {
62 render_element(&mut body, element);
63 }
64 }
65
66 format!(
67 r#"<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
68<w:document xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main"
69 xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships">
70<w:body>
71{body}
72</w:body>
73</w:document>"#
74 )
75}
76
77fn render_element(out: &mut String, element: &ContentElement) {
78 match element {
79 ContentElement::Heading(h) => {
80 let level = h.heading_level.unwrap_or(1).clamp(1, 6);
81 let text = xml_escape(&h.base.base.value());
82 out.push_str(&make_heading(&text, level));
83 }
84 ContentElement::Paragraph(p) => {
85 let text = xml_escape(&p.base.value());
86 let trimmed = text.trim();
87 if !trimmed.is_empty() {
88 out.push_str(&make_paragraph(trimmed));
89 }
90 }
91 ContentElement::List(list) => {
92 for item in &list.list_items {
93 let label = token_rows_text(&item.label.content);
94 let body = token_rows_text(&item.body.content);
95 let text = format!("{} {}", label.trim(), body.trim());
96 out.push_str(&make_paragraph(&xml_escape(&text)));
97 }
98 }
99 ContentElement::TextBlock(tb) => {
100 let text = xml_escape(&tb.value());
101 let trimmed = text.trim();
102 if !trimmed.is_empty() {
103 out.push_str(&make_paragraph(trimmed));
104 }
105 }
106 _ => {}
107 }
108}
109
110fn token_rows_text(rows: &[TableTokenRow]) -> String {
111 rows.iter()
112 .flat_map(|row| row.iter())
113 .map(|token| token.base.value.as_str())
114 .collect::<Vec<_>>()
115 .join(" ")
116}
117
118fn make_paragraph(text: &str) -> String {
119 format!("<w:p><w:r><w:t xml:space=\"preserve\">{text}</w:t></w:r></w:p>\n")
120}
121
122fn make_heading(text: &str, level: u32) -> String {
123 format!(
124 "<w:p><w:pPr><w:pStyle w:val=\"Heading{level}\"/></w:pPr><w:r><w:t xml:space=\"preserve\">{text}</w:t></w:r></w:p>\n"
125 )
126}
127
128fn xml_escape(text: &str) -> String {
129 text.replace('&', "&")
130 .replace('<', "<")
131 .replace('>', ">")
132 .replace('"', """)
133}
134
135const CONTENT_TYPES_XML: &str = r#"<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
136<Types xmlns="http://schemas.openxmlformats.org/package/2006/content-types">
137 <Default Extension="rels" ContentType="application/vnd.openxmlformats-package.relationships+xml"/>
138 <Default Extension="xml" ContentType="application/xml"/>
139 <Override PartName="/word/document.xml" ContentType="application/vnd.openxmlformats-officedocument.wordprocessingml.document.main+xml"/>
140</Types>"#;
141
142const RELS_XML: &str = r#"<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
143<Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
144 <Relationship Id="rId1" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument" Target="word/document.xml"/>
145</Relationships>"#;
146
147const DOC_RELS_XML: &str = r#"<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
148<Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
149</Relationships>"#;
150
151#[cfg(test)]
152mod tests {
153 use super::*;
154
155 #[test]
156 fn test_empty_doc_produces_valid_docx() {
157 let doc = PdfDocument::new("test.pdf".to_string());
158 let bytes = to_docx(&doc).unwrap();
159 assert!(bytes.len() > 100);
161 assert_eq!(&bytes[0..2], b"PK");
162 }
163
164 #[test]
165 fn test_xml_escape() {
166 assert_eq!(xml_escape("a & b"), "a & b");
167 assert_eq!(xml_escape("<tag>"), "<tag>");
168 }
169
170 #[test]
171 fn test_make_paragraph() {
172 let p = make_paragraph("Hello");
173 assert!(p.contains("<w:t xml:space=\"preserve\">Hello</w:t>"));
174 }
175
176 #[test]
177 fn test_make_heading() {
178 let h = make_heading("Title", 1);
179 assert!(h.contains("Heading1"));
180 assert!(h.contains("Title"));
181 }
182}