1pub mod error;
7pub mod layout;
8pub mod writer;
9
10pub use error::{DocxError, Result};
11pub use layout::{DocxImage, PageElement, Paragraph, Run, Table};
12
13use layout::analyze_page;
14use lopdf::Document;
15use pdf_extract::{extract_page_images, extract_text, ImageFilter};
16use writer::write_docx;
17
18const MAX_DOCX_PAGES: u32 = 1000;
21
22pub fn pdf_to_docx(doc: &Document) -> Result<Vec<u8>> {
26 pdf_to_docx_inner(doc, false)
27}
28
29pub fn pdf_to_docx_text_only(doc: &Document) -> Result<Vec<u8>> {
34 pdf_to_docx_sequential(doc)
35}
36
37fn pdf_to_docx_sequential(doc: &Document) -> Result<Vec<u8>> {
44 let pages = doc.get_pages();
45 let total_pages = pages.len() as u32;
46 let total_pages = total_pages.min(MAX_DOCX_PAGES);
47 let text_blocks = extract_text(doc);
48
49 let mut all_elements: Vec<Vec<PageElement>> = Vec::new();
50
51 for page_num in 1..=total_pages {
52 let page_blocks: Vec<_> = text_blocks
53 .iter()
54 .filter(|b| b.page == page_num)
55 .cloned()
56 .collect();
57
58 let elements: Vec<PageElement> = page_blocks
61 .iter()
62 .map(|b| {
63 PageElement::Para(layout::Paragraph {
64 runs: vec![layout::Run {
65 text: b.text.clone(),
66 font_name: String::new(),
67 font_size: b.font_size,
68 bold: false,
69 italic: false,
70 }],
71 })
72 })
73 .collect();
74
75 all_elements.push(elements);
76 }
77
78 let mut output = Vec::new();
79 write_docx(&all_elements, &[], &mut output)?;
80 Ok(output)
81}
82
83fn pdf_to_docx_inner(doc: &Document, skip_images: bool) -> Result<Vec<u8>> {
84 let pages = doc.get_pages();
85 let total_pages = pages.len() as u32;
86 let total_pages = total_pages.min(MAX_DOCX_PAGES);
87
88 let mut all_elements: Vec<Vec<PageElement>> = Vec::new();
89 let mut all_images: Vec<DocxImage> = Vec::new();
90
91 let text_blocks = extract_text(doc);
93
94 for page_num in 1..=total_pages {
95 let page_blocks: Vec<_> = text_blocks
97 .iter()
98 .filter(|b| b.page == page_num)
99 .cloned()
100 .collect();
101
102 let mut elements = analyze_page(&page_blocks);
104
105 if !skip_images {
107 if let Ok(images) = extract_page_images(doc, page_num) {
108 for img in images {
109 let (content_type, ext) = match img.filter {
110 ImageFilter::Jpeg => ("image/jpeg", "jpeg"),
111 _ => ("image/png", "png"),
112 };
113
114 let id = format!("image{}_{}.{}", page_num, all_images.len(), ext);
115
116 all_images.push(DocxImage {
117 data: img.data,
118 width: img.width,
119 height: img.height,
120 content_type: content_type.to_string(),
121 id: id.clone(),
122 });
123
124 elements.push(PageElement::Img(layout::DocxImage {
125 data: Vec::new(), width: img.width,
127 height: img.height,
128 content_type: content_type.to_string(),
129 id,
130 }));
131 }
132 }
133 }
134
135 all_elements.push(elements);
136 }
137
138 let mut output = Vec::new();
139 write_docx(&all_elements, &all_images, &mut output)?;
140 Ok(output)
141}
142
143pub fn convert_pdf_bytes_to_docx(pdf_bytes: &[u8]) -> Result<Vec<u8>> {
145 let doc = Document::load_mem(pdf_bytes)?;
146 pdf_to_docx(&doc)
147}
148
149#[cfg(test)]
150mod tests {
151 use super::*;
152 use lopdf::{dictionary, Document, Object, Stream};
153 use std::io::Read;
154
155 fn make_test_pdf(content: &[u8]) -> Document {
156 let mut doc = Document::with_version("1.7");
157
158 let content_stream = Stream::new(dictionary! {}, content.to_vec());
159 let content_id = doc.add_object(Object::Stream(content_stream));
160
161 let page_dict = dictionary! {
162 "Type" => "Page",
163 "MediaBox" => vec![0.into(), 0.into(), 612.into(), 792.into()],
164 "Contents" => Object::Reference(content_id),
165 };
166 let page_id = doc.add_object(Object::Dictionary(page_dict));
167
168 let pages_dict = dictionary! {
169 "Type" => "Pages",
170 "Kids" => vec![Object::Reference(page_id)],
171 "Count" => 1_i64,
172 };
173 let pages_id = doc.add_object(Object::Dictionary(pages_dict));
174
175 if let Ok(Object::Dictionary(ref mut d)) = doc.get_object_mut(page_id) {
176 d.set("Parent", Object::Reference(pages_id));
177 }
178
179 let catalog = dictionary! {
180 "Type" => "Catalog",
181 "Pages" => Object::Reference(pages_id),
182 };
183 let catalog_id = doc.add_object(Object::Dictionary(catalog));
184 doc.trailer.set("Root", Object::Reference(catalog_id));
185
186 doc
187 }
188
189 fn read_zip_entry(data: &[u8], name: &str) -> Option<String> {
190 let cursor = std::io::Cursor::new(data);
191 let mut archive = zip::ZipArchive::new(cursor).ok()?;
192 let mut file = archive.by_name(name).ok()?;
193 let mut content = String::new();
194 file.read_to_string(&mut content).ok()?;
195 Some(content)
196 }
197
198 fn zip_file_names(data: &[u8]) -> Vec<String> {
199 let cursor = std::io::Cursor::new(data);
200 let archive = zip::ZipArchive::new(cursor).unwrap();
201 (0..archive.len())
202 .map(|i| archive.name_for_index(i).unwrap().to_string())
203 .collect()
204 }
205
206 fn levenshtein_similarity(a: &str, b: &str) -> f64 {
207 let a: Vec<char> = a.chars().collect();
208 let b: Vec<char> = b.chars().collect();
209 let (m, n) = (a.len(), b.len());
210 if m == 0 && n == 0 {
211 return 1.0;
212 }
213 let mut prev: Vec<usize> = (0..=n).collect();
214 let mut curr = vec![0; n + 1];
215 for i in 1..=m {
216 curr[0] = i;
217 for j in 1..=n {
218 let cost = if a[i - 1] == b[j - 1] { 0 } else { 1 };
219 curr[j] = (prev[j] + 1).min(curr[j - 1] + 1).min(prev[j - 1] + cost);
220 }
221 std::mem::swap(&mut prev, &mut curr);
222 }
223 1.0 - (prev[n] as f64 / m.max(n) as f64)
224 }
225
226 #[test]
227 fn convert_simple_text_pdf() {
228 let doc = make_test_pdf(b"BT /F1 12 Tf (Hello World) Tj ET");
229 let docx = pdf_to_docx(&doc).unwrap();
230 assert!(docx.len() > 100);
231 assert_eq!(&docx[0..2], b"PK"); }
233
234 #[test]
235 fn convert_multiline_pdf() {
236 let doc = make_test_pdf(b"BT /F1 12 Tf 12 TL (Line 1) Tj T* (Line 2) Tj ET");
237 let docx = pdf_to_docx(&doc).unwrap();
238 assert!(docx.len() > 100);
239 }
240
241 #[test]
242 fn convert_empty_pdf() {
243 let doc = make_test_pdf(b"");
244 let docx = pdf_to_docx(&doc).unwrap();
245 assert!(docx.len() > 100);
246 }
247
248 #[test]
249 fn convert_from_bytes() {
250 let mut doc = make_test_pdf(b"BT /F1 12 Tf (Test) Tj ET");
251 let mut pdf_bytes = Vec::new();
252 doc.save_to(&mut pdf_bytes).unwrap();
253
254 let docx = convert_pdf_bytes_to_docx(&pdf_bytes).unwrap();
255 assert!(docx.len() > 100);
256 }
257
258 #[test]
259 fn docx_structure_has_required_files() {
260 let doc = make_test_pdf(b"BT /F1 12 Tf (Structure test) Tj ET");
261 let docx = pdf_to_docx(&doc).unwrap();
262 let names = zip_file_names(&docx);
263
264 assert!(names.contains(&"[Content_Types].xml".to_string()));
265 assert!(names.contains(&"_rels/.rels".to_string()));
266 assert!(names.contains(&"word/document.xml".to_string()));
267 assert!(names.contains(&"word/styles.xml".to_string()));
268 assert!(names.contains(&"word/_rels/document.xml.rels".to_string()));
269 }
270
271 #[test]
272 fn docx_document_xml_parseable() {
273 let doc = make_test_pdf(b"BT /F1 12 Tf (XML parse test) Tj ET");
274 let docx = pdf_to_docx(&doc).unwrap();
275 let xml = read_zip_entry(&docx, "word/document.xml").unwrap();
276
277 let parsed = quick_xml::Reader::from_str(&xml);
279 let mut buf = Vec::new();
280 let mut reader = parsed;
281 loop {
282 match reader.read_event_into(&mut buf) {
283 Ok(quick_xml::events::Event::Eof) => break,
284 Err(e) => panic!("Invalid XML in document.xml: {e}"),
285 _ => {}
286 }
287 buf.clear();
288 }
289 }
290
291 #[test]
292 fn docx_styles_xml_parseable() {
293 let doc = make_test_pdf(b"BT /F1 12 Tf (Styles test) Tj ET");
294 let docx = pdf_to_docx(&doc).unwrap();
295 let xml = read_zip_entry(&docx, "word/styles.xml").unwrap();
296
297 let mut reader = quick_xml::Reader::from_str(&xml);
298 let mut buf = Vec::new();
299 loop {
300 match reader.read_event_into(&mut buf) {
301 Ok(quick_xml::events::Event::Eof) => break,
302 Err(e) => panic!("Invalid XML in styles.xml: {e}"),
303 _ => {}
304 }
305 buf.clear();
306 }
307 }
308
309 #[test]
310 fn docx_text_preserved() {
311 let doc = make_test_pdf(b"BT /F1 12 Tf (Hello World) Tj ET");
312 let docx = pdf_to_docx(&doc).unwrap();
313 let xml = read_zip_entry(&docx, "word/document.xml").unwrap();
314
315 assert!(
316 xml.contains("Hello World"),
317 "Expected 'Hello World' in document.xml, got: {xml}"
318 );
319 }
320
321 #[test]
322 fn docx_multiline_text_preserved() {
323 let doc = make_test_pdf(b"BT /F1 12 Tf 12 TL (First line) Tj T* (Second line) Tj ET");
324 let docx = pdf_to_docx(&doc).unwrap();
325 let xml = read_zip_entry(&docx, "word/document.xml").unwrap();
326
327 assert!(xml.contains("First line"));
328 assert!(xml.contains("Second line"));
329 }
330
331 #[test]
332 fn docx_table_content_in_xml() {
333 let content = b"BT /F1 12 Tf 1 0 0 1 72 700 Tm (Name) Tj 1 0 0 1 200 700 Tm (Age) Tj 1 0 0 1 72 684 Tm (Alice) Tj 1 0 0 1 200 684 Tm (30) Tj ET";
334 let doc = make_test_pdf(content);
335 let docx = pdf_to_docx(&doc).unwrap();
336 let xml = read_zip_entry(&docx, "word/document.xml").unwrap();
337
338 assert!(xml.contains("Name"));
340 assert!(xml.contains("Alice"));
341 }
342
343 #[test]
344 fn docx_text_similarity_above_threshold() {
345 let input_text = "Hello World";
346 let doc = make_test_pdf(b"BT /F1 12 Tf (Hello World) Tj ET");
347
348 let blocks = pdf_extract::extract_text(&doc);
350 let pdf_text: String = blocks
351 .iter()
352 .map(|b| b.text.as_str())
353 .collect::<Vec<_>>()
354 .join(" ");
355
356 let docx = pdf_to_docx(&doc).unwrap();
358 let xml = read_zip_entry(&docx, "word/document.xml").unwrap();
359
360 let mut docx_texts = Vec::new();
362 let mut reader = quick_xml::Reader::from_str(&xml);
363 let mut buf = Vec::new();
364 let mut in_wt = false;
365 loop {
366 match reader.read_event_into(&mut buf) {
367 Ok(quick_xml::events::Event::Start(e)) => {
368 in_wt = e.name().as_ref() == b"w:t";
369 }
370 Ok(quick_xml::events::Event::Text(e)) if in_wt => {
371 docx_texts.push(e.unescape().unwrap().to_string());
372 }
373 Ok(quick_xml::events::Event::End(_)) => {
374 in_wt = false;
375 }
376 Ok(quick_xml::events::Event::Eof) => break,
377 Err(e) => panic!("XML parse error: {e}"),
378 _ => {}
379 }
380 buf.clear();
381 }
382 let docx_text = docx_texts.join(" ");
383
384 if pdf_text.len() >= 5 {
385 let similarity = levenshtein_similarity(&pdf_text, &docx_text);
386 assert!(
387 similarity >= 0.80,
388 "Text similarity {similarity:.2} below 0.80 threshold.\n PDF: '{pdf_text}'\n DOCX: '{docx_text}'"
389 );
390 }
391
392 assert!(
394 docx_text.contains(input_text),
395 "Expected '{input_text}' in DOCX text: '{docx_text}'"
396 );
397 }
398
399 #[test]
400 fn docx_content_types_valid() {
401 let doc = make_test_pdf(b"BT /F1 12 Tf (Content types test) Tj ET");
402 let docx = pdf_to_docx(&doc).unwrap();
403 let xml = read_zip_entry(&docx, "[Content_Types].xml").unwrap();
404
405 assert!(xml.contains("ContentType"));
406 assert!(xml.contains("wordprocessingml"));
407 }
408}