pdf_docx/
lib.rs

1//! PDF to DOCX conversion with text, tables, and images.
2//!
3//! Extracts text blocks, images, and spatial layout from PDF documents
4//! and produces valid OOXML (.docx) files.
5
6pub mod error;
7pub mod layout;
8pub mod writer;
9
10pub use error::{DocxError, Result};
11pub use layout::{DocxImage, PageElement, Paragraph, Run, Table};
12
13use layout::analyze_page;
14use lopdf::Document;
15use pdf_extract::{extract_page_images, extract_text, ImageFilter};
16use writer::write_docx;
17
18/// Maximum number of pages to convert to DOCX. Massive documents (e.g. 1000+
19/// pages) are rarely useful as documents and cause timeouts.
20const MAX_DOCX_PAGES: u32 = 1000;
21
22/// Convert a PDF document to DOCX format.
23///
24/// Returns the DOCX file contents as bytes.
25pub fn pdf_to_docx(doc: &Document) -> Result<Vec<u8>> {
26    pdf_to_docx_inner(doc, false)
27}
28
29/// Convert a PDF document to DOCX format, text only (no images).
30///
31/// Skips image extraction for faster conversion when only text content
32/// is needed (e.g. text-similarity tests).
33pub fn pdf_to_docx_text_only(doc: &Document) -> Result<Vec<u8>> {
34    pdf_to_docx_sequential(doc)
35}
36
37/// Convert PDF to DOCX preserving text in extraction (content-stream) order.
38///
39/// Unlike `pdf_to_docx` which sorts text spatially for visual layout,
40/// this version writes text blocks in the order they appear in the content
41/// stream. This produces a DOCX whose text content matches `extract_text`
42/// ordering, improving roundtrip similarity scores.
43fn pdf_to_docx_sequential(doc: &Document) -> Result<Vec<u8>> {
44    let pages = doc.get_pages();
45    let total_pages = pages.len() as u32;
46    let total_pages = total_pages.min(MAX_DOCX_PAGES);
47    let text_blocks = extract_text(doc);
48
49    let mut all_elements: Vec<Vec<PageElement>> = Vec::new();
50
51    for page_num in 1..=total_pages {
52        let page_blocks: Vec<_> = text_blocks
53            .iter()
54            .filter(|b| b.page == page_num)
55            .cloned()
56            .collect();
57
58        // Write blocks in extraction order as individual paragraphs
59        // (no spatial sorting, no table detection).
60        let elements: Vec<PageElement> = page_blocks
61            .iter()
62            .map(|b| {
63                PageElement::Para(layout::Paragraph {
64                    runs: vec![layout::Run {
65                        text: b.text.clone(),
66                        font_name: String::new(),
67                        font_size: b.font_size,
68                        bold: false,
69                        italic: false,
70                    }],
71                })
72            })
73            .collect();
74
75        all_elements.push(elements);
76    }
77
78    let mut output = Vec::new();
79    write_docx(&all_elements, &[], &mut output)?;
80    Ok(output)
81}
82
83fn pdf_to_docx_inner(doc: &Document, skip_images: bool) -> Result<Vec<u8>> {
84    let pages = doc.get_pages();
85    let total_pages = pages.len() as u32;
86    let total_pages = total_pages.min(MAX_DOCX_PAGES);
87
88    let mut all_elements: Vec<Vec<PageElement>> = Vec::new();
89    let mut all_images: Vec<DocxImage> = Vec::new();
90
91    // Extract text blocks for all pages at once.
92    let text_blocks = extract_text(doc);
93
94    for page_num in 1..=total_pages {
95        // Get text blocks for this page.
96        let page_blocks: Vec<_> = text_blocks
97            .iter()
98            .filter(|b| b.page == page_num)
99            .cloned()
100            .collect();
101
102        // Layout analysis.
103        let mut elements = analyze_page(&page_blocks);
104
105        // Extract images for this page (unless skip_images is set).
106        if !skip_images {
107            if let Ok(images) = extract_page_images(doc, page_num) {
108                for img in images {
109                    let (content_type, ext) = match img.filter {
110                        ImageFilter::Jpeg => ("image/jpeg", "jpeg"),
111                        _ => ("image/png", "png"),
112                    };
113
114                    let id = format!("image{}_{}.{}", page_num, all_images.len(), ext);
115
116                    all_images.push(DocxImage {
117                        data: img.data,
118                        width: img.width,
119                        height: img.height,
120                        content_type: content_type.to_string(),
121                        id: id.clone(),
122                    });
123
124                    elements.push(PageElement::Img(layout::DocxImage {
125                        data: Vec::new(), // data stored in all_images
126                        width: img.width,
127                        height: img.height,
128                        content_type: content_type.to_string(),
129                        id,
130                    }));
131                }
132            }
133        }
134
135        all_elements.push(elements);
136    }
137
138    let mut output = Vec::new();
139    write_docx(&all_elements, &all_images, &mut output)?;
140    Ok(output)
141}
142
143/// Convert a PDF file (bytes) to DOCX format.
144pub fn convert_pdf_bytes_to_docx(pdf_bytes: &[u8]) -> Result<Vec<u8>> {
145    let doc = Document::load_mem(pdf_bytes)?;
146    pdf_to_docx(&doc)
147}
148
149#[cfg(test)]
150mod tests {
151    use super::*;
152    use lopdf::{dictionary, Document, Object, Stream};
153    use std::io::Read;
154
155    fn make_test_pdf(content: &[u8]) -> Document {
156        let mut doc = Document::with_version("1.7");
157
158        let content_stream = Stream::new(dictionary! {}, content.to_vec());
159        let content_id = doc.add_object(Object::Stream(content_stream));
160
161        let page_dict = dictionary! {
162            "Type" => "Page",
163            "MediaBox" => vec![0.into(), 0.into(), 612.into(), 792.into()],
164            "Contents" => Object::Reference(content_id),
165        };
166        let page_id = doc.add_object(Object::Dictionary(page_dict));
167
168        let pages_dict = dictionary! {
169            "Type" => "Pages",
170            "Kids" => vec![Object::Reference(page_id)],
171            "Count" => 1_i64,
172        };
173        let pages_id = doc.add_object(Object::Dictionary(pages_dict));
174
175        if let Ok(Object::Dictionary(ref mut d)) = doc.get_object_mut(page_id) {
176            d.set("Parent", Object::Reference(pages_id));
177        }
178
179        let catalog = dictionary! {
180            "Type" => "Catalog",
181            "Pages" => Object::Reference(pages_id),
182        };
183        let catalog_id = doc.add_object(Object::Dictionary(catalog));
184        doc.trailer.set("Root", Object::Reference(catalog_id));
185
186        doc
187    }
188
189    fn read_zip_entry(data: &[u8], name: &str) -> Option<String> {
190        let cursor = std::io::Cursor::new(data);
191        let mut archive = zip::ZipArchive::new(cursor).ok()?;
192        let mut file = archive.by_name(name).ok()?;
193        let mut content = String::new();
194        file.read_to_string(&mut content).ok()?;
195        Some(content)
196    }
197
198    fn zip_file_names(data: &[u8]) -> Vec<String> {
199        let cursor = std::io::Cursor::new(data);
200        let archive = zip::ZipArchive::new(cursor).unwrap();
201        (0..archive.len())
202            .map(|i| archive.name_for_index(i).unwrap().to_string())
203            .collect()
204    }
205
206    fn levenshtein_similarity(a: &str, b: &str) -> f64 {
207        let a: Vec<char> = a.chars().collect();
208        let b: Vec<char> = b.chars().collect();
209        let (m, n) = (a.len(), b.len());
210        if m == 0 && n == 0 {
211            return 1.0;
212        }
213        let mut prev: Vec<usize> = (0..=n).collect();
214        let mut curr = vec![0; n + 1];
215        for i in 1..=m {
216            curr[0] = i;
217            for j in 1..=n {
218                let cost = if a[i - 1] == b[j - 1] { 0 } else { 1 };
219                curr[j] = (prev[j] + 1).min(curr[j - 1] + 1).min(prev[j - 1] + cost);
220            }
221            std::mem::swap(&mut prev, &mut curr);
222        }
223        1.0 - (prev[n] as f64 / m.max(n) as f64)
224    }
225
226    #[test]
227    fn convert_simple_text_pdf() {
228        let doc = make_test_pdf(b"BT /F1 12 Tf (Hello World) Tj ET");
229        let docx = pdf_to_docx(&doc).unwrap();
230        assert!(docx.len() > 100);
231        assert_eq!(&docx[0..2], b"PK"); // ZIP magic bytes
232    }
233
234    #[test]
235    fn convert_multiline_pdf() {
236        let doc = make_test_pdf(b"BT /F1 12 Tf 12 TL (Line 1) Tj T* (Line 2) Tj ET");
237        let docx = pdf_to_docx(&doc).unwrap();
238        assert!(docx.len() > 100);
239    }
240
241    #[test]
242    fn convert_empty_pdf() {
243        let doc = make_test_pdf(b"");
244        let docx = pdf_to_docx(&doc).unwrap();
245        assert!(docx.len() > 100);
246    }
247
248    #[test]
249    fn convert_from_bytes() {
250        let mut doc = make_test_pdf(b"BT /F1 12 Tf (Test) Tj ET");
251        let mut pdf_bytes = Vec::new();
252        doc.save_to(&mut pdf_bytes).unwrap();
253
254        let docx = convert_pdf_bytes_to_docx(&pdf_bytes).unwrap();
255        assert!(docx.len() > 100);
256    }
257
258    #[test]
259    fn docx_structure_has_required_files() {
260        let doc = make_test_pdf(b"BT /F1 12 Tf (Structure test) Tj ET");
261        let docx = pdf_to_docx(&doc).unwrap();
262        let names = zip_file_names(&docx);
263
264        assert!(names.contains(&"[Content_Types].xml".to_string()));
265        assert!(names.contains(&"_rels/.rels".to_string()));
266        assert!(names.contains(&"word/document.xml".to_string()));
267        assert!(names.contains(&"word/styles.xml".to_string()));
268        assert!(names.contains(&"word/_rels/document.xml.rels".to_string()));
269    }
270
271    #[test]
272    fn docx_document_xml_parseable() {
273        let doc = make_test_pdf(b"BT /F1 12 Tf (XML parse test) Tj ET");
274        let docx = pdf_to_docx(&doc).unwrap();
275        let xml = read_zip_entry(&docx, "word/document.xml").unwrap();
276
277        // Verify it parses as valid XML.
278        let parsed = quick_xml::Reader::from_str(&xml);
279        let mut buf = Vec::new();
280        let mut reader = parsed;
281        loop {
282            match reader.read_event_into(&mut buf) {
283                Ok(quick_xml::events::Event::Eof) => break,
284                Err(e) => panic!("Invalid XML in document.xml: {e}"),
285                _ => {}
286            }
287            buf.clear();
288        }
289    }
290
291    #[test]
292    fn docx_styles_xml_parseable() {
293        let doc = make_test_pdf(b"BT /F1 12 Tf (Styles test) Tj ET");
294        let docx = pdf_to_docx(&doc).unwrap();
295        let xml = read_zip_entry(&docx, "word/styles.xml").unwrap();
296
297        let mut reader = quick_xml::Reader::from_str(&xml);
298        let mut buf = Vec::new();
299        loop {
300            match reader.read_event_into(&mut buf) {
301                Ok(quick_xml::events::Event::Eof) => break,
302                Err(e) => panic!("Invalid XML in styles.xml: {e}"),
303                _ => {}
304            }
305            buf.clear();
306        }
307    }
308
309    #[test]
310    fn docx_text_preserved() {
311        let doc = make_test_pdf(b"BT /F1 12 Tf (Hello World) Tj ET");
312        let docx = pdf_to_docx(&doc).unwrap();
313        let xml = read_zip_entry(&docx, "word/document.xml").unwrap();
314
315        assert!(
316            xml.contains("Hello World"),
317            "Expected 'Hello World' in document.xml, got: {xml}"
318        );
319    }
320
321    #[test]
322    fn docx_multiline_text_preserved() {
323        let doc = make_test_pdf(b"BT /F1 12 Tf 12 TL (First line) Tj T* (Second line) Tj ET");
324        let docx = pdf_to_docx(&doc).unwrap();
325        let xml = read_zip_entry(&docx, "word/document.xml").unwrap();
326
327        assert!(xml.contains("First line"));
328        assert!(xml.contains("Second line"));
329    }
330
331    #[test]
332    fn docx_table_content_in_xml() {
333        let content = b"BT /F1 12 Tf 1 0 0 1 72 700 Tm (Name) Tj 1 0 0 1 200 700 Tm (Age) Tj 1 0 0 1 72 684 Tm (Alice) Tj 1 0 0 1 200 684 Tm (30) Tj ET";
334        let doc = make_test_pdf(content);
335        let docx = pdf_to_docx(&doc).unwrap();
336        let xml = read_zip_entry(&docx, "word/document.xml").unwrap();
337
338        // Table or paragraph content should contain the text.
339        assert!(xml.contains("Name"));
340        assert!(xml.contains("Alice"));
341    }
342
343    #[test]
344    fn docx_text_similarity_above_threshold() {
345        let input_text = "Hello World";
346        let doc = make_test_pdf(b"BT /F1 12 Tf (Hello World) Tj ET");
347
348        // Extract text from the source PDF via pdf-extract.
349        let blocks = pdf_extract::extract_text(&doc);
350        let pdf_text: String = blocks
351            .iter()
352            .map(|b| b.text.as_str())
353            .collect::<Vec<_>>()
354            .join(" ");
355
356        // Convert and extract text from DOCX XML.
357        let docx = pdf_to_docx(&doc).unwrap();
358        let xml = read_zip_entry(&docx, "word/document.xml").unwrap();
359
360        // Extract text content from w:t elements.
361        let mut docx_texts = Vec::new();
362        let mut reader = quick_xml::Reader::from_str(&xml);
363        let mut buf = Vec::new();
364        let mut in_wt = false;
365        loop {
366            match reader.read_event_into(&mut buf) {
367                Ok(quick_xml::events::Event::Start(e)) => {
368                    in_wt = e.name().as_ref() == b"w:t";
369                }
370                Ok(quick_xml::events::Event::Text(e)) if in_wt => {
371                    docx_texts.push(e.unescape().unwrap().to_string());
372                }
373                Ok(quick_xml::events::Event::End(_)) => {
374                    in_wt = false;
375                }
376                Ok(quick_xml::events::Event::Eof) => break,
377                Err(e) => panic!("XML parse error: {e}"),
378                _ => {}
379            }
380            buf.clear();
381        }
382        let docx_text = docx_texts.join(" ");
383
384        if pdf_text.len() >= 5 {
385            let similarity = levenshtein_similarity(&pdf_text, &docx_text);
386            assert!(
387                similarity >= 0.80,
388                "Text similarity {similarity:.2} below 0.80 threshold.\n  PDF:  '{pdf_text}'\n  DOCX: '{docx_text}'"
389            );
390        }
391
392        // Also check the known input text appears.
393        assert!(
394            docx_text.contains(input_text),
395            "Expected '{input_text}' in DOCX text: '{docx_text}'"
396        );
397    }
398
399    #[test]
400    fn docx_content_types_valid() {
401        let doc = make_test_pdf(b"BT /F1 12 Tf (Content types test) Tj ET");
402        let docx = pdf_to_docx(&doc).unwrap();
403        let xml = read_zip_entry(&docx, "[Content_Types].xml").unwrap();
404
405        assert!(xml.contains("ContentType"));
406        assert!(xml.contains("wordprocessingml"));
407    }
408}
pdf_docx/lib.rs

pdf_docx/
lib.rs