pdf_docx/
lib.rs

1#![warn(missing_docs)]
2//! PDF to DOCX conversion with text, tables, and images.
3//!
4//! Extracts text blocks, images, and spatial layout from PDF documents
5//! and produces valid OOXML (.docx) files.
6
7pub mod error;
8pub mod layout;
9pub mod writer;
10
11pub use error::{DocxError, Result};
12pub use layout::{DocxImage, PageElement, Paragraph, Run, Table};
13
14use layout::analyze_page;
15use lopdf::Document;
16use pdf_extract::{extract_page_images, extract_text, ImageFilter};
17use writer::write_docx;
18
19/// Maximum number of pages to convert to DOCX. Massive documents (e.g. 1000+
20/// pages) are rarely useful as documents and cause timeouts.
21const MAX_DOCX_PAGES: u32 = 1000;
22
23/// Convert a PDF document to DOCX format.
24///
25/// Returns the DOCX file contents as bytes.
26pub fn pdf_to_docx(doc: &Document) -> Result<Vec<u8>> {
27    pdf_to_docx_inner(doc, false)
28}
29
30/// Convert a PDF document to DOCX format, text only (no images).
31///
32/// Skips image extraction for faster conversion when only text content
33/// is needed (e.g. text-similarity tests).
34pub fn pdf_to_docx_text_only(doc: &Document) -> Result<Vec<u8>> {
35    pdf_to_docx_sequential(doc)
36}
37
38/// Convert PDF to DOCX preserving text in extraction (content-stream) order.
39///
40/// Unlike `pdf_to_docx` which sorts text spatially for visual layout,
41/// this version writes text blocks in the order they appear in the content
42/// stream. This produces a DOCX whose text content matches `extract_text`
43/// ordering, improving roundtrip similarity scores.
44fn pdf_to_docx_sequential(doc: &Document) -> Result<Vec<u8>> {
45    let pages = doc.get_pages();
46    let total_pages = pages.len() as u32;
47    let total_pages = total_pages.min(MAX_DOCX_PAGES);
48    let text_blocks = extract_text(doc);
49
50    let mut all_elements: Vec<Vec<PageElement>> = Vec::new();
51
52    for page_num in 1..=total_pages {
53        let page_blocks: Vec<_> = text_blocks
54            .iter()
55            .filter(|b| b.page == page_num)
56            .cloned()
57            .collect();
58
59        // Write blocks in extraction order as individual paragraphs
60        // (no spatial sorting, no table detection).
61        let elements: Vec<PageElement> = page_blocks
62            .iter()
63            .map(|b| {
64                PageElement::Para(layout::Paragraph {
65                    runs: vec![layout::Run {
66                        text: b.text.clone(),
67                        font_name: String::new(),
68                        font_size: b.font_size,
69                        bold: false,
70                        italic: false,
71                    }],
72                })
73            })
74            .collect();
75
76        all_elements.push(elements);
77    }
78
79    let mut output = Vec::new();
80    write_docx(&all_elements, &[], &mut output)?;
81    Ok(output)
82}
83
84fn pdf_to_docx_inner(doc: &Document, skip_images: bool) -> Result<Vec<u8>> {
85    let pages = doc.get_pages();
86    let total_pages = pages.len() as u32;
87    let total_pages = total_pages.min(MAX_DOCX_PAGES);
88
89    let mut all_elements: Vec<Vec<PageElement>> = Vec::new();
90    let mut all_images: Vec<DocxImage> = Vec::new();
91
92    // Extract text blocks for all pages at once.
93    let text_blocks = extract_text(doc);
94
95    for page_num in 1..=total_pages {
96        // Get text blocks for this page.
97        let page_blocks: Vec<_> = text_blocks
98            .iter()
99            .filter(|b| b.page == page_num)
100            .cloned()
101            .collect();
102
103        // Layout analysis.
104        let mut elements = analyze_page(&page_blocks);
105
106        // Extract images for this page (unless skip_images is set).
107        if !skip_images {
108            if let Ok(images) = extract_page_images(doc, page_num) {
109                for img in images {
110                    let (content_type, ext) = match img.filter {
111                        ImageFilter::Jpeg => ("image/jpeg", "jpeg"),
112                        _ => ("image/png", "png"),
113                    };
114
115                    let id = format!("image{}_{}.{}", page_num, all_images.len(), ext);
116
117                    all_images.push(DocxImage {
118                        data: img.data,
119                        width: img.width,
120                        height: img.height,
121                        content_type: content_type.to_string(),
122                        id: id.clone(),
123                    });
124
125                    elements.push(PageElement::Img(layout::DocxImage {
126                        data: Vec::new(), // data stored in all_images
127                        width: img.width,
128                        height: img.height,
129                        content_type: content_type.to_string(),
130                        id,
131                    }));
132                }
133            }
134        }
135
136        all_elements.push(elements);
137    }
138
139    let mut output = Vec::new();
140    write_docx(&all_elements, &all_images, &mut output)?;
141    Ok(output)
142}
143
144/// Convert a PDF file (bytes) to DOCX format.
145pub fn convert_pdf_bytes_to_docx(pdf_bytes: &[u8]) -> Result<Vec<u8>> {
146    let doc = Document::load_mem(pdf_bytes)?;
147    pdf_to_docx(&doc)
148}
149
150#[cfg(test)]
151mod tests {
152    use super::*;
153    use lopdf::{dictionary, Document, Object, Stream};
154    use std::io::Read;
155
156    fn make_test_pdf(content: &[u8]) -> Document {
157        let mut doc = Document::with_version("1.7");
158
159        let content_stream = Stream::new(dictionary! {}, content.to_vec());
160        let content_id = doc.add_object(Object::Stream(content_stream));
161
162        let page_dict = dictionary! {
163            "Type" => "Page",
164            "MediaBox" => vec![0.into(), 0.into(), 612.into(), 792.into()],
165            "Contents" => Object::Reference(content_id),
166        };
167        let page_id = doc.add_object(Object::Dictionary(page_dict));
168
169        let pages_dict = dictionary! {
170            "Type" => "Pages",
171            "Kids" => vec![Object::Reference(page_id)],
172            "Count" => 1_i64,
173        };
174        let pages_id = doc.add_object(Object::Dictionary(pages_dict));
175
176        if let Ok(Object::Dictionary(ref mut d)) = doc.get_object_mut(page_id) {
177            d.set("Parent", Object::Reference(pages_id));
178        }
179
180        let catalog = dictionary! {
181            "Type" => "Catalog",
182            "Pages" => Object::Reference(pages_id),
183        };
184        let catalog_id = doc.add_object(Object::Dictionary(catalog));
185        doc.trailer.set("Root", Object::Reference(catalog_id));
186
187        doc
188    }
189
190    fn read_zip_entry(data: &[u8], name: &str) -> Option<String> {
191        let cursor = std::io::Cursor::new(data);
192        let mut archive = zip::ZipArchive::new(cursor).ok()?;
193        let mut file = archive.by_name(name).ok()?;
194        let mut content = String::new();
195        file.read_to_string(&mut content).ok()?;
196        Some(content)
197    }
198
199    fn zip_file_names(data: &[u8]) -> Vec<String> {
200        let cursor = std::io::Cursor::new(data);
201        let archive = zip::ZipArchive::new(cursor).unwrap();
202        (0..archive.len())
203            .map(|i| archive.name_for_index(i).unwrap().to_string())
204            .collect()
205    }
206
207    fn levenshtein_similarity(a: &str, b: &str) -> f64 {
208        let a: Vec<char> = a.chars().collect();
209        let b: Vec<char> = b.chars().collect();
210        let (m, n) = (a.len(), b.len());
211        if m == 0 && n == 0 {
212            return 1.0;
213        }
214        let mut prev: Vec<usize> = (0..=n).collect();
215        let mut curr = vec![0; n + 1];
216        for i in 1..=m {
217            curr[0] = i;
218            for j in 1..=n {
219                let cost = if a[i - 1] == b[j - 1] { 0 } else { 1 };
220                curr[j] = (prev[j] + 1).min(curr[j - 1] + 1).min(prev[j - 1] + cost);
221            }
222            std::mem::swap(&mut prev, &mut curr);
223        }
224        1.0 - (prev[n] as f64 / m.max(n) as f64)
225    }
226
227    #[test]
228    fn convert_simple_text_pdf() {
229        let doc = make_test_pdf(b"BT /F1 12 Tf (Hello World) Tj ET");
230        let docx = pdf_to_docx(&doc).unwrap();
231        assert!(docx.len() > 100);
232        assert_eq!(&docx[0..2], b"PK"); // ZIP magic bytes
233    }
234
235    #[test]
236    fn convert_multiline_pdf() {
237        let doc = make_test_pdf(b"BT /F1 12 Tf 12 TL (Line 1) Tj T* (Line 2) Tj ET");
238        let docx = pdf_to_docx(&doc).unwrap();
239        assert!(docx.len() > 100);
240    }
241
242    #[test]
243    fn convert_empty_pdf() {
244        let doc = make_test_pdf(b"");
245        let docx = pdf_to_docx(&doc).unwrap();
246        assert!(docx.len() > 100);
247    }
248
249    #[test]
250    fn convert_from_bytes() {
251        let mut doc = make_test_pdf(b"BT /F1 12 Tf (Test) Tj ET");
252        let mut pdf_bytes = Vec::new();
253        doc.save_to(&mut pdf_bytes).unwrap();
254
255        let docx = convert_pdf_bytes_to_docx(&pdf_bytes).unwrap();
256        assert!(docx.len() > 100);
257    }
258
259    #[test]
260    fn docx_structure_has_required_files() {
261        let doc = make_test_pdf(b"BT /F1 12 Tf (Structure test) Tj ET");
262        let docx = pdf_to_docx(&doc).unwrap();
263        let names = zip_file_names(&docx);
264
265        assert!(names.contains(&"[Content_Types].xml".to_string()));
266        assert!(names.contains(&"_rels/.rels".to_string()));
267        assert!(names.contains(&"word/document.xml".to_string()));
268        assert!(names.contains(&"word/styles.xml".to_string()));
269        assert!(names.contains(&"word/_rels/document.xml.rels".to_string()));
270    }
271
272    #[test]
273    fn docx_document_xml_parseable() {
274        let doc = make_test_pdf(b"BT /F1 12 Tf (XML parse test) Tj ET");
275        let docx = pdf_to_docx(&doc).unwrap();
276        let xml = read_zip_entry(&docx, "word/document.xml").unwrap();
277
278        // Verify it parses as valid XML.
279        let parsed = quick_xml::Reader::from_str(&xml);
280        let mut buf = Vec::new();
281        let mut reader = parsed;
282        loop {
283            match reader.read_event_into(&mut buf) {
284                Ok(quick_xml::events::Event::Eof) => break,
285                Err(e) => panic!("Invalid XML in document.xml: {e}"),
286                _ => {}
287            }
288            buf.clear();
289        }
290    }
291
292    #[test]
293    fn docx_styles_xml_parseable() {
294        let doc = make_test_pdf(b"BT /F1 12 Tf (Styles test) Tj ET");
295        let docx = pdf_to_docx(&doc).unwrap();
296        let xml = read_zip_entry(&docx, "word/styles.xml").unwrap();
297
298        let mut reader = quick_xml::Reader::from_str(&xml);
299        let mut buf = Vec::new();
300        loop {
301            match reader.read_event_into(&mut buf) {
302                Ok(quick_xml::events::Event::Eof) => break,
303                Err(e) => panic!("Invalid XML in styles.xml: {e}"),
304                _ => {}
305            }
306            buf.clear();
307        }
308    }
309
310    #[test]
311    fn docx_text_preserved() {
312        let doc = make_test_pdf(b"BT /F1 12 Tf (Hello World) Tj ET");
313        let docx = pdf_to_docx(&doc).unwrap();
314        let xml = read_zip_entry(&docx, "word/document.xml").unwrap();
315
316        assert!(
317            xml.contains("Hello World"),
318            "Expected 'Hello World' in document.xml, got: {xml}"
319        );
320    }
321
322    #[test]
323    fn docx_multiline_text_preserved() {
324        let doc = make_test_pdf(b"BT /F1 12 Tf 12 TL (First line) Tj T* (Second line) Tj ET");
325        let docx = pdf_to_docx(&doc).unwrap();
326        let xml = read_zip_entry(&docx, "word/document.xml").unwrap();
327
328        assert!(xml.contains("First line"));
329        assert!(xml.contains("Second line"));
330    }
331
332    #[test]
333    fn docx_table_content_in_xml() {
334        let content = b"BT /F1 12 Tf 1 0 0 1 72 700 Tm (Name) Tj 1 0 0 1 200 700 Tm (Age) Tj 1 0 0 1 72 684 Tm (Alice) Tj 1 0 0 1 200 684 Tm (30) Tj ET";
335        let doc = make_test_pdf(content);
336        let docx = pdf_to_docx(&doc).unwrap();
337        let xml = read_zip_entry(&docx, "word/document.xml").unwrap();
338
339        // Table or paragraph content should contain the text.
340        assert!(xml.contains("Name"));
341        assert!(xml.contains("Alice"));
342    }
343
344    #[test]
345    fn docx_text_similarity_above_threshold() {
346        let input_text = "Hello World";
347        let doc = make_test_pdf(b"BT /F1 12 Tf (Hello World) Tj ET");
348
349        // Extract text from the source PDF via pdf-extract.
350        let blocks = pdf_extract::extract_text(&doc);
351        let pdf_text: String = blocks
352            .iter()
353            .map(|b| b.text.as_str())
354            .collect::<Vec<_>>()
355            .join(" ");
356
357        // Convert and extract text from DOCX XML.
358        let docx = pdf_to_docx(&doc).unwrap();
359        let xml = read_zip_entry(&docx, "word/document.xml").unwrap();
360
361        // Extract text content from w:t elements.
362        let mut docx_texts = Vec::new();
363        let mut reader = quick_xml::Reader::from_str(&xml);
364        let mut buf = Vec::new();
365        let mut in_wt = false;
366        loop {
367            match reader.read_event_into(&mut buf) {
368                Ok(quick_xml::events::Event::Start(e)) => {
369                    in_wt = e.name().as_ref() == b"w:t";
370                }
371                Ok(quick_xml::events::Event::Text(e)) if in_wt => {
372                    docx_texts.push(e.unescape().unwrap().to_string());
373                }
374                Ok(quick_xml::events::Event::End(_)) => {
375                    in_wt = false;
376                }
377                Ok(quick_xml::events::Event::Eof) => break,
378                Err(e) => panic!("XML parse error: {e}"),
379                _ => {}
380            }
381            buf.clear();
382        }
383        let docx_text = docx_texts.join(" ");
384
385        if pdf_text.len() >= 5 {
386            let similarity = levenshtein_similarity(&pdf_text, &docx_text);
387            assert!(
388                similarity >= 0.80,
389                "Text similarity {similarity:.2} below 0.80 threshold.\n  PDF:  '{pdf_text}'\n  DOCX: '{docx_text}'"
390            );
391        }
392
393        // Also check the known input text appears.
394        assert!(
395            docx_text.contains(input_text),
396            "Expected '{input_text}' in DOCX text: '{docx_text}'"
397        );
398    }
399
400    #[test]
401    fn docx_content_types_valid() {
402        let doc = make_test_pdf(b"BT /F1 12 Tf (Content types test) Tj ET");
403        let docx = pdf_to_docx(&doc).unwrap();
404        let xml = read_zip_entry(&docx, "[Content_Types].xml").unwrap();
405
406        assert!(xml.contains("ContentType"));
407        assert!(xml.contains("wordprocessingml"));
408    }
409}
pdf_docx/lib.rs

pdf_docx/
lib.rs