1#![warn(missing_docs)]
2pub mod error;
8pub mod layout;
9pub mod writer;
10
11pub use error::{DocxError, Result};
12pub use layout::{DocxImage, PageElement, Paragraph, Run, Table};
13
14use layout::analyze_page;
15use lopdf::Document;
16use pdf_extract::{extract_page_images, extract_text, ImageFilter};
17use writer::write_docx;
18
19const MAX_DOCX_PAGES: u32 = 1000;
22
23pub fn pdf_to_docx(doc: &Document) -> Result<Vec<u8>> {
27 pdf_to_docx_inner(doc, false)
28}
29
30pub fn pdf_to_docx_text_only(doc: &Document) -> Result<Vec<u8>> {
35 pdf_to_docx_sequential(doc)
36}
37
38fn pdf_to_docx_sequential(doc: &Document) -> Result<Vec<u8>> {
45 let pages = doc.get_pages();
46 let total_pages = pages.len() as u32;
47 let total_pages = total_pages.min(MAX_DOCX_PAGES);
48 let text_blocks = extract_text(doc);
49
50 let mut all_elements: Vec<Vec<PageElement>> = Vec::new();
51
52 for page_num in 1..=total_pages {
53 let page_blocks: Vec<_> = text_blocks
54 .iter()
55 .filter(|b| b.page == page_num)
56 .cloned()
57 .collect();
58
59 let elements: Vec<PageElement> = page_blocks
62 .iter()
63 .map(|b| {
64 PageElement::Para(layout::Paragraph {
65 runs: vec![layout::Run {
66 text: b.text.clone(),
67 font_name: String::new(),
68 font_size: b.font_size,
69 bold: false,
70 italic: false,
71 }],
72 })
73 })
74 .collect();
75
76 all_elements.push(elements);
77 }
78
79 let mut output = Vec::new();
80 write_docx(&all_elements, &[], &mut output)?;
81 Ok(output)
82}
83
84fn pdf_to_docx_inner(doc: &Document, skip_images: bool) -> Result<Vec<u8>> {
85 let pages = doc.get_pages();
86 let total_pages = pages.len() as u32;
87 let total_pages = total_pages.min(MAX_DOCX_PAGES);
88
89 let mut all_elements: Vec<Vec<PageElement>> = Vec::new();
90 let mut all_images: Vec<DocxImage> = Vec::new();
91
92 let text_blocks = extract_text(doc);
94
95 for page_num in 1..=total_pages {
96 let page_blocks: Vec<_> = text_blocks
98 .iter()
99 .filter(|b| b.page == page_num)
100 .cloned()
101 .collect();
102
103 let mut elements = analyze_page(&page_blocks);
105
106 if !skip_images {
108 if let Ok(images) = extract_page_images(doc, page_num) {
109 for img in images {
110 let (content_type, ext) = match img.filter {
111 ImageFilter::Jpeg => ("image/jpeg", "jpeg"),
112 _ => ("image/png", "png"),
113 };
114
115 let id = format!("image{}_{}.{}", page_num, all_images.len(), ext);
116
117 all_images.push(DocxImage {
118 data: img.data,
119 width: img.width,
120 height: img.height,
121 content_type: content_type.to_string(),
122 id: id.clone(),
123 });
124
125 elements.push(PageElement::Img(layout::DocxImage {
126 data: Vec::new(), width: img.width,
128 height: img.height,
129 content_type: content_type.to_string(),
130 id,
131 }));
132 }
133 }
134 }
135
136 all_elements.push(elements);
137 }
138
139 let mut output = Vec::new();
140 write_docx(&all_elements, &all_images, &mut output)?;
141 Ok(output)
142}
143
144pub fn convert_pdf_bytes_to_docx(pdf_bytes: &[u8]) -> Result<Vec<u8>> {
146 let doc = Document::load_mem(pdf_bytes)?;
147 pdf_to_docx(&doc)
148}
149
150#[cfg(test)]
151mod tests {
152 use super::*;
153 use lopdf::{dictionary, Document, Object, Stream};
154 use std::io::Read;
155
156 fn make_test_pdf(content: &[u8]) -> Document {
157 let mut doc = Document::with_version("1.7");
158
159 let content_stream = Stream::new(dictionary! {}, content.to_vec());
160 let content_id = doc.add_object(Object::Stream(content_stream));
161
162 let page_dict = dictionary! {
163 "Type" => "Page",
164 "MediaBox" => vec![0.into(), 0.into(), 612.into(), 792.into()],
165 "Contents" => Object::Reference(content_id),
166 };
167 let page_id = doc.add_object(Object::Dictionary(page_dict));
168
169 let pages_dict = dictionary! {
170 "Type" => "Pages",
171 "Kids" => vec![Object::Reference(page_id)],
172 "Count" => 1_i64,
173 };
174 let pages_id = doc.add_object(Object::Dictionary(pages_dict));
175
176 if let Ok(Object::Dictionary(ref mut d)) = doc.get_object_mut(page_id) {
177 d.set("Parent", Object::Reference(pages_id));
178 }
179
180 let catalog = dictionary! {
181 "Type" => "Catalog",
182 "Pages" => Object::Reference(pages_id),
183 };
184 let catalog_id = doc.add_object(Object::Dictionary(catalog));
185 doc.trailer.set("Root", Object::Reference(catalog_id));
186
187 doc
188 }
189
190 fn read_zip_entry(data: &[u8], name: &str) -> Option<String> {
191 let cursor = std::io::Cursor::new(data);
192 let mut archive = zip::ZipArchive::new(cursor).ok()?;
193 let mut file = archive.by_name(name).ok()?;
194 let mut content = String::new();
195 file.read_to_string(&mut content).ok()?;
196 Some(content)
197 }
198
199 fn zip_file_names(data: &[u8]) -> Vec<String> {
200 let cursor = std::io::Cursor::new(data);
201 let archive = zip::ZipArchive::new(cursor).unwrap();
202 (0..archive.len())
203 .map(|i| archive.name_for_index(i).unwrap().to_string())
204 .collect()
205 }
206
207 fn levenshtein_similarity(a: &str, b: &str) -> f64 {
208 let a: Vec<char> = a.chars().collect();
209 let b: Vec<char> = b.chars().collect();
210 let (m, n) = (a.len(), b.len());
211 if m == 0 && n == 0 {
212 return 1.0;
213 }
214 let mut prev: Vec<usize> = (0..=n).collect();
215 let mut curr = vec![0; n + 1];
216 for i in 1..=m {
217 curr[0] = i;
218 for j in 1..=n {
219 let cost = if a[i - 1] == b[j - 1] { 0 } else { 1 };
220 curr[j] = (prev[j] + 1).min(curr[j - 1] + 1).min(prev[j - 1] + cost);
221 }
222 std::mem::swap(&mut prev, &mut curr);
223 }
224 1.0 - (prev[n] as f64 / m.max(n) as f64)
225 }
226
227 #[test]
228 fn convert_simple_text_pdf() {
229 let doc = make_test_pdf(b"BT /F1 12 Tf (Hello World) Tj ET");
230 let docx = pdf_to_docx(&doc).unwrap();
231 assert!(docx.len() > 100);
232 assert_eq!(&docx[0..2], b"PK"); }
234
235 #[test]
236 fn convert_multiline_pdf() {
237 let doc = make_test_pdf(b"BT /F1 12 Tf 12 TL (Line 1) Tj T* (Line 2) Tj ET");
238 let docx = pdf_to_docx(&doc).unwrap();
239 assert!(docx.len() > 100);
240 }
241
242 #[test]
243 fn convert_empty_pdf() {
244 let doc = make_test_pdf(b"");
245 let docx = pdf_to_docx(&doc).unwrap();
246 assert!(docx.len() > 100);
247 }
248
249 #[test]
250 fn convert_from_bytes() {
251 let mut doc = make_test_pdf(b"BT /F1 12 Tf (Test) Tj ET");
252 let mut pdf_bytes = Vec::new();
253 doc.save_to(&mut pdf_bytes).unwrap();
254
255 let docx = convert_pdf_bytes_to_docx(&pdf_bytes).unwrap();
256 assert!(docx.len() > 100);
257 }
258
259 #[test]
260 fn docx_structure_has_required_files() {
261 let doc = make_test_pdf(b"BT /F1 12 Tf (Structure test) Tj ET");
262 let docx = pdf_to_docx(&doc).unwrap();
263 let names = zip_file_names(&docx);
264
265 assert!(names.contains(&"[Content_Types].xml".to_string()));
266 assert!(names.contains(&"_rels/.rels".to_string()));
267 assert!(names.contains(&"word/document.xml".to_string()));
268 assert!(names.contains(&"word/styles.xml".to_string()));
269 assert!(names.contains(&"word/_rels/document.xml.rels".to_string()));
270 }
271
272 #[test]
273 fn docx_document_xml_parseable() {
274 let doc = make_test_pdf(b"BT /F1 12 Tf (XML parse test) Tj ET");
275 let docx = pdf_to_docx(&doc).unwrap();
276 let xml = read_zip_entry(&docx, "word/document.xml").unwrap();
277
278 let parsed = quick_xml::Reader::from_str(&xml);
280 let mut buf = Vec::new();
281 let mut reader = parsed;
282 loop {
283 match reader.read_event_into(&mut buf) {
284 Ok(quick_xml::events::Event::Eof) => break,
285 Err(e) => panic!("Invalid XML in document.xml: {e}"),
286 _ => {}
287 }
288 buf.clear();
289 }
290 }
291
292 #[test]
293 fn docx_styles_xml_parseable() {
294 let doc = make_test_pdf(b"BT /F1 12 Tf (Styles test) Tj ET");
295 let docx = pdf_to_docx(&doc).unwrap();
296 let xml = read_zip_entry(&docx, "word/styles.xml").unwrap();
297
298 let mut reader = quick_xml::Reader::from_str(&xml);
299 let mut buf = Vec::new();
300 loop {
301 match reader.read_event_into(&mut buf) {
302 Ok(quick_xml::events::Event::Eof) => break,
303 Err(e) => panic!("Invalid XML in styles.xml: {e}"),
304 _ => {}
305 }
306 buf.clear();
307 }
308 }
309
310 #[test]
311 fn docx_text_preserved() {
312 let doc = make_test_pdf(b"BT /F1 12 Tf (Hello World) Tj ET");
313 let docx = pdf_to_docx(&doc).unwrap();
314 let xml = read_zip_entry(&docx, "word/document.xml").unwrap();
315
316 assert!(
317 xml.contains("Hello World"),
318 "Expected 'Hello World' in document.xml, got: {xml}"
319 );
320 }
321
322 #[test]
323 fn docx_multiline_text_preserved() {
324 let doc = make_test_pdf(b"BT /F1 12 Tf 12 TL (First line) Tj T* (Second line) Tj ET");
325 let docx = pdf_to_docx(&doc).unwrap();
326 let xml = read_zip_entry(&docx, "word/document.xml").unwrap();
327
328 assert!(xml.contains("First line"));
329 assert!(xml.contains("Second line"));
330 }
331
332 #[test]
333 fn docx_table_content_in_xml() {
334 let content = b"BT /F1 12 Tf 1 0 0 1 72 700 Tm (Name) Tj 1 0 0 1 200 700 Tm (Age) Tj 1 0 0 1 72 684 Tm (Alice) Tj 1 0 0 1 200 684 Tm (30) Tj ET";
335 let doc = make_test_pdf(content);
336 let docx = pdf_to_docx(&doc).unwrap();
337 let xml = read_zip_entry(&docx, "word/document.xml").unwrap();
338
339 assert!(xml.contains("Name"));
341 assert!(xml.contains("Alice"));
342 }
343
344 #[test]
345 fn docx_text_similarity_above_threshold() {
346 let input_text = "Hello World";
347 let doc = make_test_pdf(b"BT /F1 12 Tf (Hello World) Tj ET");
348
349 let blocks = pdf_extract::extract_text(&doc);
351 let pdf_text: String = blocks
352 .iter()
353 .map(|b| b.text.as_str())
354 .collect::<Vec<_>>()
355 .join(" ");
356
357 let docx = pdf_to_docx(&doc).unwrap();
359 let xml = read_zip_entry(&docx, "word/document.xml").unwrap();
360
361 let mut docx_texts = Vec::new();
363 let mut reader = quick_xml::Reader::from_str(&xml);
364 let mut buf = Vec::new();
365 let mut in_wt = false;
366 loop {
367 match reader.read_event_into(&mut buf) {
368 Ok(quick_xml::events::Event::Start(e)) => {
369 in_wt = e.name().as_ref() == b"w:t";
370 }
371 Ok(quick_xml::events::Event::Text(e)) if in_wt => {
372 docx_texts.push(e.unescape().unwrap().to_string());
373 }
374 Ok(quick_xml::events::Event::End(_)) => {
375 in_wt = false;
376 }
377 Ok(quick_xml::events::Event::Eof) => break,
378 Err(e) => panic!("XML parse error: {e}"),
379 _ => {}
380 }
381 buf.clear();
382 }
383 let docx_text = docx_texts.join(" ");
384
385 if pdf_text.len() >= 5 {
386 let similarity = levenshtein_similarity(&pdf_text, &docx_text);
387 assert!(
388 similarity >= 0.80,
389 "Text similarity {similarity:.2} below 0.80 threshold.\n PDF: '{pdf_text}'\n DOCX: '{docx_text}'"
390 );
391 }
392
393 assert!(
395 docx_text.contains(input_text),
396 "Expected '{input_text}' in DOCX text: '{docx_text}'"
397 );
398 }
399
400 #[test]
401 fn docx_content_types_valid() {
402 let doc = make_test_pdf(b"BT /F1 12 Tf (Content types test) Tj ET");
403 let docx = pdf_to_docx(&doc).unwrap();
404 let xml = read_zip_entry(&docx, "[Content_Types].xml").unwrap();
405
406 assert!(xml.contains("ContentType"));
407 assert!(xml.contains("wordprocessingml"));
408 }
409}