1pub mod backend;
8mod docx;
9pub mod error;
10pub mod ir;
11mod layout;
12mod output;
13mod pdf;
14
15pub use error::{PdfmuseError, Result};
16pub use output::{chunk, to_json, to_markdown, to_text, Chunk};
17
18#[derive(Clone, Copy, Debug, PartialEq, Eq)]
20pub enum Format {
21 Pdf,
22 Docx,
23}
24
25pub fn parse(data: &[u8], fmt: Option<Format>) -> Result<ir::Document> {
35 parse_with_password(data, fmt, None)
36}
37
38pub fn parse_with_password(
43 data: &[u8],
44 fmt: Option<Format>,
45 password: Option<&str>,
46) -> Result<ir::Document> {
47 match fmt.or_else(|| detect_format(data)) {
48 Some(Format::Pdf) => {
49 let mut doc = pdf::parse_pdf(data, password)?;
50 layout_pages(&mut doc);
52 Ok(doc)
53 }
54 Some(Format::Docx) => docx::parse(data),
55 None => Err(PdfmuseError::InvalidFormat),
56 }
57}
58
59fn layout_pages(doc: &mut ir::Document) {
62 #[cfg(feature = "rayon")]
63 {
64 use rayon::prelude::*;
65 doc.pages.par_iter_mut().for_each(layout::layout_page);
66 }
67 #[cfg(not(feature = "rayon"))]
68 doc.pages.iter_mut().for_each(layout::layout_page);
69}
70
71fn detect_format(data: &[u8]) -> Option<Format> {
73 if data.starts_with(b"PK\x03\x04") {
74 return Some(Format::Docx); }
76 let head = &data[..data.len().min(1024)];
78 if head.windows(5).any(|w| w == b"%PDF-") {
79 return Some(Format::Pdf);
80 }
81 None
82}
83
84#[cfg(test)]
85mod tests {
86 use super::*;
87
88 #[test]
89 fn detects_pdf_and_docx_magic() {
90 assert_eq!(detect_format(b"%PDF-1.7\ntrailer"), Some(Format::Pdf));
91 assert_eq!(detect_format(b"PK\x03\x04rest"), Some(Format::Docx));
92 assert_eq!(detect_format(b"not a document"), None);
93 }
94
95 #[test]
96 fn docx_magic_routes_to_docx_parser() {
97 assert!(matches!(
100 parse(b"PK\x03\x04", None).unwrap_err(),
101 PdfmuseError::Malformed(_)
102 ));
103 }
104
105 #[test]
106 fn unknown_bytes_are_invalid_format() {
107 assert!(matches!(
108 parse(b"garbage", None).unwrap_err(),
109 PdfmuseError::InvalidFormat
110 ));
111 }
112}