pdfmuse_core/lib.rs
1//! pdfmuse-core — deterministic PDF/DOCX parser core.
2//!
3//! The naive `parse()` lands in PER-33 and the self-written content-stream
4//! interpreter (the real value) in PER-36. The unified IR — the data foundation
5//! that every binding serializes byte-identically — lives in [`ir`].
6
7pub mod backend;
8mod docx;
9pub mod error;
10pub mod ir;
11mod layout;
12mod output;
13mod pdf;
14
15pub use error::{PdfmuseError, Result};
16pub use output::{chunk, to_json, to_markdown, to_text, Chunk};
17
18/// Source-format hint for [`parse`].
19#[derive(Clone, Copy, Debug, PartialEq, Eq)]
20pub enum Format {
21 Pdf,
22 Docx,
23}
24
25/// Parse `data` into the unified [`ir::Document`].
26///
27/// `fmt` forces a format; `None` auto-detects from magic bytes. The core makes no
28/// I/O assumptions — it only borrows `&[u8]`, so each binding feeds it bytes
29/// however it likes (Python `bytes`, Node `Buffer`, WASM `Uint8Array`).
30///
31/// M0 uses lopdf's naive text extraction (one paragraph per page, no per-char
32/// coordinates). PER-36 replaces the PDF path with the self-written content-stream
33/// interpreter that fills [`ir::Page::chars`] with precise bboxes.
34pub fn parse(data: &[u8], fmt: Option<Format>) -> Result<ir::Document> {
35 parse_with_password(data, fmt, None)
36}
37
38/// Like [`parse`], but supplies a `password` for encrypted PDFs.
39///
40/// An encrypted document with no/incorrect password fails with
41/// [`PdfmuseError::EncryptedNoPassword`]. The password is never logged or echoed.
42pub fn parse_with_password(
43 data: &[u8],
44 fmt: Option<Format>,
45 password: Option<&str>,
46) -> Result<ir::Document> {
47 match fmt.or_else(|| detect_format(data)) {
48 Some(Format::Pdf) => {
49 let mut doc = pdf::parse_pdf(data, password)?;
50 // Geometric layout: chars → lines → paragraphs (reading order).
51 layout_pages(&mut doc);
52 // Heading detection (font-size clustering + numbering) — DOCX gets its
53 // heading levels from Word styles instead.
54 layout::assign_headings(&mut doc);
55 // Mark (never drop) running headers/footers; callers opt in via
56 // `remove_boilerplate`.
57 layout::mark_boilerplate(&mut doc);
58 Ok(doc)
59 }
60 Some(Format::Docx) => docx::parse(data),
61 None => Err(PdfmuseError::InvalidFormat),
62 }
63}
64
65/// Strip running headers/footers (paragraphs marked
66/// [`ir::BlockRole::HeaderFooter`]) from `doc`, in place.
67///
68/// Opt-in: [`parse`] only *marks* boilerplate, so default output is unchanged.
69/// Call this before [`to_text`] / [`to_markdown`] / [`chunk`] to drop repeated page
70/// furniture (page numbers, running titles) from your RAG text. No-op on documents
71/// where nothing was marked (single/short docs, or no repetition).
72pub fn remove_boilerplate(doc: &mut ir::Document) {
73 for page in &mut doc.pages {
74 page.blocks
75 .retain(|b| !matches!(b, ir::Block::Paragraph(p) if p.role == Some(ir::BlockRole::HeaderFooter)));
76 }
77}
78
79/// Run geometric layout on every page. Parallel across cores with the `rayon`
80/// feature enabled, sequential otherwise — identical output either way.
81fn layout_pages(doc: &mut ir::Document) {
82 #[cfg(feature = "rayon")]
83 {
84 use rayon::prelude::*;
85 doc.pages.par_iter_mut().for_each(layout::layout_page);
86 }
87 #[cfg(not(feature = "rayon"))]
88 doc.pages.iter_mut().for_each(layout::layout_page);
89}
90
91/// Detect the container format from leading magic bytes.
92fn detect_format(data: &[u8]) -> Option<Format> {
93 if data.starts_with(b"PK\x03\x04") {
94 return Some(Format::Docx); // ZIP container → OOXML (DOCX)
95 }
96 // Some PDFs carry leading junk before `%PDF-`; scan the first 1 KiB.
97 let head = &data[..data.len().min(1024)];
98 if head.windows(5).any(|w| w == b"%PDF-") {
99 return Some(Format::Pdf);
100 }
101 None
102}
103
104#[cfg(test)]
105mod tests {
106 use super::*;
107
108 #[test]
109 fn detects_pdf_and_docx_magic() {
110 assert_eq!(detect_format(b"%PDF-1.7\ntrailer"), Some(Format::Pdf));
111 assert_eq!(detect_format(b"PK\x03\x04rest"), Some(Format::Docx));
112 assert_eq!(detect_format(b"not a document"), None);
113 }
114
115 #[test]
116 fn docx_magic_routes_to_docx_parser() {
117 // A bare ZIP magic is a truncated DOCX → recognized, then Malformed
118 // (no longer Unsupported, now that DOCX parsing is implemented).
119 assert!(matches!(
120 parse(b"PK\x03\x04", None).unwrap_err(),
121 PdfmuseError::Malformed(_)
122 ));
123 }
124
125 #[test]
126 fn unknown_bytes_are_invalid_format() {
127 assert!(matches!(
128 parse(b"garbage", None).unwrap_err(),
129 PdfmuseError::InvalidFormat
130 ));
131 }
132}