Skip to main content

pdfmuse_core/
lib.rs

1//! pdfmuse-core — deterministic PDF/DOCX parser core.
2//!
3//! The naive `parse()` lands in PER-33 and the self-written content-stream
4//! interpreter (the real value) in PER-36. The unified IR — the data foundation
5//! that every binding serializes byte-identically — lives in [`ir`].
6
7pub mod backend;
8mod docx;
9pub mod error;
10pub mod ir;
11mod layout;
12mod output;
13mod pdf;
14mod profile;
15
16pub use error::{PdfmuseError, Result};
17pub use output::{chunk, to_json, to_markdown, to_text, Chunk};
18
19/// Source-format hint for [`parse`].
20#[derive(Clone, Copy, Debug, PartialEq, Eq)]
21pub enum Format {
22    Pdf,
23    Docx,
24}
25
26/// Parse `data` into the unified [`ir::Document`].
27///
28/// `fmt` forces a format; `None` auto-detects from magic bytes. The core makes no
29/// I/O assumptions — it only borrows `&[u8]`, so each binding feeds it bytes
30/// however it likes (Python `bytes`, Node `Buffer`, WASM `Uint8Array`).
31///
32/// M0 uses lopdf's naive text extraction (one paragraph per page, no per-char
33/// coordinates). PER-36 replaces the PDF path with the self-written content-stream
34/// interpreter that fills [`ir::Page::chars`] with precise bboxes.
35pub fn parse(data: &[u8], fmt: Option<Format>) -> Result<ir::Document> {
36    parse_with_password(data, fmt, None)
37}
38
39/// Like [`parse`], but supplies a `password` for encrypted PDFs.
40///
41/// An encrypted document with no/incorrect password fails with
42/// [`PdfmuseError::EncryptedNoPassword`]. The password is never logged or echoed.
43pub fn parse_with_password(
44    data: &[u8],
45    fmt: Option<Format>,
46    password: Option<&str>,
47) -> Result<ir::Document> {
48    match fmt.or_else(|| detect_format(data)) {
49        Some(Format::Pdf) => {
50            let mut doc = pdf::parse_pdf(data, password)?;
51            let prof = profile::enabled();
52            // Geometric layout: chars → lines → paragraphs (reading order).
53            let tl = profile::start(prof);
54            layout_pages(&mut doc);
55            profile::log(&tl, "layout(lines+paragraphs+columns+tables)");
56            // Heading detection (font-size clustering + numbering) — DOCX gets its
57            // heading levels from Word styles instead.
58            let th = profile::start(prof);
59            layout::assign_headings(&mut doc);
60            // Mark (never drop) running headers/footers; callers opt in via
61            // `remove_boilerplate`.
62            layout::mark_boilerplate(&mut doc);
63            profile::log(&th, "headings+boilerplate");
64            profile::dump(prof); // layout substage breakdown
65            Ok(doc)
66        }
67        Some(Format::Docx) => docx::parse(data),
68        None => Err(PdfmuseError::InvalidFormat),
69    }
70}
71
72/// Strip running headers/footers (paragraphs marked
73/// [`ir::BlockRole::HeaderFooter`]) from `doc`, in place.
74///
75/// Opt-in: [`parse`] only *marks* boilerplate, so default output is unchanged.
76/// Call this before [`to_text`] / [`to_markdown`] / [`chunk`] to drop repeated page
77/// furniture (page numbers, running titles) from your RAG text. No-op on documents
78/// where nothing was marked (single/short docs, or no repetition).
79pub fn remove_boilerplate(doc: &mut ir::Document) {
80    for page in &mut doc.pages {
81        page.blocks
82            .retain(|b| !matches!(b, ir::Block::Paragraph(p) if p.role == Some(ir::BlockRole::HeaderFooter)));
83    }
84}
85
86/// Run geometric layout on every page. Parallel across cores with the `rayon`
87/// feature enabled, sequential otherwise — identical output either way.
88fn layout_pages(doc: &mut ir::Document) {
89    #[cfg(feature = "rayon")]
90    {
91        use rayon::prelude::*;
92        doc.pages.par_iter_mut().for_each(layout::layout_page);
93    }
94    #[cfg(not(feature = "rayon"))]
95    doc.pages.iter_mut().for_each(layout::layout_page);
96}
97
98/// Detect the container format from leading magic bytes.
99fn detect_format(data: &[u8]) -> Option<Format> {
100    if data.starts_with(b"PK\x03\x04") {
101        return Some(Format::Docx); // ZIP container → OOXML (DOCX)
102    }
103    // Some PDFs carry leading junk before `%PDF-`; scan the first 1 KiB.
104    let head = &data[..data.len().min(1024)];
105    if head.windows(5).any(|w| w == b"%PDF-") {
106        return Some(Format::Pdf);
107    }
108    None
109}
110
111#[cfg(test)]
112mod tests {
113    use super::*;
114
115    #[test]
116    fn detects_pdf_and_docx_magic() {
117        assert_eq!(detect_format(b"%PDF-1.7\ntrailer"), Some(Format::Pdf));
118        assert_eq!(detect_format(b"PK\x03\x04rest"), Some(Format::Docx));
119        assert_eq!(detect_format(b"not a document"), None);
120    }
121
122    #[test]
123    fn docx_magic_routes_to_docx_parser() {
124        // A bare ZIP magic is a truncated DOCX → recognized, then Malformed
125        // (no longer Unsupported, now that DOCX parsing is implemented).
126        assert!(matches!(
127            parse(b"PK\x03\x04", None).unwrap_err(),
128            PdfmuseError::Malformed(_)
129        ));
130    }
131
132    #[test]
133    fn unknown_bytes_are_invalid_format() {
134        assert!(matches!(
135            parse(b"garbage", None).unwrap_err(),
136            PdfmuseError::InvalidFormat
137        ));
138    }
139}