Skip to main content

rusty_pdfgrep/
pdf.rs

1//! PDF reading via `lopdf` (FR-024..FR-026, AD-004, AD-012, HINT-002/003).
2//!
3//! Per-page text extraction with `std::panic::catch_unwind` around each call —
4//! `lopdf` has known panic paths on malformed CID font dictionaries. A panic
5//! on page N emits a stderr warning and the caller advances to N+1.
6
7use std::panic::AssertUnwindSafe;
8use std::path::Path;
9
10use crate::error::PdfGrepError;
11
12/// Loaded PDF handle ready for page-by-page extraction.
13#[derive(Debug)]
14pub struct PdfDocument {
15    doc: lopdf::Document,
16    page_numbers: Vec<u32>,
17}
18
19impl PdfDocument {
20    /// Load a PDF from a path, attempting each `passwords` entry in order if
21    /// the file is encrypted. Returns `PdfGrepError::Encrypted` when no
22    /// password succeeds.
23    ///
24    /// # Errors
25    ///
26    /// - `PdfGrepError::Io` on file-open failure.
27    /// - `PdfGrepError::Pdf` on `lopdf` parser failure.
28    /// - `PdfGrepError::Encrypted` on encrypted PDF with no successful password.
29    pub fn open(path: &Path, passwords: &[String]) -> Result<Self, PdfGrepError> {
30        let mut doc = lopdf::Document::load(path).map_err(|e| match e {
31            lopdf::Error::IO(io) => PdfGrepError::Io {
32                path: path.to_path_buf(),
33                source: io,
34            },
35            other => PdfGrepError::Pdf {
36                path: path.to_path_buf(),
37                message: other.to_string(),
38            },
39        })?;
40
41        if doc.is_encrypted() {
42            let mut authenticated = false;
43            // Always try empty password first (matches upstream's silent
44            // auto-decrypt of empty-string-encrypted PDFs).
45            if doc.decrypt("").is_ok() {
46                authenticated = true;
47            } else {
48                for pwd in passwords {
49                    if doc.decrypt(pwd).is_ok() {
50                        authenticated = true;
51                        break;
52                    }
53                }
54            }
55            if !authenticated {
56                return Err(PdfGrepError::Encrypted {
57                    path: path.to_path_buf(),
58                });
59            }
60        }
61
62        let page_numbers: Vec<u32> = doc.get_pages().keys().copied().collect();
63        let mut page_numbers = page_numbers;
64        page_numbers.sort_unstable();
65
66        Ok(PdfDocument { doc, page_numbers })
67    }
68
69    /// Load a PDF from bytes (for stdin handling).
70    ///
71    /// # Errors
72    ///
73    /// - `PdfGrepError::Pdf` on parser failure.
74    /// - `PdfGrepError::Encrypted` if encrypted and no password works.
75    pub fn from_bytes(bytes: &[u8], passwords: &[String]) -> Result<Self, PdfGrepError> {
76        let virtual_path = Path::new("<stdin>");
77        let mut doc = lopdf::Document::load_mem(bytes).map_err(|e| PdfGrepError::Pdf {
78            path: virtual_path.to_path_buf(),
79            message: e.to_string(),
80        })?;
81
82        if doc.is_encrypted() {
83            let mut authenticated = false;
84            if doc.decrypt("").is_ok() {
85                authenticated = true;
86            } else {
87                for pwd in passwords {
88                    if doc.decrypt(pwd).is_ok() {
89                        authenticated = true;
90                        break;
91                    }
92                }
93            }
94            if !authenticated {
95                return Err(PdfGrepError::Encrypted {
96                    path: virtual_path.to_path_buf(),
97                });
98            }
99        }
100
101        let mut page_numbers: Vec<u32> = doc.get_pages().keys().copied().collect();
102        page_numbers.sort_unstable();
103        Ok(PdfDocument { doc, page_numbers })
104    }
105
106    /// Sorted list of 1-indexed page numbers in this PDF.
107    #[must_use]
108    pub fn page_numbers(&self) -> &[u32] {
109        &self.page_numbers
110    }
111
112    /// Extract text from a single page, catching `lopdf` parser panics.
113    /// Returns `Ok(text)` on success, `Err(message)` on parser error or panic
114    /// — the caller decides whether to warn or propagate.
115    pub fn extract_page(&self, page_num: u32) -> Result<String, String> {
116        let doc = &self.doc;
117        let result = std::panic::catch_unwind(AssertUnwindSafe(|| doc.extract_text(&[page_num])));
118        match result {
119            Ok(Ok(text)) => Ok(text),
120            Ok(Err(e)) => Err(format!("page {page_num}: {e}")),
121            Err(_) => Err(format!("page {page_num}: parser panic")),
122        }
123    }
124}
125
126#[cfg(test)]
127mod tests {
128    use super::*;
129
130    #[test]
131    fn non_existent_path_returns_io_error() {
132        let err = PdfDocument::open(Path::new("/definitely/not/a/real/pdf.pdf"), &[]).unwrap_err();
133        assert!(matches!(
134            err,
135            PdfGrepError::Io { .. } | PdfGrepError::Pdf { .. }
136        ));
137    }
138
139    #[test]
140    fn invalid_bytes_return_pdf_error() {
141        let err = PdfDocument::from_bytes(b"NOT A PDF", &[]).unwrap_err();
142        assert!(matches!(err, PdfGrepError::Pdf { .. }));
143    }
144}