1use std::panic::AssertUnwindSafe;
8use std::path::Path;
9
10use crate::error::PdfGrepError;
11
12#[derive(Debug)]
14pub struct PdfDocument {
15 doc: lopdf::Document,
16 page_numbers: Vec<u32>,
17}
18
19impl PdfDocument {
20 pub fn open(path: &Path, passwords: &[String]) -> Result<Self, PdfGrepError> {
30 let mut doc = lopdf::Document::load(path).map_err(|e| match e {
31 lopdf::Error::IO(io) => PdfGrepError::Io {
32 path: path.to_path_buf(),
33 source: io,
34 },
35 other => PdfGrepError::Pdf {
36 path: path.to_path_buf(),
37 message: other.to_string(),
38 },
39 })?;
40
41 if doc.is_encrypted() {
42 let mut authenticated = false;
43 if doc.decrypt("").is_ok() {
46 authenticated = true;
47 } else {
48 for pwd in passwords {
49 if doc.decrypt(pwd).is_ok() {
50 authenticated = true;
51 break;
52 }
53 }
54 }
55 if !authenticated {
56 return Err(PdfGrepError::Encrypted {
57 path: path.to_path_buf(),
58 });
59 }
60 }
61
62 let page_numbers: Vec<u32> = doc.get_pages().keys().copied().collect();
63 let mut page_numbers = page_numbers;
64 page_numbers.sort_unstable();
65
66 Ok(PdfDocument { doc, page_numbers })
67 }
68
69 pub fn from_bytes(bytes: &[u8], passwords: &[String]) -> Result<Self, PdfGrepError> {
76 let virtual_path = Path::new("<stdin>");
77 let mut doc = lopdf::Document::load_mem(bytes).map_err(|e| PdfGrepError::Pdf {
78 path: virtual_path.to_path_buf(),
79 message: e.to_string(),
80 })?;
81
82 if doc.is_encrypted() {
83 let mut authenticated = false;
84 if doc.decrypt("").is_ok() {
85 authenticated = true;
86 } else {
87 for pwd in passwords {
88 if doc.decrypt(pwd).is_ok() {
89 authenticated = true;
90 break;
91 }
92 }
93 }
94 if !authenticated {
95 return Err(PdfGrepError::Encrypted {
96 path: virtual_path.to_path_buf(),
97 });
98 }
99 }
100
101 let mut page_numbers: Vec<u32> = doc.get_pages().keys().copied().collect();
102 page_numbers.sort_unstable();
103 Ok(PdfDocument { doc, page_numbers })
104 }
105
106 #[must_use]
108 pub fn page_numbers(&self) -> &[u32] {
109 &self.page_numbers
110 }
111
112 pub fn extract_page(&self, page_num: u32) -> Result<String, String> {
116 let doc = &self.doc;
117 let result = std::panic::catch_unwind(AssertUnwindSafe(|| doc.extract_text(&[page_num])));
118 match result {
119 Ok(Ok(text)) => Ok(text),
120 Ok(Err(e)) => Err(format!("page {page_num}: {e}")),
121 Err(_) => Err(format!("page {page_num}: parser panic")),
122 }
123 }
124}
125
126#[cfg(test)]
127mod tests {
128 use super::*;
129
130 #[test]
131 fn non_existent_path_returns_io_error() {
132 let err = PdfDocument::open(Path::new("/definitely/not/a/real/pdf.pdf"), &[]).unwrap_err();
133 assert!(matches!(
134 err,
135 PdfGrepError::Io { .. } | PdfGrepError::Pdf { .. }
136 ));
137 }
138
139 #[test]
140 fn invalid_bytes_return_pdf_error() {
141 let err = PdfDocument::from_bytes(b"NOT A PDF", &[]).unwrap_err();
142 assert!(matches!(err, PdfGrepError::Pdf { .. }));
143 }
144}