Skip to main content

pdfplumber_core/
error.rs

1//! Error and warning types for pdfplumber-rs.
2//!
3//! Provides [`PdfError`] for fatal errors that stop processing,
4//! [`ExtractWarning`] for non-fatal issues that allow best-effort continuation,
5//! [`ExtractResult`] for pairing a value with collected warnings, and
6//! [`ExtractOptions`] for configuring resource limits and warning behavior.
7
8use std::fmt;
9
10use crate::unicode_norm::UnicodeNorm;
11
12/// Fatal error types for PDF processing.
13///
14/// These errors indicate conditions that prevent further processing
15/// of the PDF or current operation.
16#[derive(Debug, Clone, PartialEq)]
17pub enum PdfError {
18    /// Error parsing PDF structure or syntax.
19    ParseError(String),
20    /// I/O error reading PDF data.
21    IoError(String),
22    /// Error resolving font or encoding information.
23    FontError(String),
24    /// Error during content stream interpretation.
25    InterpreterError(String),
26    /// A configured resource limit was exceeded.
27    ResourceLimitExceeded(String),
28    /// The PDF is encrypted and requires a password to open.
29    PasswordRequired,
30    /// The supplied password is incorrect for this encrypted PDF.
31    InvalidPassword,
32    /// Any other error not covered by specific variants.
33    Other(String),
34}
35
36impl fmt::Display for PdfError {
37    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
38        match self {
39            PdfError::ParseError(msg) => write!(f, "parse error: {msg}"),
40            PdfError::IoError(msg) => write!(f, "I/O error: {msg}"),
41            PdfError::FontError(msg) => write!(f, "font error: {msg}"),
42            PdfError::InterpreterError(msg) => write!(f, "interpreter error: {msg}"),
43            PdfError::ResourceLimitExceeded(msg) => write!(f, "resource limit exceeded: {msg}"),
44            PdfError::PasswordRequired => write!(f, "PDF is encrypted and requires a password"),
45            PdfError::InvalidPassword => write!(f, "the supplied password is incorrect"),
46            PdfError::Other(msg) => write!(f, "{msg}"),
47        }
48    }
49}
50
51impl std::error::Error for PdfError {}
52
53impl From<std::io::Error> for PdfError {
54    fn from(err: std::io::Error) -> Self {
55        PdfError::IoError(err.to_string())
56    }
57}
58
59/// A non-fatal warning encountered during extraction.
60///
61/// Warnings allow best-effort continuation when issues are encountered
62/// (e.g., missing font metrics, unknown operators). They include a
63/// description and optional source location context such as page number,
64/// operator index, and font name.
65#[derive(Debug, Clone, PartialEq)]
66pub struct ExtractWarning {
67    /// Human-readable description of the warning.
68    pub description: String,
69    /// Page number where the warning occurred (0-indexed), if applicable.
70    pub page: Option<usize>,
71    /// Element context (e.g., "char at offset 42").
72    pub element: Option<String>,
73    /// Index of the operator in the content stream where the warning occurred.
74    pub operator_index: Option<usize>,
75    /// Font name associated with the warning, if applicable.
76    pub font_name: Option<String>,
77}
78
79impl ExtractWarning {
80    /// Create a warning with just a description.
81    pub fn new(description: impl Into<String>) -> Self {
82        Self {
83            description: description.into(),
84            page: None,
85            element: None,
86            operator_index: None,
87            font_name: None,
88        }
89    }
90
91    /// Create a warning with page context.
92    pub fn on_page(description: impl Into<String>, page: usize) -> Self {
93        Self {
94            description: description.into(),
95            page: Some(page),
96            element: None,
97            operator_index: None,
98            font_name: None,
99        }
100    }
101
102    /// Create a warning with full source context.
103    pub fn with_context(
104        description: impl Into<String>,
105        page: usize,
106        element: impl Into<String>,
107    ) -> Self {
108        Self {
109            description: description.into(),
110            page: Some(page),
111            element: Some(element.into()),
112            operator_index: None,
113            font_name: None,
114        }
115    }
116
117    /// Create a warning with operator and font context.
118    ///
119    /// Includes the operator index in the content stream and the font name,
120    /// useful for diagnosing font-related issues during text extraction.
121    pub fn with_operator_context(
122        description: impl Into<String>,
123        operator_index: usize,
124        font_name: impl Into<String>,
125    ) -> Self {
126        Self {
127            description: description.into(),
128            page: None,
129            element: None,
130            operator_index: Some(operator_index),
131            font_name: Some(font_name.into()),
132        }
133    }
134}
135
136impl fmt::Display for ExtractWarning {
137    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
138        write!(f, "{}", self.description)?;
139        if let Some(page) = self.page {
140            write!(f, " (page {page})")?;
141        }
142        if let Some(ref font_name) = self.font_name {
143            write!(f, " [font {font_name}]")?;
144        }
145        if let Some(index) = self.operator_index {
146            write!(f, " [operator #{index}]")?;
147        }
148        if let Some(ref element) = self.element {
149            write!(f, " [{element}]")?;
150        }
151        Ok(())
152    }
153}
154
155/// Result wrapper that pairs a value with collected warnings.
156///
157/// Used when extraction can partially succeed with non-fatal issues.
158#[derive(Debug, Clone)]
159pub struct ExtractResult<T> {
160    /// The extracted value.
161    pub value: T,
162    /// Warnings collected during extraction.
163    pub warnings: Vec<ExtractWarning>,
164}
165
166impl<T> ExtractResult<T> {
167    /// Create a result with no warnings.
168    pub fn ok(value: T) -> Self {
169        Self {
170            value,
171            warnings: Vec::new(),
172        }
173    }
174
175    /// Create a result with warnings.
176    pub fn with_warnings(value: T, warnings: Vec<ExtractWarning>) -> Self {
177        Self { value, warnings }
178    }
179
180    /// Returns true if there are no warnings.
181    pub fn is_clean(&self) -> bool {
182        self.warnings.is_empty()
183    }
184
185    /// Transform the value while preserving warnings.
186    pub fn map<U>(self, f: impl FnOnce(T) -> U) -> ExtractResult<U> {
187        ExtractResult {
188            value: f(self.value),
189            warnings: self.warnings,
190        }
191    }
192}
193
194/// Options controlling extraction behavior and resource limits.
195///
196/// Provides sensible defaults for all settings. Resource limits prevent
197/// pathological PDFs from consuming excessive memory or causing infinite loops.
198#[derive(Debug, Clone)]
199pub struct ExtractOptions {
200    /// Maximum recursion depth for nested Form XObjects (default: 10).
201    pub max_recursion_depth: usize,
202    /// Maximum number of objects extracted per page (default: 100,000).
203    pub max_objects_per_page: usize,
204    /// Maximum content stream bytes to process (default: 100 MB).
205    pub max_stream_bytes: usize,
206    /// Whether to collect warnings during extraction (default: true).
207    pub collect_warnings: bool,
208    /// Unicode normalization form to apply to extracted character text (default: None).
209    pub unicode_norm: UnicodeNorm,
210}
211
212impl Default for ExtractOptions {
213    fn default() -> Self {
214        Self {
215            max_recursion_depth: 10,
216            max_objects_per_page: 100_000,
217            max_stream_bytes: 100 * 1024 * 1024,
218            collect_warnings: true,
219            unicode_norm: UnicodeNorm::None,
220        }
221    }
222}
223
224#[cfg(test)]
225mod tests {
226    use super::*;
227    use crate::unicode_norm::UnicodeNorm;
228
229    // --- PdfError tests ---
230
231    #[test]
232    fn pdf_error_parse_error_creation() {
233        let err = PdfError::ParseError("invalid xref".to_string());
234        assert_eq!(err.to_string(), "parse error: invalid xref");
235    }
236
237    #[test]
238    fn pdf_error_io_error_creation() {
239        let err = PdfError::IoError("file not found".to_string());
240        assert_eq!(err.to_string(), "I/O error: file not found");
241    }
242
243    #[test]
244    fn pdf_error_font_error_creation() {
245        let err = PdfError::FontError("missing glyph widths".to_string());
246        assert_eq!(err.to_string(), "font error: missing glyph widths");
247    }
248
249    #[test]
250    fn pdf_error_interpreter_error_creation() {
251        let err = PdfError::InterpreterError("unknown operator".to_string());
252        assert_eq!(err.to_string(), "interpreter error: unknown operator");
253    }
254
255    #[test]
256    fn pdf_error_resource_limit_exceeded() {
257        let err = PdfError::ResourceLimitExceeded("too many objects".to_string());
258        assert_eq!(err.to_string(), "resource limit exceeded: too many objects");
259    }
260
261    #[test]
262    fn pdf_error_password_required() {
263        let err = PdfError::PasswordRequired;
264        assert_eq!(err.to_string(), "PDF is encrypted and requires a password");
265    }
266
267    #[test]
268    fn pdf_error_invalid_password() {
269        let err = PdfError::InvalidPassword;
270        assert_eq!(err.to_string(), "the supplied password is incorrect");
271    }
272
273    #[test]
274    fn pdf_error_password_required_clone_and_eq() {
275        let err1 = PdfError::PasswordRequired;
276        let err2 = err1.clone();
277        assert_eq!(err1, err2);
278    }
279
280    #[test]
281    fn pdf_error_invalid_password_clone_and_eq() {
282        let err1 = PdfError::InvalidPassword;
283        let err2 = err1.clone();
284        assert_eq!(err1, err2);
285    }
286
287    #[test]
288    fn pdf_error_other() {
289        let err = PdfError::Other("something went wrong".to_string());
290        assert_eq!(err.to_string(), "something went wrong");
291    }
292
293    #[test]
294    fn pdf_error_implements_std_error() {
295        let err: Box<dyn std::error::Error> = Box::new(PdfError::ParseError("test".to_string()));
296        assert_eq!(err.to_string(), "parse error: test");
297    }
298
299    #[test]
300    fn pdf_error_clone_and_eq() {
301        let err1 = PdfError::ParseError("test".to_string());
302        let err2 = err1.clone();
303        assert_eq!(err1, err2);
304    }
305
306    #[test]
307    fn pdf_error_from_io_error() {
308        let io_err = std::io::Error::new(std::io::ErrorKind::NotFound, "missing file");
309        let pdf_err: PdfError = io_err.into();
310        assert!(matches!(pdf_err, PdfError::IoError(_)));
311        assert!(pdf_err.to_string().contains("missing file"));
312    }
313
314    // --- ExtractWarning tests ---
315
316    #[test]
317    fn warning_new_with_description_only() {
318        let w = ExtractWarning::new("missing font metrics");
319        assert_eq!(w.description, "missing font metrics");
320        assert_eq!(w.page, None);
321        assert_eq!(w.element, None);
322        assert_eq!(w.operator_index, None);
323        assert_eq!(w.font_name, None);
324        assert_eq!(w.to_string(), "missing font metrics");
325    }
326
327    #[test]
328    fn warning_on_page() {
329        let w = ExtractWarning::on_page("unknown operator", 3);
330        assert_eq!(w.description, "unknown operator");
331        assert_eq!(w.page, Some(3));
332        assert_eq!(w.element, None);
333        assert_eq!(w.operator_index, None);
334        assert_eq!(w.font_name, None);
335        assert_eq!(w.to_string(), "unknown operator (page 3)");
336    }
337
338    #[test]
339    fn warning_with_full_context() {
340        let w = ExtractWarning::with_context("missing width", 1, "char at offset 42");
341        assert_eq!(w.description, "missing width");
342        assert_eq!(w.page, Some(1));
343        assert_eq!(w.element, Some("char at offset 42".to_string()));
344        assert_eq!(w.operator_index, None);
345        assert_eq!(w.font_name, None);
346        assert_eq!(w.to_string(), "missing width (page 1) [char at offset 42]");
347    }
348
349    #[test]
350    fn warning_with_operator_context() {
351        let w =
352            ExtractWarning::with_operator_context("font not found in resources", 5, "Helvetica");
353        assert_eq!(w.description, "font not found in resources");
354        assert_eq!(w.page, None);
355        assert_eq!(w.element, None);
356        assert_eq!(w.operator_index, Some(5));
357        assert_eq!(w.font_name, Some("Helvetica".to_string()));
358        assert_eq!(
359            w.to_string(),
360            "font not found in resources [font Helvetica] [operator #5]"
361        );
362    }
363
364    #[test]
365    fn warning_display_with_all_fields() {
366        let w = ExtractWarning {
367            description: "test warning".to_string(),
368            page: Some(2),
369            element: Some("extra context".to_string()),
370            operator_index: Some(10),
371            font_name: Some("Arial".to_string()),
372        };
373        assert_eq!(
374            w.to_string(),
375            "test warning (page 2) [font Arial] [operator #10] [extra context]"
376        );
377    }
378
379    #[test]
380    fn warning_clone_and_eq() {
381        let w1 = ExtractWarning::on_page("test warning", 0);
382        let w2 = w1.clone();
383        assert_eq!(w1, w2);
384    }
385
386    #[test]
387    fn warning_with_operator_context_clone_and_eq() {
388        let w1 = ExtractWarning::with_operator_context("test", 3, "Times");
389        let w2 = w1.clone();
390        assert_eq!(w1, w2);
391    }
392
393    // --- ExtractResult tests ---
394
395    #[test]
396    fn extract_result_ok_no_warnings() {
397        let result = ExtractResult::ok(42);
398        assert_eq!(result.value, 42);
399        assert!(result.warnings.is_empty());
400        assert!(result.is_clean());
401    }
402
403    #[test]
404    fn extract_result_with_warnings() {
405        let warnings = vec![
406            ExtractWarning::new("warn 1"),
407            ExtractWarning::on_page("warn 2", 0),
408        ];
409        let result = ExtractResult::with_warnings("hello", warnings);
410        assert_eq!(result.value, "hello");
411        assert_eq!(result.warnings.len(), 2);
412        assert!(!result.is_clean());
413    }
414
415    #[test]
416    fn extract_result_map_preserves_warnings() {
417        let warnings = vec![ExtractWarning::new("test")];
418        let result = ExtractResult::with_warnings(10, warnings);
419        let mapped = result.map(|v| v * 2);
420        assert_eq!(mapped.value, 20);
421        assert_eq!(mapped.warnings.len(), 1);
422        assert_eq!(mapped.warnings[0].description, "test");
423    }
424
425    #[test]
426    fn extract_result_collect_multiple_warnings() {
427        let mut result = ExtractResult::ok(Vec::<String>::new());
428        result.warnings.push(ExtractWarning::new("first"));
429        result.warnings.push(ExtractWarning::on_page("second", 1));
430        result
431            .warnings
432            .push(ExtractWarning::with_context("third", 2, "char 'A'"));
433        assert_eq!(result.warnings.len(), 3);
434    }
435
436    // --- ExtractOptions tests ---
437
438    #[test]
439    fn extract_options_default_values() {
440        let opts = ExtractOptions::default();
441        assert_eq!(opts.max_recursion_depth, 10);
442        assert_eq!(opts.max_objects_per_page, 100_000);
443        assert_eq!(opts.max_stream_bytes, 100 * 1024 * 1024);
444        assert!(opts.collect_warnings);
445        assert_eq!(opts.unicode_norm, UnicodeNorm::None);
446    }
447
448    #[test]
449    fn extract_options_custom_values() {
450        let opts = ExtractOptions {
451            max_recursion_depth: 5,
452            max_objects_per_page: 50_000,
453            max_stream_bytes: 10 * 1024 * 1024,
454            collect_warnings: false,
455            unicode_norm: UnicodeNorm::None,
456        };
457        assert_eq!(opts.max_recursion_depth, 5);
458        assert_eq!(opts.max_objects_per_page, 50_000);
459        assert_eq!(opts.max_stream_bytes, 10 * 1024 * 1024);
460        assert!(!opts.collect_warnings);
461    }
462
463    #[test]
464    fn extract_options_clone() {
465        let opts1 = ExtractOptions::default();
466        let opts2 = opts1.clone();
467        assert_eq!(opts2.max_recursion_depth, opts1.max_recursion_depth);
468        assert_eq!(opts2.collect_warnings, opts1.collect_warnings);
469    }
470}