Skip to main content

pdfplumber_core/
error.rs

1//! Error and warning types for pdfplumber-rs.
2//!
3//! Provides [`PdfError`] for fatal errors that stop processing,
4//! [`ExtractWarning`] for non-fatal issues that allow best-effort continuation,
5//! [`ExtractResult`] for pairing a value with collected warnings, and
6//! [`ExtractOptions`] for configuring resource limits and warning behavior.
7
8use std::fmt;
9
10/// Fatal error types for PDF processing.
11///
12/// These errors indicate conditions that prevent further processing
13/// of the PDF or current operation.
14#[derive(Debug, Clone, PartialEq)]
15pub enum PdfError {
16    /// Error parsing PDF structure or syntax.
17    ParseError(String),
18    /// I/O error reading PDF data.
19    IoError(String),
20    /// Error resolving font or encoding information.
21    FontError(String),
22    /// Error during content stream interpretation.
23    InterpreterError(String),
24    /// A configured resource limit was exceeded.
25    ResourceLimitExceeded(String),
26    /// Any other error not covered by specific variants.
27    Other(String),
28}
29
30impl fmt::Display for PdfError {
31    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
32        match self {
33            PdfError::ParseError(msg) => write!(f, "parse error: {msg}"),
34            PdfError::IoError(msg) => write!(f, "I/O error: {msg}"),
35            PdfError::FontError(msg) => write!(f, "font error: {msg}"),
36            PdfError::InterpreterError(msg) => write!(f, "interpreter error: {msg}"),
37            PdfError::ResourceLimitExceeded(msg) => write!(f, "resource limit exceeded: {msg}"),
38            PdfError::Other(msg) => write!(f, "{msg}"),
39        }
40    }
41}
42
43impl std::error::Error for PdfError {}
44
45impl From<std::io::Error> for PdfError {
46    fn from(err: std::io::Error) -> Self {
47        PdfError::IoError(err.to_string())
48    }
49}
50
51/// A non-fatal warning encountered during extraction.
52///
53/// Warnings allow best-effort continuation when issues are encountered
54/// (e.g., missing font metrics, unknown operators). They include a
55/// description and optional source location context such as page number,
56/// operator index, and font name.
57#[derive(Debug, Clone, PartialEq)]
58pub struct ExtractWarning {
59    /// Human-readable description of the warning.
60    pub description: String,
61    /// Page number where the warning occurred (0-indexed), if applicable.
62    pub page: Option<usize>,
63    /// Element context (e.g., "char at offset 42").
64    pub element: Option<String>,
65    /// Index of the operator in the content stream where the warning occurred.
66    pub operator_index: Option<usize>,
67    /// Font name associated with the warning, if applicable.
68    pub font_name: Option<String>,
69}
70
71impl ExtractWarning {
72    /// Create a warning with just a description.
73    pub fn new(description: impl Into<String>) -> Self {
74        Self {
75            description: description.into(),
76            page: None,
77            element: None,
78            operator_index: None,
79            font_name: None,
80        }
81    }
82
83    /// Create a warning with page context.
84    pub fn on_page(description: impl Into<String>, page: usize) -> Self {
85        Self {
86            description: description.into(),
87            page: Some(page),
88            element: None,
89            operator_index: None,
90            font_name: None,
91        }
92    }
93
94    /// Create a warning with full source context.
95    pub fn with_context(
96        description: impl Into<String>,
97        page: usize,
98        element: impl Into<String>,
99    ) -> Self {
100        Self {
101            description: description.into(),
102            page: Some(page),
103            element: Some(element.into()),
104            operator_index: None,
105            font_name: None,
106        }
107    }
108
109    /// Create a warning with operator and font context.
110    ///
111    /// Includes the operator index in the content stream and the font name,
112    /// useful for diagnosing font-related issues during text extraction.
113    pub fn with_operator_context(
114        description: impl Into<String>,
115        operator_index: usize,
116        font_name: impl Into<String>,
117    ) -> Self {
118        Self {
119            description: description.into(),
120            page: None,
121            element: None,
122            operator_index: Some(operator_index),
123            font_name: Some(font_name.into()),
124        }
125    }
126}
127
128impl fmt::Display for ExtractWarning {
129    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
130        write!(f, "{}", self.description)?;
131        if let Some(page) = self.page {
132            write!(f, " (page {page})")?;
133        }
134        if let Some(ref font_name) = self.font_name {
135            write!(f, " [font {font_name}]")?;
136        }
137        if let Some(index) = self.operator_index {
138            write!(f, " [operator #{index}]")?;
139        }
140        if let Some(ref element) = self.element {
141            write!(f, " [{element}]")?;
142        }
143        Ok(())
144    }
145}
146
147/// Result wrapper that pairs a value with collected warnings.
148///
149/// Used when extraction can partially succeed with non-fatal issues.
150#[derive(Debug, Clone)]
151pub struct ExtractResult<T> {
152    /// The extracted value.
153    pub value: T,
154    /// Warnings collected during extraction.
155    pub warnings: Vec<ExtractWarning>,
156}
157
158impl<T> ExtractResult<T> {
159    /// Create a result with no warnings.
160    pub fn ok(value: T) -> Self {
161        Self {
162            value,
163            warnings: Vec::new(),
164        }
165    }
166
167    /// Create a result with warnings.
168    pub fn with_warnings(value: T, warnings: Vec<ExtractWarning>) -> Self {
169        Self { value, warnings }
170    }
171
172    /// Returns true if there are no warnings.
173    pub fn is_clean(&self) -> bool {
174        self.warnings.is_empty()
175    }
176
177    /// Transform the value while preserving warnings.
178    pub fn map<U>(self, f: impl FnOnce(T) -> U) -> ExtractResult<U> {
179        ExtractResult {
180            value: f(self.value),
181            warnings: self.warnings,
182        }
183    }
184}
185
186/// Options controlling extraction behavior and resource limits.
187///
188/// Provides sensible defaults for all settings. Resource limits prevent
189/// pathological PDFs from consuming excessive memory or causing infinite loops.
190#[derive(Debug, Clone)]
191pub struct ExtractOptions {
192    /// Maximum recursion depth for nested Form XObjects (default: 10).
193    pub max_recursion_depth: usize,
194    /// Maximum number of objects extracted per page (default: 100,000).
195    pub max_objects_per_page: usize,
196    /// Maximum content stream bytes to process (default: 100 MB).
197    pub max_stream_bytes: usize,
198    /// Whether to collect warnings during extraction (default: true).
199    pub collect_warnings: bool,
200}
201
202impl Default for ExtractOptions {
203    fn default() -> Self {
204        Self {
205            max_recursion_depth: 10,
206            max_objects_per_page: 100_000,
207            max_stream_bytes: 100 * 1024 * 1024,
208            collect_warnings: true,
209        }
210    }
211}
212
213#[cfg(test)]
214mod tests {
215    use super::*;
216
217    // --- PdfError tests ---
218
219    #[test]
220    fn pdf_error_parse_error_creation() {
221        let err = PdfError::ParseError("invalid xref".to_string());
222        assert_eq!(err.to_string(), "parse error: invalid xref");
223    }
224
225    #[test]
226    fn pdf_error_io_error_creation() {
227        let err = PdfError::IoError("file not found".to_string());
228        assert_eq!(err.to_string(), "I/O error: file not found");
229    }
230
231    #[test]
232    fn pdf_error_font_error_creation() {
233        let err = PdfError::FontError("missing glyph widths".to_string());
234        assert_eq!(err.to_string(), "font error: missing glyph widths");
235    }
236
237    #[test]
238    fn pdf_error_interpreter_error_creation() {
239        let err = PdfError::InterpreterError("unknown operator".to_string());
240        assert_eq!(err.to_string(), "interpreter error: unknown operator");
241    }
242
243    #[test]
244    fn pdf_error_resource_limit_exceeded() {
245        let err = PdfError::ResourceLimitExceeded("too many objects".to_string());
246        assert_eq!(err.to_string(), "resource limit exceeded: too many objects");
247    }
248
249    #[test]
250    fn pdf_error_other() {
251        let err = PdfError::Other("something went wrong".to_string());
252        assert_eq!(err.to_string(), "something went wrong");
253    }
254
255    #[test]
256    fn pdf_error_implements_std_error() {
257        let err: Box<dyn std::error::Error> = Box::new(PdfError::ParseError("test".to_string()));
258        assert_eq!(err.to_string(), "parse error: test");
259    }
260
261    #[test]
262    fn pdf_error_clone_and_eq() {
263        let err1 = PdfError::ParseError("test".to_string());
264        let err2 = err1.clone();
265        assert_eq!(err1, err2);
266    }
267
268    #[test]
269    fn pdf_error_from_io_error() {
270        let io_err = std::io::Error::new(std::io::ErrorKind::NotFound, "missing file");
271        let pdf_err: PdfError = io_err.into();
272        assert!(matches!(pdf_err, PdfError::IoError(_)));
273        assert!(pdf_err.to_string().contains("missing file"));
274    }
275
276    // --- ExtractWarning tests ---
277
278    #[test]
279    fn warning_new_with_description_only() {
280        let w = ExtractWarning::new("missing font metrics");
281        assert_eq!(w.description, "missing font metrics");
282        assert_eq!(w.page, None);
283        assert_eq!(w.element, None);
284        assert_eq!(w.operator_index, None);
285        assert_eq!(w.font_name, None);
286        assert_eq!(w.to_string(), "missing font metrics");
287    }
288
289    #[test]
290    fn warning_on_page() {
291        let w = ExtractWarning::on_page("unknown operator", 3);
292        assert_eq!(w.description, "unknown operator");
293        assert_eq!(w.page, Some(3));
294        assert_eq!(w.element, None);
295        assert_eq!(w.operator_index, None);
296        assert_eq!(w.font_name, None);
297        assert_eq!(w.to_string(), "unknown operator (page 3)");
298    }
299
300    #[test]
301    fn warning_with_full_context() {
302        let w = ExtractWarning::with_context("missing width", 1, "char at offset 42");
303        assert_eq!(w.description, "missing width");
304        assert_eq!(w.page, Some(1));
305        assert_eq!(w.element, Some("char at offset 42".to_string()));
306        assert_eq!(w.operator_index, None);
307        assert_eq!(w.font_name, None);
308        assert_eq!(w.to_string(), "missing width (page 1) [char at offset 42]");
309    }
310
311    #[test]
312    fn warning_with_operator_context() {
313        let w =
314            ExtractWarning::with_operator_context("font not found in resources", 5, "Helvetica");
315        assert_eq!(w.description, "font not found in resources");
316        assert_eq!(w.page, None);
317        assert_eq!(w.element, None);
318        assert_eq!(w.operator_index, Some(5));
319        assert_eq!(w.font_name, Some("Helvetica".to_string()));
320        assert_eq!(
321            w.to_string(),
322            "font not found in resources [font Helvetica] [operator #5]"
323        );
324    }
325
326    #[test]
327    fn warning_display_with_all_fields() {
328        let w = ExtractWarning {
329            description: "test warning".to_string(),
330            page: Some(2),
331            element: Some("extra context".to_string()),
332            operator_index: Some(10),
333            font_name: Some("Arial".to_string()),
334        };
335        assert_eq!(
336            w.to_string(),
337            "test warning (page 2) [font Arial] [operator #10] [extra context]"
338        );
339    }
340
341    #[test]
342    fn warning_clone_and_eq() {
343        let w1 = ExtractWarning::on_page("test warning", 0);
344        let w2 = w1.clone();
345        assert_eq!(w1, w2);
346    }
347
348    #[test]
349    fn warning_with_operator_context_clone_and_eq() {
350        let w1 = ExtractWarning::with_operator_context("test", 3, "Times");
351        let w2 = w1.clone();
352        assert_eq!(w1, w2);
353    }
354
355    // --- ExtractResult tests ---
356
357    #[test]
358    fn extract_result_ok_no_warnings() {
359        let result = ExtractResult::ok(42);
360        assert_eq!(result.value, 42);
361        assert!(result.warnings.is_empty());
362        assert!(result.is_clean());
363    }
364
365    #[test]
366    fn extract_result_with_warnings() {
367        let warnings = vec![
368            ExtractWarning::new("warn 1"),
369            ExtractWarning::on_page("warn 2", 0),
370        ];
371        let result = ExtractResult::with_warnings("hello", warnings);
372        assert_eq!(result.value, "hello");
373        assert_eq!(result.warnings.len(), 2);
374        assert!(!result.is_clean());
375    }
376
377    #[test]
378    fn extract_result_map_preserves_warnings() {
379        let warnings = vec![ExtractWarning::new("test")];
380        let result = ExtractResult::with_warnings(10, warnings);
381        let mapped = result.map(|v| v * 2);
382        assert_eq!(mapped.value, 20);
383        assert_eq!(mapped.warnings.len(), 1);
384        assert_eq!(mapped.warnings[0].description, "test");
385    }
386
387    #[test]
388    fn extract_result_collect_multiple_warnings() {
389        let mut result = ExtractResult::ok(Vec::<String>::new());
390        result.warnings.push(ExtractWarning::new("first"));
391        result.warnings.push(ExtractWarning::on_page("second", 1));
392        result
393            .warnings
394            .push(ExtractWarning::with_context("third", 2, "char 'A'"));
395        assert_eq!(result.warnings.len(), 3);
396    }
397
398    // --- ExtractOptions tests ---
399
400    #[test]
401    fn extract_options_default_values() {
402        let opts = ExtractOptions::default();
403        assert_eq!(opts.max_recursion_depth, 10);
404        assert_eq!(opts.max_objects_per_page, 100_000);
405        assert_eq!(opts.max_stream_bytes, 100 * 1024 * 1024);
406        assert!(opts.collect_warnings);
407    }
408
409    #[test]
410    fn extract_options_custom_values() {
411        let opts = ExtractOptions {
412            max_recursion_depth: 5,
413            max_objects_per_page: 50_000,
414            max_stream_bytes: 10 * 1024 * 1024,
415            collect_warnings: false,
416        };
417        assert_eq!(opts.max_recursion_depth, 5);
418        assert_eq!(opts.max_objects_per_page, 50_000);
419        assert_eq!(opts.max_stream_bytes, 10 * 1024 * 1024);
420        assert!(!opts.collect_warnings);
421    }
422
423    #[test]
424    fn extract_options_clone() {
425        let opts1 = ExtractOptions::default();
426        let opts2 = opts1.clone();
427        assert_eq!(opts2.max_recursion_depth, opts1.max_recursion_depth);
428        assert_eq!(opts2.collect_warnings, opts1.collect_warnings);
429    }
430}