Skip to main content

pdfplumber_parse/
backend.rs

1//! PDF parsing backend trait.
2//!
3//! Defines the [`PdfBackend`] trait that abstracts PDF parsing operations.
4//! This enables pluggable backends (e.g., lopdf, pdf-rs) for PDF reading.
5
6use pdfplumber_core::{BBox, ExtractOptions, PdfError};
7
8use crate::handler::ContentHandler;
9
10/// Trait abstracting PDF parsing operations.
11///
12/// A backend provides methods to open PDF documents, access pages,
13/// extract page properties (MediaBox, CropBox, Rotate), and interpret
14/// page content streams via a [`ContentHandler`] callback.
15///
16/// # Associated Types
17///
18/// - `Document`: The parsed PDF document representation.
19/// - `Page`: A reference to a single page within a document.
20/// - `Error`: Backend-specific error type, convertible to [`PdfError`].
21///
22/// # Usage
23///
24/// ```ignore
25/// let doc = MyBackend::open(pdf_bytes)?;
26/// let page_count = MyBackend::page_count(&doc);
27/// let page = MyBackend::get_page(&doc, 0)?;
28/// let media_box = MyBackend::page_media_box(&doc, &page)?;
29/// MyBackend::interpret_page(&doc, &page, &mut handler, &options)?;
30/// ```
31pub trait PdfBackend {
32    /// The parsed PDF document type.
33    type Document;
34
35    /// A reference to a single page within a document.
36    type Page;
37
38    /// Backend-specific error type, convertible to [`PdfError`].
39    type Error: std::error::Error + Into<PdfError>;
40
41    /// Parse PDF bytes into a document.
42    ///
43    /// # Errors
44    ///
45    /// Returns an error if the bytes do not represent a valid PDF document.
46    fn open(bytes: &[u8]) -> Result<Self::Document, Self::Error>;
47
48    /// Return the number of pages in the document.
49    fn page_count(doc: &Self::Document) -> usize;
50
51    /// Access a page by 0-based index.
52    ///
53    /// # Errors
54    ///
55    /// Returns an error if the index is out of range or the page cannot be loaded.
56    fn get_page(doc: &Self::Document, index: usize) -> Result<Self::Page, Self::Error>;
57
58    /// Get the MediaBox for a page.
59    ///
60    /// MediaBox is required by the PDF specification and defines the boundaries
61    /// of the physical page. The returned [`BBox`] uses the library's top-left
62    /// origin coordinate system.
63    ///
64    /// # Errors
65    ///
66    /// Returns an error if the MediaBox cannot be resolved (e.g., missing
67    /// from both the page and its parent page tree).
68    fn page_media_box(doc: &Self::Document, page: &Self::Page) -> Result<BBox, Self::Error>;
69
70    /// Get the CropBox for a page, if explicitly set.
71    ///
72    /// CropBox defines the visible region of the page. Returns `None` if
73    /// not explicitly set (in which case MediaBox serves as the CropBox).
74    ///
75    /// # Errors
76    ///
77    /// Returns an error if the CropBox entry exists but is malformed.
78    fn page_crop_box(doc: &Self::Document, page: &Self::Page) -> Result<Option<BBox>, Self::Error>;
79
80    /// Get the page rotation angle in degrees.
81    ///
82    /// Returns one of: 0, 90, 180, or 270. Defaults to 0 if not specified.
83    ///
84    /// # Errors
85    ///
86    /// Returns an error if the Rotate entry exists but is malformed.
87    fn page_rotate(doc: &Self::Document, page: &Self::Page) -> Result<i32, Self::Error>;
88
89    /// Interpret the page's content stream, calling back into the handler.
90    ///
91    /// The interpreter processes PDF content stream operators (text, path,
92    /// image) and notifies the `handler` of extracted content via
93    /// [`ContentHandler`] callbacks. Resource limits from `options` are
94    /// enforced during interpretation.
95    ///
96    /// # Errors
97    ///
98    /// Returns an error if content stream parsing fails or a resource limit
99    /// is exceeded.
100    fn interpret_page(
101        doc: &Self::Document,
102        page: &Self::Page,
103        handler: &mut dyn ContentHandler,
104        options: &ExtractOptions,
105    ) -> Result<(), Self::Error>;
106}
107
108#[cfg(test)]
109mod tests {
110    use super::*;
111    use crate::handler::{CharEvent, ImageEvent, PaintOp, PathEvent};
112    use pdfplumber_core::{Color, PathSegment, Point};
113
114    // --- Mock types ---
115
116    #[derive(Debug)]
117    struct MockDocument {
118        pages: Vec<MockPageData>,
119    }
120
121    #[derive(Debug)]
122    struct MockPageData {
123        media_box: BBox,
124        crop_box: Option<BBox>,
125        rotate: i32,
126    }
127
128    #[derive(Debug)]
129    struct MockPage {
130        index: usize,
131    }
132
133    // --- CollectingHandler for testing ---
134
135    struct CollectingHandler {
136        chars: Vec<CharEvent>,
137        paths: Vec<PathEvent>,
138        images: Vec<ImageEvent>,
139    }
140
141    impl CollectingHandler {
142        fn new() -> Self {
143            Self {
144                chars: Vec::new(),
145                paths: Vec::new(),
146                images: Vec::new(),
147            }
148        }
149    }
150
151    impl ContentHandler for CollectingHandler {
152        fn on_char(&mut self, event: CharEvent) {
153            self.chars.push(event);
154        }
155
156        fn on_path_painted(&mut self, event: PathEvent) {
157            self.paths.push(event);
158        }
159
160        fn on_image(&mut self, event: ImageEvent) {
161            self.images.push(event);
162        }
163    }
164
165    // --- MockBackend implementation ---
166
167    struct MockBackend;
168
169    impl PdfBackend for MockBackend {
170        type Document = MockDocument;
171        type Page = MockPage;
172        type Error = PdfError;
173
174        fn open(bytes: &[u8]) -> Result<Self::Document, Self::Error> {
175            if bytes.is_empty() {
176                return Err(PdfError::ParseError("empty input".to_string()));
177            }
178            // Mock: first byte encodes page count
179            let page_count = bytes[0] as usize;
180            let mut pages = Vec::new();
181            for _ in 0..page_count {
182                pages.push(MockPageData {
183                    media_box: BBox::new(0.0, 0.0, 612.0, 792.0), // US Letter
184                    crop_box: None,
185                    rotate: 0,
186                });
187            }
188            Ok(MockDocument { pages })
189        }
190
191        fn page_count(doc: &Self::Document) -> usize {
192            doc.pages.len()
193        }
194
195        fn get_page(doc: &Self::Document, index: usize) -> Result<Self::Page, Self::Error> {
196            if index >= doc.pages.len() {
197                return Err(PdfError::ParseError(format!(
198                    "page index {index} out of range (0..{})",
199                    doc.pages.len()
200                )));
201            }
202            Ok(MockPage { index })
203        }
204
205        fn page_media_box(doc: &Self::Document, page: &Self::Page) -> Result<BBox, Self::Error> {
206            Ok(doc.pages[page.index].media_box)
207        }
208
209        fn page_crop_box(
210            doc: &Self::Document,
211            page: &Self::Page,
212        ) -> Result<Option<BBox>, Self::Error> {
213            Ok(doc.pages[page.index].crop_box)
214        }
215
216        fn page_rotate(doc: &Self::Document, page: &Self::Page) -> Result<i32, Self::Error> {
217            Ok(doc.pages[page.index].rotate)
218        }
219
220        fn interpret_page(
221            _doc: &Self::Document,
222            _page: &Self::Page,
223            handler: &mut dyn ContentHandler,
224            _options: &ExtractOptions,
225        ) -> Result<(), Self::Error> {
226            // Emit a sample char
227            handler.on_char(CharEvent {
228                char_code: 72, // 'H'
229                unicode: Some("H".to_string()),
230                font_name: "Times-Roman".to_string(),
231                font_size: 14.0,
232                text_matrix: [1.0, 0.0, 0.0, 1.0, 72.0, 720.0],
233                ctm: [1.0, 0.0, 0.0, 1.0, 0.0, 0.0],
234                displacement: 722.0,
235                char_spacing: 0.0,
236                word_spacing: 0.0,
237                h_scaling: 1.0,
238                rise: 0.0,
239            });
240
241            // Emit a sample path (horizontal line)
242            handler.on_path_painted(PathEvent {
243                segments: vec![
244                    PathSegment::MoveTo(Point::new(72.0, 700.0)),
245                    PathSegment::LineTo(Point::new(540.0, 700.0)),
246                ],
247                paint_op: PaintOp::Stroke,
248                line_width: 0.5,
249                stroking_color: Some(Color::black()),
250                non_stroking_color: None,
251                ctm: [1.0, 0.0, 0.0, 1.0, 0.0, 0.0],
252                dash_pattern: None,
253                fill_rule: None,
254            });
255
256            // Emit a sample image
257            handler.on_image(ImageEvent {
258                name: "Im1".to_string(),
259                ctm: [100.0, 0.0, 0.0, 75.0, 72.0, 600.0],
260                width: 400,
261                height: 300,
262                colorspace: Some("DeviceRGB".to_string()),
263                bits_per_component: Some(8),
264            });
265
266            Ok(())
267        }
268    }
269
270    // --- PdfBackend::open tests ---
271
272    #[test]
273    fn mock_backend_open_valid_document() {
274        let doc = MockBackend::open(&[3]).unwrap();
275        assert_eq!(MockBackend::page_count(&doc), 3);
276    }
277
278    #[test]
279    fn mock_backend_open_single_page() {
280        let doc = MockBackend::open(&[1]).unwrap();
281        assert_eq!(MockBackend::page_count(&doc), 1);
282    }
283
284    #[test]
285    fn mock_backend_open_empty_bytes_fails() {
286        let result = MockBackend::open(&[]);
287        assert!(result.is_err());
288    }
289
290    // --- PdfBackend::get_page tests ---
291
292    #[test]
293    fn mock_backend_get_page_valid_index() {
294        let doc = MockBackend::open(&[3]).unwrap();
295        let page = MockBackend::get_page(&doc, 0).unwrap();
296        assert_eq!(page.index, 0);
297
298        let page2 = MockBackend::get_page(&doc, 2).unwrap();
299        assert_eq!(page2.index, 2);
300    }
301
302    #[test]
303    fn mock_backend_get_page_out_of_bounds() {
304        let doc = MockBackend::open(&[2]).unwrap();
305        let result = MockBackend::get_page(&doc, 5);
306        assert!(result.is_err());
307    }
308
309    // --- PdfBackend::page_media_box tests ---
310
311    #[test]
312    fn mock_backend_page_media_box() {
313        let doc = MockBackend::open(&[1]).unwrap();
314        let page = MockBackend::get_page(&doc, 0).unwrap();
315        let media_box = MockBackend::page_media_box(&doc, &page).unwrap();
316        assert_eq!(media_box, BBox::new(0.0, 0.0, 612.0, 792.0));
317    }
318
319    // --- PdfBackend::page_crop_box tests ---
320
321    #[test]
322    fn mock_backend_page_crop_box_none() {
323        let doc = MockBackend::open(&[1]).unwrap();
324        let page = MockBackend::get_page(&doc, 0).unwrap();
325        let crop_box = MockBackend::page_crop_box(&doc, &page).unwrap();
326        assert_eq!(crop_box, None);
327    }
328
329    // --- PdfBackend::page_rotate tests ---
330
331    #[test]
332    fn mock_backend_page_rotate_default() {
333        let doc = MockBackend::open(&[1]).unwrap();
334        let page = MockBackend::get_page(&doc, 0).unwrap();
335        let rotate = MockBackend::page_rotate(&doc, &page).unwrap();
336        assert_eq!(rotate, 0);
337    }
338
339    // --- PdfBackend::interpret_page tests ---
340
341    #[test]
342    fn mock_backend_interpret_page_emits_char() {
343        let doc = MockBackend::open(&[1]).unwrap();
344        let page = MockBackend::get_page(&doc, 0).unwrap();
345        let options = ExtractOptions::default();
346        let mut handler = CollectingHandler::new();
347
348        MockBackend::interpret_page(&doc, &page, &mut handler, &options).unwrap();
349
350        assert_eq!(handler.chars.len(), 1);
351        assert_eq!(handler.chars[0].char_code, 72);
352        assert_eq!(handler.chars[0].unicode.as_deref(), Some("H"));
353        assert_eq!(handler.chars[0].font_name, "Times-Roman");
354        assert_eq!(handler.chars[0].font_size, 14.0);
355    }
356
357    #[test]
358    fn mock_backend_interpret_page_emits_path() {
359        let doc = MockBackend::open(&[1]).unwrap();
360        let page = MockBackend::get_page(&doc, 0).unwrap();
361        let options = ExtractOptions::default();
362        let mut handler = CollectingHandler::new();
363
364        MockBackend::interpret_page(&doc, &page, &mut handler, &options).unwrap();
365
366        assert_eq!(handler.paths.len(), 1);
367        assert_eq!(handler.paths[0].paint_op, PaintOp::Stroke);
368        assert_eq!(handler.paths[0].segments.len(), 2);
369        assert_eq!(handler.paths[0].line_width, 0.5);
370    }
371
372    #[test]
373    fn mock_backend_interpret_page_emits_image() {
374        let doc = MockBackend::open(&[1]).unwrap();
375        let page = MockBackend::get_page(&doc, 0).unwrap();
376        let options = ExtractOptions::default();
377        let mut handler = CollectingHandler::new();
378
379        MockBackend::interpret_page(&doc, &page, &mut handler, &options).unwrap();
380
381        assert_eq!(handler.images.len(), 1);
382        assert_eq!(handler.images[0].name, "Im1");
383        assert_eq!(handler.images[0].width, 400);
384        assert_eq!(handler.images[0].height, 300);
385    }
386
387    #[test]
388    fn mock_backend_interpret_page_uses_trait_object() {
389        let doc = MockBackend::open(&[1]).unwrap();
390        let page = MockBackend::get_page(&doc, 0).unwrap();
391        let options = ExtractOptions::default();
392        let mut handler = CollectingHandler::new();
393
394        // Pass handler as &mut dyn ContentHandler explicitly
395        let handler_ref: &mut dyn ContentHandler = &mut handler;
396        MockBackend::interpret_page(&doc, &page, handler_ref, &options).unwrap();
397
398        assert_eq!(handler.chars.len(), 1);
399        assert_eq!(handler.paths.len(), 1);
400        assert_eq!(handler.images.len(), 1);
401    }
402
403    // --- Error conversion tests ---
404
405    #[test]
406    fn mock_backend_error_converts_to_pdf_error() {
407        let result = MockBackend::open(&[]);
408        let err = result.unwrap_err();
409        // PdfError::into() PdfError is identity
410        let pdf_err: PdfError = err.into();
411        assert!(matches!(pdf_err, PdfError::ParseError(_)));
412    }
413
414    #[test]
415    fn mock_backend_error_is_std_error() {
416        let result = MockBackend::open(&[]);
417        let err = result.unwrap_err();
418        let std_err: Box<dyn std::error::Error> = Box::new(err);
419        assert!(std_err.to_string().contains("empty input"));
420    }
421
422    // --- Custom mock with CropBox and Rotate ---
423
424    #[test]
425    fn mock_backend_custom_page_properties() {
426        let doc = MockDocument {
427            pages: vec![
428                MockPageData {
429                    media_box: BBox::new(0.0, 0.0, 595.0, 842.0), // A4
430                    crop_box: Some(BBox::new(10.0, 10.0, 585.0, 832.0)),
431                    rotate: 90,
432                },
433                MockPageData {
434                    media_box: BBox::new(0.0, 0.0, 842.0, 595.0), // A4 landscape
435                    crop_box: None,
436                    rotate: 0,
437                },
438            ],
439        };
440
441        // Page 0: A4 portrait with CropBox and rotation
442        let page0 = MockBackend::get_page(&doc, 0).unwrap();
443        let media_box0 = MockBackend::page_media_box(&doc, &page0).unwrap();
444        assert_eq!(media_box0, BBox::new(0.0, 0.0, 595.0, 842.0));
445
446        let crop_box0 = MockBackend::page_crop_box(&doc, &page0).unwrap();
447        assert_eq!(crop_box0, Some(BBox::new(10.0, 10.0, 585.0, 832.0)));
448
449        let rotate0 = MockBackend::page_rotate(&doc, &page0).unwrap();
450        assert_eq!(rotate0, 90);
451
452        // Page 1: A4 landscape, no CropBox, no rotation
453        let page1 = MockBackend::get_page(&doc, 1).unwrap();
454        let crop_box1 = MockBackend::page_crop_box(&doc, &page1).unwrap();
455        assert_eq!(crop_box1, None);
456
457        let rotate1 = MockBackend::page_rotate(&doc, &page1).unwrap();
458        assert_eq!(rotate1, 0);
459    }
460}