Skip to main content

pdfplumber_parse/
handler.rs

1//! Content handler callback trait for content stream interpretation.
2//!
3//! Defines the [`ContentHandler`] trait that bridges Layer 2 (content stream
4//! interpreter) and Layer 3 (object extraction). The interpreter calls handler
5//! methods as it processes PDF content stream operators.
6
7use pdfplumber_core::{Color, DashPattern, ExtractWarning, FillRule, PathSegment};
8
9/// The type of paint operation applied to a path.
10#[derive(Debug, Clone, Copy, PartialEq, Eq)]
11pub enum PaintOp {
12    /// Path is stroked (outlined).
13    Stroke,
14    /// Path is filled.
15    Fill,
16    /// Path is both filled and stroked.
17    FillAndStroke,
18}
19
20/// Information about a rendered character glyph.
21///
22/// Produced by the interpreter when processing text rendering operators
23/// (Tj, TJ, ', "). Contains all positioning and font context needed
24/// to compute the final character bounding box.
25#[derive(Debug, Clone)]
26pub struct CharEvent {
27    /// The character code from the PDF content stream.
28    pub char_code: u32,
29    /// Unicode text if a ToUnicode mapping is available.
30    pub unicode: Option<String>,
31    /// Font name (e.g., "Helvetica", "BCDFEE+ArialMT").
32    pub font_name: String,
33    /// Font size in text space units.
34    pub font_size: f64,
35    /// The text rendering matrix at the time of rendering (6-element affine).
36    pub text_matrix: [f64; 6],
37    /// The current transformation matrix at the time of rendering.
38    pub ctm: [f64; 6],
39    /// Glyph displacement (advance width) in glyph space units (1/1000 of text space).
40    pub displacement: f64,
41    /// Character spacing value (Tc operator).
42    pub char_spacing: f64,
43    /// Word spacing value (Tw operator), applied for space characters.
44    pub word_spacing: f64,
45    /// Horizontal scaling factor (Tz operator, as a fraction: 100% = 1.0).
46    pub h_scaling: f64,
47    /// Text rise value (Ts operator) for superscript/subscript.
48    pub rise: f64,
49}
50
51/// Information about a painted path.
52///
53/// Produced by the interpreter when a path is stroked, filled, or both.
54/// Contains the path geometry, paint operation, and graphics state at
55/// the time of painting.
56#[derive(Debug, Clone)]
57pub struct PathEvent {
58    /// The path segments making up this path.
59    pub segments: Vec<PathSegment>,
60    /// The paint operation applied.
61    pub paint_op: PaintOp,
62    /// Stroke line width.
63    pub line_width: f64,
64    /// Stroking (outline) color.
65    pub stroking_color: Option<Color>,
66    /// Non-stroking (fill) color.
67    pub non_stroking_color: Option<Color>,
68    /// Current transformation matrix at the time of painting.
69    pub ctm: [f64; 6],
70    /// Dash pattern for stroked paths.
71    pub dash_pattern: Option<DashPattern>,
72    /// Fill rule for filled paths.
73    pub fill_rule: Option<FillRule>,
74}
75
76/// Information about a placed image.
77///
78/// Produced by the interpreter when a Do operator references an Image
79/// XObject. The CTM determines the image's position and size on the page.
80#[derive(Debug, Clone)]
81pub struct ImageEvent {
82    /// Image XObject name reference (e.g., "Im0").
83    pub name: String,
84    /// CTM at the time of image placement (determines position and size).
85    pub ctm: [f64; 6],
86    /// Image width in pixels.
87    pub width: u32,
88    /// Image height in pixels.
89    pub height: u32,
90    /// Color space name (e.g., "DeviceRGB", "DeviceGray").
91    pub colorspace: Option<String>,
92    /// Bits per component.
93    pub bits_per_component: Option<u32>,
94}
95
96/// Callback handler for content stream interpretation.
97///
98/// The content stream interpreter calls these methods as it processes
99/// PDF page content. Implementors collect the events to build extraction
100/// results (characters, paths, images).
101///
102/// All methods have default no-op implementations, allowing handlers to
103/// subscribe only to the event types they care about.
104///
105/// # Text Operations
106///
107/// [`on_char`](ContentHandler::on_char) is called for each rendered
108/// character glyph with full positioning and font context.
109///
110/// # Path Operations
111///
112/// [`on_path_painted`](ContentHandler::on_path_painted) is called when
113/// a path is stroked, filled, or both.
114///
115/// # Image Operations
116///
117/// [`on_image`](ContentHandler::on_image) is called when an image
118/// XObject is placed on the page.
119pub trait ContentHandler {
120    /// Called when a character glyph is rendered.
121    fn on_char(&mut self, _event: CharEvent) {}
122
123    /// Called when a path is painted (stroked, filled, or both).
124    fn on_path_painted(&mut self, _event: PathEvent) {}
125
126    /// Called when an image XObject is placed on the page.
127    fn on_image(&mut self, _event: ImageEvent) {}
128
129    /// Called when a non-fatal warning is encountered during interpretation.
130    ///
131    /// Warnings indicate best-effort degradation (e.g., missing font metrics,
132    /// unresolvable references). They do not affect extraction correctness —
133    /// the interpreter continues with sensible defaults.
134    fn on_warning(&mut self, _warning: ExtractWarning) {}
135}
136
137#[cfg(test)]
138mod tests {
139    use super::*;
140    use pdfplumber_core::Point;
141
142    // --- CollectingHandler: captures all events for assertion ---
143
144    struct CollectingHandler {
145        chars: Vec<CharEvent>,
146        paths: Vec<PathEvent>,
147        images: Vec<ImageEvent>,
148        warnings: Vec<ExtractWarning>,
149    }
150
151    impl CollectingHandler {
152        fn new() -> Self {
153            Self {
154                chars: Vec::new(),
155                paths: Vec::new(),
156                images: Vec::new(),
157                warnings: Vec::new(),
158            }
159        }
160    }
161
162    impl ContentHandler for CollectingHandler {
163        fn on_char(&mut self, event: CharEvent) {
164            self.chars.push(event);
165        }
166
167        fn on_path_painted(&mut self, event: PathEvent) {
168            self.paths.push(event);
169        }
170
171        fn on_image(&mut self, event: ImageEvent) {
172            self.images.push(event);
173        }
174
175        fn on_warning(&mut self, warning: ExtractWarning) {
176            self.warnings.push(warning);
177        }
178    }
179
180    // --- NoopHandler: verifies default no-op implementations compile ---
181
182    struct NoopHandler;
183    impl ContentHandler for NoopHandler {}
184
185    // --- Helper to create a sample CharEvent ---
186
187    fn sample_char_event() -> CharEvent {
188        CharEvent {
189            char_code: 65,
190            unicode: Some("A".to_string()),
191            font_name: "Helvetica".to_string(),
192            font_size: 12.0,
193            text_matrix: [1.0, 0.0, 0.0, 1.0, 72.0, 720.0],
194            ctm: [1.0, 0.0, 0.0, 1.0, 0.0, 0.0],
195            displacement: 667.0,
196            char_spacing: 0.0,
197            word_spacing: 0.0,
198            h_scaling: 1.0,
199            rise: 0.0,
200        }
201    }
202
203    fn sample_path_event() -> PathEvent {
204        PathEvent {
205            segments: vec![
206                PathSegment::MoveTo(Point::new(0.0, 0.0)),
207                PathSegment::LineTo(Point::new(100.0, 0.0)),
208                PathSegment::LineTo(Point::new(100.0, 50.0)),
209                PathSegment::LineTo(Point::new(0.0, 50.0)),
210                PathSegment::ClosePath,
211            ],
212            paint_op: PaintOp::Stroke,
213            line_width: 1.0,
214            stroking_color: Some(Color::black()),
215            non_stroking_color: None,
216            ctm: [1.0, 0.0, 0.0, 1.0, 0.0, 0.0],
217            dash_pattern: None,
218            fill_rule: None,
219        }
220    }
221
222    fn sample_image_event() -> ImageEvent {
223        ImageEvent {
224            name: "Im0".to_string(),
225            ctm: [200.0, 0.0, 0.0, 150.0, 100.0, 300.0],
226            width: 800,
227            height: 600,
228            colorspace: Some("DeviceRGB".to_string()),
229            bits_per_component: Some(8),
230        }
231    }
232
233    // --- PaintOp tests ---
234
235    #[test]
236    fn paint_op_variants() {
237        assert_ne!(PaintOp::Stroke, PaintOp::Fill);
238        assert_ne!(PaintOp::Fill, PaintOp::FillAndStroke);
239        assert_ne!(PaintOp::Stroke, PaintOp::FillAndStroke);
240    }
241
242    #[test]
243    fn paint_op_copy() {
244        let op = PaintOp::Stroke;
245        let op2 = op; // Copy
246        assert_eq!(op, op2);
247    }
248
249    // --- CharEvent tests ---
250
251    #[test]
252    fn char_event_construction() {
253        let event = sample_char_event();
254        assert_eq!(event.char_code, 65);
255        assert_eq!(event.unicode.as_deref(), Some("A"));
256        assert_eq!(event.font_name, "Helvetica");
257        assert_eq!(event.font_size, 12.0);
258        assert_eq!(event.displacement, 667.0);
259        assert_eq!(event.h_scaling, 1.0);
260        assert_eq!(event.rise, 0.0);
261    }
262
263    #[test]
264    fn char_event_without_unicode() {
265        let event = CharEvent {
266            unicode: None,
267            ..sample_char_event()
268        };
269        assert_eq!(event.unicode, None);
270    }
271
272    #[test]
273    fn char_event_clone() {
274        let event = sample_char_event();
275        let cloned = event.clone();
276        assert_eq!(cloned.char_code, 65);
277        assert_eq!(cloned.font_name, "Helvetica");
278    }
279
280    // --- PathEvent tests ---
281
282    #[test]
283    fn path_event_construction() {
284        let event = sample_path_event();
285        assert_eq!(event.segments.len(), 5);
286        assert_eq!(event.paint_op, PaintOp::Stroke);
287        assert_eq!(event.line_width, 1.0);
288        assert!(event.stroking_color.is_some());
289        assert!(event.non_stroking_color.is_none());
290    }
291
292    #[test]
293    fn path_event_fill_with_rule() {
294        let event = PathEvent {
295            paint_op: PaintOp::Fill,
296            fill_rule: Some(FillRule::EvenOdd),
297            stroking_color: None,
298            non_stroking_color: Some(Color::Rgb(1.0, 0.0, 0.0)),
299            ..sample_path_event()
300        };
301        assert_eq!(event.paint_op, PaintOp::Fill);
302        assert_eq!(event.fill_rule, Some(FillRule::EvenOdd));
303    }
304
305    #[test]
306    fn path_event_with_dash_pattern() {
307        let event = PathEvent {
308            dash_pattern: Some(DashPattern {
309                dash_array: vec![3.0, 2.0],
310                dash_phase: 0.0,
311            }),
312            ..sample_path_event()
313        };
314        let dp = event.dash_pattern.unwrap();
315        assert_eq!(dp.dash_array, vec![3.0, 2.0]);
316    }
317
318    // --- ImageEvent tests ---
319
320    #[test]
321    fn image_event_construction() {
322        let event = sample_image_event();
323        assert_eq!(event.name, "Im0");
324        assert_eq!(event.width, 800);
325        assert_eq!(event.height, 600);
326        assert_eq!(event.colorspace.as_deref(), Some("DeviceRGB"));
327        assert_eq!(event.bits_per_component, Some(8));
328    }
329
330    #[test]
331    fn image_event_without_optional_fields() {
332        let event = ImageEvent {
333            colorspace: None,
334            bits_per_component: None,
335            ..sample_image_event()
336        };
337        assert_eq!(event.colorspace, None);
338        assert_eq!(event.bits_per_component, None);
339    }
340
341    // --- ContentHandler with CollectingHandler ---
342
343    #[test]
344    fn collecting_handler_receives_char_events() {
345        let mut handler = CollectingHandler::new();
346        handler.on_char(sample_char_event());
347        handler.on_char(CharEvent {
348            char_code: 66,
349            unicode: Some("B".to_string()),
350            ..sample_char_event()
351        });
352
353        assert_eq!(handler.chars.len(), 2);
354        assert_eq!(handler.chars[0].char_code, 65);
355        assert_eq!(handler.chars[1].char_code, 66);
356    }
357
358    #[test]
359    fn collecting_handler_receives_path_events() {
360        let mut handler = CollectingHandler::new();
361        handler.on_path_painted(sample_path_event());
362
363        assert_eq!(handler.paths.len(), 1);
364        assert_eq!(handler.paths[0].paint_op, PaintOp::Stroke);
365    }
366
367    #[test]
368    fn collecting_handler_receives_image_events() {
369        let mut handler = CollectingHandler::new();
370        handler.on_image(sample_image_event());
371
372        assert_eq!(handler.images.len(), 1);
373        assert_eq!(handler.images[0].name, "Im0");
374    }
375
376    #[test]
377    fn collecting_handler_receives_mixed_events() {
378        let mut handler = CollectingHandler::new();
379        handler.on_char(sample_char_event());
380        handler.on_path_painted(sample_path_event());
381        handler.on_image(sample_image_event());
382        handler.on_char(CharEvent {
383            char_code: 66,
384            unicode: Some("B".to_string()),
385            ..sample_char_event()
386        });
387
388        assert_eq!(handler.chars.len(), 2);
389        assert_eq!(handler.paths.len(), 1);
390        assert_eq!(handler.images.len(), 1);
391    }
392
393    // --- NoopHandler: default implementations ---
394
395    #[test]
396    fn noop_handler_accepts_all_events() {
397        let mut handler = NoopHandler;
398        handler.on_char(sample_char_event());
399        handler.on_path_painted(sample_path_event());
400        handler.on_image(sample_image_event());
401        // No panics, no state change — verifies default no-op implementations work
402    }
403
404    // --- ContentHandler as trait object ---
405
406    #[test]
407    fn content_handler_is_object_safe() {
408        let mut handler = CollectingHandler::new();
409        let handler_ref: &mut dyn ContentHandler = &mut handler;
410        handler_ref.on_char(sample_char_event());
411        // Verifies the trait can be used as a trait object
412    }
413
414    // --- on_warning tests ---
415
416    #[test]
417    fn noop_handler_on_warning_does_nothing() {
418        let mut handler = NoopHandler;
419        handler.on_warning(ExtractWarning::new("test warning"));
420        // No panics — verifies default no-op implementation works
421    }
422
423    #[test]
424    fn collecting_handler_receives_warnings() {
425        let mut handler = CollectingHandler::new();
426        handler.on_warning(ExtractWarning::new("warning 1"));
427        handler.on_warning(ExtractWarning::on_page("warning 2", 0));
428        handler.on_warning(ExtractWarning::with_operator_context(
429            "font issue",
430            5,
431            "Helvetica",
432        ));
433
434        assert_eq!(handler.warnings.len(), 3);
435        assert_eq!(handler.warnings[0].description, "warning 1");
436        assert_eq!(handler.warnings[1].page, Some(0));
437        assert_eq!(handler.warnings[2].font_name, Some("Helvetica".to_string()));
438    }
439
440    #[test]
441    fn on_warning_via_trait_object() {
442        let mut handler = CollectingHandler::new();
443        let handler_ref: &mut dyn ContentHandler = &mut handler;
444        handler_ref.on_warning(ExtractWarning::new("test"));
445
446        assert_eq!(handler.warnings.len(), 1);
447    }
448}