Skip to main content

pdfplumber_parse/
char_extraction.rs

1//! Character bounding box calculation from content stream events.
2//!
3//! Combines font metrics, text state, and CTM to calculate the final
4//! bounding box for each character in top-left origin coordinates.
5//! This bridges Layer 2 (interpreter) and Layer 3 (object extraction).
6
7use pdfplumber_core::geometry::{BBox, Ctm, Point};
8use pdfplumber_core::painting::Color;
9use pdfplumber_core::text::{Char, TextDirection};
10
11use crate::font_metrics::FontMetrics;
12use crate::handler::CharEvent;
13
14/// Convert a `CharEvent` and font metrics into a fully-populated `Char`
15/// with bounding box in top-left origin page coordinates.
16///
17/// # Arguments
18///
19/// * `event` - Character rendering event from the content stream interpreter.
20/// * `metrics` - Font metrics for width, ascent, and descent lookup.
21/// * `page_height` - Page height in PDF units (for y-flip from bottom-left to top-left origin).
22/// * `stroking_color` - Current stroking color from the graphics state.
23/// * `non_stroking_color` - Current non-stroking color from the graphics state.
24///
25/// # Coordinate System
26///
27/// PDF uses bottom-left origin. This function converts to top-left origin
28/// (pdfplumber convention) by flipping: `top = page_height - max_y`.
29pub fn char_from_event(
30    event: &CharEvent,
31    metrics: &FontMetrics,
32    page_height: f64,
33    stroking_color: Option<Color>,
34    non_stroking_color: Option<Color>,
35) -> Char {
36    let font_size = event.font_size;
37    let h_scaling = event.h_scaling;
38
39    // Build the Text Rendering Matrix (Trm) per PDF spec 9.4.4:
40    // Trm = [Tfs*Th, 0, 0, Tfs, 0, Trise] x Tm x CTM
41    let font_matrix = Ctm::new(font_size * h_scaling, 0.0, 0.0, font_size, 0.0, event.rise);
42    let tm = ctm_from_array(&event.text_matrix);
43    let ctm = ctm_from_array(&event.ctm);
44    let trm = font_matrix.concat(&tm).concat(&ctm);
45
46    // Character width in glyph-normalized space.
47    // Per PDF spec: advance = ((w0/1000)*Tfs + Tc + Tw) * Th
48    // In glyph-norm space (Trm x-axis scales by Tfs*Th):
49    //   w_norm = advance / (Tfs*Th) = w0/1000 + (Tc + Tw) / Tfs
50    let word_spacing = if event.char_code == 32 {
51        event.word_spacing
52    } else {
53        0.0
54    };
55    let w_norm = if font_size.abs() > f64::EPSILON {
56        event.displacement / 1000.0 + (event.char_spacing + word_spacing) / font_size
57    } else {
58        event.displacement / 1000.0
59    };
60
61    // Ascent/descent in glyph-normalized space (1/1000 units → normalized)
62    let ascent_norm = metrics.ascent() / 1000.0;
63    let descent_norm = metrics.descent() / 1000.0;
64
65    // Four corners of the character rectangle in glyph-normalized space,
66    // transformed through Trm to page space (PDF bottom-left origin).
67    let corners = [
68        trm.transform_point(Point::new(0.0, descent_norm)),
69        trm.transform_point(Point::new(w_norm, descent_norm)),
70        trm.transform_point(Point::new(w_norm, ascent_norm)),
71        trm.transform_point(Point::new(0.0, ascent_norm)),
72    ];
73
74    // Axis-aligned bounding box in PDF page space
75    let min_x = corners.iter().map(|p| p.x).fold(f64::INFINITY, f64::min);
76    let max_x = corners
77        .iter()
78        .map(|p| p.x)
79        .fold(f64::NEG_INFINITY, f64::max);
80    let min_y = corners.iter().map(|p| p.y).fold(f64::INFINITY, f64::min);
81    let max_y = corners
82        .iter()
83        .map(|p| p.y)
84        .fold(f64::NEG_INFINITY, f64::max);
85
86    // Y-flip: PDF bottom-left origin → top-left origin
87    let top = page_height - max_y;
88    let bottom = page_height - min_y;
89
90    let bbox = BBox::new(min_x, top, max_x, bottom);
91
92    // Upright: no rotation/shear in the text rendering matrix
93    let upright = trm.b.abs() < 1e-6 && trm.c.abs() < 1e-6;
94
95    // Text direction from the dominant axis of the text rendering matrix
96    let direction = if trm.a.abs() >= trm.b.abs() {
97        if trm.a >= 0.0 {
98            TextDirection::Ltr
99        } else {
100            TextDirection::Rtl
101        }
102    } else if trm.b > 0.0 {
103        TextDirection::Btt
104    } else {
105        TextDirection::Ttb
106    };
107
108    // Unicode text with fallback
109    let text = event.unicode.clone().unwrap_or_else(|| {
110        char::from_u32(event.char_code)
111            .map(|c| c.to_string())
112            .unwrap_or_else(|| "\u{FFFD}".to_string())
113    });
114
115    Char {
116        text,
117        bbox,
118        fontname: event.font_name.clone(),
119        size: font_size,
120        doctop: top,
121        upright,
122        direction,
123        stroking_color,
124        non_stroking_color,
125        ctm: event.ctm,
126        char_code: event.char_code,
127        mcid: None,
128        tag: None,
129    }
130}
131
132/// Create a [`Ctm`] from a 6-element array `[a, b, c, d, e, f]`.
133fn ctm_from_array(arr: &[f64; 6]) -> Ctm {
134    Ctm::new(arr[0], arr[1], arr[2], arr[3], arr[4], arr[5])
135}
136
137#[cfg(test)]
138mod tests {
139    use super::*;
140
141    const PAGE_HEIGHT: f64 = 792.0; // US Letter
142
143    /// Helper: create a default CharEvent for testing.
144    fn default_event() -> CharEvent {
145        CharEvent {
146            char_code: 65, // 'A'
147            unicode: Some("A".to_string()),
148            font_name: "Helvetica".to_string(),
149            font_size: 12.0,
150            text_matrix: [1.0, 0.0, 0.0, 1.0, 72.0, 720.0],
151            ctm: [1.0, 0.0, 0.0, 1.0, 0.0, 0.0],
152            displacement: 667.0, // glyph width in 1/1000 units
153            char_spacing: 0.0,
154            word_spacing: 0.0,
155            h_scaling: 1.0,
156            rise: 0.0,
157        }
158    }
159
160    /// Helper: create default FontMetrics for testing.
161    fn default_metrics() -> FontMetrics {
162        FontMetrics::new(
163            vec![667.0], // width for char_code 65 ('A')
164            65,
165            65,
166            600.0,  // missing width
167            750.0,  // ascent
168            -250.0, // descent
169            None,
170        )
171    }
172
173    fn assert_approx(actual: f64, expected: f64, msg: &str) {
174        assert!(
175            (actual - expected).abs() < 0.01,
176            "{msg}: expected {expected}, got {actual}"
177        );
178    }
179
180    // ===== Test 1: Simple horizontal text bbox =====
181
182    #[test]
183    fn simple_horizontal_text_bbox() {
184        let event = default_event();
185        let metrics = default_metrics();
186
187        let ch = char_from_event(&event, &metrics, PAGE_HEIGHT, None, Some(Color::black()));
188
189        // Trm = [12, 0, 0, 12, 72, 720]
190        // w_norm = 0.667, ascent_norm = 0.75, descent_norm = -0.25
191        // BL→(72, 717), BR→(80.004, 717), TR→(80.004, 729), TL→(72, 729)
192        // Y-flip: top = 792-729 = 63, bottom = 792-717 = 75
193        assert_approx(ch.bbox.x0, 72.0, "x0");
194        assert_approx(ch.bbox.top, 63.0, "top");
195        assert_approx(ch.bbox.x1, 80.004, "x1");
196        assert_approx(ch.bbox.bottom, 75.0, "bottom");
197        assert_approx(ch.bbox.width(), 8.004, "width");
198        assert_approx(ch.bbox.height(), 12.0, "height");
199
200        assert_eq!(ch.text, "A");
201        assert_eq!(ch.fontname, "Helvetica");
202        assert_eq!(ch.size, 12.0);
203        assert!(ch.upright);
204        assert_eq!(ch.direction, TextDirection::Ltr);
205        assert_eq!(ch.char_code, 65);
206        assert_eq!(ch.ctm, [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]);
207    }
208
209    // ===== Test 2: Scaled text (font_size = 24) =====
210
211    #[test]
212    fn scaled_text_bbox() {
213        let event = CharEvent {
214            font_size: 24.0,
215            ..default_event()
216        };
217        let metrics = default_metrics();
218
219        let ch = char_from_event(&event, &metrics, PAGE_HEIGHT, None, None);
220
221        // Trm = [24, 0, 0, 24, 72, 720]
222        // BL→(72, 714), BR→(88.008, 714), TR→(88.008, 738), TL→(72, 738)
223        // Y-flip: top = 792-738 = 54, bottom = 792-714 = 78
224        assert_approx(ch.bbox.x0, 72.0, "x0");
225        assert_approx(ch.bbox.top, 54.0, "top");
226        assert_approx(ch.bbox.x1, 88.008, "x1");
227        assert_approx(ch.bbox.bottom, 78.0, "bottom");
228        assert_approx(ch.bbox.width(), 16.008, "width");
229        assert_approx(ch.bbox.height(), 24.0, "height");
230        assert_eq!(ch.size, 24.0);
231    }
232
233    // ===== Test 3: Text with rise (superscript) =====
234
235    #[test]
236    fn text_with_rise_bbox() {
237        let event = CharEvent {
238            rise: 5.0,
239            ..default_event()
240        };
241        let metrics = default_metrics();
242
243        let ch = char_from_event(&event, &metrics, PAGE_HEIGHT, None, None);
244
245        // font_matrix = [12, 0, 0, 12, 0, 5]
246        // Trm = font_matrix.concat(tm).concat(ctm) = [12, 0, 0, 12, 72, 725]
247        // BL→(72, 722), BR→(80.004, 722), TR→(80.004, 734), TL→(72, 734)
248        // Y-flip: top = 792-734 = 58, bottom = 792-722 = 70
249        assert_approx(ch.bbox.x0, 72.0, "x0");
250        assert_approx(ch.bbox.top, 58.0, "top");
251        assert_approx(ch.bbox.x1, 80.004, "x1");
252        assert_approx(ch.bbox.bottom, 70.0, "bottom");
253        // Same size, just shifted up by 5 points
254        assert_approx(ch.bbox.height(), 12.0, "height");
255    }
256
257    // ===== Test 4: Rotated text matrix (90 degrees CCW) =====
258
259    #[test]
260    fn rotated_text_matrix_bbox() {
261        let event = CharEvent {
262            text_matrix: [0.0, 1.0, -1.0, 0.0, 200.0, 400.0],
263            ..default_event()
264        };
265        let metrics = default_metrics();
266
267        let ch = char_from_event(&event, &metrics, PAGE_HEIGHT, None, None);
268
269        // font_matrix = [12, 0, 0, 12, 0, 0]
270        // tm = [0, 1, -1, 0, 200, 400]
271        // trm = [0, 12, -12, 0, 200, 400]
272        // BL(0,-0.25)→(203, 400), BR(0.667,-0.25)→(203, 408.004)
273        // TR(0.667,0.75)→(191, 408.004), TL(0,0.75)→(191, 400)
274        // min_x=191, max_x=203, min_y=400, max_y=408.004
275        // Y-flip: top=792-408.004=383.996, bottom=792-400=392
276        assert_approx(ch.bbox.x0, 191.0, "x0");
277        assert_approx(ch.bbox.top, 383.996, "top");
278        assert_approx(ch.bbox.x1, 203.0, "x1");
279        assert_approx(ch.bbox.bottom, 392.0, "bottom");
280        // Rotated: width and height swap
281        assert_approx(ch.bbox.width(), 12.0, "width");
282        assert_approx(ch.bbox.height(), 8.004, "height");
283
284        assert!(!ch.upright);
285        // Text goes bottom-to-top in this rotation
286        assert_eq!(ch.direction, TextDirection::Btt);
287    }
288
289    // ===== Test 5: CTM transformation (translation) =====
290
291    #[test]
292    fn ctm_translation_bbox() {
293        let event = CharEvent {
294            ctm: [1.0, 0.0, 0.0, 1.0, 50.0, 50.0],
295            ..default_event()
296        };
297        let metrics = default_metrics();
298
299        let ch = char_from_event(&event, &metrics, PAGE_HEIGHT, None, None);
300
301        // Trm = [12, 0, 0, 12, 72, 720].concat([1,0,0,1,50,50])
302        //      = [12, 0, 0, 12, 122, 770]
303        // BL→(122, 767), BR→(130.004, 767), TR→(130.004, 779), TL→(122, 779)
304        // Y-flip: top=792-779=13, bottom=792-767=25
305        assert_approx(ch.bbox.x0, 122.0, "x0");
306        assert_approx(ch.bbox.top, 13.0, "top");
307        assert_approx(ch.bbox.x1, 130.004, "x1");
308        assert_approx(ch.bbox.bottom, 25.0, "bottom");
309    }
310
311    // ===== Test 6: Char spacing affects width =====
312
313    #[test]
314    fn char_spacing_increases_width() {
315        let event = CharEvent {
316            char_spacing: 2.0, // 2 units extra spacing
317            ..default_event()
318        };
319        let metrics = default_metrics();
320
321        let ch = char_from_event(&event, &metrics, PAGE_HEIGHT, None, None);
322
323        // w_norm = 667/1000 + 2.0/12.0 = 0.667 + 0.1667 = 0.8337
324        // Width in page space = 12 * 0.8337 = 10.004 (vs 8.004 without spacing)
325        assert_approx(ch.bbox.width(), 10.004, "width with char_spacing");
326        // Height unchanged
327        assert_approx(ch.bbox.height(), 12.0, "height");
328    }
329
330    // ===== Test 7: Word spacing for space character =====
331
332    #[test]
333    fn word_spacing_applied_for_space() {
334        let event = CharEvent {
335            char_code: 32, // space
336            unicode: Some(" ".to_string()),
337            displacement: 250.0, // typical space width
338            word_spacing: 3.0,
339            ..default_event()
340        };
341        let metrics = FontMetrics::new(vec![250.0], 32, 32, 600.0, 750.0, -250.0, None);
342
343        let ch = char_from_event(&event, &metrics, PAGE_HEIGHT, None, None);
344
345        // w_norm = 250/1000 + (0 + 3)/12 = 0.25 + 0.25 = 0.5
346        // Width in page space = 12 * 0.5 = 6.0
347        assert_approx(ch.bbox.width(), 6.0, "width with word_spacing");
348    }
349
350    #[test]
351    fn word_spacing_not_applied_for_non_space() {
352        let event = CharEvent {
353            word_spacing: 3.0, // should be ignored for non-space
354            ..default_event()
355        };
356        let metrics = default_metrics();
357
358        let ch = char_from_event(&event, &metrics, PAGE_HEIGHT, None, None);
359
360        // Word spacing should not affect non-space characters
361        // w_norm = 667/1000 = 0.667, same as no spacing
362        assert_approx(ch.bbox.width(), 8.004, "width without word_spacing");
363    }
364
365    // ===== Test 8: Upright detection =====
366
367    #[test]
368    fn upright_for_horizontal_text() {
369        let event = default_event();
370        let metrics = default_metrics();
371
372        let ch = char_from_event(&event, &metrics, PAGE_HEIGHT, None, None);
373        assert!(ch.upright);
374    }
375
376    #[test]
377    fn not_upright_for_rotated_text() {
378        let event = CharEvent {
379            text_matrix: [0.0, 1.0, -1.0, 0.0, 100.0, 500.0],
380            ..default_event()
381        };
382        let metrics = default_metrics();
383
384        let ch = char_from_event(&event, &metrics, PAGE_HEIGHT, None, None);
385        assert!(!ch.upright);
386    }
387
388    // ===== Test 9: Text direction detection =====
389
390    #[test]
391    fn direction_ltr_for_normal_text() {
392        let event = default_event();
393        let metrics = default_metrics();
394
395        let ch = char_from_event(&event, &metrics, PAGE_HEIGHT, None, None);
396        assert_eq!(ch.direction, TextDirection::Ltr);
397    }
398
399    #[test]
400    fn direction_rtl_for_mirrored_text() {
401        let event = CharEvent {
402            text_matrix: [-1.0, 0.0, 0.0, 1.0, 300.0, 720.0],
403            ..default_event()
404        };
405        let metrics = default_metrics();
406
407        let ch = char_from_event(&event, &metrics, PAGE_HEIGHT, None, None);
408        assert_eq!(ch.direction, TextDirection::Rtl);
409    }
410
411    #[test]
412    fn direction_ttb_for_downward_text() {
413        // 90 degrees CW rotation: text flows top to bottom
414        let event = CharEvent {
415            text_matrix: [0.0, -1.0, 1.0, 0.0, 100.0, 700.0],
416            ..default_event()
417        };
418        let metrics = default_metrics();
419
420        let ch = char_from_event(&event, &metrics, PAGE_HEIGHT, None, None);
421        assert_eq!(ch.direction, TextDirection::Ttb);
422    }
423
424    #[test]
425    fn direction_btt_for_upward_text() {
426        // 90 degrees CCW rotation: text flows bottom to top
427        let event = CharEvent {
428            text_matrix: [0.0, 1.0, -1.0, 0.0, 100.0, 100.0],
429            ..default_event()
430        };
431        let metrics = default_metrics();
432
433        let ch = char_from_event(&event, &metrics, PAGE_HEIGHT, None, None);
434        assert_eq!(ch.direction, TextDirection::Btt);
435    }
436
437    // ===== Test 10: Unicode fallback =====
438
439    #[test]
440    fn unicode_from_event() {
441        let event = CharEvent {
442            unicode: Some("B".to_string()),
443            char_code: 66,
444            ..default_event()
445        };
446        let metrics = default_metrics();
447
448        let ch = char_from_event(&event, &metrics, PAGE_HEIGHT, None, None);
449        assert_eq!(ch.text, "B");
450    }
451
452    #[test]
453    fn unicode_fallback_to_char_code() {
454        let event = CharEvent {
455            unicode: None,
456            char_code: 65, // valid Unicode code point for 'A'
457            ..default_event()
458        };
459        let metrics = default_metrics();
460
461        let ch = char_from_event(&event, &metrics, PAGE_HEIGHT, None, None);
462        assert_eq!(ch.text, "A"); // falls back to char::from_u32(65) = 'A'
463    }
464
465    #[test]
466    fn unicode_fallback_replacement_for_invalid() {
467        let event = CharEvent {
468            unicode: None,
469            char_code: 0xFFFFFFFF, // invalid Unicode code point
470            ..default_event()
471        };
472        let metrics = default_metrics();
473
474        let ch = char_from_event(&event, &metrics, PAGE_HEIGHT, None, None);
475        assert_eq!(ch.text, "\u{FFFD}");
476    }
477
478    // ===== Test 11: Y-flip verification =====
479
480    #[test]
481    fn y_flip_converts_to_top_left_origin() {
482        // Character at bottom of page in PDF coords (y=100)
483        let event = CharEvent {
484            text_matrix: [1.0, 0.0, 0.0, 1.0, 72.0, 100.0],
485            ..default_event()
486        };
487        let metrics = default_metrics();
488
489        let ch = char_from_event(&event, &metrics, PAGE_HEIGHT, None, None);
490
491        // In PDF space: min_y = 97, max_y = 109
492        // In top-left: top = 792-109 = 683, bottom = 792-97 = 695
493        assert_approx(ch.bbox.top, 683.0, "top near page bottom");
494        assert_approx(ch.bbox.bottom, 695.0, "bottom near page bottom");
495        // doctop equals top for single-page
496        assert_approx(ch.doctop, 683.0, "doctop");
497    }
498
499    // ===== Test 12: Colors passed through =====
500
501    #[test]
502    fn colors_passed_through() {
503        let event = default_event();
504        let metrics = default_metrics();
505
506        let stroking = Some(Color::Rgb(1.0, 0.0, 0.0));
507        let non_stroking = Some(Color::Cmyk(0.0, 0.0, 0.0, 1.0));
508
509        let ch = char_from_event(
510            &event,
511            &metrics,
512            PAGE_HEIGHT,
513            stroking.clone(),
514            non_stroking.clone(),
515        );
516
517        assert_eq!(ch.stroking_color, stroking);
518        assert_eq!(ch.non_stroking_color, non_stroking);
519    }
520
521    // ===== Test 13: Horizontal scaling =====
522
523    #[test]
524    fn horizontal_scaling_affects_width() {
525        let event = CharEvent {
526            h_scaling: 0.5, // 50% horizontal scaling
527            ..default_event()
528        };
529        let metrics = default_metrics();
530
531        let ch = char_from_event(&event, &metrics, PAGE_HEIGHT, None, None);
532
533        // font_matrix = [12*0.5, 0, 0, 12, 0, 0] = [6, 0, 0, 12, 0, 0]
534        // Trm = [6, 0, 0, 12, 72, 720]
535        // w_norm = 0.667 (no spacing change)
536        // Width = 6 * 0.667 = 4.002
537        assert_approx(ch.bbox.width(), 4.002, "width at 50% h_scaling");
538        // Height unchanged
539        assert_approx(ch.bbox.height(), 12.0, "height at 50% h_scaling");
540    }
541
542    // ===== Test 14: Default/missing font metrics =====
543
544    #[test]
545    fn default_metrics_produce_reasonable_bbox() {
546        let event = default_event();
547        let metrics = FontMetrics::default_metrics();
548
549        let ch = char_from_event(&event, &metrics, PAGE_HEIGHT, None, None);
550
551        // Default ascent=750, descent=-250, width=600 (missing width)
552        // displacement=667 from event, so w_norm=0.667
553        // Height = (750+250)/1000 * 12 = 12.0
554        assert_approx(ch.bbox.height(), 12.0, "height with default metrics");
555        // Width = 12 * 0.667 = 8.004
556        assert_approx(ch.bbox.width(), 8.004, "width with default metrics");
557    }
558
559    // ===== Test 15: CTM scaling =====
560
561    #[test]
562    fn ctm_scaling_affects_bbox() {
563        let event = CharEvent {
564            text_matrix: [1.0, 0.0, 0.0, 1.0, 36.0, 360.0],
565            ctm: [2.0, 0.0, 0.0, 2.0, 0.0, 0.0],
566            ..default_event()
567        };
568        let metrics = default_metrics();
569
570        let ch = char_from_event(&event, &metrics, PAGE_HEIGHT, None, None);
571
572        // Trm = [12,0,0,12,36,360].concat([2,0,0,2,0,0]) = [24,0,0,24,72,720]
573        // Same as font_size=24 test
574        assert_approx(ch.bbox.width(), 16.008, "width with 2x CTM");
575        assert_approx(ch.bbox.height(), 24.0, "height with 2x CTM");
576    }
577
578    // ===== Test 16: Zero font size edge case =====
579
580    #[test]
581    fn zero_font_size_does_not_panic() {
582        let event = CharEvent {
583            font_size: 0.0,
584            ..default_event()
585        };
586        let metrics = default_metrics();
587
588        // Should not panic, even though bbox will be degenerate
589        let ch = char_from_event(&event, &metrics, PAGE_HEIGHT, None, None);
590        assert_eq!(ch.size, 0.0);
591    }
592
593    // ===== Test 17: Combined char_spacing and word_spacing =====
594
595    #[test]
596    fn combined_spacing_for_space() {
597        let event = CharEvent {
598            char_code: 32,
599            unicode: Some(" ".to_string()),
600            displacement: 250.0,
601            char_spacing: 1.0,
602            word_spacing: 2.0,
603            ..default_event()
604        };
605        let metrics = FontMetrics::new(vec![250.0], 32, 32, 600.0, 750.0, -250.0, None);
606
607        let ch = char_from_event(&event, &metrics, PAGE_HEIGHT, None, None);
608
609        // w_norm = 250/1000 + (1.0 + 2.0)/12.0 = 0.25 + 0.25 = 0.5
610        // Width = 12 * 0.5 = 6.0
611        assert_approx(ch.bbox.width(), 6.0, "width with combined spacing");
612    }
613}