Skip to main content

pdfplumber_parse/
char_extraction.rs

1//! Character bounding box calculation from content stream events.
2//!
3//! Combines font metrics, text state, and CTM to calculate the final
4//! bounding box for each character in top-left origin coordinates.
5//! This bridges Layer 2 (interpreter) and Layer 3 (object extraction).
6
7use pdfplumber_core::geometry::{BBox, Ctm, Point};
8use pdfplumber_core::painting::Color;
9use pdfplumber_core::text::{Char, TextDirection};
10
11use crate::font_metrics::FontMetrics;
12use crate::handler::CharEvent;
13
14/// Convert a `CharEvent` and font metrics into a fully-populated `Char`
15/// with bounding box in top-left origin page coordinates.
16///
17/// # Arguments
18///
19/// * `event` - Character rendering event from the content stream interpreter.
20/// * `metrics` - Font metrics for width, ascent, and descent lookup.
21/// * `page_height` - Page height in PDF units (for y-flip from bottom-left to top-left origin).
22/// * `stroking_color` - Current stroking color from the graphics state.
23/// * `non_stroking_color` - Current non-stroking color from the graphics state.
24///
25/// # Coordinate System
26///
27/// PDF uses bottom-left origin. This function converts to top-left origin
28/// (pdfplumber convention) by flipping: `top = page_height - max_y`.
29pub fn char_from_event(
30    event: &CharEvent,
31    metrics: &FontMetrics,
32    page_height: f64,
33    stroking_color: Option<Color>,
34    non_stroking_color: Option<Color>,
35) -> Char {
36    let font_size = event.font_size;
37    let h_scaling = event.h_scaling;
38
39    // Build the Text Rendering Matrix (Trm) per PDF spec 9.4.4:
40    // Trm = [Tfs*Th, 0, 0, Tfs, 0, Trise] x Tm x CTM
41    let font_matrix = Ctm::new(font_size * h_scaling, 0.0, 0.0, font_size, 0.0, event.rise);
42    let tm = ctm_from_array(&event.text_matrix);
43    let ctm = ctm_from_array(&event.ctm);
44    let trm = font_matrix.concat(&tm).concat(&ctm);
45
46    // Character width in glyph-normalized space.
47    // Per PDF spec: advance = ((w0/1000)*Tfs + Tc + Tw) * Th
48    // In glyph-norm space (Trm x-axis scales by Tfs*Th):
49    //   w_norm = advance / (Tfs*Th) = w0/1000 + (Tc + Tw) / Tfs
50    let word_spacing = if event.char_code == 32 {
51        event.word_spacing
52    } else {
53        0.0
54    };
55    let w_norm = if font_size.abs() > f64::EPSILON {
56        event.displacement / 1000.0 + (event.char_spacing + word_spacing) / font_size
57    } else {
58        event.displacement / 1000.0
59    };
60
61    // Ascent/descent in glyph-normalized space (1/1000 units → normalized)
62    let ascent_norm = metrics.ascent() / 1000.0;
63    let descent_norm = metrics.descent() / 1000.0;
64
65    // Four corners of the character rectangle in glyph-normalized space,
66    // transformed through Trm to page space (PDF bottom-left origin).
67    let corners = [
68        trm.transform_point(Point::new(0.0, descent_norm)),
69        trm.transform_point(Point::new(w_norm, descent_norm)),
70        trm.transform_point(Point::new(w_norm, ascent_norm)),
71        trm.transform_point(Point::new(0.0, ascent_norm)),
72    ];
73
74    // Axis-aligned bounding box in PDF page space
75    let min_x = corners.iter().map(|p| p.x).fold(f64::INFINITY, f64::min);
76    let max_x = corners
77        .iter()
78        .map(|p| p.x)
79        .fold(f64::NEG_INFINITY, f64::max);
80    let min_y = corners.iter().map(|p| p.y).fold(f64::INFINITY, f64::min);
81    let max_y = corners
82        .iter()
83        .map(|p| p.y)
84        .fold(f64::NEG_INFINITY, f64::max);
85
86    // Y-flip: PDF bottom-left origin → top-left origin
87    let top = page_height - max_y;
88    let bottom = page_height - min_y;
89
90    let bbox = BBox::new(min_x, top, max_x, bottom);
91
92    // Upright: no rotation/shear in the text rendering matrix
93    let upright = trm.b.abs() < 1e-6 && trm.c.abs() < 1e-6;
94
95    // Text direction from the dominant axis of the text rendering matrix
96    let direction = if trm.a.abs() >= trm.b.abs() {
97        if trm.a >= 0.0 {
98            TextDirection::Ltr
99        } else {
100            TextDirection::Rtl
101        }
102    } else if trm.b > 0.0 {
103        TextDirection::Btt
104    } else {
105        TextDirection::Ttb
106    };
107
108    // Unicode text with fallback
109    let text = event.unicode.clone().unwrap_or_else(|| {
110        char::from_u32(event.char_code)
111            .map(|c| c.to_string())
112            .unwrap_or_else(|| "\u{FFFD}".to_string())
113    });
114
115    Char {
116        text,
117        bbox,
118        fontname: event.font_name.clone(),
119        size: font_size,
120        doctop: top,
121        upright,
122        direction,
123        stroking_color,
124        non_stroking_color,
125        ctm: event.ctm,
126        char_code: event.char_code,
127    }
128}
129
130/// Create a [`Ctm`] from a 6-element array `[a, b, c, d, e, f]`.
131fn ctm_from_array(arr: &[f64; 6]) -> Ctm {
132    Ctm::new(arr[0], arr[1], arr[2], arr[3], arr[4], arr[5])
133}
134
135#[cfg(test)]
136mod tests {
137    use super::*;
138
139    const PAGE_HEIGHT: f64 = 792.0; // US Letter
140
141    /// Helper: create a default CharEvent for testing.
142    fn default_event() -> CharEvent {
143        CharEvent {
144            char_code: 65, // 'A'
145            unicode: Some("A".to_string()),
146            font_name: "Helvetica".to_string(),
147            font_size: 12.0,
148            text_matrix: [1.0, 0.0, 0.0, 1.0, 72.0, 720.0],
149            ctm: [1.0, 0.0, 0.0, 1.0, 0.0, 0.0],
150            displacement: 667.0, // glyph width in 1/1000 units
151            char_spacing: 0.0,
152            word_spacing: 0.0,
153            h_scaling: 1.0,
154            rise: 0.0,
155        }
156    }
157
158    /// Helper: create default FontMetrics for testing.
159    fn default_metrics() -> FontMetrics {
160        FontMetrics::new(
161            vec![667.0], // width for char_code 65 ('A')
162            65,
163            65,
164            600.0,  // missing width
165            750.0,  // ascent
166            -250.0, // descent
167            None,
168        )
169    }
170
171    fn assert_approx(actual: f64, expected: f64, msg: &str) {
172        assert!(
173            (actual - expected).abs() < 0.01,
174            "{msg}: expected {expected}, got {actual}"
175        );
176    }
177
178    // ===== Test 1: Simple horizontal text bbox =====
179
180    #[test]
181    fn simple_horizontal_text_bbox() {
182        let event = default_event();
183        let metrics = default_metrics();
184
185        let ch = char_from_event(&event, &metrics, PAGE_HEIGHT, None, Some(Color::black()));
186
187        // Trm = [12, 0, 0, 12, 72, 720]
188        // w_norm = 0.667, ascent_norm = 0.75, descent_norm = -0.25
189        // BL→(72, 717), BR→(80.004, 717), TR→(80.004, 729), TL→(72, 729)
190        // Y-flip: top = 792-729 = 63, bottom = 792-717 = 75
191        assert_approx(ch.bbox.x0, 72.0, "x0");
192        assert_approx(ch.bbox.top, 63.0, "top");
193        assert_approx(ch.bbox.x1, 80.004, "x1");
194        assert_approx(ch.bbox.bottom, 75.0, "bottom");
195        assert_approx(ch.bbox.width(), 8.004, "width");
196        assert_approx(ch.bbox.height(), 12.0, "height");
197
198        assert_eq!(ch.text, "A");
199        assert_eq!(ch.fontname, "Helvetica");
200        assert_eq!(ch.size, 12.0);
201        assert!(ch.upright);
202        assert_eq!(ch.direction, TextDirection::Ltr);
203        assert_eq!(ch.char_code, 65);
204        assert_eq!(ch.ctm, [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]);
205    }
206
207    // ===== Test 2: Scaled text (font_size = 24) =====
208
209    #[test]
210    fn scaled_text_bbox() {
211        let event = CharEvent {
212            font_size: 24.0,
213            ..default_event()
214        };
215        let metrics = default_metrics();
216
217        let ch = char_from_event(&event, &metrics, PAGE_HEIGHT, None, None);
218
219        // Trm = [24, 0, 0, 24, 72, 720]
220        // BL→(72, 714), BR→(88.008, 714), TR→(88.008, 738), TL→(72, 738)
221        // Y-flip: top = 792-738 = 54, bottom = 792-714 = 78
222        assert_approx(ch.bbox.x0, 72.0, "x0");
223        assert_approx(ch.bbox.top, 54.0, "top");
224        assert_approx(ch.bbox.x1, 88.008, "x1");
225        assert_approx(ch.bbox.bottom, 78.0, "bottom");
226        assert_approx(ch.bbox.width(), 16.008, "width");
227        assert_approx(ch.bbox.height(), 24.0, "height");
228        assert_eq!(ch.size, 24.0);
229    }
230
231    // ===== Test 3: Text with rise (superscript) =====
232
233    #[test]
234    fn text_with_rise_bbox() {
235        let event = CharEvent {
236            rise: 5.0,
237            ..default_event()
238        };
239        let metrics = default_metrics();
240
241        let ch = char_from_event(&event, &metrics, PAGE_HEIGHT, None, None);
242
243        // font_matrix = [12, 0, 0, 12, 0, 5]
244        // Trm = font_matrix.concat(tm).concat(ctm) = [12, 0, 0, 12, 72, 725]
245        // BL→(72, 722), BR→(80.004, 722), TR→(80.004, 734), TL→(72, 734)
246        // Y-flip: top = 792-734 = 58, bottom = 792-722 = 70
247        assert_approx(ch.bbox.x0, 72.0, "x0");
248        assert_approx(ch.bbox.top, 58.0, "top");
249        assert_approx(ch.bbox.x1, 80.004, "x1");
250        assert_approx(ch.bbox.bottom, 70.0, "bottom");
251        // Same size, just shifted up by 5 points
252        assert_approx(ch.bbox.height(), 12.0, "height");
253    }
254
255    // ===== Test 4: Rotated text matrix (90 degrees CCW) =====
256
257    #[test]
258    fn rotated_text_matrix_bbox() {
259        let event = CharEvent {
260            text_matrix: [0.0, 1.0, -1.0, 0.0, 200.0, 400.0],
261            ..default_event()
262        };
263        let metrics = default_metrics();
264
265        let ch = char_from_event(&event, &metrics, PAGE_HEIGHT, None, None);
266
267        // font_matrix = [12, 0, 0, 12, 0, 0]
268        // tm = [0, 1, -1, 0, 200, 400]
269        // trm = [0, 12, -12, 0, 200, 400]
270        // BL(0,-0.25)→(203, 400), BR(0.667,-0.25)→(203, 408.004)
271        // TR(0.667,0.75)→(191, 408.004), TL(0,0.75)→(191, 400)
272        // min_x=191, max_x=203, min_y=400, max_y=408.004
273        // Y-flip: top=792-408.004=383.996, bottom=792-400=392
274        assert_approx(ch.bbox.x0, 191.0, "x0");
275        assert_approx(ch.bbox.top, 383.996, "top");
276        assert_approx(ch.bbox.x1, 203.0, "x1");
277        assert_approx(ch.bbox.bottom, 392.0, "bottom");
278        // Rotated: width and height swap
279        assert_approx(ch.bbox.width(), 12.0, "width");
280        assert_approx(ch.bbox.height(), 8.004, "height");
281
282        assert!(!ch.upright);
283        // Text goes bottom-to-top in this rotation
284        assert_eq!(ch.direction, TextDirection::Btt);
285    }
286
287    // ===== Test 5: CTM transformation (translation) =====
288
289    #[test]
290    fn ctm_translation_bbox() {
291        let event = CharEvent {
292            ctm: [1.0, 0.0, 0.0, 1.0, 50.0, 50.0],
293            ..default_event()
294        };
295        let metrics = default_metrics();
296
297        let ch = char_from_event(&event, &metrics, PAGE_HEIGHT, None, None);
298
299        // Trm = [12, 0, 0, 12, 72, 720].concat([1,0,0,1,50,50])
300        //      = [12, 0, 0, 12, 122, 770]
301        // BL→(122, 767), BR→(130.004, 767), TR→(130.004, 779), TL→(122, 779)
302        // Y-flip: top=792-779=13, bottom=792-767=25
303        assert_approx(ch.bbox.x0, 122.0, "x0");
304        assert_approx(ch.bbox.top, 13.0, "top");
305        assert_approx(ch.bbox.x1, 130.004, "x1");
306        assert_approx(ch.bbox.bottom, 25.0, "bottom");
307    }
308
309    // ===== Test 6: Char spacing affects width =====
310
311    #[test]
312    fn char_spacing_increases_width() {
313        let event = CharEvent {
314            char_spacing: 2.0, // 2 units extra spacing
315            ..default_event()
316        };
317        let metrics = default_metrics();
318
319        let ch = char_from_event(&event, &metrics, PAGE_HEIGHT, None, None);
320
321        // w_norm = 667/1000 + 2.0/12.0 = 0.667 + 0.1667 = 0.8337
322        // Width in page space = 12 * 0.8337 = 10.004 (vs 8.004 without spacing)
323        assert_approx(ch.bbox.width(), 10.004, "width with char_spacing");
324        // Height unchanged
325        assert_approx(ch.bbox.height(), 12.0, "height");
326    }
327
328    // ===== Test 7: Word spacing for space character =====
329
330    #[test]
331    fn word_spacing_applied_for_space() {
332        let event = CharEvent {
333            char_code: 32, // space
334            unicode: Some(" ".to_string()),
335            displacement: 250.0, // typical space width
336            word_spacing: 3.0,
337            ..default_event()
338        };
339        let metrics = FontMetrics::new(vec![250.0], 32, 32, 600.0, 750.0, -250.0, None);
340
341        let ch = char_from_event(&event, &metrics, PAGE_HEIGHT, None, None);
342
343        // w_norm = 250/1000 + (0 + 3)/12 = 0.25 + 0.25 = 0.5
344        // Width in page space = 12 * 0.5 = 6.0
345        assert_approx(ch.bbox.width(), 6.0, "width with word_spacing");
346    }
347
348    #[test]
349    fn word_spacing_not_applied_for_non_space() {
350        let event = CharEvent {
351            word_spacing: 3.0, // should be ignored for non-space
352            ..default_event()
353        };
354        let metrics = default_metrics();
355
356        let ch = char_from_event(&event, &metrics, PAGE_HEIGHT, None, None);
357
358        // Word spacing should not affect non-space characters
359        // w_norm = 667/1000 = 0.667, same as no spacing
360        assert_approx(ch.bbox.width(), 8.004, "width without word_spacing");
361    }
362
363    // ===== Test 8: Upright detection =====
364
365    #[test]
366    fn upright_for_horizontal_text() {
367        let event = default_event();
368        let metrics = default_metrics();
369
370        let ch = char_from_event(&event, &metrics, PAGE_HEIGHT, None, None);
371        assert!(ch.upright);
372    }
373
374    #[test]
375    fn not_upright_for_rotated_text() {
376        let event = CharEvent {
377            text_matrix: [0.0, 1.0, -1.0, 0.0, 100.0, 500.0],
378            ..default_event()
379        };
380        let metrics = default_metrics();
381
382        let ch = char_from_event(&event, &metrics, PAGE_HEIGHT, None, None);
383        assert!(!ch.upright);
384    }
385
386    // ===== Test 9: Text direction detection =====
387
388    #[test]
389    fn direction_ltr_for_normal_text() {
390        let event = default_event();
391        let metrics = default_metrics();
392
393        let ch = char_from_event(&event, &metrics, PAGE_HEIGHT, None, None);
394        assert_eq!(ch.direction, TextDirection::Ltr);
395    }
396
397    #[test]
398    fn direction_rtl_for_mirrored_text() {
399        let event = CharEvent {
400            text_matrix: [-1.0, 0.0, 0.0, 1.0, 300.0, 720.0],
401            ..default_event()
402        };
403        let metrics = default_metrics();
404
405        let ch = char_from_event(&event, &metrics, PAGE_HEIGHT, None, None);
406        assert_eq!(ch.direction, TextDirection::Rtl);
407    }
408
409    #[test]
410    fn direction_ttb_for_downward_text() {
411        // 90 degrees CW rotation: text flows top to bottom
412        let event = CharEvent {
413            text_matrix: [0.0, -1.0, 1.0, 0.0, 100.0, 700.0],
414            ..default_event()
415        };
416        let metrics = default_metrics();
417
418        let ch = char_from_event(&event, &metrics, PAGE_HEIGHT, None, None);
419        assert_eq!(ch.direction, TextDirection::Ttb);
420    }
421
422    #[test]
423    fn direction_btt_for_upward_text() {
424        // 90 degrees CCW rotation: text flows bottom to top
425        let event = CharEvent {
426            text_matrix: [0.0, 1.0, -1.0, 0.0, 100.0, 100.0],
427            ..default_event()
428        };
429        let metrics = default_metrics();
430
431        let ch = char_from_event(&event, &metrics, PAGE_HEIGHT, None, None);
432        assert_eq!(ch.direction, TextDirection::Btt);
433    }
434
435    // ===== Test 10: Unicode fallback =====
436
437    #[test]
438    fn unicode_from_event() {
439        let event = CharEvent {
440            unicode: Some("B".to_string()),
441            char_code: 66,
442            ..default_event()
443        };
444        let metrics = default_metrics();
445
446        let ch = char_from_event(&event, &metrics, PAGE_HEIGHT, None, None);
447        assert_eq!(ch.text, "B");
448    }
449
450    #[test]
451    fn unicode_fallback_to_char_code() {
452        let event = CharEvent {
453            unicode: None,
454            char_code: 65, // valid Unicode code point for 'A'
455            ..default_event()
456        };
457        let metrics = default_metrics();
458
459        let ch = char_from_event(&event, &metrics, PAGE_HEIGHT, None, None);
460        assert_eq!(ch.text, "A"); // falls back to char::from_u32(65) = 'A'
461    }
462
463    #[test]
464    fn unicode_fallback_replacement_for_invalid() {
465        let event = CharEvent {
466            unicode: None,
467            char_code: 0xFFFFFFFF, // invalid Unicode code point
468            ..default_event()
469        };
470        let metrics = default_metrics();
471
472        let ch = char_from_event(&event, &metrics, PAGE_HEIGHT, None, None);
473        assert_eq!(ch.text, "\u{FFFD}");
474    }
475
476    // ===== Test 11: Y-flip verification =====
477
478    #[test]
479    fn y_flip_converts_to_top_left_origin() {
480        // Character at bottom of page in PDF coords (y=100)
481        let event = CharEvent {
482            text_matrix: [1.0, 0.0, 0.0, 1.0, 72.0, 100.0],
483            ..default_event()
484        };
485        let metrics = default_metrics();
486
487        let ch = char_from_event(&event, &metrics, PAGE_HEIGHT, None, None);
488
489        // In PDF space: min_y = 97, max_y = 109
490        // In top-left: top = 792-109 = 683, bottom = 792-97 = 695
491        assert_approx(ch.bbox.top, 683.0, "top near page bottom");
492        assert_approx(ch.bbox.bottom, 695.0, "bottom near page bottom");
493        // doctop equals top for single-page
494        assert_approx(ch.doctop, 683.0, "doctop");
495    }
496
497    // ===== Test 12: Colors passed through =====
498
499    #[test]
500    fn colors_passed_through() {
501        let event = default_event();
502        let metrics = default_metrics();
503
504        let stroking = Some(Color::Rgb(1.0, 0.0, 0.0));
505        let non_stroking = Some(Color::Cmyk(0.0, 0.0, 0.0, 1.0));
506
507        let ch = char_from_event(
508            &event,
509            &metrics,
510            PAGE_HEIGHT,
511            stroking.clone(),
512            non_stroking.clone(),
513        );
514
515        assert_eq!(ch.stroking_color, stroking);
516        assert_eq!(ch.non_stroking_color, non_stroking);
517    }
518
519    // ===== Test 13: Horizontal scaling =====
520
521    #[test]
522    fn horizontal_scaling_affects_width() {
523        let event = CharEvent {
524            h_scaling: 0.5, // 50% horizontal scaling
525            ..default_event()
526        };
527        let metrics = default_metrics();
528
529        let ch = char_from_event(&event, &metrics, PAGE_HEIGHT, None, None);
530
531        // font_matrix = [12*0.5, 0, 0, 12, 0, 0] = [6, 0, 0, 12, 0, 0]
532        // Trm = [6, 0, 0, 12, 72, 720]
533        // w_norm = 0.667 (no spacing change)
534        // Width = 6 * 0.667 = 4.002
535        assert_approx(ch.bbox.width(), 4.002, "width at 50% h_scaling");
536        // Height unchanged
537        assert_approx(ch.bbox.height(), 12.0, "height at 50% h_scaling");
538    }
539
540    // ===== Test 14: Default/missing font metrics =====
541
542    #[test]
543    fn default_metrics_produce_reasonable_bbox() {
544        let event = default_event();
545        let metrics = FontMetrics::default_metrics();
546
547        let ch = char_from_event(&event, &metrics, PAGE_HEIGHT, None, None);
548
549        // Default ascent=750, descent=-250, width=600 (missing width)
550        // displacement=667 from event, so w_norm=0.667
551        // Height = (750+250)/1000 * 12 = 12.0
552        assert_approx(ch.bbox.height(), 12.0, "height with default metrics");
553        // Width = 12 * 0.667 = 8.004
554        assert_approx(ch.bbox.width(), 8.004, "width with default metrics");
555    }
556
557    // ===== Test 15: CTM scaling =====
558
559    #[test]
560    fn ctm_scaling_affects_bbox() {
561        let event = CharEvent {
562            text_matrix: [1.0, 0.0, 0.0, 1.0, 36.0, 360.0],
563            ctm: [2.0, 0.0, 0.0, 2.0, 0.0, 0.0],
564            ..default_event()
565        };
566        let metrics = default_metrics();
567
568        let ch = char_from_event(&event, &metrics, PAGE_HEIGHT, None, None);
569
570        // Trm = [12,0,0,12,36,360].concat([2,0,0,2,0,0]) = [24,0,0,24,72,720]
571        // Same as font_size=24 test
572        assert_approx(ch.bbox.width(), 16.008, "width with 2x CTM");
573        assert_approx(ch.bbox.height(), 24.0, "height with 2x CTM");
574    }
575
576    // ===== Test 16: Zero font size edge case =====
577
578    #[test]
579    fn zero_font_size_does_not_panic() {
580        let event = CharEvent {
581            font_size: 0.0,
582            ..default_event()
583        };
584        let metrics = default_metrics();
585
586        // Should not panic, even though bbox will be degenerate
587        let ch = char_from_event(&event, &metrics, PAGE_HEIGHT, None, None);
588        assert_eq!(ch.size, 0.0);
589    }
590
591    // ===== Test 17: Combined char_spacing and word_spacing =====
592
593    #[test]
594    fn combined_spacing_for_space() {
595        let event = CharEvent {
596            char_code: 32,
597            unicode: Some(" ".to_string()),
598            displacement: 250.0,
599            char_spacing: 1.0,
600            word_spacing: 2.0,
601            ..default_event()
602        };
603        let metrics = FontMetrics::new(vec![250.0], 32, 32, 600.0, 750.0, -250.0, None);
604
605        let ch = char_from_event(&event, &metrics, PAGE_HEIGHT, None, None);
606
607        // w_norm = 250/1000 + (1.0 + 2.0)/12.0 = 0.25 + 0.25 = 0.5
608        // Width = 12 * 0.5 = 6.0
609        assert_approx(ch.bbox.width(), 6.0, "width with combined spacing");
610    }
611}