Skip to main content

pdfplumber_parse/
font_metrics.rs

1//! Font metrics extraction from PDF font dictionaries.
2//!
3//! Parses /Widths, /FirstChar, /LastChar, and /FontDescriptor to provide
4//! glyph widths, ascent, and descent for character bounding box calculation.
5
6use crate::error::BackendError;
7
8/// Default ascent when not specified (750/1000 of text space).
9const DEFAULT_ASCENT: f64 = 750.0;
10
11/// Default descent when not specified (-250/1000 of text space).
12const DEFAULT_DESCENT: f64 = -250.0;
13
14/// Default character width when not specified (600/1000 of text space).
15const DEFAULT_WIDTH: f64 = 600.0;
16
17/// Font metrics extracted from a PDF font dictionary.
18///
19/// Stores glyph widths and font descriptor information (ascent, descent,
20/// bounding box) needed to calculate character bounding boxes.
21///
22/// Width values are in glyph space units (1/1000 of text space).
23#[derive(Debug, Clone)]
24pub struct FontMetrics {
25    /// Glyph widths indexed by (char_code - first_char).
26    widths: Vec<f64>,
27    /// First character code in the widths array.
28    first_char: u32,
29    /// Last character code in the widths array.
30    last_char: u32,
31    /// Default width for characters outside [first_char, last_char].
32    missing_width: f64,
33    /// Font ascent in glyph space units (positive, above baseline).
34    ascent: f64,
35    /// Font descent in glyph space units (negative, below baseline).
36    descent: f64,
37    /// Font bounding box [llx, lly, urx, ury] in glyph space units.
38    font_bbox: Option<[f64; 4]>,
39}
40
41impl FontMetrics {
42    /// Create FontMetrics from parsed PDF font dictionary values.
43    pub fn new(
44        widths: Vec<f64>,
45        first_char: u32,
46        last_char: u32,
47        missing_width: f64,
48        ascent: f64,
49        descent: f64,
50        font_bbox: Option<[f64; 4]>,
51    ) -> Self {
52        Self {
53            widths,
54            first_char,
55            last_char,
56            missing_width,
57            ascent,
58            descent,
59            font_bbox,
60        }
61    }
62
63    /// Create default FontMetrics for when font info is unavailable.
64    pub fn default_metrics() -> Self {
65        Self {
66            widths: Vec::new(),
67            first_char: 0,
68            last_char: 0,
69            missing_width: DEFAULT_WIDTH,
70            ascent: DEFAULT_ASCENT,
71            descent: DEFAULT_DESCENT,
72            font_bbox: None,
73        }
74    }
75
76    /// Get the width for a character code in glyph space (1/1000 of text space).
77    pub fn get_width(&self, char_code: u32) -> f64 {
78        if char_code >= self.first_char && char_code <= self.last_char {
79            let index = (char_code - self.first_char) as usize;
80            if index < self.widths.len() {
81                return self.widths[index];
82            }
83        }
84        self.missing_width
85    }
86
87    /// Font ascent in glyph space units (positive, above baseline).
88    pub fn ascent(&self) -> f64 {
89        self.ascent
90    }
91
92    /// Font descent in glyph space units (negative, below baseline).
93    pub fn descent(&self) -> f64 {
94        self.descent
95    }
96
97    /// Font bounding box [llx, lly, urx, ury] in glyph space units.
98    pub fn font_bbox(&self) -> Option<[f64; 4]> {
99        self.font_bbox
100    }
101
102    /// Missing width used for characters outside the widths range.
103    pub fn missing_width(&self) -> f64 {
104        self.missing_width
105    }
106
107    /// First character code in the widths array.
108    pub fn first_char(&self) -> u32 {
109        self.first_char
110    }
111
112    /// Last character code in the widths array.
113    pub fn last_char(&self) -> u32 {
114        self.last_char
115    }
116}
117
118/// Extract [`FontMetrics`] from a lopdf font dictionary.
119///
120/// Reads /Widths, /FirstChar, /LastChar from the font dictionary,
121/// and /Ascent, /Descent, /FontBBox, /MissingWidth from the /FontDescriptor.
122///
123/// Returns default metrics if essential fields are missing.
124pub fn extract_font_metrics(
125    doc: &lopdf::Document,
126    font_dict: &lopdf::Dictionary,
127) -> Result<FontMetrics, BackendError> {
128    // Parse /FirstChar and /LastChar
129    let first_char = font_dict
130        .get(b"FirstChar")
131        .ok()
132        .and_then(object_to_f64_opt)
133        .map(|v| v as u32)
134        .unwrap_or(0);
135
136    let last_char = font_dict
137        .get(b"LastChar")
138        .ok()
139        .and_then(object_to_f64_opt)
140        .map(|v| v as u32)
141        .unwrap_or(0);
142
143    // Parse /Widths array
144    let widths = match font_dict.get(b"Widths") {
145        Ok(obj) => {
146            let obj = resolve_object(doc, obj);
147            match obj.as_array() {
148                Ok(arr) => arr
149                    .iter()
150                    .map(|o| {
151                        let o = resolve_object(doc, o);
152                        object_to_f64_opt(o).unwrap_or(0.0)
153                    })
154                    .collect(),
155                Err(_) => Vec::new(),
156            }
157        }
158        Err(_) => Vec::new(),
159    };
160
161    // Parse /FontDescriptor
162    let desc_info = parse_font_descriptor(doc, font_dict)?;
163
164    Ok(FontMetrics::new(
165        widths,
166        first_char,
167        last_char,
168        desc_info.missing_width,
169        desc_info.ascent,
170        desc_info.descent,
171        desc_info.font_bbox,
172    ))
173}
174
175/// Parsed font descriptor values.
176struct FontDescriptorInfo {
177    ascent: f64,
178    descent: f64,
179    font_bbox: Option<[f64; 4]>,
180    missing_width: f64,
181}
182
183/// Parse /FontDescriptor dictionary for ascent, descent, bbox, and missing width.
184fn parse_font_descriptor(
185    doc: &lopdf::Document,
186    font_dict: &lopdf::Dictionary,
187) -> Result<FontDescriptorInfo, BackendError> {
188    let descriptor_dict = font_dict
189        .get(b"FontDescriptor")
190        .ok()
191        .map(|obj| resolve_object(doc, obj))
192        .and_then(|obj| obj.as_dict().ok());
193
194    let Some(desc) = descriptor_dict else {
195        return Ok(FontDescriptorInfo {
196            ascent: DEFAULT_ASCENT,
197            descent: DEFAULT_DESCENT,
198            font_bbox: None,
199            missing_width: DEFAULT_WIDTH,
200        });
201    };
202
203    let ascent = desc
204        .get(b"Ascent")
205        .ok()
206        .and_then(object_to_f64_opt)
207        .unwrap_or(DEFAULT_ASCENT);
208
209    let descent = desc
210        .get(b"Descent")
211        .ok()
212        .and_then(object_to_f64_opt)
213        .unwrap_or(DEFAULT_DESCENT);
214
215    let missing_width = desc
216        .get(b"MissingWidth")
217        .ok()
218        .and_then(object_to_f64_opt)
219        .unwrap_or(DEFAULT_WIDTH);
220
221    let font_bbox = desc
222        .get(b"FontBBox")
223        .ok()
224        .and_then(|o| {
225            let o = resolve_object(doc, o);
226            o.as_array().ok()
227        })
228        .and_then(|arr| {
229            if arr.len() == 4 {
230                let vals: Vec<f64> = arr.iter().filter_map(object_to_f64_opt).collect();
231                if vals.len() == 4 {
232                    Some([vals[0], vals[1], vals[2], vals[3]])
233                } else {
234                    None
235                }
236            } else {
237                None
238            }
239        });
240
241    Ok(FontDescriptorInfo {
242        ascent,
243        descent,
244        font_bbox,
245        missing_width,
246    })
247}
248
249/// Resolve an indirect reference to the actual object.
250fn resolve_object<'a>(doc: &'a lopdf::Document, obj: &'a lopdf::Object) -> &'a lopdf::Object {
251    match obj {
252        lopdf::Object::Reference(id) => doc.get_object(*id).unwrap_or(obj),
253        _ => obj,
254    }
255}
256
257/// Convert a lopdf object to f64, returning None if not a number.
258fn object_to_f64_opt(obj: &lopdf::Object) -> Option<f64> {
259    match obj {
260        lopdf::Object::Integer(i) => Some(*i as f64),
261        lopdf::Object::Real(f) => Some(*f as f64),
262        _ => None,
263    }
264}
265
266#[cfg(test)]
267mod tests {
268    use super::*;
269    use lopdf::{Document, Object, dictionary};
270
271    // ========== FontMetrics struct tests (TDD: Red phase) ==========
272
273    #[test]
274    fn width_lookup_within_range() {
275        let metrics = FontMetrics::new(
276            vec![250.0, 500.0, 750.0],
277            65, // 'A'
278            67, // 'C'
279            0.0,
280            DEFAULT_ASCENT,
281            DEFAULT_DESCENT,
282            None,
283        );
284        assert_eq!(metrics.get_width(65), 250.0); // 'A'
285        assert_eq!(metrics.get_width(66), 500.0); // 'B'
286        assert_eq!(metrics.get_width(67), 750.0); // 'C'
287    }
288
289    #[test]
290    fn width_lookup_out_of_range_returns_missing_width() {
291        let metrics = FontMetrics::new(
292            vec![250.0, 500.0],
293            65,
294            66,
295            300.0, // missing width
296            DEFAULT_ASCENT,
297            DEFAULT_DESCENT,
298            None,
299        );
300        // Below first_char
301        assert_eq!(metrics.get_width(64), 300.0);
302        // Above last_char
303        assert_eq!(metrics.get_width(67), 300.0);
304    }
305
306    #[test]
307    fn width_lookup_with_zero_missing_width() {
308        let metrics = FontMetrics::new(
309            vec![600.0],
310            32, // space
311            32,
312            0.0,
313            DEFAULT_ASCENT,
314            DEFAULT_DESCENT,
315            None,
316        );
317        assert_eq!(metrics.get_width(32), 600.0);
318        assert_eq!(metrics.get_width(65), 0.0); // out of range
319    }
320
321    #[test]
322    fn width_lookup_empty_widths_returns_missing_width() {
323        let metrics = FontMetrics::new(vec![], 0, 0, 500.0, DEFAULT_ASCENT, DEFAULT_DESCENT, None);
324        assert_eq!(metrics.get_width(0), 500.0);
325        assert_eq!(metrics.get_width(65), 500.0);
326    }
327
328    #[test]
329    fn width_lookup_widths_shorter_than_range() {
330        // LastChar - FirstChar + 1 > widths.len()
331        let metrics = FontMetrics::new(
332            vec![250.0, 500.0], // only 2 widths
333            65,
334            70, // but range is 65..70 (6 chars)
335            300.0,
336            DEFAULT_ASCENT,
337            DEFAULT_DESCENT,
338            None,
339        );
340        assert_eq!(metrics.get_width(65), 250.0);
341        assert_eq!(metrics.get_width(66), 500.0);
342        assert_eq!(metrics.get_width(67), 300.0); // index 2 > widths.len(), fallback
343    }
344
345    #[test]
346    fn ascent_and_descent() {
347        let metrics = FontMetrics::new(vec![], 0, 0, 0.0, 800.0, -200.0, None);
348        assert_eq!(metrics.ascent(), 800.0);
349        assert_eq!(metrics.descent(), -200.0);
350    }
351
352    #[test]
353    fn font_bbox_some() {
354        let bbox = [-100.0, -250.0, 1100.0, 900.0];
355        let metrics = FontMetrics::new(vec![], 0, 0, 0.0, 0.0, 0.0, Some(bbox));
356        assert_eq!(metrics.font_bbox(), Some([-100.0, -250.0, 1100.0, 900.0]));
357    }
358
359    #[test]
360    fn font_bbox_none() {
361        let metrics = FontMetrics::new(vec![], 0, 0, 0.0, 0.0, 0.0, None);
362        assert_eq!(metrics.font_bbox(), None);
363    }
364
365    #[test]
366    fn default_metrics_values() {
367        let metrics = FontMetrics::default_metrics();
368        assert_eq!(metrics.ascent(), DEFAULT_ASCENT);
369        assert_eq!(metrics.descent(), DEFAULT_DESCENT);
370        assert_eq!(metrics.missing_width(), DEFAULT_WIDTH);
371        assert_eq!(metrics.first_char(), 0);
372        assert_eq!(metrics.last_char(), 0);
373        assert_eq!(metrics.font_bbox(), None);
374        // Any char code returns default width
375        assert_eq!(metrics.get_width(65), DEFAULT_WIDTH);
376    }
377
378    #[test]
379    fn first_char_last_char_accessors() {
380        let metrics = FontMetrics::new(vec![500.0], 32, 32, 0.0, 0.0, 0.0, None);
381        assert_eq!(metrics.first_char(), 32);
382        assert_eq!(metrics.last_char(), 32);
383    }
384
385    #[test]
386    fn width_lookup_large_char_code() {
387        let metrics = FontMetrics::new(vec![600.0], 0xFFFF, 0xFFFF, 0.0, 0.0, 0.0, None);
388        assert_eq!(metrics.get_width(0xFFFF), 600.0);
389        assert_eq!(metrics.get_width(0xFFFE), 0.0);
390    }
391
392    // ========== extract_font_metrics tests (lopdf parsing) ==========
393
394    /// Helper: create a lopdf font dictionary with /Widths, /FirstChar, /LastChar.
395    fn create_font_dict_with_widths(
396        doc: &mut Document,
397        widths: &[f64],
398        first_char: i64,
399        last_char: i64,
400    ) -> lopdf::Dictionary {
401        let width_objects: Vec<Object> = widths.iter().map(|w| Object::Real(*w as f32)).collect();
402        let widths_id = doc.add_object(Object::Array(width_objects));
403
404        dictionary! {
405            "Type" => "Font",
406            "Subtype" => "Type1",
407            "BaseFont" => "Helvetica",
408            "FirstChar" => first_char,
409            "LastChar" => last_char,
410            "Widths" => widths_id,
411        }
412    }
413
414    /// Helper: add a /FontDescriptor to a font dictionary.
415    fn add_font_descriptor(
416        doc: &mut Document,
417        font_dict: &mut lopdf::Dictionary,
418        ascent: f64,
419        descent: f64,
420        missing_width: Option<f64>,
421        font_bbox: Option<[f64; 4]>,
422    ) {
423        let mut desc = dictionary! {
424            "Type" => "FontDescriptor",
425            "FontName" => "Helvetica",
426            "Ascent" => Object::Real(ascent as f32),
427            "Descent" => Object::Real(descent as f32),
428        };
429        if let Some(mw) = missing_width {
430            desc.set("MissingWidth", Object::Real(mw as f32));
431        }
432        if let Some(bbox) = font_bbox {
433            desc.set(
434                "FontBBox",
435                Object::Array(bbox.iter().map(|v| Object::Real(*v as f32)).collect()),
436            );
437        }
438        let desc_id = doc.add_object(Object::Dictionary(desc));
439        font_dict.set("FontDescriptor", desc_id);
440    }
441
442    #[test]
443    fn extract_metrics_with_widths_and_descriptor() {
444        let mut doc = Document::with_version("1.5");
445        let mut font_dict = create_font_dict_with_widths(&mut doc, &[278.0, 556.0, 722.0], 65, 67);
446        add_font_descriptor(
447            &mut doc,
448            &mut font_dict,
449            718.0,
450            -207.0,
451            Some(278.0),
452            Some([-166.0, -225.0, 1000.0, 931.0]),
453        );
454
455        let metrics = extract_font_metrics(&doc, &font_dict).unwrap();
456
457        assert_eq!(metrics.get_width(65), 278.0); // A
458        assert_eq!(metrics.get_width(66), 556.0); // B
459        assert_eq!(metrics.get_width(67), 722.0); // C
460        assert_eq!(metrics.get_width(68), 278.0); // D — missing width
461        assert!((metrics.ascent() - 718.0).abs() < 1.0);
462        assert!((metrics.descent() - (-207.0)).abs() < 1.0);
463        assert!(metrics.font_bbox().is_some());
464    }
465
466    #[test]
467    fn extract_metrics_without_font_descriptor() {
468        let mut doc = Document::with_version("1.5");
469        let font_dict = create_font_dict_with_widths(&mut doc, &[500.0, 600.0], 32, 33);
470        // No FontDescriptor added
471
472        let metrics = extract_font_metrics(&doc, &font_dict).unwrap();
473
474        assert_eq!(metrics.get_width(32), 500.0);
475        assert_eq!(metrics.get_width(33), 600.0);
476        // Defaults for missing descriptor
477        assert_eq!(metrics.ascent(), DEFAULT_ASCENT);
478        assert_eq!(metrics.descent(), DEFAULT_DESCENT);
479        assert_eq!(metrics.missing_width(), DEFAULT_WIDTH);
480    }
481
482    #[test]
483    fn extract_metrics_without_widths() {
484        let mut doc = Document::with_version("1.5");
485        let mut font_dict = dictionary! {
486            "Type" => "Font",
487            "Subtype" => "Type1",
488            "BaseFont" => "Helvetica",
489        };
490        add_font_descriptor(&mut doc, &mut font_dict, 800.0, -200.0, Some(500.0), None);
491
492        let metrics = extract_font_metrics(&doc, &font_dict).unwrap();
493
494        // No widths — all codes return missing width
495        assert_eq!(metrics.get_width(65), 500.0);
496        assert!((metrics.ascent() - 800.0).abs() < 1.0);
497        assert!((metrics.descent() - (-200.0)).abs() < 1.0);
498    }
499
500    #[test]
501    fn extract_metrics_empty_font_dict() {
502        let doc = Document::with_version("1.5");
503        let font_dict = dictionary! {};
504
505        let metrics = extract_font_metrics(&doc, &font_dict).unwrap();
506
507        // Everything defaults
508        assert_eq!(metrics.ascent(), DEFAULT_ASCENT);
509        assert_eq!(metrics.descent(), DEFAULT_DESCENT);
510        assert_eq!(metrics.missing_width(), DEFAULT_WIDTH);
511        assert_eq!(metrics.get_width(65), DEFAULT_WIDTH);
512    }
513
514    #[test]
515    fn extract_metrics_descriptor_without_missing_width() {
516        let mut doc = Document::with_version("1.5");
517        let mut font_dict = create_font_dict_with_widths(&mut doc, &[400.0], 65, 65);
518        add_font_descriptor(&mut doc, &mut font_dict, 700.0, -300.0, None, None);
519
520        let metrics = extract_font_metrics(&doc, &font_dict).unwrap();
521
522        assert_eq!(metrics.get_width(65), 400.0);
523        // MissingWidth defaults to DEFAULT_WIDTH when not in descriptor
524        assert_eq!(metrics.missing_width(), DEFAULT_WIDTH);
525    }
526
527    #[test]
528    fn extract_metrics_with_integer_widths() {
529        let mut doc = Document::with_version("1.5");
530        // Use Integer objects instead of Real for widths
531        let width_objects: Vec<Object> = vec![Object::Integer(250), Object::Integer(500)];
532        let widths_id = doc.add_object(Object::Array(width_objects));
533
534        let font_dict = dictionary! {
535            "Type" => "Font",
536            "Subtype" => "TrueType",
537            "BaseFont" => "Arial",
538            "FirstChar" => 65i64,
539            "LastChar" => 66i64,
540            "Widths" => widths_id,
541        };
542
543        let metrics = extract_font_metrics(&doc, &font_dict).unwrap();
544
545        assert_eq!(metrics.get_width(65), 250.0);
546        assert_eq!(metrics.get_width(66), 500.0);
547    }
548
549    #[test]
550    fn extract_metrics_with_font_bbox() {
551        let mut doc = Document::with_version("1.5");
552        let mut font_dict = dictionary! {
553            "Type" => "Font",
554            "Subtype" => "Type1",
555            "BaseFont" => "Courier",
556        };
557        add_font_descriptor(
558            &mut doc,
559            &mut font_dict,
560            629.0,
561            -157.0,
562            Some(600.0),
563            Some([-23.0, -250.0, 715.0, 805.0]),
564        );
565
566        let metrics = extract_font_metrics(&doc, &font_dict).unwrap();
567
568        let bbox = metrics.font_bbox().unwrap();
569        assert!((bbox[0] - (-23.0)).abs() < 1.0);
570        assert!((bbox[1] - (-250.0)).abs() < 1.0);
571        assert!((bbox[2] - 715.0).abs() < 1.0);
572        assert!((bbox[3] - 805.0).abs() < 1.0);
573    }
574
575    #[test]
576    fn extract_metrics_integer_first_last_char() {
577        let mut doc = Document::with_version("1.5");
578        let widths_id = doc.add_object(Object::Array(vec![Object::Integer(600)]));
579
580        let font_dict = dictionary! {
581            "Type" => "Font",
582            "Subtype" => "Type1",
583            "BaseFont" => "Courier",
584            "FirstChar" => 32i64,
585            "LastChar" => 32i64,
586            "Widths" => widths_id,
587        };
588
589        let metrics = extract_font_metrics(&doc, &font_dict).unwrap();
590
591        assert_eq!(metrics.first_char(), 32);
592        assert_eq!(metrics.last_char(), 32);
593        assert_eq!(metrics.get_width(32), 600.0);
594    }
595
596    #[test]
597    fn extract_metrics_indirect_font_descriptor() {
598        let mut doc = Document::with_version("1.5");
599        let desc_id = doc.add_object(Object::Dictionary(dictionary! {
600            "Type" => "FontDescriptor",
601            "FontName" => "Times-Roman",
602            "Ascent" => Object::Real(683.0),
603            "Descent" => Object::Real(-217.0),
604            "MissingWidth" => Object::Integer(250),
605        }));
606
607        let font_dict = dictionary! {
608            "Type" => "Font",
609            "Subtype" => "Type1",
610            "BaseFont" => "Times-Roman",
611            "FontDescriptor" => desc_id,
612        };
613
614        let metrics = extract_font_metrics(&doc, &font_dict).unwrap();
615
616        assert!((metrics.ascent() - 683.0).abs() < 1.0);
617        assert!((metrics.descent() - (-217.0)).abs() < 1.0);
618        assert!((metrics.missing_width() - 250.0).abs() < 1.0);
619    }
620
621    #[test]
622    fn width_as_get_width_callback() {
623        // Verify FontMetrics works as the width callback for text_renderer
624        let metrics = FontMetrics::new(
625            vec![278.0, 556.0, 722.0],
626            65,
627            67,
628            278.0,
629            718.0,
630            -207.0,
631            None,
632        );
633        let get_width: &dyn Fn(u32) -> f64 = &|code| metrics.get_width(code);
634        assert_eq!(get_width(65), 278.0);
635        assert_eq!(get_width(66), 556.0);
636        assert_eq!(get_width(68), 278.0); // missing
637    }
638}