Skip to main content

zpdf_document/
font_loader.rs

1use zpdf_core::{ObjectId, PdfObject, Result};
2use zpdf_font::{CidWidths, FontCache, LoadedFont, PdfFontType};
3use zpdf_parser::PdfFile;
4
5use crate::page::PdfPage;
6
7/// Load all fonts referenced by a page into a FontCache.
8pub fn load_page_fonts(file: &PdfFile, page: &PdfPage) -> FontCache {
9    let mut cache = FontCache::new();
10
11    for (name, &font_ref) in &page.resources.fonts {
12        match load_single_font(file, font_ref) {
13            Ok(font) => {
14                cache.insert(name.clone(), font);
15            }
16            Err(e) => {
17                tracing::debug!("font {name} ({font_ref}): fallback - {e}");
18                cache.insert(name.clone(), LoadedFont::new_placeholder(name.clone()));
19            }
20        }
21    }
22
23    cache
24}
25
26pub fn load_single_font(file: &PdfFile, font_ref: ObjectId) -> Result<LoadedFont> {
27    let obj = file.resolve(font_ref)?;
28    let dict = obj.as_dict()?;
29
30    let subtype = dict.get_name("Subtype").unwrap_or("");
31    let base_font = dict.get_name("BaseFont").unwrap_or("Unknown").to_string();
32
33    let mut font = match subtype {
34        "Type0" => load_type0_font(file, dict, base_font)?,
35        "TrueType" => load_truetype_font(file, dict, base_font)?,
36        "Type3" => load_type3_font(file, dict, base_font)?,
37        "Type1" | "MMType1" => load_type1_font(file, dict, base_font)?,
38        _ => LoadedFont::new_placeholder(base_font),
39    };
40
41    attach_text_mappings(file, dict, subtype, &mut font);
42    // A substituted composite font needs /ToUnicode (attached just above) to
43    // route CIDs through the system face's Unicode cmap.
44    font.build_substitute_cid_to_gid();
45    Ok(font)
46}
47
48/// FontDescriptor-derived hints for system-font substitution.
49fn substitute_hints(
50    file: &PdfFile,
51    dict: &zpdf_core::PdfDict,
52) -> zpdf_font::system::SubstituteHints {
53    let mut hints = zpdf_font::system::SubstituteHints::default();
54    if let Ok(fd_ref) = dict.get_ref("FontDescriptor") {
55        if let Ok(fd) = file.resolve(fd_ref) {
56            if let Ok(fd) = fd.as_dict() {
57                if let Ok(flags) = fd.get_i64("Flags") {
58                    hints.fixed_pitch = flags & 1 != 0;
59                    hints.serif = flags & 2 != 0;
60                    hints.italic = flags & 64 != 0;
61                    hints.bold = flags & (1 << 18) != 0; // ForceBold
62                }
63                if let Ok(w) = fd.get_f64("StemV") {
64                    hints.bold |= w >= 160.0;
65                }
66            }
67        }
68    }
69    hints
70}
71
72/// Try to substitute an installed system font for a non-embedded simple font.
73/// The PDF /Widths stay authoritative for advances when present; otherwise the
74/// standard-14 metrics (if the name matches one) seed the widths.
75fn try_system_substitute_simple(
76    file: &PdfFile,
77    dict: &zpdf_core::PdfDict,
78    base_font: &str,
79    font_type: PdfFontType,
80    mut cid_widths: CidWidths,
81) -> Option<LoadedFont> {
82    let hints = substitute_hints(file, dict);
83    let m = zpdf_font::system::find_system_font(base_font, hints, None)?;
84    if cid_widths.is_empty() {
85        if let Some(metrics) = zpdf_font::standard_fonts::lookup(base_font) {
86            for (code, &w) in metrics.widths.iter().enumerate() {
87                if w > 0 {
88                    cid_widths.set(code as u16, w as f64);
89                }
90            }
91        }
92    }
93    LoadedFont::new_substitute(
94        font_type,
95        base_font.to_string(),
96        m.data,
97        m.face_index,
98        cid_widths,
99    )
100}
101
102/// Attach the simple-font /Encoding, the symbolic flag, and /ToUnicode (for
103/// text extraction) to a freshly-loaded font.
104fn attach_text_mappings(
105    file: &PdfFile,
106    dict: &zpdf_core::PdfDict,
107    subtype: &str,
108    font: &mut LoadedFont,
109) {
110    // /ToUnicode lives at the top-level font dict for both simple and Type0 fonts.
111    if let Ok(tu_ref) = dict.get_ref("ToUnicode") {
112        if let Ok(data) = file.resolve_stream_data(tu_ref) {
113            let map = zpdf_font::cmap::ToUnicodeMap::parse(&data);
114            if !map.is_empty() {
115                font.to_unicode = Some(map);
116            }
117        }
118    }
119
120    // /Encoding and the symbolic flag apply only to simple (non-composite) fonts.
121    if subtype == "Type0" {
122        return;
123    }
124
125    font.symbolic = font_descriptor_symbolic(file, dict);
126
127    let encoding = if dict.get("Encoding").is_none() {
128        // No explicit /Encoding: the Symbol/ZapfDingbats standard fonts carry their
129        // own built-in encoding; other symbolic fonts use the font program's cmap.
130        builtin_symbol_encoding(&font.base_font)
131            .or_else(|| parse_encoding(file, dict, subtype, font.symbolic))
132    } else {
133        parse_encoding(file, dict, subtype, font.symbolic)
134    };
135    if let Some(enc) = encoding {
136        font.encoding = Some(enc);
137    }
138
139    // With encoding and widths in place, recover Quartz-subset glyphs that are
140    // reachable through no declared encoding (charset entries named ".notdef").
141    font.map_unencoded_orphans();
142}
143
144/// The built-in encoding for the Symbol / ZapfDingbats standard fonts, matched by
145/// BaseFont (ignoring any subset prefix). Used when no explicit /Encoding is given,
146/// so symbolic Symbol/Dingbats text is still extractable via the glyph list.
147fn builtin_symbol_encoding(base_font: &str) -> Option<zpdf_font::encoding::Encoding> {
148    use zpdf_font::encoding::{base_encoding_by_name, Encoding};
149    let name = base_font.rsplit('+').next().unwrap_or(base_font);
150    let canonical = if name.contains("ZapfDingbats") || name.contains("Dingbats") {
151        "ZapfDingbats"
152    } else if name.contains("Symbol") {
153        "Symbol"
154    } else {
155        return None;
156    };
157    base_encoding_by_name(canonical).map(Encoding::from_base)
158}
159
160/// Read the FontDescriptor /Flags and decide whether the font is symbolic
161/// (bit 3 set, bit 6 clear).
162fn font_descriptor_symbolic(file: &PdfFile, dict: &zpdf_core::PdfDict) -> bool {
163    let fd_ref = match dict.get_ref("FontDescriptor") {
164        Ok(r) => r,
165        Err(_) => return false,
166    };
167    let flags = file
168        .resolve(fd_ref)
169        .ok()
170        .and_then(|o| o.as_dict().ok().and_then(|d| d.get_i64("Flags").ok()));
171    matches!(flags, Some(f) if (f & 4) != 0 && (f & 32) == 0)
172}
173
174/// Build the effective simple-font encoding from /Encoding (a name, a dict with
175/// /BaseEncoding + /Differences, or absent).
176fn parse_encoding(
177    file: &PdfFile,
178    dict: &zpdf_core::PdfDict,
179    subtype: &str,
180    symbolic: bool,
181) -> Option<zpdf_font::encoding::Encoding> {
182    use zpdf_font::encoding::{base_encoding_by_name, Encoding};
183
184    let enc_obj = match dict.get("Encoding").cloned() {
185        Some(PdfObject::Ref(r)) => file.resolve(r).ok(),
186        other => other,
187    };
188
189    match enc_obj {
190        Some(PdfObject::Name(n)) => base_encoding_by_name(n.as_str()).map(Encoding::from_base),
191        Some(PdfObject::Dict(enc_dict)) => {
192            let base = enc_dict
193                .get_name("BaseEncoding")
194                .ok()
195                .and_then(base_encoding_by_name)
196                .unwrap_or_else(|| default_simple_base(subtype));
197            let mut encoding = Encoding::from_base(base);
198            apply_differences(&enc_dict, &mut encoding);
199            Some(encoding)
200        }
201        // No /Encoding: symbolic fonts use their built-in cmap; others get a default.
202        _ if symbolic => None,
203        _ => Some(Encoding::from_base(default_simple_base(subtype))),
204    }
205}
206
207fn default_simple_base(subtype: &str) -> &'static zpdf_font::encoding::EncodingTable {
208    match subtype {
209        "TrueType" => &zpdf_font::encoding::WIN_ANSI_ENCODING,
210        _ => &zpdf_font::encoding::STANDARD_ENCODING,
211    }
212}
213
214fn apply_differences(enc_dict: &zpdf_core::PdfDict, encoding: &mut zpdf_font::encoding::Encoding) {
215    if let Ok(diffs) = enc_dict.get_array("Differences") {
216        let mut code = 0u32;
217        for obj in diffs {
218            match obj {
219                PdfObject::Integer(n) => code = (*n).max(0) as u32,
220                PdfObject::Name(name) => {
221                    if code <= 255 {
222                        encoding.apply_difference(code as u8, name.as_str());
223                    }
224                    code += 1;
225                }
226                _ => {}
227            }
228        }
229    }
230}
231
232/// Resolve a Type0 font's /Encoding into a code → CID CMap: a predefined
233/// name, or an embedded CMap stream. Unknown legacy CMaps fall back to
234/// Identity-H with a warning.
235fn parse_type0_encoding(file: &PdfFile, dict: &zpdf_core::PdfDict) -> zpdf_font::cmap::CidCMap {
236    use zpdf_font::cmap::CidCMap;
237    // Unknown (legacy byte-encoded) CMaps degrade to Identity, but the
238    // writing mode is still known from the -V suffix and kept.
239    fn identity_fallback(name: &str) -> CidCMap {
240        let wmode = name.ends_with("-V") as u8;
241        tracing::warn!(
242            "unsupported predefined CMap {name}; using Identity-{}",
243            if wmode == 1 { "V" } else { "H" }
244        );
245        CidCMap::identity(wmode)
246    }
247    match dict.get("Encoding") {
248        Some(PdfObject::Name(n)) => {
249            CidCMap::predefined(n.as_str()).unwrap_or_else(|| identity_fallback(n.as_str()))
250        }
251        Some(PdfObject::Ref(r)) => match file.resolve(*r) {
252            Ok(PdfObject::Name(n)) => {
253                CidCMap::predefined(n.as_str()).unwrap_or_else(|| identity_fallback(n.as_str()))
254            }
255            Ok(PdfObject::Stream(s)) => {
256                let data = file
257                    .resolve_stream_data(*r)
258                    .or_else(|_| zpdf_parser::filters::decode_stream(&s.data, &s.dict));
259                let mut cmap = match data {
260                    Ok(d) => CidCMap::parse(&d),
261                    Err(e) => {
262                        tracing::warn!("undecodable embedded CMap: {e}; using Identity-H");
263                        CidCMap::identity(0)
264                    }
265                };
266                // /WMode may also live on the stream dict.
267                if let Ok(1) = s.dict.get_i64("WMode") {
268                    cmap.wmode = 1;
269                }
270                cmap
271            }
272            _ => CidCMap::identity(0),
273        },
274        _ => CidCMap::identity(0),
275    }
276}
277
278/// /DW2 vertical metrics from a CID font dict: [vy w1y], default [880 −1000].
279fn parse_dw2(file: &PdfFile, desc_dict: &zpdf_core::PdfDict) -> (f64, f64) {
280    resolve_array(file, desc_dict, "DW2")
281        .and_then(|arr| {
282            let v: Vec<f64> = arr.iter().filter_map(|o| o.as_f64().ok()).collect();
283            (v.len() >= 2).then(|| (v[0], v[1]))
284        })
285        .unwrap_or((880.0, -1000.0))
286}
287
288fn load_type0_font(
289    file: &PdfFile,
290    dict: &zpdf_core::PdfDict,
291    base_font: String,
292) -> Result<LoadedFont> {
293    // /DescendantFonts is commonly an indirect reference to the array.
294    let descendants = resolve_array(file, dict, "DescendantFonts")
295        .ok_or_else(|| zpdf_core::Error::MissingKey("DescendantFonts".into()))?;
296    let desc_ref = descendants
297        .first()
298        .ok_or_else(|| zpdf_core::Error::MissingKey("DescendantFonts[0]".into()))?
299        .as_ref()?;
300
301    let desc_obj = file.resolve(desc_ref)?;
302    let desc_dict = desc_obj.as_dict()?;
303
304    let mut cid_widths = parse_cid_widths(file, desc_dict);
305    parse_cid_w2(file, desc_dict, &mut cid_widths);
306    let cmap = parse_type0_encoding(file, dict);
307    let dw2 = parse_dw2(file, desc_dict);
308
309    let font_data = extract_font_file(file, desc_dict);
310
311    let mut font = match font_data {
312        Some(data) => {
313            let mut font =
314                LoadedFont::new_with_data(PdfFontType::Type0CidType2, base_font, data, cid_widths);
315            // /CIDToGIDMap stream: explicit CID → GID table, authoritative for
316            // CIDFontType2 (TrueType-based) descendants. A raw-CFF CIDFontType0
317            // descendant keeps its charset-derived map built in new_with_data —
318            // there /CIDToGIDMap is not even a legal key.
319            if let Some(map) = parse_cid_to_gid_stream(file, desc_dict) {
320                let subtype = desc_dict.get_name("Subtype").unwrap_or("");
321                if subtype == "CIDFontType2" || font.cid_to_gid.is_none() {
322                    font.cid_to_gid = Some(map);
323                }
324            }
325            font
326        }
327        None => {
328            // Non-embedded composite font (typically CJK): substitute a system
329            // face. CIDs are remapped through /ToUnicode once it is attached
330            // (see build_substitute_cid_to_gid in load_single_font).
331            let ordering = resolve_dict(file, desc_dict, "CIDSystemInfo").and_then(|csi| match csi
332                .get("Ordering")
333            {
334                Some(PdfObject::String(s)) => Some(s.to_string_lossy()),
335                Some(PdfObject::Name(n)) => Some(n.as_str().to_string()),
336                _ => None,
337            });
338            let hints = substitute_hints(file, desc_dict);
339            let substituted =
340                zpdf_font::system::find_system_font(&base_font, hints, ordering.as_deref())
341                    .and_then(|m| {
342                        LoadedFont::new_substitute(
343                            PdfFontType::Type0CidType2,
344                            base_font.clone(),
345                            m.data,
346                            m.face_index,
347                            cid_widths,
348                        )
349                    });
350            substituted.unwrap_or_else(|| LoadedFont::new_placeholder(base_font))
351        }
352    };
353    font.cid_cmap = Some(cmap);
354    font.dw2 = dw2;
355    // A Unicode-coded CMap is only usable when the font program can resolve
356    // Unicode; otherwise fall back to Identity (codes pass through as CIDs).
357    font.validate_cid_cmap();
358    Ok(font)
359}
360
361/// Decode a /CIDToGIDMap stream into a CID → GID table: two bytes per CID,
362/// big-endian, indexed by CID. Returns `None` for /Identity, absence, or any
363/// non-stream form, which keeps the identity (or charset-derived) behavior.
364/// CIDs mapped to GID 0 (.notdef) are omitted — `glyph_outline` treats a
365/// missing entry as "no glyph", which matches the spec semantics.
366fn parse_cid_to_gid_stream(
367    file: &PdfFile,
368    desc_dict: &zpdf_core::PdfDict,
369) -> Option<std::collections::HashMap<u16, u16>> {
370    let stream_ref = match desc_dict.get("CIDToGIDMap") {
371        Some(PdfObject::Ref(r)) => *r,
372        // /Identity (the common name form), absent, or malformed.
373        _ => return None,
374    };
375    let data = match file.resolve_stream_data(stream_ref) {
376        Ok(d) => d,
377        Err(e) => {
378            // e.g. an indirect /Identity name, or an undecodable stream.
379            tracing::debug!("CIDToGIDMap {stream_ref}: not a decodable stream - {e}");
380            return None;
381        }
382    };
383    let mut map = std::collections::HashMap::new();
384    for (cid, gid_bytes) in data.chunks_exact(2).enumerate().take(u16::MAX as usize + 1) {
385        let gid = u16::from_be_bytes([gid_bytes[0], gid_bytes[1]]);
386        if gid != 0 {
387            map.insert(cid as u16, gid);
388        }
389    }
390    if map.is_empty() {
391        None
392    } else {
393        Some(map)
394    }
395}
396
397fn load_truetype_font(
398    file: &PdfFile,
399    dict: &zpdf_core::PdfDict,
400    base_font: String,
401) -> Result<LoadedFont> {
402    let cid_widths = parse_simple_widths(file, dict);
403    let font_data = extract_font_file_from_descriptor(file, dict);
404
405    match font_data {
406        Some(data) => Ok(LoadedFont::new_with_data(
407            PdfFontType::TrueType,
408            base_font,
409            data,
410            cid_widths,
411        )),
412        None => Ok(try_system_substitute_simple(
413            file,
414            dict,
415            &base_font,
416            PdfFontType::TrueType,
417            cid_widths,
418        )
419        .or_else(|| LoadedFont::new_standard(base_font.clone()))
420        .unwrap_or_else(|| LoadedFont::new_placeholder(base_font))),
421    }
422}
423
424fn load_type3_font(
425    file: &PdfFile,
426    dict: &zpdf_core::PdfDict,
427    base_font: String,
428) -> Result<LoadedFont> {
429    use std::sync::Arc;
430
431    // All four Type3 keys are commonly emitted as indirect objects; a direct-only
432    // read would silently drop every glyph, so resolve one level of indirection.
433
434    // FontMatrix: typically [0.001 0 0 -0.001 0 0] for 1000-unit glyph space
435    let font_matrix = {
436        let mut m = [0.001, 0.0, 0.0, -0.001, 0.0, 0.0];
437        if let Some(arr) = resolve_array(file, dict, "FontMatrix") {
438            for (i, obj) in arr.iter().enumerate().take(6) {
439                if let Ok(v) = obj.as_f64() {
440                    m[i] = v;
441                }
442            }
443        }
444        m
445    };
446
447    // Encoding/Differences → glyph name list
448    let mut encoding = Vec::new();
449    if let Some(enc_dict) = resolve_dict(file, dict, "Encoding") {
450        if let Some(diffs) = resolve_array(file, &enc_dict, "Differences") {
451            let mut current_code = 0usize;
452            for obj in &diffs {
453                match obj {
454                    PdfObject::Integer(n) => {
455                        current_code = *n as usize;
456                        while encoding.len() < current_code {
457                            encoding.push(String::new());
458                        }
459                    }
460                    PdfObject::Name(n) => {
461                        while encoding.len() <= current_code {
462                            encoding.push(String::new());
463                        }
464                        encoding[current_code] = n.0.clone();
465                        current_code += 1;
466                    }
467                    _ => {}
468                }
469            }
470        }
471    }
472
473    // CharProcs: name → stream ref
474    let mut char_procs = std::collections::HashMap::new();
475    if let Some(cp_dict) = resolve_dict(file, dict, "CharProcs") {
476        for (name, obj) in &cp_dict.0 {
477            if let PdfObject::Ref(r) = obj {
478                if let Ok(data) = file.resolve_stream_data(*r) {
479                    char_procs.insert(name.0.clone(), Arc::from(data));
480                }
481            }
482        }
483    }
484
485    // Widths
486    let first_char = dict.get_i64("FirstChar").unwrap_or(0) as u16;
487    let widths: Vec<f64> = resolve_array(file, dict, "Widths")
488        .unwrap_or_default()
489        .iter()
490        .map(|o| o.as_f64().unwrap_or(0.0))
491        .collect();
492
493    let font = LoadedFont {
494        font_type: zpdf_font::PdfFontType::Type3 {
495            font_matrix,
496            char_procs,
497            encoding,
498            widths,
499            first_char,
500        },
501        base_font,
502        font_data: None,
503        face_index: 0,
504        is_substitute: false,
505        cid_widths: CidWidths::new(1000.0),
506        units_per_em: 1000.0,
507        ascent: 880.0,
508        descent: -120.0,
509        cid_to_gid: None,
510        builtin_encoding_gids: None,
511        orphan_gids: Vec::new(),
512        encoding: None,
513        to_unicode: None,
514        symbolic: false,
515        type1: None,
516        cid_cmap: None,
517        dw2: (880.0, -1000.0),
518    };
519
520    Ok(font)
521}
522
523fn load_type1_font(
524    file: &PdfFile,
525    dict: &zpdf_core::PdfDict,
526    base_font: String,
527) -> Result<LoadedFont> {
528    let cid_widths = parse_simple_widths(file, dict);
529    let font_data = extract_font_file_from_descriptor(file, dict);
530
531    match font_data {
532        Some(data) => Ok(LoadedFont::new_with_data(
533            PdfFontType::Type1,
534            base_font,
535            data,
536            cid_widths,
537        )),
538        None => Ok(try_system_substitute_simple(
539            file,
540            dict,
541            &base_font,
542            PdfFontType::Type1,
543            cid_widths,
544        )
545        .or_else(|| LoadedFont::new_standard(base_font.clone()))
546        .unwrap_or_else(|| LoadedFont::new_placeholder(base_font))),
547    }
548}
549
550/// Extract embedded font binary from FontDescriptor → FontFile2 (TrueType).
551fn extract_font_file(file: &PdfFile, cid_dict: &zpdf_core::PdfDict) -> Option<Vec<u8>> {
552    let fd_ref = cid_dict.get_ref("FontDescriptor").ok()?;
553    let fd_obj = file.resolve(fd_ref).ok()?;
554    let fd_dict = fd_obj.as_dict().ok()?;
555
556    // Try FontFile2 (TrueType), then FontFile3 (OpenType/CFF), then FontFile (Type1)
557    for key in &["FontFile2", "FontFile3", "FontFile"] {
558        if let Ok(ff_ref) = fd_dict.get_ref(key) {
559            if let Ok(data) = file.resolve_stream_data(ff_ref) {
560                if !data.is_empty() {
561                    return Some(data);
562                }
563            }
564        }
565    }
566    None
567}
568
569fn extract_font_file_from_descriptor(
570    file: &PdfFile,
571    font_dict: &zpdf_core::PdfDict,
572) -> Option<Vec<u8>> {
573    let fd_ref = font_dict.get_ref("FontDescriptor").ok()?;
574    let fd_obj = file.resolve(fd_ref).ok()?;
575    let fd_dict = fd_obj.as_dict().ok()?;
576
577    for key in &["FontFile2", "FontFile3", "FontFile"] {
578        if let Ok(ff_ref) = fd_dict.get_ref(key) {
579            if let Ok(data) = file.resolve_stream_data(ff_ref) {
580                if !data.is_empty() {
581                    return Some(data);
582                }
583            }
584        }
585    }
586    None
587}
588
589/// Fetch an array value, resolving one level of indirect reference. pdftex (and
590/// many other producers) commonly emit `/Widths` and `/W` as indirect objects,
591/// which a plain `get_array` would miss (leaving every glyph at the default width).
592fn resolve_array(file: &PdfFile, dict: &zpdf_core::PdfDict, key: &str) -> Option<Vec<PdfObject>> {
593    match dict.get(key) {
594        Some(PdfObject::Array(a)) => Some(a.clone()),
595        Some(PdfObject::Ref(id)) => file
596            .resolve(*id)
597            .ok()
598            .and_then(|o| o.as_array().ok().map(|a| a.to_vec())),
599        _ => None,
600    }
601}
602
603/// Fetch a dictionary value, resolving one level of indirect reference, in the
604/// same spirit as [`resolve_array`] (Type3 producers commonly emit /CharProcs
605/// and /Encoding as indirect objects).
606fn resolve_dict(
607    file: &PdfFile,
608    dict: &zpdf_core::PdfDict,
609    key: &str,
610) -> Option<zpdf_core::PdfDict> {
611    match dict.get(key) {
612        Some(PdfObject::Dict(d)) => Some(d.clone()),
613        Some(PdfObject::Ref(id)) => file
614            .resolve(*id)
615            .ok()
616            .and_then(|o| o.as_dict().ok().cloned()),
617        _ => None,
618    }
619}
620
621/// Parse CID /W array: format is [cid [w1 w2 ...]] or [cid_first cid_last w]
622fn parse_cid_widths(file: &PdfFile, dict: &zpdf_core::PdfDict) -> CidWidths {
623    let dw = dict.get_f64("DW").unwrap_or(1000.0);
624    let mut widths = CidWidths::new(dw);
625
626    let w_array = match resolve_array(file, dict, "W") {
627        Some(arr) => arr,
628        None => return widths,
629    };
630
631    let mut i = 0;
632    while i < w_array.len() {
633        let cid_start = match w_array[i].as_i64() {
634            Ok(v) => v as u16,
635            Err(_) => break,
636        };
637        i += 1;
638        if i >= w_array.len() {
639            break;
640        }
641
642        match &w_array[i] {
643            PdfObject::Array(arr) => {
644                // [cid_start [w1 w2 w3 ...]]
645                for (j, obj) in arr.iter().enumerate() {
646                    let Some(cid) = cid_start.checked_add(j as u16) else {
647                        break;
648                    };
649                    if let Ok(w) = obj.as_f64() {
650                        widths.set(cid, w);
651                    }
652                }
653                i += 1;
654            }
655            PdfObject::Integer(_) | PdfObject::Real(_) => {
656                // [cid_start cid_end width]
657                let cid_end = w_array[i].as_i64().unwrap_or(cid_start as i64) as u16;
658                i += 1;
659                if i < w_array.len() {
660                    let w = w_array[i].as_f64().unwrap_or(dw);
661                    for cid in cid_start..=cid_end {
662                        widths.set(cid, w);
663                    }
664                    i += 1;
665                }
666            }
667            _ => {
668                i += 1;
669            }
670        }
671    }
672
673    widths
674}
675
676/// Parse the CID /W2 array (PDF 9.7.4.3) into per-CID vertical metrics.
677/// Two element forms, mirroring /W but with THREE numbers per glyph:
678///   `c [ w1y_1 vx_1 vy_1  w1y_2 vx_2 vy_2 ... ]`   (list form)
679///   `cFirst cLast w1y vx vy`                         (range form)
680/// where `w1y` is the vertical displacement and `(vx, vy)` the position vector.
681fn parse_cid_w2(file: &PdfFile, dict: &zpdf_core::PdfDict, widths: &mut CidWidths) {
682    if let Some(arr) = resolve_array(file, dict, "W2") {
683        apply_w2_array(&arr, widths);
684    }
685}
686
687fn apply_w2_array(w2_array: &[PdfObject], widths: &mut CidWidths) {
688    let mut i = 0;
689    while i < w2_array.len() {
690        let cid_start = match w2_array[i].as_i64() {
691            Ok(v) => v as u16,
692            Err(_) => break,
693        };
694        i += 1;
695        if i >= w2_array.len() {
696            break;
697        }
698
699        match &w2_array[i] {
700            PdfObject::Array(arr) => {
701                // List form: triples (w1y, vx, vy) starting at cid_start.
702                let mut k = 0;
703                while k + 2 < arr.len() {
704                    let (Ok(w1y), Ok(vx), Ok(vy)) =
705                        (arr[k].as_f64(), arr[k + 1].as_f64(), arr[k + 2].as_f64())
706                    else {
707                        break;
708                    };
709                    let Some(cid) = cid_start.checked_add((k / 3) as u16) else {
710                        break;
711                    };
712                    widths.set_v(cid, w1y, vx, vy);
713                    k += 3;
714                }
715                i += 1;
716            }
717            PdfObject::Integer(_) | PdfObject::Real(_) => {
718                // Range form: cFirst cLast w1y vx vy.
719                let cid_end = w2_array[i].as_i64().unwrap_or(cid_start as i64) as u16;
720                if i + 3 < w2_array.len() {
721                    let (Ok(w1y), Ok(vx), Ok(vy)) = (
722                        w2_array[i + 1].as_f64(),
723                        w2_array[i + 2].as_f64(),
724                        w2_array[i + 3].as_f64(),
725                    ) else {
726                        break;
727                    };
728                    for cid in cid_start..=cid_end {
729                        widths.set_v(cid, w1y, vx, vy);
730                    }
731                    i += 4;
732                } else {
733                    break;
734                }
735            }
736            _ => {
737                i += 1;
738            }
739        }
740    }
741}
742
743fn parse_simple_widths(file: &PdfFile, dict: &zpdf_core::PdfDict) -> CidWidths {
744    let first_char = dict.get_i64("FirstChar").unwrap_or(0) as u16;
745    let mut widths = CidWidths::new(1000.0);
746
747    if let Some(arr) = resolve_array(file, dict, "Widths") {
748        for (j, obj) in arr.iter().enumerate() {
749            let Some(code) = first_char.checked_add(j as u16) else {
750                break;
751            };
752            if let Ok(w) = obj.as_f64() {
753                widths.set(code, w);
754            }
755        }
756    }
757
758    widths
759}
760
761#[cfg(test)]
762mod tests {
763    use super::*;
764
765    fn int(v: i64) -> PdfObject {
766        PdfObject::Integer(v)
767    }
768    fn real(v: f64) -> PdfObject {
769        PdfObject::Real(v)
770    }
771
772    #[test]
773    fn w2_list_form_assigns_consecutive_cids() {
774        // 120 [w1y vx vy  w1y vx vy] → CIDs 120 and 121.
775        let arr = vec![
776            int(120),
777            PdfObject::Array(vec![
778                real(-1000.0),
779                real(500.0),
780                real(880.0),
781                int(-900),
782                int(450),
783                int(820),
784            ]),
785        ];
786        let mut w = CidWidths::new(1000.0);
787        apply_w2_array(&arr, &mut w);
788        assert_eq!(w.get_v(120), Some((-1000.0, 500.0, 880.0)));
789        assert_eq!(w.get_v(121), Some((-900.0, 450.0, 820.0)));
790        assert_eq!(w.get_v(122), None);
791    }
792
793    #[test]
794    fn w2_range_form_assigns_inclusive_range() {
795        // cFirst cLast w1y vx vy
796        let arr = vec![int(10), int(12), int(-1000), int(500), int(880)];
797        let mut w = CidWidths::new(1000.0);
798        apply_w2_array(&arr, &mut w);
799        for cid in 10..=12 {
800            assert_eq!(w.get_v(cid), Some((-1000.0, 500.0, 880.0)));
801        }
802        assert_eq!(w.get_v(9), None);
803        assert_eq!(w.get_v(13), None);
804    }
805
806    #[test]
807    fn w2_truncated_entry_is_ignored_not_panic() {
808        // Range header without the trailing metric numbers must not panic.
809        let arr = vec![int(10), int(12), int(-1000)];
810        let mut w = CidWidths::new(1000.0);
811        apply_w2_array(&arr, &mut w);
812        assert_eq!(w.get_v(10), None);
813    }
814}