edgeparse_core/pdf/
font.rs

1//! PDF font handling — font resolution, glyph widths, and Unicode mapping.
2
3use std::collections::HashMap;
4use std::sync::LazyLock;
5
6/// Adobe Glyph List data, embedded at compile time.
7static AGL_DATA: &str = include_str!("data/glyphlist.txt");
8
9/// Lazily-initialized AGL lookup: glyph name → Unicode string.
10static AGL_MAP: LazyLock<HashMap<&'static str, String>> = LazyLock::new(|| {
11    let mut map = HashMap::with_capacity(4300);
12    for line in AGL_DATA.lines() {
13        let line = line.trim();
14        if line.is_empty() || line.starts_with('#') {
15            continue;
16        }
17        if let Some((name, hex_part)) = line.split_once(';') {
18            // hex_part may be a single value "0041" or multi-codepoint "05D3 05B2"
19            let mut s = String::new();
20            for hex_str in hex_part.split_whitespace() {
21                if let Ok(cp) = u32::from_str_radix(hex_str, 16) {
22                    if let Some(c) = char::from_u32(cp) {
23                        s.push(c);
24                    }
25                }
26            }
27            if !s.is_empty() {
28                map.insert(name, s);
29            }
30        }
31    }
32    map
33});
34
35/// Extra glyph names used by TeX Computer Modern fonts (CMSY, CMEX, CMMI, etc.)
36/// that are not in the Adobe Glyph List.
37static TEX_GLYPH_MAP: LazyLock<HashMap<&'static str, &'static str>> = LazyLock::new(|| {
38    let mut m = HashMap::new();
39    // CMSY (Computer Modern Symbol) extras
40    m.insert("asteriskmath", "\u{2217}"); // ∗
41    m.insert("diamondmath", "\u{22C4}"); // ⋄
42    m.insert("minusplus", "\u{2213}"); // ∓
43    m.insert("circleminus", "\u{2296}"); // ⊖
44    m.insert("circledivide", "\u{2298}"); // ⊘
45    m.insert("circledot", "\u{2299}"); // ⊙
46    m.insert("circlecopyrt", "\u{00A9}"); // ©
47    m.insert("equivasymptotic", "\u{224D}"); // ≍
48    m.insert("precedesequal", "\u{227C}"); // ≼
49    m.insert("followsequal", "\u{227D}"); // ≽
50    m.insert("similarequal", "\u{2243}"); // ≃
51    m.insert("lessmuch", "\u{226A}"); // ≪
52    m.insert("greatermuch", "\u{226B}"); // ≫
53    m.insert("follows", "\u{227B}"); // ≻
54    m.insert("arrownortheast", "\u{2197}"); // ↗
55    m.insert("arrowsoutheast", "\u{2198}"); // ↘
56    m.insert("arrownorthwest", "\u{2196}"); // ↖
57    m.insert("arrowsouthwest", "\u{2199}"); // ↙
58    m.insert("negationslash", "\u{0338}"); // combining long solidus
59    m.insert("owner", "\u{220B}"); // ∋
60    m.insert("triangleinv", "\u{25BD}"); // ▽
61    m.insert("latticetop", "\u{22A4}"); // ⊤
62                                        // CMMI (Computer Modern Math Italic) extras
63    m.insert("tie", "\u{2040}"); // ⁀  (character tie)
64    m.insert("dotlessj", "\u{0237}"); // ȷ
65    m.insert("vector", "\u{20D7}"); // combining right arrow above
66                                    // CMEX (Computer Modern Extension) and CMR extras
67    m.insert("bardbl", "\u{2016}"); // ‖
68    m.insert("mapsto", "\u{21A6}"); // ↦
69    m.insert("lscript", "\u{2113}"); // ℓ
70    m.insert("weierstrass", "\u{2118}"); // ℘
71    m.insert("visiblespace", "\u{2423}"); // ␣
72    m
73});
74
75/// Represents a resolved PDF font with metrics and encoding info.
76#[derive(Debug, Clone)]
77pub struct PdfFont {
78    /// Resource name (e.g., "F1")
79    pub name: String,
80    /// Base font name (e.g., "Helvetica", "ArialMT")
81    pub base_font: String,
82    /// Font subtype (Type1, TrueType, Type0, Type3)
83    pub subtype: String,
84    /// Glyph widths indexed by character code
85    pub widths: HashMap<u32, f64>,
86    /// Default width for missing glyphs
87    pub default_width: f64,
88    /// ToUnicode mapping: character code → Unicode string
89    pub to_unicode: HashMap<u32, String>,
90    /// Encoding name
91    pub encoding: String,
92    /// Whether this is a standard 14 font
93    pub is_standard: bool,
94    /// Font descriptor flags
95    pub flags: u32,
96    /// Italic angle
97    pub italic_angle: f64,
98    /// Font weight (estimated)
99    pub weight: f64,
100    /// Bytes per character code (1 for Type1/TrueType, 2 for Type0/CID)
101    pub bytes_per_code: u8,
102    /// Font ascent in glyph-space units (per-mille, i.e. 1000 = 1 em)
103    pub ascent: f64,
104    /// Font descent in glyph-space units (negative, per-mille)
105    pub descent: f64,
106    /// Font bounding box [llx, lly, urx, ury] in glyph-space units
107    pub font_bbox: [f64; 4],
108}
109
110impl PdfFont {
111    /// Create a default font with standard metrics.
112    pub fn default_font(name: &str) -> Self {
113        Self {
114            name: name.to_string(),
115            base_font: "Unknown".to_string(),
116            subtype: "Type1".to_string(),
117            widths: HashMap::new(),
118            default_width: 600.0, // Monospace default
119            to_unicode: HashMap::new(),
120            encoding: "WinAnsiEncoding".to_string(),
121            is_standard: false,
122            flags: 0,
123            italic_angle: 0.0,
124            weight: 400.0,
125            bytes_per_code: 1,
126            ascent: 800.0,
127            descent: -200.0,
128            font_bbox: [0.0, -200.0, 1000.0, 800.0],
129        }
130    }
131
132    /// Get glyph width for a character code (in glyph space, typically 1/1000 of text space).
133    pub fn glyph_width(&self, char_code: u32) -> f64 {
134        *self.widths.get(&char_code).unwrap_or(&self.default_width)
135    }
136
137    /// Get Unicode string for a character code.
138    pub fn decode_char(&self, char_code: u32) -> String {
139        if let Some(unicode) = self.to_unicode.get(&char_code) {
140            return unicode.clone();
141        }
142        // Fallback: use the font's declared encoding
143        match self.encoding.as_str() {
144            "MacRomanEncoding" => decode_macroman(char_code),
145            _ => decode_winansi(char_code),
146        }
147    }
148
149    /// Whether the font is bold (estimated from name or flags).
150    pub fn is_bold(&self) -> bool {
151        let name_lower = self.base_font.to_lowercase();
152        name_lower.contains("bold")
153            || name_lower.contains("black")
154            || name_lower.contains("heavy")
155            || (self.flags & 0x40000) != 0
156            || self.weight >= 700.0
157    }
158
159    /// Whether the font is italic (estimated from name or italic angle).
160    pub fn is_italic(&self) -> bool {
161        let name_lower = self.base_font.to_lowercase();
162        name_lower.contains("italic")
163            || name_lower.contains("oblique")
164            || (self.flags & 0x40) != 0
165            || self.italic_angle.abs() > 0.5
166    }
167}
168
169/// Font cache for resolved fonts per page.
170#[derive(Debug, Default, Clone)]
171pub struct FontCache {
172    fonts: HashMap<String, PdfFont>,
173}
174
175impl FontCache {
176    /// Get a font by resource name, or create a default placeholder.
177    pub fn get_or_default(&mut self, name: &str) -> &PdfFont {
178        if !self.fonts.contains_key(name) {
179            self.fonts
180                .insert(name.to_string(), PdfFont::default_font(name));
181        }
182        &self.fonts[name]
183    }
184
185    /// Insert a resolved font.
186    pub fn insert(&mut self, name: String, font: PdfFont) {
187        self.fonts.insert(name, font);
188    }
189
190    /// Get a font by name.
191    pub fn get(&self, name: &str) -> Option<&PdfFont> {
192        self.fonts.get(name)
193    }
194
195    /// Iterate over all fonts.
196    pub fn iter(&self) -> impl Iterator<Item = (&String, &PdfFont)> {
197        self.fonts.iter()
198    }
199}
200
201/// Resolve fonts from a page's /Resources /Font dictionary.
202pub fn resolve_page_fonts(doc: &lopdf::Document, page_id: lopdf::ObjectId) -> FontCache {
203    let mut cache = FontCache::default();
204
205    // Get page dictionary
206    let page_dict = match doc.get_object(page_id).and_then(|o| o.as_dict()) {
207        Ok(d) => d,
208        Err(_) => return cache,
209    };
210
211    // Get Resources dict
212    let resources = match page_dict.get(b"Resources") {
213        Ok(r) => resolve_object(doc, r),
214        Err(_) => return cache,
215    };
216
217    let resources_dict = match resources.as_dict() {
218        Ok(d) => d,
219        Err(_) => return cache,
220    };
221
222    // Get Font dict from resources
223    let font_dict = match resources_dict.get(b"Font") {
224        Ok(f) => resolve_object(doc, f),
225        Err(_) => return cache,
226    };
227
228    let font_dict = match font_dict.as_dict() {
229        Ok(d) => d,
230        Err(_) => return cache,
231    };
232
233    // Resolve each font
234    for (name_bytes, font_ref) in font_dict.iter() {
235        let name = String::from_utf8_lossy(name_bytes).to_string();
236        let font_obj = resolve_object(doc, font_ref);
237
238        if let Ok(fd) = font_obj.as_dict() {
239            let pdf_font = resolve_font_dict(doc, &name, fd);
240            cache.insert(name, pdf_font);
241        }
242    }
243
244    cache
245}
246
247/// Resolve a font dictionary into a PdfFont.
248pub(crate) fn resolve_font_dict(
249    doc: &lopdf::Document,
250    name: &str,
251    dict: &lopdf::Dictionary,
252) -> PdfFont {
253    let base_font = dict
254        .get(b"BaseFont")
255        .ok()
256        .and_then(|o| {
257            if let lopdf::Object::Name(n) = o {
258                Some(String::from_utf8_lossy(n).to_string())
259            } else {
260                None
261            }
262        })
263        .unwrap_or_else(|| "Unknown".to_string());
264
265    let subtype = dict
266        .get(b"Subtype")
267        .ok()
268        .and_then(|o| {
269            if let lopdf::Object::Name(n) = o {
270                Some(String::from_utf8_lossy(n).to_string())
271            } else {
272                None
273            }
274        })
275        .unwrap_or_else(|| "Type1".to_string());
276
277    let encoding = dict
278        .get(b"Encoding")
279        .ok()
280        .and_then(|o| {
281            let resolved = resolve_object(doc, o);
282            match resolved {
283                lopdf::Object::Name(n) => Some(String::from_utf8_lossy(&n).to_string()),
284                lopdf::Object::Dictionary(ref d) => {
285                    // Dictionary encoding — extract /BaseEncoding if present
286                    d.get(b"BaseEncoding").ok().and_then(|be| {
287                        if let lopdf::Object::Name(n) = be {
288                            Some(String::from_utf8_lossy(n).to_string())
289                        } else {
290                            None
291                        }
292                    })
293                }
294                _ => None,
295            }
296        })
297        .unwrap_or_else(|| "WinAnsiEncoding".to_string());
298
299    let is_standard = is_standard_font(&base_font);
300    let mut default_width = if is_standard {
301        standard_font_default_width(&base_font)
302    } else {
303        1000.0
304    };
305
306    // Determine if this is a Type0 (CID) font
307    let is_type0 = subtype == "Type0";
308    let bytes_per_code: u8 = if is_type0 { 2 } else { 1 };
309
310    // Resolve widths — for Type0, use descendant font's /W array
311    let mut widths = resolve_widths(doc, dict);
312
313    // Resolve ToUnicode CMap
314    let mut to_unicode = resolve_tounicode(doc, dict);
315
316    // For Type0 fonts, resolve DescendantFonts for widths and font descriptor
317    let mut flags = 0u32;
318    let mut italic_angle = 0.0f64;
319    let mut weight = 400.0f64;
320    let mut ascent = 800.0f64;
321    let mut descent = -200.0f64;
322    let mut font_bbox = [0.0f64, -200.0, 1000.0, 800.0];
323
324    if is_type0 {
325        if let Ok(desc_ref) = dict.get(b"DescendantFonts") {
326            let desc_obj = resolve_object(doc, desc_ref);
327            if let Ok(desc_arr) = desc_obj.as_array() {
328                if let Some(first) = desc_arr.first() {
329                    let desc_font_obj = resolve_object(doc, first);
330                    if let Ok(desc_dict) = desc_font_obj.as_dict() {
331                        // Get DW (default width) from descendant
332                        if let Ok(dw) = desc_dict.get(b"DW") {
333                            if let Some(dw_val) = obj_to_f64(resolve_object(doc, dw)) {
334                                default_width = dw_val;
335                            }
336                        }
337                        // Get W array from descendant
338                        resolve_cid_widths(doc, desc_dict, &mut widths);
339                        // Get font descriptor from descendant
340                        let (f, ia, w, a, d, fb) = resolve_font_descriptor(doc, desc_dict);
341                        flags = f;
342                        italic_angle = ia;
343                        weight = w;
344                        ascent = a;
345                        descent = d;
346                        font_bbox = fb;
347                    }
348                }
349            }
350        }
351    } else {
352        // Parse /Encoding /Differences array — maps char codes to glyph names → Unicode
353        resolve_encoding_differences(doc, dict, &mut to_unicode);
354        // For Type1 fonts without /Encoding or /ToUnicode, try extracting
355        // encoding from the embedded font program (/FontFile stream).
356        // This handles TeX fonts (CMSY, CMMI, CMEX, etc.) that use custom
357        // encodings without explicit /Encoding dictionaries.
358        if subtype == "Type1" {
359            resolve_type1_font_program_encoding(doc, dict, &mut to_unicode);
360        }
361        // Resolve font descriptor
362        let (f, ia, w, a, d, fb) = resolve_font_descriptor(doc, dict);
363        flags = f;
364        italic_angle = ia;
365        weight = w;
366        ascent = a;
367        descent = d;
368        font_bbox = fb;
369    }
370
371    // If the font name indicates bold but StemV gave a low weight, override.
372    // Many PDFs have "Bold" in the font name but StemV in the 100-140 range
373    // which our heuristic maps to 500 (Medium).  The font name is more reliable.
374    let name_lower = base_font.to_lowercase();
375    let is_name_bold =
376        name_lower.contains("bold") || name_lower.contains("black") || name_lower.contains("heavy");
377    if is_name_bold && weight < 700.0 {
378        weight = 700.0;
379    }
380
381    PdfFont {
382        name: name.to_string(),
383        base_font,
384        subtype,
385        widths,
386        default_width,
387        to_unicode,
388        encoding,
389        is_standard,
390        flags,
391        italic_angle,
392        weight,
393        bytes_per_code,
394        ascent,
395        descent,
396        font_bbox,
397    }
398}
399
400/// Resolve font widths from /Widths array.
401fn resolve_widths(doc: &lopdf::Document, dict: &lopdf::Dictionary) -> HashMap<u32, f64> {
402    let mut widths = HashMap::new();
403
404    let first_char = dict
405        .get(b"FirstChar")
406        .ok()
407        .and_then(|o| obj_to_i64(resolve_object(doc, o)))
408        .unwrap_or(0) as u32;
409
410    if let Ok(widths_ref) = dict.get(b"Widths") {
411        let widths_obj = resolve_object(doc, widths_ref);
412        if let Ok(arr) = widths_obj.as_array() {
413            for (i, w) in arr.iter().enumerate() {
414                if let Some(width) = obj_to_f64(resolve_object(doc, w)) {
415                    widths.insert(first_char + i as u32, width);
416                }
417            }
418        }
419    }
420
421    widths
422}
423
424/// Resolve CID font widths from /W array.
425///
426/// The /W array format: [cid [w1 w2 ...] cid_start cid_end w ...]
427/// Two forms:
428/// - `cid [w1, w2, w3, ...]` — consecutive CIDs starting at cid
429/// - `cid_start cid_end w` — range of CIDs all with same width
430fn resolve_cid_widths(
431    doc: &lopdf::Document,
432    dict: &lopdf::Dictionary,
433    widths: &mut HashMap<u32, f64>,
434) {
435    let w_obj = match dict.get(b"W") {
436        Ok(o) => resolve_object(doc, o),
437        Err(_) => return,
438    };
439    let w_arr = match w_obj.as_array() {
440        Ok(a) => a,
441        Err(_) => return,
442    };
443
444    let mut i = 0;
445    while i < w_arr.len() {
446        let first = resolve_object(doc, &w_arr[i]);
447        if let Some(cid_start) = obj_to_i64(first) {
448            let cid_start = cid_start as u32;
449            i += 1;
450            if i >= w_arr.len() {
451                break;
452            }
453            let next = resolve_object(doc, &w_arr[i]);
454            if let Ok(arr) = next.as_array() {
455                // Form 1: cid [w1 w2 w3 ...]
456                for (j, w) in arr.iter().enumerate() {
457                    if let Some(width) = obj_to_f64(resolve_object(doc, w)) {
458                        widths.insert(cid_start + j as u32, width);
459                    }
460                }
461                i += 1;
462            } else if let Some(cid_end) = obj_to_i64(next) {
463                // Form 2: cid_start cid_end w
464                let cid_end = cid_end as u32;
465                i += 1;
466                if i >= w_arr.len() {
467                    break;
468                }
469                let w_val = resolve_object(doc, &w_arr[i]);
470                if let Some(width) = obj_to_f64(w_val) {
471                    for cid in cid_start..=cid_end {
472                        widths.insert(cid, width);
473                    }
474                }
475                i += 1;
476            } else {
477                i += 1;
478            }
479        } else {
480            i += 1;
481        }
482    }
483}
484
485/// Resolve ToUnicode CMap.
486fn resolve_tounicode(doc: &lopdf::Document, dict: &lopdf::Dictionary) -> HashMap<u32, String> {
487    let mut mapping = HashMap::new();
488
489    if let Ok(tounicode_ref) = dict.get(b"ToUnicode") {
490        let tounicode_ref = match tounicode_ref {
491            lopdf::Object::Reference(r) => *r,
492            _ => return mapping,
493        };
494
495        if let Ok(stream) = doc.get_object(tounicode_ref) {
496            if let Ok(stream) = stream.as_stream() {
497                if let Ok(data) = stream.decompressed_content() {
498                    parse_cmap(&data, &mut mapping);
499                }
500            }
501        }
502    }
503
504    mapping
505}
506
507/// Parse /Encoding dictionary's /Differences array and add mappings to to_unicode.
508///
509/// The /Differences array format is: [code, name, name, ..., code, name, ...]
510/// Numbers reset the current character code; names are applied sequentially.
511fn resolve_encoding_differences(
512    doc: &lopdf::Document,
513    dict: &lopdf::Dictionary,
514    to_unicode: &mut HashMap<u32, String>,
515) {
516    let enc_obj = match dict.get(b"Encoding") {
517        Ok(o) => resolve_object(doc, o),
518        Err(_) => return,
519    };
520
521    let enc_dict = match enc_obj.as_dict() {
522        Ok(d) => d,
523        Err(_) => return, // Not a dictionary (just a name like "WinAnsiEncoding")
524    };
525
526    let diffs_obj = match enc_dict.get(b"Differences") {
527        Ok(o) => resolve_object(doc, o),
528        Err(_) => return,
529    };
530
531    let diffs = match diffs_obj.as_array() {
532        Ok(a) => a,
533        Err(_) => return,
534    };
535
536    let mut current_code: u32 = 0;
537    for item in diffs {
538        let resolved = resolve_object(doc, item);
539        match resolved {
540            lopdf::Object::Integer(i) => {
541                current_code = i as u32;
542            }
543            lopdf::Object::Name(ref name_bytes) => {
544                let glyph_name = String::from_utf8_lossy(name_bytes).to_string();
545                // Only add if not already mapped by ToUnicode (ToUnicode takes priority)
546                if let std::collections::hash_map::Entry::Vacant(e) = to_unicode.entry(current_code)
547                {
548                    if let Some(unicode) = glyph_name_to_unicode(&glyph_name) {
549                        e.insert(unicode);
550                    }
551                }
552                current_code += 1;
553            }
554            _ => {}
555        }
556    }
557}
558
559/// Extract encoding from an embedded Type1 font program (/FontFile stream).
560///
561/// Type1 font programs contain an Encoding array defined in PostScript.
562/// The pattern is: `dup <code> /<glyphname> put`
563/// This function parses those lines and maps glyph names to Unicode.
564fn resolve_type1_font_program_encoding(
565    doc: &lopdf::Document,
566    dict: &lopdf::Dictionary,
567    to_unicode: &mut HashMap<u32, String>,
568) {
569    // Get FontDescriptor
570    let fd_obj = match dict.get(b"FontDescriptor") {
571        Ok(o) => resolve_object(doc, o),
572        Err(_) => return,
573    };
574    let fd = match fd_obj.as_dict() {
575        Ok(d) => d,
576        Err(_) => return,
577    };
578
579    // Try /FontFile (Type1), /FontFile2 (TrueType), /FontFile3 (CFF/OpenType)
580    let stream_data = None
581        .or_else(|| get_font_file_data(doc, fd, b"FontFile"))
582        .or_else(|| get_font_file_data(doc, fd, b"FontFile3"));
583
584    let data = match stream_data {
585        Some(d) => d,
586        None => return,
587    };
588
589    // Parse the Type1 font program's Encoding vector
590    // Look for patterns: dup <number> /<name> put
591    let text = String::from_utf8_lossy(&data);
592
593    // Check if this font uses StandardEncoding (no custom encoding)
594    if text.contains("/Encoding StandardEncoding def") {
595        return; // Standard encoding, handled by WinAnsi fallback
596    }
597
598    for line in text.lines() {
599        let trimmed = line.trim();
600        // Pattern: "dup 121 /dagger put" or "dup 3 /asteriskmath put"
601        if !trimmed.starts_with("dup ") || !trimmed.ends_with(" put") {
602            continue;
603        }
604        let inner = &trimmed[4..trimmed.len() - 4].trim();
605        // Split: "121 /dagger"
606        let parts: Vec<&str> = inner.splitn(2, ' ').collect();
607        if parts.len() != 2 {
608            continue;
609        }
610        let code: u32 = match parts[0].trim().parse() {
611            Ok(c) => c,
612            Err(_) => continue,
613        };
614        let glyph = parts[1].trim().trim_start_matches('/');
615        if glyph.is_empty() || glyph == ".notdef" {
616            continue;
617        }
618        // Only add if not already mapped by ToUnicode or Encoding/Differences
619        if let std::collections::hash_map::Entry::Vacant(e) = to_unicode.entry(code) {
620            if let Some(unicode) = glyph_name_to_unicode(glyph) {
621                e.insert(unicode);
622            }
623        }
624    }
625}
626
627/// Get decompressed data from a font file stream.
628fn get_font_file_data(
629    doc: &lopdf::Document,
630    fd: &lopdf::Dictionary,
631    key: &[u8],
632) -> Option<Vec<u8>> {
633    let ff_ref = fd.get(key).ok()?;
634    let ff_id = match ff_ref {
635        lopdf::Object::Reference(r) => *r,
636        _ => return None,
637    };
638    let ff_obj = doc.get_object(ff_id).ok()?;
639    let stream = ff_obj.as_stream().ok()?;
640    stream.decompressed_content().ok()
641}
642
643/// Map a PostScript/PDF glyph name to its Unicode string representation.
644///
645/// Uses the full Adobe Glyph List (4281 entries) with fallbacks for
646/// uniXXXX, uXXXXXX, and single-character names.
647/// Ligatures are decomposed to plain text (e.g., "fi" → "fi" not U+FB01).
648fn glyph_name_to_unicode(name: &str) -> Option<String> {
649    // Override: decompose ligatures to plain text for readability
650    match name {
651        "fi" => return Some("fi".to_string()),
652        "fl" => return Some("fl".to_string()),
653        "ff" => return Some("ff".to_string()),
654        "ffi" => return Some("ffi".to_string()),
655        "ffl" => return Some("ffl".to_string()),
656        "IJ" => return Some("IJ".to_string()),
657        "ij" => return Some("ij".to_string()),
658        _ => {}
659    }
660
661    // 1. Try direct lookup (AGL, TeX, uniXXXX, uXXXXXX, single-char)
662    if let Some(s) = resolve_glyph_component(name) {
663        return Some(s);
664    }
665
666    // 2. AGL Specification: strip variant suffix after first period
667    //    e.g. "c.sc" → "c", "seven.oldstyle" → "seven", "A.swash" → "A"
668    let base = if let Some(dot_pos) = name.find('.') {
669        &name[..dot_pos]
670    } else {
671        name
672    };
673
674    if base != name {
675        if let Some(s) = resolve_glyph_component(base) {
676            return Some(s);
677        }
678    }
679
680    // 3. AGL Specification: underscore ligature decomposition
681    //    e.g. "f_l" → "fl", "T_h" → "Th", "f_f_i" → "ffi"
682    if base.contains('_') {
683        let mut result = String::new();
684        for component in base.split('_') {
685            if let Some(s) = resolve_glyph_component(component) {
686                result.push_str(&s);
687            } else {
688                return None; // If any component fails, give up
689            }
690        }
691        if !result.is_empty() {
692            return Some(result);
693        }
694    }
695
696    None
697}
698
699/// Resolve a single glyph name component to Unicode (no period/underscore decomposition).
700fn resolve_glyph_component(name: &str) -> Option<String> {
701    if name.is_empty() {
702        return None;
703    }
704
705    // Fast path: look up in the full AGL
706    if let Some(s) = AGL_MAP.get(name) {
707        return Some(s.clone());
708    }
709
710    // TeX-specific glyph names not in AGL (CMSY, CMMI, CMEX, etc.)
711    if let Some(s) = TEX_GLYPH_MAP.get(name) {
712        return Some((*s).to_string());
713    }
714
715    // Single-character glyph names (letter names match directly)
716    if name.len() == 1 {
717        return Some(name.to_string());
718    }
719
720    // "uniXXXX" format (Adobe convention for BMP codepoints)
721    if let Some(hex) = name.strip_prefix("uni") {
722        if hex.len() == 4 {
723            if let Ok(cp) = u32::from_str_radix(hex, 16) {
724                if let Some(c) = char::from_u32(cp) {
725                    return Some(c.to_string());
726                }
727            }
728        }
729        // uniXXXXYYYY... — sequence of BMP codepoints in groups of 4
730        if hex.len() > 4 && hex.len() % 4 == 0 {
731            let mut s = String::new();
732            for chunk in hex.as_bytes().chunks(4) {
733                if let Ok(h) = std::str::from_utf8(chunk) {
734                    if let Ok(cp) = u32::from_str_radix(h, 16) {
735                        if let Some(c) = char::from_u32(cp) {
736                            s.push(c);
737                        }
738                    }
739                }
740            }
741            if !s.is_empty() {
742                return Some(s);
743            }
744        }
745    }
746
747    // "uXXXXXX" format (Adobe convention for supplementary codepoints)
748    if let Some(hex) = name.strip_prefix('u') {
749        if (4..=6).contains(&hex.len()) && hex.chars().all(|c| c.is_ascii_hexdigit()) {
750            if let Ok(cp) = u32::from_str_radix(hex, 16) {
751                if let Some(c) = char::from_u32(cp) {
752                    return Some(c.to_string());
753                }
754            }
755        }
756    }
757
758    None
759}
760
761/// Parse a basic CMap for bfchar/bfrange mappings.
762fn parse_cmap(data: &[u8], mapping: &mut HashMap<u32, String>) {
763    let text = String::from_utf8_lossy(data);
764
765    // Parse beginbfchar ... endbfchar blocks
766    let mut in_bfchar = false;
767    for line in text.lines() {
768        let trimmed = line.trim();
769        if trimmed.contains("beginbfchar") {
770            in_bfchar = true;
771            continue;
772        }
773        if trimmed.contains("endbfchar") {
774            in_bfchar = false;
775            continue;
776        }
777        if in_bfchar {
778            // Format: <XXXX> <YYYY>
779            let parts: Vec<&str> = trimmed.split('>').collect();
780            if parts.len() >= 2 {
781                if let (Some(src), Some(dst)) =
782                    (parse_hex_value(parts[0]), parse_hex_unicode(parts[1]))
783                {
784                    mapping.insert(src, dst);
785                }
786            }
787        }
788    }
789
790    // Parse beginbfrange ... endbfrange blocks
791    // Supports two formats:
792    //   <start> <end> <dst>       — single hex value, incremented across range
793    //   <start> <end> [<v1> <v2> ...] — array of Unicode values, one per code
794    let mut in_bfrange = false;
795    for line in text.lines() {
796        let trimmed = line.trim();
797        if trimmed.contains("beginbfrange") {
798            in_bfrange = true;
799            continue;
800        }
801        if trimmed.contains("endbfrange") {
802            in_bfrange = false;
803            continue;
804        }
805        if in_bfrange {
806            // Check if this line has an array format: <start> <end> [<v1> <v2> ...]
807            if let Some(bracket_start) = trimmed.find('[') {
808                // Array format
809                let before_bracket = &trimmed[..bracket_start];
810                let parts: Vec<&str> = before_bracket.split('>').collect();
811                if parts.len() >= 2 {
812                    if let (Some(start), Some(end)) =
813                        (parse_hex_value(parts[0]), parse_hex_value(parts[1]))
814                    {
815                        // Extract values inside brackets [ ... ]
816                        let bracket_end = trimmed.rfind(']').unwrap_or(trimmed.len());
817                        let inside = &trimmed[bracket_start + 1..bracket_end];
818                        let values: Vec<String> = inside
819                            .split('>')
820                            .filter_map(|s| {
821                                let s = s.trim().trim_start_matches('<');
822                                if s.is_empty() {
823                                    None
824                                } else {
825                                    parse_hex_unicode_str(s)
826                                }
827                            })
828                            .collect();
829                        for (i, val) in values.iter().enumerate() {
830                            let code = start + i as u32;
831                            if code > end {
832                                break;
833                            }
834                            mapping.insert(code, val.clone());
835                        }
836                    }
837                }
838            } else {
839                // Standard format: <XXXX> <YYYY> <ZZZZ>
840                let parts: Vec<&str> = trimmed.split('>').collect();
841                if parts.len() >= 3 {
842                    if let (Some(start), Some(end), Some(dst_start)) = (
843                        parse_hex_value(parts[0]),
844                        parse_hex_value(parts[1]),
845                        parse_hex_value(parts[2]),
846                    ) {
847                        for code in start..=end {
848                            let unicode_point = dst_start + (code - start);
849                            if let Some(c) = char::from_u32(unicode_point) {
850                                mapping.insert(code, c.to_string());
851                            }
852                        }
853                    }
854                }
855            }
856        }
857    }
858}
859
860/// Parse a hex value from a CMap entry like "<0041".
861fn parse_hex_value(s: &str) -> Option<u32> {
862    let cleaned = s.trim().trim_start_matches('<').trim();
863    if cleaned.is_empty() {
864        return None;
865    }
866    u32::from_str_radix(cleaned, 16).ok()
867}
868
869/// Parse a hex Unicode value from a CMap entry like " <0041".
870fn parse_hex_unicode(s: &str) -> Option<String> {
871    let cleaned = s
872        .trim()
873        .trim_start_matches('<')
874        .trim_end_matches('>')
875        .trim();
876    parse_hex_unicode_str(cleaned)
877}
878
879/// Parse a hex Unicode string from pre-cleaned hex digits like "0041" or "D800DC00".
880fn parse_hex_unicode_str(cleaned: &str) -> Option<String> {
881    if cleaned.is_empty() {
882        return None;
883    }
884
885    // May be multi-byte Unicode
886    let mut result = String::new();
887    let bytes: Vec<&str> = cleaned
888        .as_bytes()
889        .chunks(4)
890        .map(|c| std::str::from_utf8(c).unwrap_or(""))
891        .collect();
892
893    for hex_str in bytes {
894        if let Ok(code_point) = u32::from_str_radix(hex_str, 16) {
895            if let Some(c) = char::from_u32(code_point) {
896                result.push(c);
897            }
898        }
899    }
900
901    if result.is_empty() {
902        None
903    } else {
904        Some(result)
905    }
906}
907
908/// Resolve font descriptor fields.
909fn resolve_font_descriptor(
910    doc: &lopdf::Document,
911    dict: &lopdf::Dictionary,
912) -> (u32, f64, f64, f64, f64, [f64; 4]) {
913    let mut flags = 0u32;
914    let mut italic_angle = 0.0f64;
915    let mut weight = 400.0f64;
916    let mut ascent = 800.0f64;
917    let mut descent = -200.0f64;
918    let mut font_bbox = [0.0f64, -200.0, 1000.0, 800.0];
919
920    if let Ok(fd_ref) = dict.get(b"FontDescriptor") {
921        let fd_obj = resolve_object(doc, fd_ref);
922        if let Ok(fd) = fd_obj.as_dict() {
923            flags = fd
924                .get(b"Flags")
925                .ok()
926                .and_then(|o| obj_to_i64(resolve_object(doc, o)))
927                .unwrap_or(0) as u32;
928
929            italic_angle = fd
930                .get(b"ItalicAngle")
931                .ok()
932                .and_then(|o| obj_to_f64(resolve_object(doc, o)))
933                .unwrap_or(0.0);
934
935            // StemV can approximate weight
936            let stem_v = fd
937                .get(b"StemV")
938                .ok()
939                .and_then(|o| obj_to_f64(resolve_object(doc, o)))
940                .unwrap_or(0.0);
941
942            weight = if stem_v >= 140.0 {
943                700.0 // Bold
944            } else if stem_v >= 100.0 {
945                500.0 // Medium
946            } else {
947                400.0 // Normal
948            };
949
950            // Read font bounding box
951            if let Ok(bbox_ref) = fd.get(b"FontBBox") {
952                let bbox_obj = resolve_object(doc, bbox_ref);
953                if let Ok(bbox_arr) = bbox_obj.as_array() {
954                    if bbox_arr.len() >= 4 {
955                        let vals: Vec<f64> = bbox_arr
956                            .iter()
957                            .filter_map(|o| obj_to_f64(resolve_object(doc, o)))
958                            .collect();
959                        if vals.len() >= 4 {
960                            font_bbox = [vals[0], vals[1], vals[2], vals[3]];
961                        }
962                    }
963                }
964            }
965
966            // Read ascent (with fallback to font bbox)
967            if let Ok(a_ref) = fd.get(b"Ascent") {
968                if let Some(a) = obj_to_f64(resolve_object(doc, a_ref)) {
969                    ascent = a;
970                }
971            } else {
972                ascent = font_bbox[3]; // fallback to font bbox ury
973            }
974
975            // Read descent (with fallback to font bbox)
976            if let Ok(d_ref) = fd.get(b"Descent") {
977                if let Some(d) = obj_to_f64(resolve_object(doc, d_ref)) {
978                    descent = d;
979                }
980            } else {
981                descent = font_bbox[1]; // fallback to font bbox lly
982            }
983        }
984    }
985
986    (flags, italic_angle, weight, ascent, descent, font_bbox)
987}
988
989/// Resolve a PDF object reference.
990fn resolve_object<'a>(doc: &'a lopdf::Document, obj: &'a lopdf::Object) -> lopdf::Object {
991    match obj {
992        lopdf::Object::Reference(id) => doc.get_object(*id).cloned().unwrap_or(lopdf::Object::Null),
993        other => other.clone(),
994    }
995}
996
997/// Convert a PDF object to f64.
998fn obj_to_f64(obj: lopdf::Object) -> Option<f64> {
999    match obj {
1000        lopdf::Object::Integer(i) => Some(i as f64),
1001        lopdf::Object::Real(f) => Some(f),
1002        _ => None,
1003    }
1004}
1005
1006/// Convert a PDF object to i64.
1007fn obj_to_i64(obj: lopdf::Object) -> Option<i64> {
1008    match obj {
1009        lopdf::Object::Integer(i) => Some(i),
1010        lopdf::Object::Real(f) => Some(f as i64),
1011        _ => None,
1012    }
1013}
1014
1015/// Check if a font name is one of the standard 14 fonts.
1016fn is_standard_font(name: &str) -> bool {
1017    matches!(
1018        name,
1019        "Courier"
1020            | "Courier-Bold"
1021            | "Courier-Oblique"
1022            | "Courier-BoldOblique"
1023            | "Helvetica"
1024            | "Helvetica-Bold"
1025            | "Helvetica-Oblique"
1026            | "Helvetica-BoldOblique"
1027            | "Times-Roman"
1028            | "Times-Bold"
1029            | "Times-Italic"
1030            | "Times-BoldItalic"
1031            | "Symbol"
1032            | "ZapfDingbats"
1033    )
1034}
1035
1036/// Get default glyph width for standard fonts.
1037fn standard_font_default_width(name: &str) -> f64 {
1038    if name.starts_with("Courier") {
1039        600.0
1040    } else {
1041        // Variable-width fonts — use a reasonable default
1042        500.0
1043    }
1044}
1045
1046/// Decode a character code using MacRomanEncoding (Mac OS Roman).
1047///
1048/// Maps byte values 0x80-0xFF to the Mac OS Roman Unicode equivalents.
1049/// Used when a font's /Encoding is explicitly set to MacRomanEncoding.
1050fn decode_macroman(code: u32) -> String {
1051    if code < 128 {
1052        if let Some(c) = char::from_u32(code) {
1053            return c.to_string();
1054        }
1055    }
1056    let mapped = match code {
1057        0x80 => '\u{00C4}', // Ä
1058        0x81 => '\u{00C5}', // Å
1059        0x82 => '\u{00C7}', // Ç
1060        0x83 => '\u{00C9}', // É
1061        0x84 => '\u{00D1}', // Ñ
1062        0x85 => '\u{00D6}', // Ö
1063        0x86 => '\u{00DC}', // Ü
1064        0x87 => '\u{00E1}', // á
1065        0x88 => '\u{00E0}', // à
1066        0x89 => '\u{00E2}', // â
1067        0x8A => '\u{00E4}', // ä
1068        0x8B => '\u{00E3}', // ã
1069        0x8C => '\u{00E5}', // å
1070        0x8D => '\u{00E7}', // ç
1071        0x8E => '\u{00E9}', // é
1072        0x8F => '\u{00E8}', // è
1073        0x90 => '\u{00EA}', // ê
1074        0x91 => '\u{00EB}', // ë
1075        0x92 => '\u{00ED}', // í
1076        0x93 => '\u{00EC}', // ì
1077        0x94 => '\u{00EE}', // î
1078        0x95 => '\u{00EF}', // ï
1079        0x96 => '\u{00F1}', // ñ
1080        0x97 => '\u{00F3}', // ó
1081        0x98 => '\u{00F2}', // ò
1082        0x99 => '\u{00F4}', // ô
1083        0x9A => '\u{00F6}', // ö
1084        0x9B => '\u{00F5}', // õ
1085        0x9C => '\u{00FA}', // ú
1086        0x9D => '\u{00F9}', // ù
1087        0x9E => '\u{00FB}', // û
1088        0x9F => '\u{00FC}', // ü
1089        0xA0 => '\u{2020}', // †
1090        0xA1 => '\u{00B0}', // °
1091        0xA2 => '\u{00A2}', // ¢
1092        0xA3 => '\u{00A3}', // £
1093        0xA4 => '\u{00A7}', // §
1094        0xA5 => '\u{2022}', // •
1095        0xA6 => '\u{00B6}', // ¶
1096        0xA7 => '\u{00DF}', // ß
1097        0xA8 => '\u{00AE}', // ®
1098        0xA9 => '\u{00A9}', // ©
1099        0xAA => '\u{2122}', // ™
1100        0xAB => '\u{00B4}', // ´
1101        0xAC => '\u{00A8}', // ¨
1102        0xAD => '\u{2260}', // ≠
1103        0xAE => '\u{00C6}', // Æ
1104        0xAF => '\u{00D8}', // Ø
1105        0xB0 => '\u{221E}', // ∞
1106        0xB1 => '\u{00B1}', // ±
1107        0xB2 => '\u{2264}', // ≤
1108        0xB3 => '\u{2265}', // ≥
1109        0xB4 => '\u{00A5}', // ¥
1110        0xB5 => '\u{00B5}', // µ
1111        0xB6 => '\u{2202}', // ∂
1112        0xB7 => '\u{2211}', // ∑
1113        0xB8 => '\u{220F}', // ∏
1114        0xB9 => '\u{03C0}', // π
1115        0xBA => '\u{222B}', // ∫
1116        0xBB => '\u{00AA}', // ª
1117        0xBC => '\u{00BA}', // º
1118        0xBD => '\u{03A9}', // Ω
1119        0xBE => '\u{00E6}', // æ
1120        0xBF => '\u{00F8}', // ø
1121        0xC0 => '\u{00BF}', // ¿
1122        0xC1 => '\u{00A1}', // ¡
1123        0xC2 => '\u{00AC}', // ¬
1124        0xC3 => '\u{221A}', // √
1125        0xC4 => '\u{0192}', // ƒ
1126        0xC5 => '\u{2248}', // ≈
1127        0xC6 => '\u{2206}', // ∆
1128        0xC7 => '\u{00AB}', // «
1129        0xC8 => '\u{00BB}', // »
1130        0xC9 => '\u{2026}', // …
1131        0xCA => '\u{00A0}', // non-breaking space
1132        0xCB => '\u{00C0}', // À
1133        0xCC => '\u{00C3}', // Ã
1134        0xCD => '\u{00D5}', // Õ
1135        0xCE => '\u{0152}', // Œ
1136        0xCF => '\u{0153}', // œ
1137        0xD0 => '\u{2013}', // –
1138        0xD1 => '\u{2014}', // —
1139        0xD2 => '\u{201C}', // "
1140        0xD3 => '\u{201D}', // "
1141        0xD4 => '\u{2018}', // '
1142        0xD5 => '\u{2019}', // '
1143        0xD6 => '\u{00F7}', // ÷
1144        0xD7 => '\u{25CA}', // ◊
1145        0xD8 => '\u{00FF}', // ÿ
1146        0xD9 => '\u{0178}', // Ÿ
1147        0xDA => '\u{2044}', // ⁄
1148        0xDB => '\u{20AC}', // €
1149        0xDC => '\u{2039}', // ‹
1150        0xDD => '\u{203A}', // ›
1151        0xDE => '\u{FB01}', // fi ligature
1152        0xDF => '\u{FB02}', // fl ligature
1153        0xE0 => '\u{2021}', // ‡
1154        0xE1 => '\u{00B7}', // ·
1155        0xE2 => '\u{201A}', // ‚
1156        0xE3 => '\u{201E}', // „
1157        0xE4 => '\u{2030}', // ‰
1158        0xE5 => '\u{00C2}', // Â
1159        0xE6 => '\u{00CA}', // Ê
1160        0xE7 => '\u{00C1}', // Á
1161        0xE8 => '\u{00CB}', // Ë
1162        0xE9 => '\u{00C8}', // È
1163        0xEA => '\u{00CD}', // Í
1164        0xEB => '\u{00CE}', // Î
1165        0xEC => '\u{00CF}', // Ï
1166        0xED => '\u{00CC}', // Ì
1167        0xEE => '\u{00D3}', // Ó
1168        0xEF => '\u{00D4}', // Ô
1169        0xF0 => '\u{F8FF}', // Apple logo
1170        0xF1 => '\u{00D2}', // Ò
1171        0xF2 => '\u{00DA}', // Ú
1172        0xF3 => '\u{00DB}', // Û
1173        0xF4 => '\u{00D9}', // Ù
1174        0xF5 => '\u{0131}', // ı
1175        0xF6 => '\u{02C6}', // ˆ
1176        0xF7 => '\u{02DC}', // ˜
1177        0xF8 => '\u{00AF}', // ¯
1178        0xF9 => '\u{02D8}', // ˘
1179        0xFA => '\u{02D9}', // ˙
1180        0xFB => '\u{02DA}', // ˚
1181        0xFC => '\u{00B8}', // ¸
1182        0xFD => '\u{02DD}', // ˝
1183        0xFE => '\u{02DB}', // ˛
1184        0xFF => '\u{02C7}', // ˇ
1185        _ => {
1186            return char::from_u32(code)
1187                .map(|c| c.to_string())
1188                .unwrap_or_default();
1189        }
1190    };
1191    mapped.to_string()
1192}
1193
1194/// Decode a character code using WinAnsiEncoding (Windows-1252 superset).
1195fn decode_winansi(code: u32) -> String {
1196    if code < 128 {
1197        // ASCII range
1198        if let Some(c) = char::from_u32(code) {
1199            return c.to_string();
1200        }
1201    }
1202    // Windows-1252 special range 0x80-0x9F
1203    let mapped = match code {
1204        0x80 => '\u{20AC}', // Euro sign
1205        0x82 => '\u{201A}', // Single low-9 quotation mark
1206        0x83 => '\u{0192}', // Latin small letter f with hook
1207        0x84 => '\u{201E}', // Double low-9 quotation mark
1208        0x85 => '\u{2026}', // Horizontal ellipsis
1209        0x86 => '\u{2020}', // Dagger
1210        0x87 => '\u{2021}', // Double dagger
1211        0x88 => '\u{02C6}', // Modifier letter circumflex accent
1212        0x89 => '\u{2030}', // Per mille sign
1213        0x8A => '\u{0160}', // Latin capital letter S with caron
1214        0x8B => '\u{2039}', // Single left-pointing angle quotation mark
1215        0x8C => '\u{0152}', // Latin capital ligature OE
1216        0x8E => '\u{017D}', // Latin capital letter Z with caron
1217        0x91 => '\u{2018}', // Left single quotation mark
1218        0x92 => '\u{2019}', // Right single quotation mark
1219        0x93 => '\u{201C}', // Left double quotation mark
1220        0x94 => '\u{201D}', // Right double quotation mark
1221        0x95 => '\u{2022}', // Bullet
1222        0x96 => '\u{2013}', // En dash
1223        0x97 => '\u{2014}', // Em dash
1224        0x98 => '\u{02DC}', // Small tilde
1225        0x99 => '\u{2122}', // Trade mark sign
1226        0x9A => '\u{0161}', // Latin small letter s with caron
1227        0x9B => '\u{203A}', // Single right-pointing angle quotation mark
1228        0x9C => '\u{0153}', // Latin small ligature oe
1229        0x9E => '\u{017E}', // Latin small letter z with caron
1230        0x9F => '\u{0178}', // Latin capital letter Y with diaeresis
1231        // 0xA0-0xFF: direct Unicode mapping (Latin-1 Supplement)
1232        c @ 0xA0..=0xFF => {
1233            return char::from_u32(c)
1234                .map(|ch| ch.to_string())
1235                .unwrap_or_default();
1236        }
1237        _ => {
1238            return char::from_u32(code)
1239                .map(|c| c.to_string())
1240                .unwrap_or_default();
1241        }
1242    };
1243    mapped.to_string()
1244}
1245
1246#[cfg(test)]
1247mod tests {
1248    use super::*;
1249
1250    #[test]
1251    fn test_default_font() {
1252        let font = PdfFont::default_font("F1");
1253        assert_eq!(font.name, "F1");
1254        assert!((font.default_width - 600.0).abs() < 1e-10);
1255    }
1256
1257    #[test]
1258    fn test_bold_detection() {
1259        let mut font = PdfFont::default_font("F1");
1260        font.base_font = "Helvetica-Bold".to_string();
1261        assert!(font.is_bold());
1262
1263        font.base_font = "Helvetica".to_string();
1264        assert!(!font.is_bold());
1265    }
1266
1267    #[test]
1268    fn test_decode_winansi() {
1269        assert_eq!(decode_winansi(65), "A");
1270        assert_eq!(decode_winansi(0x93), "\u{201C}");
1271    }
1272
1273    #[test]
1274    fn test_parse_hex_value() {
1275        assert_eq!(parse_hex_value("<0041"), Some(0x41));
1276        assert_eq!(parse_hex_value("<00FF"), Some(0xFF));
1277    }
1278
1279    #[test]
1280    fn test_standard_font() {
1281        assert!(is_standard_font("Helvetica"));
1282        assert!(is_standard_font("Courier-Bold"));
1283        assert!(!is_standard_font("ArialMT"));
1284    }
1285
1286    #[test]
1287    fn test_glyph_name_to_unicode_ligatures() {
1288        assert_eq!(glyph_name_to_unicode("fi"), Some("fi".to_string()));
1289        assert_eq!(glyph_name_to_unicode("fl"), Some("fl".to_string()));
1290        assert_eq!(glyph_name_to_unicode("ff"), Some("ff".to_string()));
1291        assert_eq!(glyph_name_to_unicode("ffi"), Some("ffi".to_string()));
1292        assert_eq!(glyph_name_to_unicode("ffl"), Some("ffl".to_string()));
1293    }
1294
1295    #[test]
1296    fn test_glyph_name_to_unicode_common() {
1297        assert_eq!(glyph_name_to_unicode("percent"), Some("%".to_string()));
1298        assert_eq!(glyph_name_to_unicode("ampersand"), Some("&".to_string()));
1299        assert_eq!(glyph_name_to_unicode("parenleft"), Some("(".to_string()));
1300        assert_eq!(
1301            glyph_name_to_unicode("endash"),
1302            Some("\u{2013}".to_string())
1303        );
1304        assert_eq!(glyph_name_to_unicode("A"), Some("A".to_string()));
1305        assert_eq!(glyph_name_to_unicode("uni0041"), Some("A".to_string()));
1306    }
1307
1308    #[test]
1309    fn test_glyph_name_to_unicode_unknown() {
1310        assert_eq!(glyph_name_to_unicode("nonexistent_glyph_xyz"), None);
1311    }
1312
1313    #[test]
1314    fn test_glyph_name_to_unicode_agl_extended() {
1315        // Characters only available via the full AGL, not the old hand-coded list
1316        assert_eq!(
1317            glyph_name_to_unicode("Dcroat"),
1318            Some("\u{0110}".to_string())
1319        );
1320        assert_eq!(
1321            glyph_name_to_unicode("dcroat"),
1322            Some("\u{0111}".to_string())
1323        );
1324        assert_eq!(
1325            glyph_name_to_unicode("Emacron"),
1326            Some("\u{0112}".to_string())
1327        );
1328        assert_eq!(
1329            glyph_name_to_unicode("afii10017"),
1330            Some("\u{0410}".to_string())
1331        ); // Cyrillic А
1332        assert_eq!(
1333            glyph_name_to_unicode("afii57636"),
1334            Some("\u{20AA}".to_string())
1335        ); // New Sheqel sign
1336           // Multi-codepoint entry (Hebrew dalet + hataf patah)
1337        assert_eq!(
1338            glyph_name_to_unicode("dalethatafpatah"),
1339            Some("\u{05D3}\u{05B2}".to_string())
1340        );
1341    }
1342
1343    #[test]
1344    fn test_glyph_name_to_unicode_uni_formats() {
1345        // uniXXXX format
1346        assert_eq!(glyph_name_to_unicode("uni0041"), Some("A".to_string()));
1347        assert_eq!(glyph_name_to_unicode("uni00E9"), Some("é".to_string()));
1348        // uniXXXXYYYY (sequence of BMP codepoints)
1349        assert_eq!(glyph_name_to_unicode("uni00410042"), Some("AB".to_string()));
1350        // uXXXXXX format (supplementary)
1351        assert_eq!(
1352            glyph_name_to_unicode("u1F600"),
1353            Some("\u{1F600}".to_string())
1354        );
1355    }
1356
1357    #[test]
1358    fn test_parse_cmap_bfrange_array() {
1359        // bfrange with array format: <start> <end> [<v1> <v2> <v3>]
1360        let cmap_data = b"beginbfrange\n<0001> <0003> [<0041> <0042> <0043>]\nendbfrange\n";
1361        let mut mapping = HashMap::new();
1362        parse_cmap(cmap_data, &mut mapping);
1363        assert_eq!(mapping.get(&1), Some(&"A".to_string()));
1364        assert_eq!(mapping.get(&2), Some(&"B".to_string()));
1365        assert_eq!(mapping.get(&3), Some(&"C".to_string()));
1366    }
1367
1368    #[test]
1369    fn test_parse_cmap_bfrange_incremented() {
1370        // bfrange with incrementing hex: <start> <end> <dst>
1371        let cmap_data = b"beginbfrange\n<0041> <0043> <0061>\nendbfrange\n";
1372        let mut mapping = HashMap::new();
1373        parse_cmap(cmap_data, &mut mapping);
1374        assert_eq!(mapping.get(&0x41), Some(&"a".to_string()));
1375        assert_eq!(mapping.get(&0x42), Some(&"b".to_string()));
1376        assert_eq!(mapping.get(&0x43), Some(&"c".to_string()));
1377    }
1378
1379    #[test]
1380    fn test_decode_winansi_extended() {
1381        assert_eq!(decode_winansi(0x82), "\u{201A}"); // Single low-9 quotation
1382        assert_eq!(decode_winansi(0x83), "\u{0192}"); // f with hook
1383        assert_eq!(decode_winansi(0x8A), "\u{0160}"); // S with caron
1384        assert_eq!(decode_winansi(0x8C), "\u{0152}"); // OE ligature
1385        assert_eq!(decode_winansi(0x99), "\u{2122}"); // Trademark
1386        assert_eq!(decode_winansi(0x9C), "\u{0153}"); // oe ligature
1387        assert_eq!(decode_winansi(0xA0), "\u{00A0}"); // NBSP
1388        assert_eq!(decode_winansi(0xE9), "\u{00E9}"); // é
1389    }
1390
1391    #[test]
1392    fn test_tex_glyph_names() {
1393        // CMSY font glyph names (TeX-specific, not in AGL)
1394        assert_eq!(
1395            glyph_name_to_unicode("asteriskmath"),
1396            Some("\u{2217}".to_string())
1397        ); // ∗
1398        assert_eq!(
1399            glyph_name_to_unicode("diamondmath"),
1400            Some("\u{22C4}".to_string())
1401        ); // ⋄
1402        assert_eq!(
1403            glyph_name_to_unicode("minusplus"),
1404            Some("\u{2213}".to_string())
1405        ); // ∓
1406        assert_eq!(
1407            glyph_name_to_unicode("circleminus"),
1408            Some("\u{2296}".to_string())
1409        ); // ⊖
1410        assert_eq!(
1411            glyph_name_to_unicode("circledot"),
1412            Some("\u{2299}".to_string())
1413        ); // ⊙
1414        assert_eq!(
1415            glyph_name_to_unicode("follows"),
1416            Some("\u{227B}".to_string())
1417        ); // ≻
1418        assert_eq!(
1419            glyph_name_to_unicode("lessmuch"),
1420            Some("\u{226A}".to_string())
1421        ); // ≪
1422        assert_eq!(
1423            glyph_name_to_unicode("greatermuch"),
1424            Some("\u{226B}".to_string())
1425        ); // ≫
1426        assert_eq!(
1427            glyph_name_to_unicode("latticetop"),
1428            Some("\u{22A4}".to_string())
1429        ); // ⊤
1430        assert_eq!(
1431            glyph_name_to_unicode("mapsto"),
1432            Some("\u{21A6}".to_string())
1433        ); // ↦
1434           // These should still resolve via AGL
1435        assert_eq!(
1436            glyph_name_to_unicode("dagger"),
1437            Some("\u{2020}".to_string())
1438        ); // †
1439        assert_eq!(
1440            glyph_name_to_unicode("daggerdbl"),
1441            Some("\u{2021}".to_string())
1442        ); // ‡
1443        assert_eq!(
1444            glyph_name_to_unicode("braceleft"),
1445            Some("\u{007B}".to_string())
1446        ); // {
1447        assert_eq!(
1448            glyph_name_to_unicode("braceright"),
1449            Some("\u{007D}".to_string())
1450        ); // }
1451    }
1452}
edgeparse_core/pdf/font.rs

edgeparse_core/pdf/
font.rs