Skip to main content

folio_font/
encoding.rs

1//! PDF text encoding tables and decoding.
2//!
3//! PDF uses several legacy encodings for mapping character codes to glyphs.
4//! The most common are WinAnsiEncoding and MacRomanEncoding.
5
6/// Known PDF text encodings.
7#[derive(Debug, Clone, PartialEq)]
8pub enum PdfEncoding {
9    StandardEncoding,
10    MacRomanEncoding,
11    WinAnsiEncoding,
12    PDFDocEncoding,
13    MacExpertEncoding,
14    Identity,
15    Custom(Vec<Option<String>>),
16}
17
18/// An encoding that can map character codes to Unicode strings.
19#[derive(Debug)]
20pub struct Encoding {
21    table: Vec<Option<String>>,
22}
23
24impl Encoding {
25    /// Create an encoding from a PDF encoding name.
26    pub fn from_name(name: &[u8]) -> Self {
27        match name {
28            b"WinAnsiEncoding" => Self::win_ansi(),
29            b"MacRomanEncoding" => Self::mac_roman(),
30            b"StandardEncoding" => Self::standard(),
31            b"ZapfDingbatsEncoding" => Self::zapf_dingbats(),
32            b"SymbolEncoding" => Self::symbol(),
33            _ => Self::win_ansi(),
34        }
35    }
36
37    /// Create WinAnsiEncoding (Windows code page 1252).
38    pub fn win_ansi() -> Self {
39        let mut table = vec![None; 256];
40        // Use Windows-1252 (cp1252) for the full 0-255 range.
41        // encoding_rs handles this correctly.
42        let cp1252_bytes: Vec<u8> = (0..=255u16).map(|i| i as u8).collect();
43        let (decoded, _, _) = encoding_rs::WINDOWS_1252.decode(&cp1252_bytes);
44        for (i, ch) in decoded.chars().enumerate() {
45            if i < 256 && (ch != '\0' || i == 0) {
46                table[i] = Some(ch.to_string());
47            }
48        }
49        // Ensure control characters are handled
50        table[0x09] = Some("\t".into());
51        table[0x0A] = Some("\n".into());
52        table[0x0D] = Some("\r".into());
53        table[0x20] = Some(" ".into());
54        Self { table }
55    }
56
57    /// Create MacRomanEncoding.
58    pub fn mac_roman() -> Self {
59        let mut table = vec![None; 256];
60        // Use macOS Roman encoding via encoding_rs.
61        let mac_bytes: Vec<u8> = (0..=255u16).map(|i| i as u8).collect();
62        let (decoded, _, _) = encoding_rs::MACINTOSH.decode(&mac_bytes);
63        for (i, ch) in decoded.chars().enumerate() {
64            if i < 256 {
65                table[i] = Some(ch.to_string());
66            }
67        }
68        table[0x09] = Some("\t".into());
69        table[0x0A] = Some("\n".into());
70        table[0x0D] = Some("\r".into());
71        table[0x20] = Some(" ".into());
72        Self { table }
73    }
74
75    /// Create StandardEncoding (Adobe's standard encoding for Type 1 fonts).
76    pub fn standard() -> Self {
77        // StandardEncoding: ASCII 0x20-0x7E same as ASCII, plus specific mappings
78        // for the 0x80-0xFF range. Use WinAnsi as base and override specific positions.
79        let mut enc = Self::win_ansi();
80        // Key differences from WinAnsi in the upper range:
81        enc.table[0x60] = Some("\u{2018}".into()); // quoteleft
82        enc.table[0x27] = Some("\u{2019}".into()); // quoteright
83        enc.table[0xA1] = Some("\u{00A1}".into()); // exclamdown
84        enc.table[0xA2] = Some("\u{00A2}".into()); // cent
85        enc.table[0xA3] = Some("\u{00A3}".into()); // sterling
86        enc.table[0xA4] = Some("\u{2044}".into()); // fraction
87        enc.table[0xA5] = Some("\u{00A5}".into()); // yen
88        enc.table[0xA6] = Some("\u{0192}".into()); // florin
89        enc.table[0xA7] = Some("\u{00A7}".into()); // section
90        enc.table[0xA8] = Some("\u{00A4}".into()); // currency
91        enc.table[0xAC] = Some("\u{FB01}".into()); // fi
92        enc.table[0xAD] = Some("\u{FB02}".into()); // fl
93        enc.table[0xB0] = Some("\u{2013}".into()); // endash
94        enc.table[0xB1] = Some("\u{2020}".into()); // dagger
95        enc.table[0xB2] = Some("\u{2021}".into()); // daggerdbl
96        enc.table[0xB3] = Some("\u{00B7}".into()); // periodcentered
97        enc.table[0xB7] = Some("\u{2022}".into()); // bullet
98        enc.table[0xB8] = Some("\u{201A}".into()); // quotesinglbase
99        enc.table[0xB9] = Some("\u{201E}".into()); // quotedblbase
100        enc.table[0xBA] = Some("\u{201D}".into()); // quotedblright
101        enc.table[0xBB] = Some("\u{00BB}".into()); // guillemotright
102        enc.table[0xBC] = Some("\u{2026}".into()); // ellipsis
103        enc.table[0xBD] = Some("\u{2030}".into()); // perthousand
104        enc.table[0xC1] = Some("\u{0060}".into()); // grave
105        enc.table[0xC2] = Some("\u{00B4}".into()); // acute
106        enc.table[0xC3] = Some("\u{02C6}".into()); // circumflex
107        enc.table[0xC4] = Some("\u{02DC}".into()); // tilde
108        enc.table[0xC5] = Some("\u{00AF}".into()); // macron
109        enc.table[0xC6] = Some("\u{02D8}".into()); // breve
110        enc.table[0xC7] = Some("\u{02D9}".into()); // dotaccent
111        enc.table[0xC8] = Some("\u{00A8}".into()); // dieresis
112        enc.table[0xCA] = Some("\u{02DA}".into()); // ring
113        enc.table[0xCB] = Some("\u{00B8}".into()); // cedilla
114        enc.table[0xCD] = Some("\u{02DD}".into()); // hungarumlaut
115        enc.table[0xCE] = Some("\u{02DB}".into()); // ogonek
116        enc.table[0xCF] = Some("\u{02C7}".into()); // caron
117        enc.table[0xD0] = Some("\u{2014}".into()); // emdash
118        enc.table[0xE1] = Some("\u{00C6}".into()); // AE
119        enc.table[0xE3] = Some("\u{00AA}".into()); // ordfeminine
120        enc.table[0xE8] = Some("\u{0141}".into()); // Lslash
121        enc.table[0xE9] = Some("\u{00D8}".into()); // Oslash
122        enc.table[0xEA] = Some("\u{0152}".into()); // OE
123        enc.table[0xEB] = Some("\u{00BA}".into()); // ordmasculine
124        enc.table[0xF1] = Some("\u{00E6}".into()); // ae
125        enc.table[0xF5] = Some("\u{0131}".into()); // dotlessi
126        enc.table[0xF8] = Some("\u{0142}".into()); // lslash
127        enc.table[0xF9] = Some("\u{00F8}".into()); // oslash
128        enc.table[0xFA] = Some("\u{0153}".into()); // oe
129        enc.table[0xFB] = Some("\u{00DF}".into()); // germandbls
130        enc
131    }
132
133    /// Create ZapfDingbats encoding.
134    ///
135    /// ZapfDingbats uses its own character set where ASCII letters map to
136    /// various symbols (arrows, stars, diamonds, etc.).
137    pub fn zapf_dingbats() -> Self {
138        let mut table = vec![None; 256];
139        // Space
140        table[0x20] = Some(" ".into());
141        // Key ZapfDingbats mappings (0x21-0x7E range = symbol glyphs)
142        let mappings: &[(u8, char)] = &[
143            (0x21, '\u{2701}'), // upper blade scissors
144            (0x22, '\u{2702}'), // black scissors
145            (0x23, '\u{2703}'), // lower blade scissors
146            (0x24, '\u{2704}'), // white scissors
147            (0x25, '\u{260E}'), // black telephone
148            (0x26, '\u{2706}'), // telephone location sign
149            (0x27, '\u{2707}'), // tape drive
150            (0x28, '\u{2708}'), // airplane
151            (0x29, '\u{2709}'), // envelope
152            (0x2A, '\u{261B}'), // black right pointing index
153            (0x2B, '\u{261E}'), // white right pointing index
154            (0x2C, '\u{270C}'), // victory hand
155            (0x2D, '\u{270D}'), // writing hand
156            (0x2E, '\u{270E}'), // lower right pencil
157            (0x2F, '\u{270F}'), // pencil
158            (0x30, '\u{2710}'), // upper right pencil
159            (0x31, '\u{2711}'), // white nib
160            (0x32, '\u{2712}'), // black nib
161            (0x33, '\u{2713}'), // check mark
162            (0x34, '\u{2714}'), // heavy check mark
163            (0x35, '\u{2715}'), // multiplication x
164            (0x36, '\u{2716}'), // heavy multiplication x
165            (0x37, '\u{2717}'), // ballot x
166            (0x38, '\u{2718}'), // heavy ballot x
167            (0x39, '\u{2719}'), // outlined Greek cross
168            (0x3A, '\u{271A}'), // heavy Greek cross
169            (0x3B, '\u{271B}'), // open centre cross
170            (0x3C, '\u{271C}'), // heavy open centre cross
171            (0x3D, '\u{271D}'), // Latin cross
172            (0x3E, '\u{271E}'), // shadowed white Latin cross
173            (0x3F, '\u{271F}'), // outlined Latin cross
174            (0x40, '\u{2720}'), // Maltese cross
175            (0x41, '\u{2721}'), // star of David
176            (0x42, '\u{2722}'), // four teardrop-spoked asterisk
177            (0x43, '\u{2723}'), // four balloon-spoked asterisk
178            (0x44, '\u{2724}'), // heavy four balloon-spoked asterisk
179            (0x45, '\u{2725}'), // four club-spoked asterisk
180            (0x46, '\u{2726}'), // black four pointed star
181            (0x47, '\u{2727}'), // white four pointed star
182            (0x48, '\u{2605}'), // black star
183            (0x49, '\u{2729}'), // stress outlined white star
184            (0x4A, '\u{272A}'), // circled white star
185            (0x4B, '\u{272B}'), // open centre black star
186            (0x4C, '\u{272C}'), // black centre white star
187            (0x4D, '\u{272D}'), // outlined black star
188            (0x4E, '\u{272E}'), // heavy outlined black star
189            (0x4F, '\u{272F}'), // pinwheel star
190            (0x50, '\u{2730}'), // shadowed white star
191            (0x51, '\u{2731}'), // heavy asterisk
192            (0x52, '\u{2732}'), // open centre asterisk
193            (0x53, '\u{2733}'), // eight spoked asterisk
194            (0x54, '\u{2734}'), // eight pointed black star
195            (0x55, '\u{2735}'), // eight pointed pinwheel star
196            (0x56, '\u{2736}'), // six pointed black star
197            (0x57, '\u{2737}'), // eight pointed rectilinear black star
198            (0x58, '\u{2738}'), // heavy eight pointed rectilinear black star
199            (0x59, '\u{2739}'), // twelve pointed black star
200            (0x5A, '\u{273A}'), // sixteen pointed asterisk
201            (0x5B, '\u{273B}'), // teardrop-spoked asterisk
202            (0x5C, '\u{273C}'), // open centre teardrop-spoked asterisk
203            (0x5D, '\u{273D}'), // heavy teardrop-spoked asterisk
204            (0x5E, '\u{273E}'), // six petalled black and white florette
205            (0x5F, '\u{273F}'), // black florette
206            (0x60, '\u{2740}'), // white florette
207            (0x61, '\u{2741}'), // eight petalled outlined black florette
208            (0x62, '\u{2742}'), // circled open centre eight pointed star
209            (0x63, '\u{2743}'), // heavy teardrop-spoked pinwheel asterisk
210            (0x64, '\u{2744}'), // snowflake
211            (0x65, '\u{2745}'), // tight trifoliate snowflake
212            (0x66, '\u{2746}'), // heavy chevron snowflake
213            (0x67, '\u{2747}'), // sparkle
214            (0x68, '\u{2748}'), // heavy sparkle
215            (0x69, '\u{2749}'), // balloon-spoked asterisk
216            (0x6A, '\u{274A}'), // eight teardrop-spoked propeller asterisk
217            (0x6B, '\u{274B}'), // heavy eight teardrop-spoked propeller asterisk
218            // Key mapping: 0x6C-0x7E
219            (0x6C, '\u{25CF}'), // black circle
220            (0x6D, '\u{274D}'), // shadowed white circle
221            (0x6E, '\u{25A0}'), // black square
222            (0x6F, '\u{274F}'), // lower right drop-shadowed white square
223            (0x70, '\u{2750}'), // upper right drop-shadowed white square
224            (0x71, '\u{2751}'), // lower right shadowed white square
225            (0x72, '\u{2752}'), // upper right shadowed white square
226            (0x73, '\u{25B2}'), // black up-pointing triangle
227            (0x74, '\u{25BC}'), // black down-pointing triangle
228            (0x75, '\u{25C6}'), // BLACK DIAMOND — the key mapping!
229            (0x76, '\u{2756}'), // black diamond minus white x
230            (0x77, '\u{25D7}'), // right half black circle
231            (0x78, '\u{2758}'), // light vertical bar
232            (0x79, '\u{2759}'), // medium vertical bar
233            (0x7A, '\u{275A}'), // heavy vertical bar
234            (0x7B, '\u{275B}'), // heavy single turned comma quotation mark ornament
235            (0x7C, '\u{275C}'), // heavy single comma quotation mark ornament
236            (0x7D, '\u{275D}'), // heavy double turned comma quotation mark ornament
237            (0x7E, '\u{275E}'), // heavy double comma quotation mark ornament
238        ];
239        for &(code, ch) in mappings {
240            table[code as usize] = Some(ch.to_string());
241        }
242        // 0x80-0x9F range (some dingbats continue here)
243        let high_mappings: &[(u8, char)] = &[
244            (0x80, '\u{2768}'),
245            (0x81, '\u{2769}'),
246            (0x82, '\u{276A}'),
247            (0x83, '\u{276B}'),
248            (0x84, '\u{276C}'),
249            (0x85, '\u{276D}'),
250            (0x86, '\u{276E}'),
251            (0x87, '\u{276F}'),
252            (0x88, '\u{2770}'),
253            (0x89, '\u{2771}'),
254            (0x8A, '\u{2772}'),
255            (0x8B, '\u{2773}'),
256            (0x8C, '\u{2774}'),
257            (0x8D, '\u{2775}'),
258            (0xA1, '\u{2761}'),
259            (0xA2, '\u{2762}'),
260            (0xA3, '\u{2763}'),
261            (0xA4, '\u{2764}'),
262            (0xA5, '\u{2765}'),
263            (0xA6, '\u{2766}'),
264            (0xA7, '\u{2767}'),
265            (0xB1, '\u{2460}'),
266            (0xB2, '\u{2461}'),
267            (0xB3, '\u{2462}'),
268            (0xB4, '\u{2463}'),
269            (0xB5, '\u{2464}'),
270            (0xB6, '\u{2465}'),
271            (0xB7, '\u{2466}'),
272            (0xB8, '\u{2467}'),
273            (0xB9, '\u{2468}'),
274            (0xBA, '\u{2469}'),
275            (0xC0, '\u{2776}'),
276            (0xC1, '\u{2777}'),
277            (0xC2, '\u{2778}'),
278            (0xC3, '\u{2779}'),
279            (0xC4, '\u{277A}'),
280            (0xC5, '\u{277B}'),
281            (0xC6, '\u{277C}'),
282            (0xC7, '\u{277D}'),
283            (0xC8, '\u{277E}'),
284            (0xC9, '\u{277F}'),
285            (0xD1, '\u{2780}'),
286            (0xD2, '\u{2781}'),
287            (0xD3, '\u{2782}'),
288            (0xD4, '\u{2783}'),
289            (0xD5, '\u{2784}'),
290            (0xD6, '\u{2785}'),
291            (0xD7, '\u{2786}'),
292            (0xD8, '\u{2787}'),
293            (0xD9, '\u{2788}'),
294            (0xDA, '\u{2789}'),
295            (0xE1, '\u{278A}'),
296            (0xE2, '\u{278B}'),
297            (0xE3, '\u{278C}'),
298            (0xE4, '\u{278D}'),
299            (0xE5, '\u{278E}'),
300            (0xE6, '\u{278F}'),
301            (0xE7, '\u{2790}'),
302            (0xE8, '\u{2791}'),
303            (0xE9, '\u{2792}'),
304            (0xEA, '\u{2793}'),
305            (0xF1, '\u{2794}'),
306            (0xF2, '\u{2192}'), // rightwards arrow
307            (0xF3, '\u{2194}'), // left right arrow
308            (0xF4, '\u{2195}'), // up down arrow
309        ];
310        for &(code, ch) in high_mappings {
311            table[code as usize] = Some(ch.to_string());
312        }
313        Self { table }
314    }
315
316    /// Create Symbol encoding (Adobe Symbol font).
317    pub fn symbol() -> Self {
318        let mut table = vec![None; 256];
319        // ASCII printable range is mostly identity for Symbol
320        // but with Greek letters and math symbols instead of Latin
321        table[0x20] = Some(" ".into());
322        let mappings: &[(u8, char)] = &[
323            (0x21, '!'),
324            (0x22, '\u{2200}'), // for all
325            (0x23, '#'),
326            (0x24, '\u{2203}'), // there exists
327            (0x25, '%'),
328            (0x26, '&'),
329            (0x27, '\u{220B}'), // contains
330            (0x28, '('),
331            (0x29, ')'),
332            (0x2A, '*'),
333            (0x2B, '+'),
334            (0x2C, ','),
335            (0x2D, '\u{2212}'), // minus sign
336            (0x2E, '.'),
337            (0x2F, '/'),
338            (0x30, '0'),
339            (0x31, '1'),
340            (0x32, '2'),
341            (0x33, '3'),
342            (0x34, '4'),
343            (0x35, '5'),
344            (0x36, '6'),
345            (0x37, '7'),
346            (0x38, '8'),
347            (0x39, '9'),
348            (0x3A, ':'),
349            (0x3B, ';'),
350            (0x3C, '<'),
351            (0x3D, '='),
352            (0x3E, '>'),
353            (0x3F, '?'),
354            (0x40, '\u{2245}'), // approximately equal
355            (0x41, '\u{0391}'), // Alpha
356            (0x42, '\u{0392}'), // Beta
357            (0x43, '\u{03A7}'), // Chi
358            (0x44, '\u{0394}'), // Delta
359            (0x45, '\u{0395}'), // Epsilon
360            (0x46, '\u{03A6}'), // Phi
361            (0x47, '\u{0393}'), // Gamma
362            (0x48, '\u{0397}'), // Eta
363            (0x49, '\u{0399}'), // Iota
364            (0x4B, '\u{039A}'), // Kappa
365            (0x4C, '\u{039B}'), // Lambda
366            (0x4D, '\u{039C}'), // Mu
367            (0x4E, '\u{039D}'), // Nu
368            (0x4F, '\u{039F}'), // Omicron
369            (0x50, '\u{03A0}'), // Pi
370            (0x51, '\u{0398}'), // Theta
371            (0x52, '\u{03A1}'), // Rho
372            (0x53, '\u{03A3}'), // Sigma
373            (0x54, '\u{03A4}'), // Tau
374            (0x55, '\u{03A5}'), // Upsilon
375            (0x57, '\u{03A9}'), // Omega
376            (0x58, '\u{039E}'), // Xi
377            (0x59, '\u{03A8}'), // Psi
378            (0x5A, '\u{0396}'), // Zeta
379            (0x5B, '['),
380            (0x5D, ']'),
381            (0x5E, '\u{22A5}'), // perpendicular
382            (0x5F, '_'),
383            (0x61, '\u{03B1}'), // alpha
384            (0x62, '\u{03B2}'), // beta
385            (0x63, '\u{03C7}'), // chi
386            (0x64, '\u{03B4}'), // delta
387            (0x65, '\u{03B5}'), // epsilon
388            (0x66, '\u{03C6}'), // phi
389            (0x67, '\u{03B3}'), // gamma
390            (0x68, '\u{03B7}'), // eta
391            (0x69, '\u{03B9}'), // iota
392            (0x6B, '\u{03BA}'), // kappa
393            (0x6C, '\u{03BB}'), // lambda
394            (0x6D, '\u{03BC}'), // mu
395            (0x6E, '\u{03BD}'), // nu
396            (0x6F, '\u{03BF}'), // omicron
397            (0x70, '\u{03C0}'), // pi
398            (0x71, '\u{03B8}'), // theta
399            (0x72, '\u{03C1}'), // rho
400            (0x73, '\u{03C3}'), // sigma
401            (0x74, '\u{03C4}'), // tau
402            (0x75, '\u{03C5}'), // upsilon
403            (0x77, '\u{03C9}'), // omega
404            (0x78, '\u{03BE}'), // xi
405            (0x79, '\u{03C8}'), // psi
406            (0x7A, '\u{03B6}'), // zeta
407            (0x7B, '{'),
408            (0x7C, '|'),
409            (0x7D, '}'),
410            (0x7E, '\u{223C}'), // tilde operator
411            (0xA0, '\u{20AC}'), // Euro
412            (0xB1, '\u{00B1}'), // plus-minus
413            (0xB4, '\u{00D7}'), // multiplication
414            (0xB5, '\u{221D}'), // proportional to
415            (0xB6, '\u{2202}'), // partial differential
416            (0xB7, '\u{2022}'), // bullet
417            (0xB8, '\u{00F7}'), // division
418            (0xB9, '\u{2260}'), // not equal
419            (0xBA, '\u{2261}'), // identical
420            (0xBB, '\u{2248}'), // almost equal
421            (0xBC, '\u{2026}'), // ellipsis
422            (0xC0, '\u{2135}'), // alef
423            (0xC1, '\u{2111}'), // imaginary part
424            (0xC2, '\u{211C}'), // real part
425            (0xC3, '\u{2118}'), // Weierstrass p
426            (0xC5, '\u{2297}'), // circled times
427            (0xC6, '\u{2295}'), // circled plus
428            (0xC7, '\u{2205}'), // empty set
429            (0xC8, '\u{2229}'), // intersection
430            (0xC9, '\u{222A}'), // union
431            (0xCA, '\u{2283}'), // superset
432            (0xCB, '\u{2287}'), // superset or equal
433            (0xCC, '\u{2284}'), // not subset
434            (0xCD, '\u{2282}'), // subset
435            (0xCE, '\u{2286}'), // subset or equal
436            (0xCF, '\u{2208}'), // element of
437            (0xD0, '\u{2209}'), // not element of
438            (0xD1, '\u{2220}'), // angle
439            (0xD2, '\u{2207}'), // nabla
440            (0xD5, '\u{220F}'), // product
441            (0xD6, '\u{221A}'), // square root
442            (0xE0, '\u{25CA}'), // lozenge
443            (0xE5, '\u{2211}'), // summation
444            (0xF2, '\u{222B}'), // integral
445            (0xF5, '\u{221E}'), // infinity
446        ];
447        for &(code, ch) in mappings {
448            table[code as usize] = Some(ch.to_string());
449        }
450        Self { table }
451    }
452
453    /// Apply a Differences array to modify this encoding.
454    pub fn apply_differences(&mut self, differences: &[folio_cos::PdfObject]) {
455        let mut code = 0u16;
456        for item in differences {
457            match item {
458                folio_cos::PdfObject::Integer(n) => code = *n as u16,
459                folio_cos::PdfObject::Name(name) => {
460                    if (code as usize) < self.table.len() {
461                        let unicode = glyph_name_to_unicode(name);
462                        self.table[code as usize] = Some(unicode);
463                    }
464                    code += 1;
465                }
466                _ => {}
467            }
468        }
469    }
470
471    /// Map a character code to a Unicode string.
472    pub fn decode_char(&self, code: u8) -> Option<&str> {
473        self.table.get(code as usize).and_then(|s| s.as_deref())
474    }
475
476    /// Decode a byte sequence to a Unicode string.
477    pub fn decode_bytes(&self, data: &[u8]) -> String {
478        let mut result = String::new();
479        for &byte in data {
480            match self.decode_char(byte) {
481                Some(s) => result.push_str(s),
482                None => result.push(char::REPLACEMENT_CHARACTER),
483            }
484        }
485        result
486    }
487}
488
489/// Map a glyph name to its Unicode string representation.
490///
491/// Covers the Adobe Glyph List (AGL) entries most commonly found in PDFs.
492fn glyph_name_to_unicode(name: &[u8]) -> String {
493    let name_str = std::str::from_utf8(name).unwrap_or("");
494    match name_str {
495        // Basic ASCII-named glyphs
496        "space" | "nbspace" => " ".into(),
497        "exclam" => "!".into(),
498        "quotedbl" => "\"".into(),
499        "numbersign" => "#".into(),
500        "dollar" => "$".into(),
501        "percent" => "%".into(),
502        "ampersand" => "&".into(),
503        "quotesingle" => "'".into(),
504        "parenleft" => "(".into(),
505        "parenright" => ")".into(),
506        "asterisk" => "*".into(),
507        "plus" => "+".into(),
508        "comma" => ",".into(),
509        "hyphen" | "minus" | "hyphenchar" => "-".into(),
510        "period" => ".".into(),
511        "slash" => "/".into(),
512        "zero" => "0".into(),
513        "one" => "1".into(),
514        "two" => "2".into(),
515        "three" => "3".into(),
516        "four" => "4".into(),
517        "five" => "5".into(),
518        "six" => "6".into(),
519        "seven" => "7".into(),
520        "eight" => "8".into(),
521        "nine" => "9".into(),
522        "colon" => ":".into(),
523        "semicolon" => ";".into(),
524        "less" => "<".into(),
525        "equal" => "=".into(),
526        "greater" => ">".into(),
527        "question" => "?".into(),
528        "at" => "@".into(),
529        "bracketleft" => "[".into(),
530        "backslash" => "\\".into(),
531        "bracketright" => "]".into(),
532        "asciicircum" => "^".into(),
533        "underscore" => "_".into(),
534        "grave" | "quoteleft" => "\u{2018}".into(),
535        "braceleft" => "{".into(),
536        "bar" => "|".into(),
537        "braceright" => "}".into(),
538        "asciitilde" => "~".into(),
539
540        // Typographic quotes and dashes
541        "quoteright" => "\u{2019}".into(),
542        "quotedblleft" => "\u{201C}".into(),
543        "quotedblright" => "\u{201D}".into(),
544        "quotedblbase" => "\u{201E}".into(),
545        "quotesinglbase" => "\u{201A}".into(),
546        "guillemotleft" | "guilsinglleft" => "\u{00AB}".into(),
547        "guillemotright" | "guilsinglright" => "\u{00BB}".into(),
548
549        "endash" => "\u{2013}".into(),
550        "emdash" => "\u{2014}".into(),
551        "bullet" => "\u{2022}".into(),
552        "ellipsis" => "\u{2026}".into(),
553        "dagger" => "\u{2020}".into(),
554        "daggerdbl" => "\u{2021}".into(),
555        "perthousand" => "\u{2030}".into(),
556        "trademark" => "\u{2122}".into(),
557        "copyright" => "\u{00A9}".into(),
558        "registered" => "\u{00AE}".into(),
559
560        // Special characters
561        "fi" => "\u{FB01}".into(),
562        "fl" => "\u{FB02}".into(),
563        "ff" => "\u{FB00}".into(),
564        "ffi" => "\u{FB03}".into(),
565        "ffl" => "\u{FB04}".into(),
566        "lozenge" => "\u{25CA}".into(),
567        "Euro" => "\u{20AC}".into(),
568        "degree" => "\u{00B0}".into(),
569        "section" => "\u{00A7}".into(),
570        "paragraph" | "pilcrow" => "\u{00B6}".into(),
571        "fraction" => "\u{2044}".into(),
572        "florin" => "\u{0192}".into(),
573
574        // Accented Latin characters
575        "Agrave" => "\u{00C0}".into(),
576        "Aacute" => "\u{00C1}".into(),
577        "Acircumflex" => "\u{00C2}".into(),
578        "Atilde" => "\u{00C3}".into(),
579        "Adieresis" => "\u{00C4}".into(),
580        "Aring" => "\u{00C5}".into(),
581        "AE" => "\u{00C6}".into(),
582        "Ccedilla" => "\u{00C7}".into(),
583        "Egrave" => "\u{00C8}".into(),
584        "Eacute" => "\u{00C9}".into(),
585        "Ecircumflex" => "\u{00CA}".into(),
586        "Edieresis" => "\u{00CB}".into(),
587        "Igrave" => "\u{00CC}".into(),
588        "Iacute" => "\u{00CD}".into(),
589        "Icircumflex" => "\u{00CE}".into(),
590        "Idieresis" => "\u{00CF}".into(),
591        "Eth" => "\u{00D0}".into(),
592        "Ntilde" => "\u{00D1}".into(),
593        "Ograve" => "\u{00D2}".into(),
594        "Oacute" => "\u{00D3}".into(),
595        "Ocircumflex" => "\u{00D4}".into(),
596        "Otilde" => "\u{00D5}".into(),
597        "Odieresis" => "\u{00D6}".into(),
598        "Oslash" => "\u{00D8}".into(),
599        "Ugrave" => "\u{00D9}".into(),
600        "Uacute" => "\u{00DA}".into(),
601        "Ucircumflex" => "\u{00DB}".into(),
602        "Udieresis" => "\u{00DC}".into(),
603        "Yacute" => "\u{00DD}".into(),
604        "Thorn" => "\u{00DE}".into(),
605        "germandbls" => "\u{00DF}".into(),
606        "agrave" => "\u{00E0}".into(),
607        "aacute" => "\u{00E1}".into(),
608        "acircumflex" => "\u{00E2}".into(),
609        "atilde" => "\u{00E3}".into(),
610        "adieresis" => "\u{00E4}".into(),
611        "aring" => "\u{00E5}".into(),
612        "ae" => "\u{00E6}".into(),
613        "ccedilla" => "\u{00E7}".into(),
614        "egrave" => "\u{00E8}".into(),
615        "eacute" => "\u{00E9}".into(),
616        "ecircumflex" => "\u{00EA}".into(),
617        "edieresis" => "\u{00EB}".into(),
618        "igrave" => "\u{00EC}".into(),
619        "iacute" => "\u{00ED}".into(),
620        "icircumflex" => "\u{00EE}".into(),
621        "idieresis" => "\u{00EF}".into(),
622        "eth" => "\u{00F0}".into(),
623        "ntilde" => "\u{00F1}".into(),
624        "ograve" => "\u{00F2}".into(),
625        "oacute" => "\u{00F3}".into(),
626        "ocircumflex" => "\u{00F4}".into(),
627        "otilde" => "\u{00F5}".into(),
628        "odieresis" => "\u{00F6}".into(),
629        "oslash" => "\u{00F8}".into(),
630        "ugrave" => "\u{00F9}".into(),
631        "uacute" => "\u{00FA}".into(),
632        "ucircumflex" => "\u{00FB}".into(),
633        "udieresis" => "\u{00FC}".into(),
634        "yacute" => "\u{00FD}".into(),
635        "thorn" => "\u{00FE}".into(),
636        "ydieresis" => "\u{00FF}".into(),
637
638        // Greek uppercase
639        "Alpha" => "\u{0391}".into(),
640        "Beta" => "\u{0392}".into(),
641        "Gamma" => "\u{0393}".into(),
642        "Delta" => "\u{0394}".into(),
643        "Epsilon" => "\u{0395}".into(),
644        "Zeta" => "\u{0396}".into(),
645        "Eta" => "\u{0397}".into(),
646        "Theta" => "\u{0398}".into(),
647        "Iota" => "\u{0399}".into(),
648        "Kappa" => "\u{039A}".into(),
649        "Lambda" => "\u{039B}".into(),
650        "Mu" => "\u{039C}".into(),
651        "Nu" => "\u{039D}".into(),
652        "Xi" => "\u{039E}".into(),
653        "Omicron" => "\u{039F}".into(),
654        "Pi" => "\u{03A0}".into(),
655        "Rho" => "\u{03A1}".into(),
656        "Sigma" => "\u{03A3}".into(),
657        "Tau" => "\u{03A4}".into(),
658        "Upsilon" => "\u{03A5}".into(),
659        "Phi" => "\u{03A6}".into(),
660        "Chi" => "\u{03A7}".into(),
661        "Psi" => "\u{03A8}".into(),
662        "Omega" => "\u{03A9}".into(),
663
664        // Greek lowercase
665        "alpha" => "\u{03B1}".into(),
666        "beta" => "\u{03B2}".into(),
667        "gamma" => "\u{03B3}".into(),
668        "delta" => "\u{03B4}".into(),
669        "epsilon" | "varepsilon" => "\u{03B5}".into(),
670        "zeta" => "\u{03B6}".into(),
671        "eta" => "\u{03B7}".into(),
672        "theta" => "\u{03B8}".into(),
673        "iota" => "\u{03B9}".into(),
674        "kappa" => "\u{03BA}".into(),
675        "lambda" => "\u{03BB}".into(),
676        "mu" => "\u{03BC}".into(),
677        "nu" => "\u{03BD}".into(),
678        "xi" => "\u{03BE}".into(),
679        "omicron" => "\u{03BF}".into(),
680        "pi" => "\u{03C0}".into(),
681        "rho" => "\u{03C1}".into(),
682        "sigma" => "\u{03C3}".into(),
683        "varsigma" | "sigmafinal" => "\u{03C2}".into(),
684        "tau" => "\u{03C4}".into(),
685        "upsilon" => "\u{03C5}".into(),
686        "phi" | "varphi" => "\u{03C6}".into(),
687        "chi" => "\u{03C7}".into(),
688        "psi" => "\u{03C8}".into(),
689        "omega" => "\u{03C9}".into(),
690        "vartheta" => "\u{03D1}".into(),
691        "varpi" => "\u{03D6}".into(),
692
693        // Math / symbol
694        "multiply" => "\u{00D7}".into(),
695        "divide" => "\u{00F7}".into(),
696        "plusminus" => "\u{00B1}".into(),
697        "minusmath" => "\u{2212}".into(),
698        "notequal" => "\u{2260}".into(),
699        "lessequal" | "leq" => "\u{2264}".into(),
700        "greaterequal" | "geq" => "\u{2265}".into(),
701        "infinity" => "\u{221E}".into(),
702        "summation" => "\u{2211}".into(),
703        "product" => "\u{220F}".into(),
704        "integral" => "\u{222B}".into(),
705        "radical" | "sqrt" => "\u{221A}".into(),
706        "approxequal" | "approx" => "\u{2248}".into(),
707        "nabla" | "gradient" => "\u{2207}".into(),
708        "partial" | "partialdiff" => "\u{2202}".into(),
709        "element" | "in" => "\u{2208}".into(),
710        "notelement" | "notin" => "\u{2209}".into(),
711        "propersubset" | "subset" => "\u{2282}".into(),
712        "propersuperset" | "superset" => "\u{2283}".into(),
713        "reflexsubset" | "subseteq" => "\u{2286}".into(),
714        "reflexsuperset" | "supseteq" => "\u{2287}".into(),
715        "union" | "cup" => "\u{222A}".into(),
716        "intersection" | "cap" => "\u{2229}".into(),
717        "emptyset" => "\u{2205}".into(),
718        "forall" | "universal" => "\u{2200}".into(),
719        "existential" | "exists" => "\u{2203}".into(),
720        "logicaland" | "wedge" => "\u{2227}".into(),
721        "logicalor" | "vee" => "\u{2228}".into(),
722        "logicalnot" | "neg" => "\u{00AC}".into(),
723        "therefore" => "\u{2234}".into(),
724        "because" => "\u{2235}".into(),
725        "equivalence" | "equiv" => "\u{2261}".into(),
726        "proportional" | "propto" => "\u{221D}".into(),
727        "perpendicular" | "perp" => "\u{22A5}".into(),
728        "angle" => "\u{2220}".into(),
729        "arrowleft" | "leftarrow" => "\u{2190}".into(),
730        "arrowup" | "uparrow" => "\u{2191}".into(),
731        "arrowright" | "rightarrow" => "\u{2192}".into(),
732        "arrowdown" | "downarrow" => "\u{2193}".into(),
733        "arrowboth" | "leftrightarrow" => "\u{2194}".into(),
734        "Arrowleft" | "Leftarrow" => "\u{21D0}".into(),
735        "Arrowright" | "Rightarrow" => "\u{21D2}".into(),
736        "Arrowboth" | "Leftrightarrow" => "\u{21D4}".into(),
737        "aleph" => "\u{2135}".into(),
738        "Ifraktur" | "Im" => "\u{2111}".into(),
739        "Rfraktur" | "Re" => "\u{211C}".into(),
740        "weierstrass" | "wp" => "\u{2118}".into(),
741        "circleplus" | "oplus" => "\u{2295}".into(),
742        "circlemultiply" | "otimes" => "\u{2297}".into(),
743        "dotmath" | "cdot" | "periodcentered" | "middot" => "\u{00B7}".into(),
744        "times" => "\u{00D7}".into(),
745        "star" => "\u{22C6}".into(),
746
747        // Other Latin
748        "Lslash" => "\u{0141}".into(),
749        "lslash" => "\u{0142}".into(),
750        "OE" => "\u{0152}".into(),
751        "oe" => "\u{0153}".into(),
752        "Scaron" => "\u{0160}".into(),
753        "scaron" => "\u{0161}".into(),
754        "Zcaron" => "\u{017D}".into(),
755        "zcaron" => "\u{017E}".into(),
756        "Ydieresis" => "\u{0178}".into(),
757        "dotlessi" => "\u{0131}".into(),
758        "circumflex" => "\u{02C6}".into(),
759        "tilde" => "\u{02DC}".into(),
760        "breve" => "\u{02D8}".into(),
761        "dotaccent" => "\u{02D9}".into(),
762        "ring" => "\u{02DA}".into(),
763        "cedilla" => "\u{00B8}".into(),
764        "hungarumlaut" => "\u{02DD}".into(),
765        "ogonek" => "\u{02DB}".into(),
766        "caron" => "\u{02C7}".into(),
767        "macron" => "\u{00AF}".into(),
768        "dieresis" => "\u{00A8}".into(),
769        "acute" => "\u{00B4}".into(),
770
771        // Shapes
772        "blackdiamond" | "diamond" => "\u{25C6}".into(),
773        "filledbox" | "blacksquare" => "\u{25A0}".into(),
774        "filledcircle" | "blackcircle" => "\u{25CF}".into(),
775        "opendiamond" => "\u{25C7}".into(),
776        "openbox" | "square" => "\u{25A1}".into(),
777        "circle" | "opencircle" => "\u{25CB}".into(),
778        "triagup" => "\u{25B2}".into(),
779        "triagdn" => "\u{25BC}".into(),
780
781        // For "uniXXXX" format
782        _ if name_str.starts_with("uni") && name_str.len() == 7 => {
783            u32::from_str_radix(&name_str[3..], 16)
784                .ok()
785                .and_then(char::from_u32)
786                .map(|c| c.to_string())
787                .unwrap_or_else(|| name_str.into())
788        }
789        // For "uXXXX" format
790        _ if name_str.starts_with('u')
791            && name_str.len() >= 5
792            && name_str[1..].chars().all(|c| c.is_ascii_hexdigit()) =>
793        {
794            u32::from_str_radix(&name_str[1..], 16)
795                .ok()
796                .and_then(char::from_u32)
797                .map(|c| c.to_string())
798                .unwrap_or_else(|| name_str.into())
799        }
800        // Single-letter names match directly
801        _ if name_str.len() == 1 => name_str.into(),
802        _ => name_str.into(),
803    }
804}
805
806/// Decode text bytes using the appropriate encoding for a font.
807pub fn decode_text(
808    data: &[u8],
809    encoding: &Encoding,
810    tounicode: Option<&super::cmap::ToUnicodeCMap>,
811) -> String {
812    if let Some(cmap) = tounicode {
813        return cmap.decode(data);
814    }
815    encoding.decode_bytes(data)
816}