pdfplumber_core/
encoding.rs

1//! Standard PDF text encodings and encoding resolution.
2//!
3//! Implements WinAnsiEncoding, MacRomanEncoding, MacExpertEncoding,
4//! StandardEncoding, Differences array handling, and the encoding
5//! resolution order: ToUnicode > explicit Encoding > implicit/default.
6
7use std::collections::HashMap;
8
9/// A named standard PDF encoding.
10#[derive(Debug, Clone, Copy, PartialEq, Eq)]
11pub enum StandardEncoding {
12    /// WinAnsiEncoding — Windows code page 1252 superset.
13    WinAnsi,
14    /// MacRomanEncoding — Classic Mac OS Roman.
15    MacRoman,
16    /// MacExpertEncoding — Mac expert glyph set (small caps, fractions, etc.).
17    MacExpert,
18    /// StandardEncoding — Adobe standard Latin encoding.
19    Standard,
20}
21
22impl StandardEncoding {
23    /// Look up the Unicode character for a given byte code in this encoding.
24    ///
25    /// Returns `None` for undefined code points (e.g., 0x80–0x8F in WinAnsi
26    /// that are undefined, or codes with no mapping).
27    pub fn decode(&self, code: u8) -> Option<char> {
28        let table = match self {
29            StandardEncoding::WinAnsi => &WIN_ANSI_TABLE,
30            StandardEncoding::MacRoman => &MAC_ROMAN_TABLE,
31            StandardEncoding::MacExpert => &MAC_EXPERT_TABLE,
32            StandardEncoding::Standard => &STANDARD_TABLE,
33        };
34        table[code as usize]
35    }
36
37    /// Decode a byte string into a Unicode string using this encoding.
38    ///
39    /// Bytes with no mapping are replaced with U+FFFD (replacement character).
40    pub fn decode_bytes(&self, bytes: &[u8]) -> String {
41        bytes
42            .iter()
43            .map(|&b| self.decode(b).unwrap_or('\u{FFFD}'))
44            .collect()
45    }
46}
47
48/// An encoding table that may be a standard encoding modified by a Differences array.
49///
50/// This represents the /Encoding entry in a PDF font dictionary, which can be:
51/// - A name referring to a standard encoding (e.g., /WinAnsiEncoding)
52/// - A dictionary with /BaseEncoding + /Differences array
53#[derive(Debug, Clone)]
54pub struct FontEncoding {
55    /// The base encoding table (256 entries, indexed by character code).
56    table: [Option<char>; 256],
57}
58
59impl FontEncoding {
60    /// Create a `FontEncoding` from a standard encoding.
61    pub fn from_standard(encoding: StandardEncoding) -> Self {
62        let table = match encoding {
63            StandardEncoding::WinAnsi => WIN_ANSI_TABLE,
64            StandardEncoding::MacRoman => MAC_ROMAN_TABLE,
65            StandardEncoding::MacExpert => MAC_EXPERT_TABLE,
66            StandardEncoding::Standard => STANDARD_TABLE,
67        };
68        Self { table }
69    }
70
71    /// Create a `FontEncoding` from a standard encoding with Differences applied.
72    ///
73    /// The `differences` slice contains pairs of `(code, character)` that override
74    /// the base encoding. This matches the PDF /Differences array format:
75    /// `[code1 /name1 /name2 ... codeN /nameN ...]` where each code starts a run
76    /// of consecutive overrides.
77    pub fn from_standard_with_differences(
78        encoding: StandardEncoding,
79        differences: &[(u8, char)],
80    ) -> Self {
81        let mut enc = Self::from_standard(encoding);
82        enc.apply_differences(differences);
83        enc
84    }
85
86    /// Create a `FontEncoding` from a custom table (256 entries).
87    pub fn from_table(table: [Option<char>; 256]) -> Self {
88        Self { table }
89    }
90
91    /// Apply Differences array overrides to this encoding.
92    pub fn apply_differences(&mut self, differences: &[(u8, char)]) {
93        for &(code, ch) in differences {
94            self.table[code as usize] = Some(ch);
95        }
96    }
97
98    /// Decode a single byte code to a Unicode character.
99    pub fn decode(&self, code: u8) -> Option<char> {
100        self.table[code as usize]
101    }
102
103    /// Decode a byte string into a Unicode string.
104    ///
105    /// Bytes with no mapping are replaced with U+FFFD (replacement character).
106    pub fn decode_bytes(&self, bytes: &[u8]) -> String {
107        bytes
108            .iter()
109            .map(|&b| self.decode(b).unwrap_or('\u{FFFD}'))
110            .collect()
111    }
112}
113
114/// Resolved encoding for a font, following PDF encoding resolution order.
115///
116/// Resolution order:
117/// 1. ToUnicode CMap (highest priority — direct CID/code → Unicode mapping)
118/// 2. Explicit /Encoding from font dictionary (standard name or dictionary with Differences)
119/// 3. Implicit/default encoding (typically StandardEncoding for Type1, identity for TrueType)
120#[derive(Debug, Clone)]
121pub struct EncodingResolver {
122    /// ToUnicode mappings (code → Unicode string). Highest priority.
123    to_unicode: Option<HashMap<u16, String>>,
124    /// Explicit font encoding (from /Encoding entry). Second priority.
125    font_encoding: Option<FontEncoding>,
126    /// Default/fallback encoding. Lowest priority.
127    default_encoding: FontEncoding,
128}
129
130impl EncodingResolver {
131    /// Create a resolver with only a default encoding.
132    pub fn new(default_encoding: FontEncoding) -> Self {
133        Self {
134            to_unicode: None,
135            font_encoding: None,
136            default_encoding,
137        }
138    }
139
140    /// Set the ToUnicode CMap mappings (highest priority).
141    pub fn with_to_unicode(mut self, to_unicode: HashMap<u16, String>) -> Self {
142        self.to_unicode = Some(to_unicode);
143        self
144    }
145
146    /// Set the explicit font encoding (second priority).
147    pub fn with_font_encoding(mut self, encoding: FontEncoding) -> Self {
148        self.font_encoding = Some(encoding);
149        self
150    }
151
152    /// Resolve a character code to a Unicode string.
153    ///
154    /// Follows the resolution order:
155    /// 1. ToUnicode CMap (if present and has mapping for this code)
156    /// 2. Explicit font encoding (if present)
157    /// 3. Default encoding
158    ///
159    /// Returns `None` only if no encoding level has a mapping.
160    pub fn resolve(&self, code: u16) -> Option<String> {
161        // 1. ToUnicode CMap (highest priority)
162        if let Some(ref to_unicode) = self.to_unicode {
163            if let Some(s) = to_unicode.get(&code) {
164                return Some(s.clone());
165            }
166        }
167
168        // For single-byte codes, try font encoding and default
169        if code <= 255 {
170            let byte = code as u8;
171
172            // 2. Explicit font encoding
173            if let Some(ref enc) = self.font_encoding {
174                if let Some(ch) = enc.decode(byte) {
175                    return Some(ch.to_string());
176                }
177            }
178
179            // 3. Default encoding
180            if let Some(ch) = self.default_encoding.decode(byte) {
181                return Some(ch.to_string());
182            }
183        }
184
185        None
186    }
187
188    /// Decode a byte string using the resolution chain.
189    ///
190    /// Each byte is resolved independently. Unresolved bytes become U+FFFD.
191    pub fn decode_bytes(&self, bytes: &[u8]) -> String {
192        bytes
193            .iter()
194            .map(|&b| {
195                self.resolve(b as u16)
196                    .unwrap_or_else(|| "\u{FFFD}".to_string())
197            })
198            .collect()
199    }
200}
201
202// =============================================================================
203// Encoding tables
204// =============================================================================
205
206/// WinAnsiEncoding — based on Windows code page 1252.
207///
208/// Codes 0x00–0x1F are control characters (mapped to Unicode controls).
209/// Codes 0x20–0x7E match ASCII. Codes 0x80–0xFF include extended Latin characters.
210/// Some codes (0x81, 0x8D, 0x8F, 0x90, 0x9D) are undefined in the PDF spec.
211static WIN_ANSI_TABLE: [Option<char>; 256] = {
212    let mut t = [None; 256];
213    // 0x00–0x1F: C0 controls
214    t[0x00] = Some('\0');
215    t[0x01] = Some('\u{0001}');
216    t[0x02] = Some('\u{0002}');
217    t[0x03] = Some('\u{0003}');
218    t[0x04] = Some('\u{0004}');
219    t[0x05] = Some('\u{0005}');
220    t[0x06] = Some('\u{0006}');
221    t[0x07] = Some('\u{0007}');
222    t[0x08] = Some('\u{0008}');
223    t[0x09] = Some('\t');
224    t[0x0A] = Some('\n');
225    t[0x0B] = Some('\u{000B}');
226    t[0x0C] = Some('\u{000C}');
227    t[0x0D] = Some('\r');
228    t[0x0E] = Some('\u{000E}');
229    t[0x0F] = Some('\u{000F}');
230    t[0x10] = Some('\u{0010}');
231    t[0x11] = Some('\u{0011}');
232    t[0x12] = Some('\u{0012}');
233    t[0x13] = Some('\u{0013}');
234    t[0x14] = Some('\u{0014}');
235    t[0x15] = Some('\u{0015}');
236    t[0x16] = Some('\u{0016}');
237    t[0x17] = Some('\u{0017}');
238    t[0x18] = Some('\u{0018}');
239    t[0x19] = Some('\u{0019}');
240    t[0x1A] = Some('\u{001A}');
241    t[0x1B] = Some('\u{001B}');
242    t[0x1C] = Some('\u{001C}');
243    t[0x1D] = Some('\u{001D}');
244    t[0x1E] = Some('\u{001E}');
245    t[0x1F] = Some('\u{001F}');
246    // 0x20–0x7E: ASCII printable
247    t[0x20] = Some(' ');
248    t[0x21] = Some('!');
249    t[0x22] = Some('"');
250    t[0x23] = Some('#');
251    t[0x24] = Some('$');
252    t[0x25] = Some('%');
253    t[0x26] = Some('&');
254    t[0x27] = Some('\'');
255    t[0x28] = Some('(');
256    t[0x29] = Some(')');
257    t[0x2A] = Some('*');
258    t[0x2B] = Some('+');
259    t[0x2C] = Some(',');
260    t[0x2D] = Some('-');
261    t[0x2E] = Some('.');
262    t[0x2F] = Some('/');
263    t[0x30] = Some('0');
264    t[0x31] = Some('1');
265    t[0x32] = Some('2');
266    t[0x33] = Some('3');
267    t[0x34] = Some('4');
268    t[0x35] = Some('5');
269    t[0x36] = Some('6');
270    t[0x37] = Some('7');
271    t[0x38] = Some('8');
272    t[0x39] = Some('9');
273    t[0x3A] = Some(':');
274    t[0x3B] = Some(';');
275    t[0x3C] = Some('<');
276    t[0x3D] = Some('=');
277    t[0x3E] = Some('>');
278    t[0x3F] = Some('?');
279    t[0x40] = Some('@');
280    t[0x41] = Some('A');
281    t[0x42] = Some('B');
282    t[0x43] = Some('C');
283    t[0x44] = Some('D');
284    t[0x45] = Some('E');
285    t[0x46] = Some('F');
286    t[0x47] = Some('G');
287    t[0x48] = Some('H');
288    t[0x49] = Some('I');
289    t[0x4A] = Some('J');
290    t[0x4B] = Some('K');
291    t[0x4C] = Some('L');
292    t[0x4D] = Some('M');
293    t[0x4E] = Some('N');
294    t[0x4F] = Some('O');
295    t[0x50] = Some('P');
296    t[0x51] = Some('Q');
297    t[0x52] = Some('R');
298    t[0x53] = Some('S');
299    t[0x54] = Some('T');
300    t[0x55] = Some('U');
301    t[0x56] = Some('V');
302    t[0x57] = Some('W');
303    t[0x58] = Some('X');
304    t[0x59] = Some('Y');
305    t[0x5A] = Some('Z');
306    t[0x5B] = Some('[');
307    t[0x5C] = Some('\\');
308    t[0x5D] = Some(']');
309    t[0x5E] = Some('^');
310    t[0x5F] = Some('_');
311    t[0x60] = Some('`');
312    t[0x61] = Some('a');
313    t[0x62] = Some('b');
314    t[0x63] = Some('c');
315    t[0x64] = Some('d');
316    t[0x65] = Some('e');
317    t[0x66] = Some('f');
318    t[0x67] = Some('g');
319    t[0x68] = Some('h');
320    t[0x69] = Some('i');
321    t[0x6A] = Some('j');
322    t[0x6B] = Some('k');
323    t[0x6C] = Some('l');
324    t[0x6D] = Some('m');
325    t[0x6E] = Some('n');
326    t[0x6F] = Some('o');
327    t[0x70] = Some('p');
328    t[0x71] = Some('q');
329    t[0x72] = Some('r');
330    t[0x73] = Some('s');
331    t[0x74] = Some('t');
332    t[0x75] = Some('u');
333    t[0x76] = Some('v');
334    t[0x77] = Some('w');
335    t[0x78] = Some('x');
336    t[0x79] = Some('y');
337    t[0x7A] = Some('z');
338    t[0x7B] = Some('{');
339    t[0x7C] = Some('|');
340    t[0x7D] = Some('}');
341    t[0x7E] = Some('~');
342    t[0x7F] = None; // DELETE (undefined in WinAnsi)
343    // 0x80–0x9F: Windows-1252 extensions
344    t[0x80] = Some('\u{20AC}'); // Euro sign
345    // 0x81 undefined
346    t[0x82] = Some('\u{201A}'); // Single low-9 quotation mark
347    t[0x83] = Some('\u{0192}'); // Latin small letter f with hook
348    t[0x84] = Some('\u{201E}'); // Double low-9 quotation mark
349    t[0x85] = Some('\u{2026}'); // Horizontal ellipsis
350    t[0x86] = Some('\u{2020}'); // Dagger
351    t[0x87] = Some('\u{2021}'); // Double dagger
352    t[0x88] = Some('\u{02C6}'); // Modifier letter circumflex accent
353    t[0x89] = Some('\u{2030}'); // Per mille sign
354    t[0x8A] = Some('\u{0160}'); // Latin capital letter S with caron
355    t[0x8B] = Some('\u{2039}'); // Single left-pointing angle quotation mark
356    t[0x8C] = Some('\u{0152}'); // Latin capital ligature OE
357    // 0x8D undefined
358    t[0x8E] = Some('\u{017D}'); // Latin capital letter Z with caron
359    // 0x8F undefined
360    // 0x90 undefined
361    t[0x91] = Some('\u{2018}'); // Left single quotation mark
362    t[0x92] = Some('\u{2019}'); // Right single quotation mark
363    t[0x93] = Some('\u{201C}'); // Left double quotation mark
364    t[0x94] = Some('\u{201D}'); // Right double quotation mark
365    t[0x95] = Some('\u{2022}'); // Bullet
366    t[0x96] = Some('\u{2013}'); // En dash
367    t[0x97] = Some('\u{2014}'); // Em dash
368    t[0x98] = Some('\u{02DC}'); // Small tilde
369    t[0x99] = Some('\u{2122}'); // Trade mark sign
370    t[0x9A] = Some('\u{0161}'); // Latin small letter s with caron
371    t[0x9B] = Some('\u{203A}'); // Single right-pointing angle quotation mark
372    t[0x9C] = Some('\u{0153}'); // Latin small ligature oe
373    // 0x9D undefined
374    t[0x9E] = Some('\u{017E}'); // Latin small letter z with caron
375    t[0x9F] = Some('\u{0178}'); // Latin capital letter Y with diaeresis
376    // 0xA0–0xFF: ISO 8859-1 upper half
377    t[0xA0] = Some('\u{00A0}'); // No-break space
378    t[0xA1] = Some('\u{00A1}'); // Inverted exclamation mark
379    t[0xA2] = Some('\u{00A2}'); // Cent sign
380    t[0xA3] = Some('\u{00A3}'); // Pound sign
381    t[0xA4] = Some('\u{00A4}'); // Currency sign
382    t[0xA5] = Some('\u{00A5}'); // Yen sign
383    t[0xA6] = Some('\u{00A6}'); // Broken bar
384    t[0xA7] = Some('\u{00A7}'); // Section sign
385    t[0xA8] = Some('\u{00A8}'); // Diaeresis
386    t[0xA9] = Some('\u{00A9}'); // Copyright sign
387    t[0xAA] = Some('\u{00AA}'); // Feminine ordinal indicator
388    t[0xAB] = Some('\u{00AB}'); // Left-pointing double angle quotation mark
389    t[0xAC] = Some('\u{00AC}'); // Not sign
390    t[0xAD] = Some('\u{00AD}'); // Soft hyphen
391    t[0xAE] = Some('\u{00AE}'); // Registered sign
392    t[0xAF] = Some('\u{00AF}'); // Macron
393    t[0xB0] = Some('\u{00B0}'); // Degree sign
394    t[0xB1] = Some('\u{00B1}'); // Plus-minus sign
395    t[0xB2] = Some('\u{00B2}'); // Superscript two
396    t[0xB3] = Some('\u{00B3}'); // Superscript three
397    t[0xB4] = Some('\u{00B4}'); // Acute accent
398    t[0xB5] = Some('\u{00B5}'); // Micro sign
399    t[0xB6] = Some('\u{00B6}'); // Pilcrow sign
400    t[0xB7] = Some('\u{00B7}'); // Middle dot
401    t[0xB8] = Some('\u{00B8}'); // Cedilla
402    t[0xB9] = Some('\u{00B9}'); // Superscript one
403    t[0xBA] = Some('\u{00BA}'); // Masculine ordinal indicator
404    t[0xBB] = Some('\u{00BB}'); // Right-pointing double angle quotation mark
405    t[0xBC] = Some('\u{00BC}'); // Vulgar fraction one quarter
406    t[0xBD] = Some('\u{00BD}'); // Vulgar fraction one half
407    t[0xBE] = Some('\u{00BE}'); // Vulgar fraction three quarters
408    t[0xBF] = Some('\u{00BF}'); // Inverted question mark
409    t[0xC0] = Some('\u{00C0}'); // Latin capital letter A with grave
410    t[0xC1] = Some('\u{00C1}'); // Latin capital letter A with acute
411    t[0xC2] = Some('\u{00C2}'); // Latin capital letter A with circumflex
412    t[0xC3] = Some('\u{00C3}'); // Latin capital letter A with tilde
413    t[0xC4] = Some('\u{00C4}'); // Latin capital letter A with diaeresis
414    t[0xC5] = Some('\u{00C5}'); // Latin capital letter A with ring above
415    t[0xC6] = Some('\u{00C6}'); // Latin capital letter AE
416    t[0xC7] = Some('\u{00C7}'); // Latin capital letter C with cedilla
417    t[0xC8] = Some('\u{00C8}'); // Latin capital letter E with grave
418    t[0xC9] = Some('\u{00C9}'); // Latin capital letter E with acute
419    t[0xCA] = Some('\u{00CA}'); // Latin capital letter E with circumflex
420    t[0xCB] = Some('\u{00CB}'); // Latin capital letter E with diaeresis
421    t[0xCC] = Some('\u{00CC}'); // Latin capital letter I with grave
422    t[0xCD] = Some('\u{00CD}'); // Latin capital letter I with acute
423    t[0xCE] = Some('\u{00CE}'); // Latin capital letter I with circumflex
424    t[0xCF] = Some('\u{00CF}'); // Latin capital letter I with diaeresis
425    t[0xD0] = Some('\u{00D0}'); // Latin capital letter Eth
426    t[0xD1] = Some('\u{00D1}'); // Latin capital letter N with tilde
427    t[0xD2] = Some('\u{00D2}'); // Latin capital letter O with grave
428    t[0xD3] = Some('\u{00D3}'); // Latin capital letter O with acute
429    t[0xD4] = Some('\u{00D4}'); // Latin capital letter O with circumflex
430    t[0xD5] = Some('\u{00D5}'); // Latin capital letter O with tilde
431    t[0xD6] = Some('\u{00D6}'); // Latin capital letter O with diaeresis
432    t[0xD7] = Some('\u{00D7}'); // Multiplication sign
433    t[0xD8] = Some('\u{00D8}'); // Latin capital letter O with stroke
434    t[0xD9] = Some('\u{00D9}'); // Latin capital letter U with grave
435    t[0xDA] = Some('\u{00DA}'); // Latin capital letter U with acute
436    t[0xDB] = Some('\u{00DB}'); // Latin capital letter U with circumflex
437    t[0xDC] = Some('\u{00DC}'); // Latin capital letter U with diaeresis
438    t[0xDD] = Some('\u{00DD}'); // Latin capital letter Y with acute
439    t[0xDE] = Some('\u{00DE}'); // Latin capital letter Thorn
440    t[0xDF] = Some('\u{00DF}'); // Latin small letter sharp s
441    t[0xE0] = Some('\u{00E0}'); // Latin small letter a with grave
442    t[0xE1] = Some('\u{00E1}'); // Latin small letter a with acute
443    t[0xE2] = Some('\u{00E2}'); // Latin small letter a with circumflex
444    t[0xE3] = Some('\u{00E3}'); // Latin small letter a with tilde
445    t[0xE4] = Some('\u{00E4}'); // Latin small letter a with diaeresis
446    t[0xE5] = Some('\u{00E5}'); // Latin small letter a with ring above
447    t[0xE6] = Some('\u{00E6}'); // Latin small letter ae
448    t[0xE7] = Some('\u{00E7}'); // Latin small letter c with cedilla
449    t[0xE8] = Some('\u{00E8}'); // Latin small letter e with grave
450    t[0xE9] = Some('\u{00E9}'); // Latin small letter e with acute
451    t[0xEA] = Some('\u{00EA}'); // Latin small letter e with circumflex
452    t[0xEB] = Some('\u{00EB}'); // Latin small letter e with diaeresis
453    t[0xEC] = Some('\u{00EC}'); // Latin small letter i with grave
454    t[0xED] = Some('\u{00ED}'); // Latin small letter i with acute
455    t[0xEE] = Some('\u{00EE}'); // Latin small letter i with circumflex
456    t[0xEF] = Some('\u{00EF}'); // Latin small letter i with diaeresis
457    t[0xF0] = Some('\u{00F0}'); // Latin small letter eth
458    t[0xF1] = Some('\u{00F1}'); // Latin small letter n with tilde
459    t[0xF2] = Some('\u{00F2}'); // Latin small letter o with grave
460    t[0xF3] = Some('\u{00F3}'); // Latin small letter o with acute
461    t[0xF4] = Some('\u{00F4}'); // Latin small letter o with circumflex
462    t[0xF5] = Some('\u{00F5}'); // Latin small letter o with tilde
463    t[0xF6] = Some('\u{00F6}'); // Latin small letter o with diaeresis
464    t[0xF7] = Some('\u{00F7}'); // Division sign
465    t[0xF8] = Some('\u{00F8}'); // Latin small letter o with stroke
466    t[0xF9] = Some('\u{00F9}'); // Latin small letter u with grave
467    t[0xFA] = Some('\u{00FA}'); // Latin small letter u with acute
468    t[0xFB] = Some('\u{00FB}'); // Latin small letter u with circumflex
469    t[0xFC] = Some('\u{00FC}'); // Latin small letter u with diaeresis
470    t[0xFD] = Some('\u{00FD}'); // Latin small letter y with acute
471    t[0xFE] = Some('\u{00FE}'); // Latin small letter thorn
472    t[0xFF] = Some('\u{00FF}'); // Latin small letter y with diaeresis
473    t
474};
475
476/// MacRomanEncoding — Classic Macintosh character set.
477static MAC_ROMAN_TABLE: [Option<char>; 256] = {
478    let mut t = [None; 256];
479    // 0x00–0x7E: Same as ASCII
480    t[0x00] = Some('\0');
481    t[0x01] = Some('\u{0001}');
482    t[0x02] = Some('\u{0002}');
483    t[0x03] = Some('\u{0003}');
484    t[0x04] = Some('\u{0004}');
485    t[0x05] = Some('\u{0005}');
486    t[0x06] = Some('\u{0006}');
487    t[0x07] = Some('\u{0007}');
488    t[0x08] = Some('\u{0008}');
489    t[0x09] = Some('\t');
490    t[0x0A] = Some('\n');
491    t[0x0B] = Some('\u{000B}');
492    t[0x0C] = Some('\u{000C}');
493    t[0x0D] = Some('\r');
494    t[0x0E] = Some('\u{000E}');
495    t[0x0F] = Some('\u{000F}');
496    t[0x10] = Some('\u{0010}');
497    t[0x11] = Some('\u{0011}');
498    t[0x12] = Some('\u{0012}');
499    t[0x13] = Some('\u{0013}');
500    t[0x14] = Some('\u{0014}');
501    t[0x15] = Some('\u{0015}');
502    t[0x16] = Some('\u{0016}');
503    t[0x17] = Some('\u{0017}');
504    t[0x18] = Some('\u{0018}');
505    t[0x19] = Some('\u{0019}');
506    t[0x1A] = Some('\u{001A}');
507    t[0x1B] = Some('\u{001B}');
508    t[0x1C] = Some('\u{001C}');
509    t[0x1D] = Some('\u{001D}');
510    t[0x1E] = Some('\u{001E}');
511    t[0x1F] = Some('\u{001F}');
512    t[0x20] = Some(' ');
513    t[0x21] = Some('!');
514    t[0x22] = Some('"');
515    t[0x23] = Some('#');
516    t[0x24] = Some('$');
517    t[0x25] = Some('%');
518    t[0x26] = Some('&');
519    t[0x27] = Some('\'');
520    t[0x28] = Some('(');
521    t[0x29] = Some(')');
522    t[0x2A] = Some('*');
523    t[0x2B] = Some('+');
524    t[0x2C] = Some(',');
525    t[0x2D] = Some('-');
526    t[0x2E] = Some('.');
527    t[0x2F] = Some('/');
528    t[0x30] = Some('0');
529    t[0x31] = Some('1');
530    t[0x32] = Some('2');
531    t[0x33] = Some('3');
532    t[0x34] = Some('4');
533    t[0x35] = Some('5');
534    t[0x36] = Some('6');
535    t[0x37] = Some('7');
536    t[0x38] = Some('8');
537    t[0x39] = Some('9');
538    t[0x3A] = Some(':');
539    t[0x3B] = Some(';');
540    t[0x3C] = Some('<');
541    t[0x3D] = Some('=');
542    t[0x3E] = Some('>');
543    t[0x3F] = Some('?');
544    t[0x40] = Some('@');
545    t[0x41] = Some('A');
546    t[0x42] = Some('B');
547    t[0x43] = Some('C');
548    t[0x44] = Some('D');
549    t[0x45] = Some('E');
550    t[0x46] = Some('F');
551    t[0x47] = Some('G');
552    t[0x48] = Some('H');
553    t[0x49] = Some('I');
554    t[0x4A] = Some('J');
555    t[0x4B] = Some('K');
556    t[0x4C] = Some('L');
557    t[0x4D] = Some('M');
558    t[0x4E] = Some('N');
559    t[0x4F] = Some('O');
560    t[0x50] = Some('P');
561    t[0x51] = Some('Q');
562    t[0x52] = Some('R');
563    t[0x53] = Some('S');
564    t[0x54] = Some('T');
565    t[0x55] = Some('U');
566    t[0x56] = Some('V');
567    t[0x57] = Some('W');
568    t[0x58] = Some('X');
569    t[0x59] = Some('Y');
570    t[0x5A] = Some('Z');
571    t[0x5B] = Some('[');
572    t[0x5C] = Some('\\');
573    t[0x5D] = Some(']');
574    t[0x5E] = Some('^');
575    t[0x5F] = Some('_');
576    t[0x60] = Some('`');
577    t[0x61] = Some('a');
578    t[0x62] = Some('b');
579    t[0x63] = Some('c');
580    t[0x64] = Some('d');
581    t[0x65] = Some('e');
582    t[0x66] = Some('f');
583    t[0x67] = Some('g');
584    t[0x68] = Some('h');
585    t[0x69] = Some('i');
586    t[0x6A] = Some('j');
587    t[0x6B] = Some('k');
588    t[0x6C] = Some('l');
589    t[0x6D] = Some('m');
590    t[0x6E] = Some('n');
591    t[0x6F] = Some('o');
592    t[0x70] = Some('p');
593    t[0x71] = Some('q');
594    t[0x72] = Some('r');
595    t[0x73] = Some('s');
596    t[0x74] = Some('t');
597    t[0x75] = Some('u');
598    t[0x76] = Some('v');
599    t[0x77] = Some('w');
600    t[0x78] = Some('x');
601    t[0x79] = Some('y');
602    t[0x7A] = Some('z');
603    t[0x7B] = Some('{');
604    t[0x7C] = Some('|');
605    t[0x7D] = Some('}');
606    t[0x7E] = Some('~');
607    t[0x7F] = None; // DELETE
608    // 0x80–0xFF: MacRoman extended characters
609    t[0x80] = Some('\u{00C4}'); // Ä
610    t[0x81] = Some('\u{00C5}'); // Å
611    t[0x82] = Some('\u{00C7}'); // Ç
612    t[0x83] = Some('\u{00C9}'); // É
613    t[0x84] = Some('\u{00D1}'); // Ñ
614    t[0x85] = Some('\u{00D6}'); // Ö
615    t[0x86] = Some('\u{00DC}'); // Ü
616    t[0x87] = Some('\u{00E1}'); // á
617    t[0x88] = Some('\u{00E0}'); // à
618    t[0x89] = Some('\u{00E2}'); // â
619    t[0x8A] = Some('\u{00E4}'); // ä
620    t[0x8B] = Some('\u{00E3}'); // ã
621    t[0x8C] = Some('\u{00E5}'); // å
622    t[0x8D] = Some('\u{00E7}'); // ç
623    t[0x8E] = Some('\u{00E9}'); // é
624    t[0x8F] = Some('\u{00E8}'); // è
625    t[0x90] = Some('\u{00EA}'); // ê
626    t[0x91] = Some('\u{00EB}'); // ë
627    t[0x92] = Some('\u{00ED}'); // í
628    t[0x93] = Some('\u{00EC}'); // ì
629    t[0x94] = Some('\u{00EE}'); // î
630    t[0x95] = Some('\u{00EF}'); // ï
631    t[0x96] = Some('\u{00F1}'); // ñ
632    t[0x97] = Some('\u{00F3}'); // ó
633    t[0x98] = Some('\u{00F2}'); // ò
634    t[0x99] = Some('\u{00F4}'); // ô
635    t[0x9A] = Some('\u{00F6}'); // ö
636    t[0x9B] = Some('\u{00F5}'); // õ
637    t[0x9C] = Some('\u{00FA}'); // ú
638    t[0x9D] = Some('\u{00F9}'); // ù
639    t[0x9E] = Some('\u{00FB}'); // û
640    t[0x9F] = Some('\u{00FC}'); // ü
641    t[0xA0] = Some('\u{2020}'); // †
642    t[0xA1] = Some('\u{00B0}'); // °
643    t[0xA2] = Some('\u{00A2}'); // ¢
644    t[0xA3] = Some('\u{00A3}'); // £
645    t[0xA4] = Some('\u{00A7}'); // §
646    t[0xA5] = Some('\u{2022}'); // •
647    t[0xA6] = Some('\u{00B6}'); // ¶
648    t[0xA7] = Some('\u{00DF}'); // ß
649    t[0xA8] = Some('\u{00AE}'); // ®
650    t[0xA9] = Some('\u{00A9}'); // ©
651    t[0xAA] = Some('\u{2122}'); // ™
652    t[0xAB] = Some('\u{00B4}'); // ´
653    t[0xAC] = Some('\u{00A8}'); // ¨
654    t[0xAD] = Some('\u{2260}'); // ≠
655    t[0xAE] = Some('\u{00C6}'); // Æ
656    t[0xAF] = Some('\u{00D8}'); // Ø
657    t[0xB0] = Some('\u{221E}'); // ∞
658    t[0xB1] = Some('\u{00B1}'); // ±
659    t[0xB2] = Some('\u{2264}'); // ≤
660    t[0xB3] = Some('\u{2265}'); // ≥
661    t[0xB4] = Some('\u{00A5}'); // ¥
662    t[0xB5] = Some('\u{00B5}'); // µ
663    t[0xB6] = Some('\u{2202}'); // ∂
664    t[0xB7] = Some('\u{2211}'); // ∑
665    t[0xB8] = Some('\u{220F}'); // ∏
666    t[0xB9] = Some('\u{03C0}'); // π
667    t[0xBA] = Some('\u{222B}'); // ∫
668    t[0xBB] = Some('\u{00AA}'); // ª
669    t[0xBC] = Some('\u{00BA}'); // º
670    t[0xBD] = Some('\u{2126}'); // Ω
671    t[0xBE] = Some('\u{00E6}'); // æ
672    t[0xBF] = Some('\u{00F8}'); // ø
673    t[0xC0] = Some('\u{00BF}'); // ¿
674    t[0xC1] = Some('\u{00A1}'); // ¡
675    t[0xC2] = Some('\u{00AC}'); // ¬
676    t[0xC3] = Some('\u{221A}'); // √
677    t[0xC4] = Some('\u{0192}'); // ƒ
678    t[0xC5] = Some('\u{2248}'); // ≈
679    t[0xC6] = Some('\u{2206}'); // ∆
680    t[0xC7] = Some('\u{00AB}'); // «
681    t[0xC8] = Some('\u{00BB}'); // »
682    t[0xC9] = Some('\u{2026}'); // …
683    t[0xCA] = Some('\u{00A0}'); // non-breaking space
684    t[0xCB] = Some('\u{00C0}'); // À
685    t[0xCC] = Some('\u{00C3}'); // Ã
686    t[0xCD] = Some('\u{00D5}'); // Õ
687    t[0xCE] = Some('\u{0152}'); // Œ
688    t[0xCF] = Some('\u{0153}'); // œ
689    t[0xD0] = Some('\u{2013}'); // –
690    t[0xD1] = Some('\u{2014}'); // —
691    t[0xD2] = Some('\u{201C}'); // "
692    t[0xD3] = Some('\u{201D}'); // "
693    t[0xD4] = Some('\u{2018}'); // '
694    t[0xD5] = Some('\u{2019}'); // '
695    t[0xD6] = Some('\u{00F7}'); // ÷
696    t[0xD7] = Some('\u{25CA}'); // ◊
697    t[0xD8] = Some('\u{00FF}'); // ÿ
698    t[0xD9] = Some('\u{0178}'); // Ÿ
699    t[0xDA] = Some('\u{2044}'); // ⁄
700    t[0xDB] = Some('\u{20AC}'); // €
701    t[0xDC] = Some('\u{2039}'); // ‹
702    t[0xDD] = Some('\u{203A}'); // ›
703    t[0xDE] = Some('\u{FB01}'); // fi
704    t[0xDF] = Some('\u{FB02}'); // fl
705    t[0xE0] = Some('\u{2021}'); // ‡
706    t[0xE1] = Some('\u{00B7}'); // ·
707    t[0xE2] = Some('\u{201A}'); // ‚
708    t[0xE3] = Some('\u{201E}'); // „
709    t[0xE4] = Some('\u{2030}'); // ‰
710    t[0xE5] = Some('\u{00C2}'); // Â
711    t[0xE6] = Some('\u{00CA}'); // Ê
712    t[0xE7] = Some('\u{00C1}'); // Á
713    t[0xE8] = Some('\u{00CB}'); // Ë
714    t[0xE9] = Some('\u{00C8}'); // È
715    t[0xEA] = Some('\u{00CD}'); // Í
716    t[0xEB] = Some('\u{00CE}'); // Î
717    t[0xEC] = Some('\u{00CF}'); // Ï
718    t[0xED] = Some('\u{00CC}'); // Ì
719    t[0xEE] = Some('\u{00D3}'); // Ó
720    t[0xEF] = Some('\u{00D4}'); // Ô
721    t[0xF0] = Some('\u{F8FF}'); // Apple logo (private use)
722    t[0xF1] = Some('\u{00D2}'); // Ò
723    t[0xF2] = Some('\u{00DA}'); // Ú
724    t[0xF3] = Some('\u{00DB}'); // Û
725    t[0xF4] = Some('\u{00D9}'); // Ù
726    t[0xF5] = Some('\u{0131}'); // ı (dotless i)
727    t[0xF6] = Some('\u{02C6}'); // ˆ
728    t[0xF7] = Some('\u{02DC}'); // ˜
729    t[0xF8] = Some('\u{00AF}'); // ¯
730    t[0xF9] = Some('\u{02D8}'); // ˘
731    t[0xFA] = Some('\u{02D9}'); // ˙
732    t[0xFB] = Some('\u{02DA}'); // ˚
733    t[0xFC] = Some('\u{00B8}'); // ¸
734    t[0xFD] = Some('\u{02DD}'); // ˝
735    t[0xFE] = Some('\u{02DB}'); // ˛
736    t[0xFF] = Some('\u{02C7}'); // ˇ
737    t
738};
739
740/// MacExpertEncoding — Mac expert character set (small caps, fractions, etc.).
741///
742/// This encoding is used for expert glyph sets in Type1 fonts. Many positions
743/// contain special typographic characters like small caps, fractions, and
744/// old-style numerals. Undefined positions are None.
745static MAC_EXPERT_TABLE: [Option<char>; 256] = {
746    let mut t = [None; 256];
747    t[0x20] = Some(' '); // space
748    // Expert glyphs — small caps, fractions, special characters
749    t[0x21] = Some('\u{F721}'); // exclamsmall
750    t[0x22] = Some('\u{F6C9}'); // Hungarumlautsmall
751    t[0x23] = Some('\u{F7A2}'); // centoldstyle
752    t[0x24] = Some('\u{F724}'); // dollaroldstyle
753    t[0x25] = Some('\u{F6DC}'); // dollarsuperior
754    t[0x26] = Some('\u{F726}'); // ampersandsmall
755    t[0x27] = Some('\u{F7B4}'); // Acutesmall
756    t[0x28] = Some('\u{207D}'); // parenleftsuperior
757    t[0x29] = Some('\u{207E}'); // parenrightsuperior
758    t[0x2A] = Some('\u{2025}'); // twodotenleader
759    t[0x2B] = Some('\u{2024}'); // onedotenleader
760    t[0x2C] = Some(','); // comma
761    t[0x2D] = Some('\u{002D}'); // hyphen
762    t[0x2E] = Some('.'); // period
763    t[0x2F] = Some('\u{2044}'); // fraction
764    t[0x30] = Some('\u{F730}'); // zerooldstyle
765    t[0x31] = Some('\u{F731}'); // oneoldstyle
766    t[0x32] = Some('\u{F732}'); // twooldstyle
767    t[0x33] = Some('\u{F733}'); // threeoldstyle
768    t[0x34] = Some('\u{F734}'); // fouroldstyle
769    t[0x35] = Some('\u{F735}'); // fiveoldstyle
770    t[0x36] = Some('\u{F736}'); // sixoldstyle
771    t[0x37] = Some('\u{F737}'); // sevenoldstyle
772    t[0x38] = Some('\u{F738}'); // eightoldstyle
773    t[0x39] = Some('\u{F739}'); // nineoldstyle
774    t[0x3A] = Some(':'); // colon
775    t[0x3B] = Some(';'); // semicolon
776    // 0x3C–0x3E undefined
777    t[0x3F] = Some('\u{F73F}'); // questionsmall
778    // 0x40 undefined
779    // 0x41–0x5A: undefined
780    // 0x5B–0x60: various
781    t[0x5B] = Some('\u{F6E2}'); // commainferior
782    // 0x5C undefined
783    // 0x5D undefined
784    // 0x5E undefined
785    // 0x5F undefined
786    // 0x60 undefined
787    t[0x61] = Some('\u{F6F1}'); // Asmall
788    t[0x62] = Some('\u{F6F2}'); // Bsmall
789    // Small caps A-Z (using Adobe PUA range as per PDF spec)
790    t[0x63] = Some('\u{F7A3}'); // Csmall
791    t[0x64] = Some('\u{F6F4}'); // Dsmall
792    t[0x65] = Some('\u{F6F5}'); // Esmall
793    t[0x66] = Some('\u{F6F6}'); // Fsmall
794    t[0x67] = Some('\u{F6F7}'); // Gsmall
795    t[0x68] = Some('\u{F6F8}'); // Hsmall
796    t[0x69] = Some('\u{F6E3}'); // Ismall
797    t[0x6A] = Some('\u{F6FA}'); // Jsmall
798    t[0x6B] = Some('\u{F6FB}'); // Ksmall
799    t[0x6C] = Some('\u{F6FC}'); // Lsmall
800    t[0x6D] = Some('\u{F6FD}'); // Msmall
801    t[0x6E] = Some('\u{F6FE}'); // Nsmall
802    t[0x6F] = Some('\u{F6FF}'); // Osmall
803    t[0x70] = Some('\u{F700}'); // Psmall
804    t[0x71] = Some('\u{F701}'); // Qsmall
805    t[0x72] = Some('\u{F702}'); // Rsmall
806    t[0x73] = Some('\u{F703}'); // Ssmall
807    t[0x74] = Some('\u{F704}'); // Tsmall
808    t[0x75] = Some('\u{F705}'); // Usmall
809    t[0x76] = Some('\u{F706}'); // Vsmall
810    t[0x77] = Some('\u{F707}'); // Wsmall
811    t[0x78] = Some('\u{F708}'); // Xsmall
812    t[0x79] = Some('\u{F709}'); // Ysmall
813    t[0x7A] = Some('\u{F70A}'); // Zsmall
814    t[0x7B] = Some('\u{20A1}'); // colonmonetary
815    t[0x7C] = Some('\u{F6DC}'); // onefitted
816    t[0x7D] = Some('\u{F6DD}'); // rupiah
817    t[0x7E] = Some('\u{F6DE}'); // Tildesmall
818    // 0x7F undefined
819    // 0x80–0x86 undefined
820    t[0x87] = Some('\u{F6E4}'); // exclamdownsmall
821    t[0x88] = Some('\u{F7A8}'); // Dieresissmall
822    // 0x89 undefined
823    t[0x8A] = Some('\u{F6E5}'); // centinferior
824    t[0x8B] = Some('\u{F6E6}'); // Lslashsmall
825    // 0x8C undefined
826    t[0x8D] = Some('\u{F7AF}'); // Macronsmall
827    // 0x8E–0x8F undefined
828    t[0x90] = Some('\u{F6E7}'); // Scaronsmall
829    // 0x91–0x92 undefined
830    t[0x93] = Some('\u{F6E8}'); // Zcaronsmall
831    // 0x94 undefined
832    t[0x95] = Some('\u{F6EA}'); // Dieresissmall (alternate)
833    t[0x96] = Some('\u{F7B8}'); // Cedillasmall
834    // 0x97–0x99 undefined
835    t[0x9A] = Some('\u{F6EB}'); // OEsmall
836    t[0x9B] = Some('\u{F6EC}'); // figuredash
837    // 0x9C undefined
838    t[0x9D] = Some('\u{F6ED}'); // habornarrowi (variant)
839    // 0x9E–0x9F undefined
840    t[0xA0] = Some('\u{F6EE}'); // spacehackcyrillic (variant)
841    t[0xA1] = Some('\u{F6EF}'); // Agravesmall
842    t[0xA2] = Some('\u{F6F0}'); // Aacutesmall
843    t[0xA3] = Some('\u{F7A3}'); // Acircumflexsmall
844    t[0xA4] = Some('\u{F7A4}'); // Atildesmall
845    t[0xA5] = Some('\u{F7A5}'); // Adieresissmall
846    t[0xA6] = Some('\u{F7A6}'); // Aringsmall
847    t[0xA7] = Some('\u{F7A7}'); // AEsmall
848    t[0xA8] = Some('\u{F7A9}'); // Ccedillasmall
849    t[0xA9] = Some('\u{F7AA}'); // Egravesmall
850    t[0xAA] = Some('\u{F7AB}'); // Eacutesmall
851    t[0xAB] = Some('\u{F7AC}'); // Ecircumflexsmall
852    t[0xAC] = Some('\u{F7AD}'); // Edieresissmall
853    t[0xAD] = Some('\u{F7AE}'); // Igravesmall
854    t[0xAE] = Some('\u{F7AF}'); // Iacutesmall
855    t[0xAF] = Some('\u{F7B0}'); // Icircumflexsmall
856    t[0xB0] = Some('\u{F7B1}'); // Idieresissmall
857    t[0xB1] = Some('\u{F7B2}'); // Ethsmall
858    t[0xB2] = Some('\u{F7B3}'); // Ntildesmall
859    t[0xB3] = Some('\u{F7B4}'); // Ogravesmall
860    t[0xB4] = Some('\u{F7B5}'); // Oacutesmall
861    t[0xB5] = Some('\u{F7B6}'); // Ocircumflexsmall
862    t[0xB6] = Some('\u{F7B7}'); // Otildesmall
863    t[0xB7] = Some('\u{F7B8}'); // Odieresissmall
864    t[0xB8] = Some('\u{F7B9}'); // OEsmall (alternate)
865    t[0xB9] = Some('\u{F7BA}'); // Oslashsmall
866    t[0xBA] = Some('\u{F7BB}'); // Ugravesmall
867    t[0xBB] = Some('\u{F7BC}'); // Uacutesmall
868    t[0xBC] = Some('\u{F7BD}'); // Ucircumflexsmall
869    t[0xBD] = Some('\u{F7BE}'); // Udieresissmall
870    t[0xBE] = Some('\u{F7BF}'); // Yacutesmall
871    t[0xBF] = Some('\u{F7C0}'); // Thornsmall
872    t[0xC0] = Some('\u{F7C1}'); // Ydieresissmall
873    t[0xC1] = Some('\u{2153}'); // onethird
874    t[0xC2] = Some('\u{2154}'); // twothirds
875    // 0xC3 undefined
876    t[0xC4] = Some('\u{215B}'); // oneeighth
877    t[0xC5] = Some('\u{215C}'); // threeeighths
878    t[0xC6] = Some('\u{215D}'); // fiveeighths
879    t[0xC7] = Some('\u{215E}'); // seveneighths
880    t[0xC8] = Some('\u{2070}'); // zerosuperior
881    // 0xC9 undefined
882    t[0xCA] = Some('\u{F6F3}'); // foursuperior
883    // 0xCB–0xCC undefined
884    t[0xCD] = Some('\u{2074}'); // foursuperior (standard)
885    // 0xCE undefined
886    t[0xCF] = Some('\u{2075}'); // fivesuperior
887    // 0xD0 undefined
888    t[0xD1] = Some('\u{2076}'); // sixsuperior
889    t[0xD2] = Some('\u{2077}'); // sevensuperior
890    t[0xD3] = Some('\u{2078}'); // eightsuperior
891    t[0xD4] = Some('\u{2079}'); // ninesuperior
892    t[0xD5] = Some('\u{2080}'); // zeroinferior
893    t[0xD6] = Some('\u{2081}'); // oneinferior
894    t[0xD7] = Some('\u{2082}'); // twoinferior
895    t[0xD8] = Some('\u{2083}'); // threeinferior
896    t[0xD9] = Some('\u{2084}'); // fourinferior
897    t[0xDA] = Some('\u{2085}'); // fiveinferior
898    t[0xDB] = Some('\u{2086}'); // sixinferior
899    t[0xDC] = Some('\u{2087}'); // seveninferior
900    t[0xDD] = Some('\u{2088}'); // eightinferior
901    t[0xDE] = Some('\u{2089}'); // nineinferior
902    // 0xDF undefined
903    t[0xE0] = Some('\u{2215}'); // centinferior (variant) / division slash
904    t[0xE1] = Some('\u{F6F4}'); // ff ligature (PUA)
905    t[0xE2] = Some('\u{F6F5}'); // fi ligature (PUA)
906    t[0xE3] = Some('\u{F6F6}'); // fl ligature (PUA)
907    t[0xE4] = Some('\u{F6F7}'); // ffi ligature (PUA)
908    t[0xE5] = Some('\u{F6F8}'); // ffl ligature (PUA)
909    t[0xE6] = Some('\u{F7E6}'); // parenleftinferior
910    // 0xE7 undefined
911    t[0xE8] = Some('\u{F7E8}'); // parenrightinferior
912    // 0xE9 undefined
913    t[0xEA] = Some('\u{F6F9}'); // Circumflexsmall
914    t[0xEB] = Some('\u{F6FA}'); // habornarrow
915    // 0xEC undefined
916    t[0xED] = Some('\u{F6E9}'); // colonmonetary
917    // 0xEE–0xEF undefined
918    t[0xF0] = Some('\u{F7F0}'); // Gravesmall
919    t[0xF1] = Some('\u{00BC}'); // onequarter
920    t[0xF2] = Some('\u{00BD}'); // onehalf
921    t[0xF3] = Some('\u{00BE}'); // threequarters
922    t[0xF4] = Some('\u{215F}'); // fraction1 (variant)
923    // 0xF5–0xF7 undefined
924    t[0xF8] = Some('\u{F6FA}'); // Commasmall (variant)
925    // 0xF9–0xFF undefined
926    t
927};
928
929/// StandardEncoding — Adobe standard Latin character encoding (PDF Reference Table D.1).
930///
931/// This is the default encoding for Type1 fonts when no explicit /Encoding is specified.
932static STANDARD_TABLE: [Option<char>; 256] = {
933    let mut t = [None; 256];
934    // 0x20–0x7E: mostly ASCII but with some differences
935    t[0x20] = Some(' '); // space
936    t[0x21] = Some('!'); // exclam
937    t[0x22] = Some('"'); // quotedbl
938    t[0x23] = Some('#'); // numbersign
939    t[0x24] = Some('$'); // dollar
940    t[0x25] = Some('%'); // percent
941    t[0x26] = Some('&'); // ampersand
942    t[0x27] = Some('\u{2019}'); // quoteright (right single quote, NOT ASCII apostrophe)
943    t[0x28] = Some('('); // parenleft
944    t[0x29] = Some(')'); // parenright
945    t[0x2A] = Some('*'); // asterisk
946    t[0x2B] = Some('+'); // plus
947    t[0x2C] = Some(','); // comma
948    t[0x2D] = Some('-'); // hyphen
949    t[0x2E] = Some('.'); // period
950    t[0x2F] = Some('/'); // slash
951    t[0x30] = Some('0'); // zero
952    t[0x31] = Some('1'); // one
953    t[0x32] = Some('2'); // two
954    t[0x33] = Some('3'); // three
955    t[0x34] = Some('4'); // four
956    t[0x35] = Some('5'); // five
957    t[0x36] = Some('6'); // six
958    t[0x37] = Some('7'); // seven
959    t[0x38] = Some('8'); // eight
960    t[0x39] = Some('9'); // nine
961    t[0x3A] = Some(':'); // colon
962    t[0x3B] = Some(';'); // semicolon
963    t[0x3C] = Some('<'); // less
964    t[0x3D] = Some('='); // equal
965    t[0x3E] = Some('>'); // greater
966    t[0x3F] = Some('?'); // question
967    t[0x40] = Some('@'); // at
968    t[0x41] = Some('A');
969    t[0x42] = Some('B');
970    t[0x43] = Some('C');
971    t[0x44] = Some('D');
972    t[0x45] = Some('E');
973    t[0x46] = Some('F');
974    t[0x47] = Some('G');
975    t[0x48] = Some('H');
976    t[0x49] = Some('I');
977    t[0x4A] = Some('J');
978    t[0x4B] = Some('K');
979    t[0x4C] = Some('L');
980    t[0x4D] = Some('M');
981    t[0x4E] = Some('N');
982    t[0x4F] = Some('O');
983    t[0x50] = Some('P');
984    t[0x51] = Some('Q');
985    t[0x52] = Some('R');
986    t[0x53] = Some('S');
987    t[0x54] = Some('T');
988    t[0x55] = Some('U');
989    t[0x56] = Some('V');
990    t[0x57] = Some('W');
991    t[0x58] = Some('X');
992    t[0x59] = Some('Y');
993    t[0x5A] = Some('Z');
994    t[0x5B] = Some('['); // bracketleft
995    t[0x5C] = Some('\\'); // backslash
996    t[0x5D] = Some(']'); // bracketright
997    t[0x5E] = Some('^'); // asciicircum
998    t[0x5F] = Some('_'); // underscore
999    t[0x60] = Some('\u{2018}'); // quoteleft (left single quote, NOT ASCII grave)
1000    t[0x61] = Some('a');
1001    t[0x62] = Some('b');
1002    t[0x63] = Some('c');
1003    t[0x64] = Some('d');
1004    t[0x65] = Some('e');
1005    t[0x66] = Some('f');
1006    t[0x67] = Some('g');
1007    t[0x68] = Some('h');
1008    t[0x69] = Some('i');
1009    t[0x6A] = Some('j');
1010    t[0x6B] = Some('k');
1011    t[0x6C] = Some('l');
1012    t[0x6D] = Some('m');
1013    t[0x6E] = Some('n');
1014    t[0x6F] = Some('o');
1015    t[0x70] = Some('p');
1016    t[0x71] = Some('q');
1017    t[0x72] = Some('r');
1018    t[0x73] = Some('s');
1019    t[0x74] = Some('t');
1020    t[0x75] = Some('u');
1021    t[0x76] = Some('v');
1022    t[0x77] = Some('w');
1023    t[0x78] = Some('x');
1024    t[0x79] = Some('y');
1025    t[0x7A] = Some('z');
1026    t[0x7B] = Some('{'); // braceleft
1027    t[0x7C] = Some('|'); // bar
1028    t[0x7D] = Some('}'); // braceright
1029    t[0x7E] = Some('~'); // asciitilde
1030    // 0x7F undefined
1031    // 0x80–0x9F: undefined in StandardEncoding
1032    // 0xA0–0xFF: Extended characters
1033    t[0xA1] = Some('\u{00A1}'); // exclamdown
1034    t[0xA2] = Some('\u{00A2}'); // cent
1035    t[0xA3] = Some('\u{00A3}'); // sterling
1036    t[0xA4] = Some('\u{2044}'); // fraction
1037    t[0xA5] = Some('\u{00A5}'); // yen
1038    t[0xA6] = Some('\u{0192}'); // florin
1039    t[0xA7] = Some('\u{00A7}'); // section
1040    t[0xA8] = Some('\u{00A4}'); // currency
1041    t[0xA9] = Some('\''); // quotesingle (ASCII apostrophe)
1042    t[0xAA] = Some('\u{201C}'); // quotedblleft
1043    t[0xAB] = Some('\u{00AB}'); // guillemotleft
1044    t[0xAC] = Some('\u{2039}'); // guilsinglleft
1045    t[0xAD] = Some('\u{203A}'); // guilsinglright
1046    t[0xAE] = Some('\u{FB01}'); // fi
1047    t[0xAF] = Some('\u{FB02}'); // fl
1048    // 0xB0 undefined
1049    t[0xB1] = Some('\u{2013}'); // endash
1050    t[0xB2] = Some('\u{2020}'); // dagger
1051    t[0xB3] = Some('\u{2021}'); // daggerdbl
1052    t[0xB4] = Some('\u{00B7}'); // periodcentered
1053    // 0xB5 undefined
1054    t[0xB6] = Some('\u{00B6}'); // paragraph
1055    t[0xB7] = Some('\u{2022}'); // bullet
1056    t[0xB8] = Some('\u{201A}'); // quotesinglbase
1057    t[0xB9] = Some('\u{201E}'); // quotedblbase
1058    t[0xBA] = Some('\u{201D}'); // quotedblright
1059    t[0xBB] = Some('\u{00BB}'); // guillemotright
1060    t[0xBC] = Some('\u{2026}'); // ellipsis
1061    t[0xBD] = Some('\u{2030}'); // perthousand
1062    // 0xBE undefined
1063    t[0xBF] = Some('\u{00BF}'); // questiondown
1064    // 0xC0 undefined
1065    t[0xC1] = Some('\u{0060}'); // grave
1066    t[0xC2] = Some('\u{00B4}'); // acute
1067    t[0xC3] = Some('\u{02C6}'); // circumflex
1068    t[0xC4] = Some('\u{02DC}'); // tilde
1069    t[0xC5] = Some('\u{00AF}'); // macron
1070    t[0xC6] = Some('\u{02D8}'); // breve
1071    t[0xC7] = Some('\u{02D9}'); // dotaccent
1072    t[0xC8] = Some('\u{00A8}'); // dieresis
1073    // 0xC9 undefined
1074    t[0xCA] = Some('\u{02DA}'); // ring
1075    t[0xCB] = Some('\u{00B8}'); // cedilla
1076    // 0xCC undefined
1077    t[0xCD] = Some('\u{02DD}'); // hungarumlaut
1078    t[0xCE] = Some('\u{02DB}'); // ogonek
1079    t[0xCF] = Some('\u{02C7}'); // caron
1080    t[0xD0] = Some('\u{2014}'); // emdash
1081    // 0xD1–0xDF undefined
1082    // 0xE0 undefined
1083    t[0xE1] = Some('\u{00C6}'); // AE
1084    // 0xE2 undefined
1085    t[0xE3] = Some('\u{00AA}'); // ordfeminine
1086    // 0xE4–0xE7 undefined
1087    t[0xE8] = Some('\u{0141}'); // Lslash
1088    t[0xE9] = Some('\u{00D8}'); // Oslash
1089    t[0xEA] = Some('\u{0152}'); // OE
1090    t[0xEB] = Some('\u{00BA}'); // ordmasculine
1091    // 0xEC–0xEF undefined
1092    // 0xF0 undefined
1093    t[0xF1] = Some('\u{00E6}'); // ae
1094    // 0xF2 undefined
1095    // 0xF3 undefined
1096    // 0xF4 undefined
1097    t[0xF5] = Some('\u{0131}'); // dotlessi
1098    // 0xF6 undefined
1099    // 0xF7 undefined
1100    t[0xF8] = Some('\u{0142}'); // lslash
1101    t[0xF9] = Some('\u{00F8}'); // oslash
1102    t[0xFA] = Some('\u{0153}'); // oe
1103    t[0xFB] = Some('\u{00DF}'); // germandbls
1104    // 0xFC–0xFF undefined
1105    t
1106};
1107
1108#[cfg(test)]
1109mod tests {
1110    use super::*;
1111
1112    // =========================================================================
1113    // WinAnsiEncoding tests
1114    // =========================================================================
1115
1116    #[test]
1117    fn win_ansi_ascii_printable() {
1118        let enc = StandardEncoding::WinAnsi;
1119        assert_eq!(enc.decode(0x20), Some(' '));
1120        assert_eq!(enc.decode(0x41), Some('A'));
1121        assert_eq!(enc.decode(0x5A), Some('Z'));
1122        assert_eq!(enc.decode(0x61), Some('a'));
1123        assert_eq!(enc.decode(0x7A), Some('z'));
1124        assert_eq!(enc.decode(0x30), Some('0'));
1125        assert_eq!(enc.decode(0x39), Some('9'));
1126    }
1127
1128    #[test]
1129    fn win_ansi_extended_characters() {
1130        let enc = StandardEncoding::WinAnsi;
1131        // Windows-1252 extensions (0x80–0x9F)
1132        assert_eq!(enc.decode(0x80), Some('\u{20AC}')); // Euro
1133        assert_eq!(enc.decode(0x85), Some('\u{2026}')); // Ellipsis
1134        assert_eq!(enc.decode(0x93), Some('\u{201C}')); // Left double quote
1135        assert_eq!(enc.decode(0x94), Some('\u{201D}')); // Right double quote
1136        assert_eq!(enc.decode(0x96), Some('\u{2013}')); // En dash
1137        assert_eq!(enc.decode(0x97), Some('\u{2014}')); // Em dash
1138        assert_eq!(enc.decode(0x99), Some('\u{2122}')); // Trademark
1139    }
1140
1141    #[test]
1142    fn win_ansi_undefined_codes() {
1143        let enc = StandardEncoding::WinAnsi;
1144        assert_eq!(enc.decode(0x81), None); // Undefined
1145        assert_eq!(enc.decode(0x8D), None); // Undefined
1146        assert_eq!(enc.decode(0x8F), None); // Undefined
1147        assert_eq!(enc.decode(0x90), None); // Undefined
1148        assert_eq!(enc.decode(0x9D), None); // Undefined
1149    }
1150
1151    #[test]
1152    fn win_ansi_latin_extended() {
1153        let enc = StandardEncoding::WinAnsi;
1154        // ISO 8859-1 upper half
1155        assert_eq!(enc.decode(0xC0), Some('\u{00C0}')); // À
1156        assert_eq!(enc.decode(0xC9), Some('\u{00C9}')); // É
1157        assert_eq!(enc.decode(0xE9), Some('\u{00E9}')); // é
1158        assert_eq!(enc.decode(0xF1), Some('\u{00F1}')); // ñ
1159        assert_eq!(enc.decode(0xFC), Some('\u{00FC}')); // ü
1160        assert_eq!(enc.decode(0xFF), Some('\u{00FF}')); // ÿ
1161    }
1162
1163    #[test]
1164    fn win_ansi_decode_bytes() {
1165        let enc = StandardEncoding::WinAnsi;
1166        // "Hello" in ASCII/WinAnsi
1167        let result = enc.decode_bytes(&[0x48, 0x65, 0x6C, 0x6C, 0x6F]);
1168        assert_eq!(result, "Hello");
1169    }
1170
1171    #[test]
1172    fn win_ansi_decode_bytes_with_extended() {
1173        let enc = StandardEncoding::WinAnsi;
1174        // "café" — 'é' is 0xE9 in WinAnsi
1175        let result = enc.decode_bytes(&[0x63, 0x61, 0x66, 0xE9]);
1176        assert_eq!(result, "caf\u{00E9}");
1177    }
1178
1179    #[test]
1180    fn win_ansi_decode_bytes_with_undefined() {
1181        let enc = StandardEncoding::WinAnsi;
1182        // Byte 0x81 is undefined, should become U+FFFD
1183        let result = enc.decode_bytes(&[0x41, 0x81, 0x42]);
1184        assert_eq!(result, "A\u{FFFD}B");
1185    }
1186
1187    // =========================================================================
1188    // MacRomanEncoding tests
1189    // =========================================================================
1190
1191    #[test]
1192    fn mac_roman_ascii_printable() {
1193        let enc = StandardEncoding::MacRoman;
1194        assert_eq!(enc.decode(0x41), Some('A'));
1195        assert_eq!(enc.decode(0x61), Some('a'));
1196        assert_eq!(enc.decode(0x20), Some(' '));
1197    }
1198
1199    #[test]
1200    fn mac_roman_extended_characters() {
1201        let enc = StandardEncoding::MacRoman;
1202        assert_eq!(enc.decode(0x80), Some('\u{00C4}')); // Ä
1203        assert_eq!(enc.decode(0x83), Some('\u{00C9}')); // É
1204        assert_eq!(enc.decode(0x84), Some('\u{00D1}')); // Ñ
1205        assert_eq!(enc.decode(0x87), Some('\u{00E1}')); // á
1206        assert_eq!(enc.decode(0x8E), Some('\u{00E9}')); // é
1207        assert_eq!(enc.decode(0x96), Some('\u{00F1}')); // ñ
1208        assert_eq!(enc.decode(0xCA), Some('\u{00A0}')); // non-breaking space
1209        assert_eq!(enc.decode(0xD2), Some('\u{201C}')); // "
1210        assert_eq!(enc.decode(0xD3), Some('\u{201D}')); // "
1211        assert_eq!(enc.decode(0xDB), Some('\u{20AC}')); // €
1212    }
1213
1214    #[test]
1215    fn mac_roman_special_symbols() {
1216        let enc = StandardEncoding::MacRoman;
1217        assert_eq!(enc.decode(0xA5), Some('\u{2022}')); // Bullet
1218        assert_eq!(enc.decode(0xB0), Some('\u{221E}')); // Infinity
1219        assert_eq!(enc.decode(0xB9), Some('\u{03C0}')); // Pi
1220        assert_eq!(enc.decode(0xC5), Some('\u{2248}')); // Almost equal
1221        assert_eq!(enc.decode(0xDE), Some('\u{FB01}')); // fi ligature
1222        assert_eq!(enc.decode(0xDF), Some('\u{FB02}')); // fl ligature
1223    }
1224
1225    #[test]
1226    fn mac_roman_decode_bytes() {
1227        let enc = StandardEncoding::MacRoman;
1228        // "Ñ" is 0x84 in MacRoman
1229        let result = enc.decode_bytes(&[0x84]);
1230        assert_eq!(result, "\u{00D1}");
1231    }
1232
1233    // =========================================================================
1234    // MacExpertEncoding tests
1235    // =========================================================================
1236
1237    #[test]
1238    fn mac_expert_fractions() {
1239        let enc = StandardEncoding::MacExpert;
1240        assert_eq!(enc.decode(0xF1), Some('\u{00BC}')); // ¼
1241        assert_eq!(enc.decode(0xF2), Some('\u{00BD}')); // ½
1242        assert_eq!(enc.decode(0xF3), Some('\u{00BE}')); // ¾
1243        assert_eq!(enc.decode(0xC1), Some('\u{2153}')); // ⅓
1244        assert_eq!(enc.decode(0xC2), Some('\u{2154}')); // ⅔
1245    }
1246
1247    #[test]
1248    fn mac_expert_superscripts_subscripts() {
1249        let enc = StandardEncoding::MacExpert;
1250        // Superscripts
1251        assert_eq!(enc.decode(0x28), Some('\u{207D}')); // parenleftsuperior
1252        assert_eq!(enc.decode(0x29), Some('\u{207E}')); // parenrightsuperior
1253        // Subscripts/Inferiors
1254        assert_eq!(enc.decode(0xD5), Some('\u{2080}')); // zeroinferior
1255        assert_eq!(enc.decode(0xD6), Some('\u{2081}')); // oneinferior
1256        assert_eq!(enc.decode(0xDE), Some('\u{2089}')); // nineinferior
1257    }
1258
1259    #[test]
1260    fn mac_expert_space_and_basic() {
1261        let enc = StandardEncoding::MacExpert;
1262        assert_eq!(enc.decode(0x20), Some(' '));
1263        assert_eq!(enc.decode(0x2C), Some(','));
1264        assert_eq!(enc.decode(0x2E), Some('.'));
1265        assert_eq!(enc.decode(0x2F), Some('\u{2044}')); // fraction slash
1266    }
1267
1268    #[test]
1269    fn mac_expert_undefined_codes() {
1270        let enc = StandardEncoding::MacExpert;
1271        // Low codes should be undefined (except 0x20)
1272        assert_eq!(enc.decode(0x00), None);
1273        assert_eq!(enc.decode(0x01), None);
1274        assert_eq!(enc.decode(0x10), None);
1275    }
1276
1277    // =========================================================================
1278    // StandardEncoding tests
1279    // =========================================================================
1280
1281    #[test]
1282    fn standard_ascii_letters() {
1283        let enc = StandardEncoding::Standard;
1284        assert_eq!(enc.decode(0x41), Some('A'));
1285        assert_eq!(enc.decode(0x5A), Some('Z'));
1286        assert_eq!(enc.decode(0x61), Some('a'));
1287        assert_eq!(enc.decode(0x7A), Some('z'));
1288    }
1289
1290    #[test]
1291    fn standard_differs_from_ascii() {
1292        let enc = StandardEncoding::Standard;
1293        // Key differences from ASCII:
1294        // 0x27 is quoteright (U+2019), not ASCII apostrophe
1295        assert_eq!(enc.decode(0x27), Some('\u{2019}'));
1296        // 0x60 is quoteleft (U+2018), not ASCII grave accent
1297        assert_eq!(enc.decode(0x60), Some('\u{2018}'));
1298    }
1299
1300    #[test]
1301    fn standard_extended_characters() {
1302        let enc = StandardEncoding::Standard;
1303        assert_eq!(enc.decode(0xA1), Some('\u{00A1}')); // ¡
1304        assert_eq!(enc.decode(0xA4), Some('\u{2044}')); // fraction
1305        assert_eq!(enc.decode(0xAE), Some('\u{FB01}')); // fi
1306        assert_eq!(enc.decode(0xAF), Some('\u{FB02}')); // fl
1307        assert_eq!(enc.decode(0xB1), Some('\u{2013}')); // endash
1308        assert_eq!(enc.decode(0xD0), Some('\u{2014}')); // emdash
1309        assert_eq!(enc.decode(0xE1), Some('\u{00C6}')); // AE
1310        assert_eq!(enc.decode(0xF1), Some('\u{00E6}')); // ae
1311        assert_eq!(enc.decode(0xFA), Some('\u{0153}')); // oe
1312        assert_eq!(enc.decode(0xFB), Some('\u{00DF}')); // germandbls
1313    }
1314
1315    #[test]
1316    fn standard_undefined_ranges() {
1317        let enc = StandardEncoding::Standard;
1318        // 0x80–0x9F are all undefined in StandardEncoding
1319        for code in 0x80..=0x9F {
1320            assert_eq!(enc.decode(code), None, "code 0x{code:02X} should be None");
1321        }
1322    }
1323
1324    #[test]
1325    fn standard_diacritics() {
1326        let enc = StandardEncoding::Standard;
1327        assert_eq!(enc.decode(0xC1), Some('\u{0060}')); // grave
1328        assert_eq!(enc.decode(0xC2), Some('\u{00B4}')); // acute
1329        assert_eq!(enc.decode(0xC3), Some('\u{02C6}')); // circumflex
1330        assert_eq!(enc.decode(0xC4), Some('\u{02DC}')); // tilde
1331        assert_eq!(enc.decode(0xCA), Some('\u{02DA}')); // ring
1332        assert_eq!(enc.decode(0xCF), Some('\u{02C7}')); // caron
1333    }
1334
1335    // =========================================================================
1336    // FontEncoding tests (Differences array)
1337    // =========================================================================
1338
1339    #[test]
1340    fn font_encoding_from_standard() {
1341        let enc = FontEncoding::from_standard(StandardEncoding::WinAnsi);
1342        assert_eq!(enc.decode(0x41), Some('A'));
1343        assert_eq!(enc.decode(0x80), Some('\u{20AC}'));
1344    }
1345
1346    #[test]
1347    fn font_encoding_differences_override() {
1348        // Create WinAnsi encoding with differences that override some positions
1349        let differences = vec![
1350            (0x41, '\u{0391}'), // Override 'A' (0x41) with Greek Alpha
1351            (0x42, '\u{0392}'), // Override 'B' (0x42) with Greek Beta
1352        ];
1353        let enc =
1354            FontEncoding::from_standard_with_differences(StandardEncoding::WinAnsi, &differences);
1355
1356        // Overridden positions
1357        assert_eq!(enc.decode(0x41), Some('\u{0391}')); // Alpha instead of A
1358        assert_eq!(enc.decode(0x42), Some('\u{0392}')); // Beta instead of B
1359
1360        // Non-overridden positions remain unchanged
1361        assert_eq!(enc.decode(0x43), Some('C'));
1362        assert_eq!(enc.decode(0x80), Some('\u{20AC}')); // Euro unchanged
1363    }
1364
1365    #[test]
1366    fn font_encoding_differences_fill_undefined() {
1367        // Fill in a previously undefined code
1368        let differences = vec![
1369            (0x81, '\u{2603}'), // Fill undefined 0x81 with snowman
1370        ];
1371        let enc =
1372            FontEncoding::from_standard_with_differences(StandardEncoding::WinAnsi, &differences);
1373
1374        assert_eq!(enc.decode(0x81), Some('\u{2603}')); // Was None, now snowman
1375    }
1376
1377    #[test]
1378    fn font_encoding_apply_differences_incrementally() {
1379        let mut enc = FontEncoding::from_standard(StandardEncoding::Standard);
1380        assert_eq!(enc.decode(0x27), Some('\u{2019}')); // quoteright
1381
1382        // Apply differences
1383        enc.apply_differences(&[(0x27, '\'')]); // Override to ASCII apostrophe
1384        assert_eq!(enc.decode(0x27), Some('\'')); // Now ASCII apostrophe
1385    }
1386
1387    #[test]
1388    fn font_encoding_decode_bytes() {
1389        let differences = vec![(0xE9, '\u{00E9}')]; // é at 0xE9
1390        let enc =
1391            FontEncoding::from_standard_with_differences(StandardEncoding::Standard, &differences);
1392        let result = enc.decode_bytes(&[0x63, 0x61, 0x66, 0xE9]); // c, a, f, é
1393        assert_eq!(result, "caf\u{00E9}");
1394    }
1395
1396    #[test]
1397    fn font_encoding_from_custom_table() {
1398        let mut table = [None; 256];
1399        table[0x41] = Some('X');
1400        table[0x42] = Some('Y');
1401        let enc = FontEncoding::from_table(table);
1402        assert_eq!(enc.decode(0x41), Some('X'));
1403        assert_eq!(enc.decode(0x42), Some('Y'));
1404        assert_eq!(enc.decode(0x43), None);
1405    }
1406
1407    // =========================================================================
1408    // EncodingResolver tests
1409    // =========================================================================
1410
1411    #[test]
1412    fn resolver_default_only() {
1413        let resolver =
1414            EncodingResolver::new(FontEncoding::from_standard(StandardEncoding::WinAnsi));
1415
1416        assert_eq!(resolver.resolve(0x41), Some("A".to_string()));
1417        assert_eq!(resolver.resolve(0x80), Some("\u{20AC}".to_string()));
1418        assert_eq!(resolver.resolve(0x81), None); // Undefined
1419    }
1420
1421    #[test]
1422    fn resolver_font_encoding_over_default() {
1423        let default_enc = FontEncoding::from_standard(StandardEncoding::Standard);
1424        let font_enc = FontEncoding::from_standard(StandardEncoding::WinAnsi);
1425
1426        let resolver = EncodingResolver::new(default_enc).with_font_encoding(font_enc);
1427
1428        // WinAnsi 0x27 = ASCII apostrophe, Standard 0x27 = quoteright
1429        // Font encoding should win over default
1430        assert_eq!(resolver.resolve(0x27), Some("'".to_string()));
1431    }
1432
1433    #[test]
1434    fn resolver_to_unicode_highest_priority() {
1435        let mut to_unicode = HashMap::new();
1436        to_unicode.insert(0x41, "X".to_string()); // Override 'A' → 'X'
1437
1438        let resolver =
1439            EncodingResolver::new(FontEncoding::from_standard(StandardEncoding::WinAnsi))
1440                .with_to_unicode(to_unicode);
1441
1442        // ToUnicode overrides everything
1443        assert_eq!(resolver.resolve(0x41), Some("X".to_string()));
1444        // But non-overridden codes still fall through
1445        assert_eq!(resolver.resolve(0x42), Some("B".to_string()));
1446    }
1447
1448    #[test]
1449    fn resolver_full_chain() {
1450        let mut to_unicode = HashMap::new();
1451        to_unicode.insert(0x01, "TOUNICODE".to_string());
1452
1453        let mut font_table = [None; 256];
1454        font_table[0x02] = Some('F'); // Only in font encoding
1455        let font_enc = FontEncoding::from_table(font_table);
1456
1457        let default_enc = FontEncoding::from_standard(StandardEncoding::WinAnsi);
1458
1459        let resolver = EncodingResolver::new(default_enc)
1460            .with_font_encoding(font_enc)
1461            .with_to_unicode(to_unicode);
1462
1463        // Level 1: ToUnicode
1464        assert_eq!(resolver.resolve(0x01), Some("TOUNICODE".to_string()));
1465        // Level 2: Font encoding
1466        assert_eq!(resolver.resolve(0x02), Some("F".to_string()));
1467        // Level 3: Default (WinAnsi)
1468        assert_eq!(resolver.resolve(0x41), Some("A".to_string()));
1469        // No mapping at any level
1470        assert_eq!(resolver.resolve(0x81), None); // WinAnsi undefined, no font/tounicode
1471    }
1472
1473    #[test]
1474    fn resolver_to_unicode_multi_char() {
1475        let mut to_unicode = HashMap::new();
1476        // ToUnicode can map to multi-character strings (e.g., ligatures)
1477        to_unicode.insert(0xFB01, "fi".to_string());
1478
1479        let resolver =
1480            EncodingResolver::new(FontEncoding::from_standard(StandardEncoding::WinAnsi))
1481                .with_to_unicode(to_unicode);
1482
1483        // Multi-byte code
1484        assert_eq!(resolver.resolve(0xFB01), Some("fi".to_string()));
1485    }
1486
1487    #[test]
1488    fn resolver_decode_bytes() {
1489        let mut to_unicode = HashMap::new();
1490        to_unicode.insert(0x41, "X".to_string()); // A → X
1491
1492        let resolver =
1493            EncodingResolver::new(FontEncoding::from_standard(StandardEncoding::WinAnsi))
1494                .with_to_unicode(to_unicode);
1495
1496        let result = resolver.decode_bytes(&[0x41, 0x42, 0x43]);
1497        assert_eq!(result, "XBC"); // A overridden to X, B and C from default
1498    }
1499
1500    #[test]
1501    fn resolver_decode_bytes_with_undefined() {
1502        let resolver =
1503            EncodingResolver::new(FontEncoding::from_standard(StandardEncoding::WinAnsi));
1504
1505        let result = resolver.decode_bytes(&[0x41, 0x81, 0x42]);
1506        assert_eq!(result, "A\u{FFFD}B"); // 0x81 is undefined in WinAnsi
1507    }
1508
1509    // =========================================================================
1510    // Cross-encoding comparison tests
1511    // =========================================================================
1512
1513    #[test]
1514    fn ascii_range_consistent_except_standard() {
1515        // WinAnsi and MacRoman should agree on 0x20–0x7E (ASCII printable)
1516        for code in 0x20..=0x7E_u8 {
1517            let win = StandardEncoding::WinAnsi.decode(code);
1518            let mac = StandardEncoding::MacRoman.decode(code);
1519            assert_eq!(win, mac, "WinAnsi and MacRoman disagree at 0x{code:02X}");
1520        }
1521    }
1522
1523    #[test]
1524    fn standard_encoding_quote_marks_differ() {
1525        // StandardEncoding uses curly quotes where others use straight
1526        assert_eq!(StandardEncoding::Standard.decode(0x27), Some('\u{2019}')); // curly
1527        assert_eq!(StandardEncoding::WinAnsi.decode(0x27), Some('\'')); // straight
1528        assert_eq!(StandardEncoding::Standard.decode(0x60), Some('\u{2018}')); // curly
1529        assert_eq!(StandardEncoding::WinAnsi.decode(0x60), Some('`')); // straight
1530    }
1531
1532    #[test]
1533    fn all_encodings_have_space() {
1534        assert_eq!(StandardEncoding::WinAnsi.decode(0x20), Some(' '));
1535        assert_eq!(StandardEncoding::MacRoman.decode(0x20), Some(' '));
1536        assert_eq!(StandardEncoding::MacExpert.decode(0x20), Some(' '));
1537        assert_eq!(StandardEncoding::Standard.decode(0x20), Some(' '));
1538    }
1539}
pdfplumber_core/encoding.rs

pdfplumber_core/
encoding.rs