Skip to main content

justpdf_core/font/
encoding.rs

1/// PDF text encoding type.
2#[derive(Debug, Clone, Copy, PartialEq, Eq)]
3pub enum Encoding {
4    StandardEncoding,
5    MacRomanEncoding,
6    WinAnsiEncoding,
7    PDFDocEncoding,
8    /// Identity (pass-through, for CID fonts).
9    Identity,
10}
11
12impl Encoding {
13    pub fn from_name(name: &[u8]) -> Self {
14        match name {
15            b"StandardEncoding" => Self::StandardEncoding,
16            b"MacRomanEncoding" => Self::MacRomanEncoding,
17            b"WinAnsiEncoding" => Self::WinAnsiEncoding,
18            b"PDFDocEncoding" => Self::PDFDocEncoding,
19            b"Identity-H" | b"Identity-V" => Self::Identity,
20            _ => Self::StandardEncoding,
21        }
22    }
23}
24
25/// Decode a PDF byte string to a Unicode string using the given encoding.
26pub fn decode_text(bytes: &[u8], encoding: Encoding) -> String {
27    // Check for UTF-16BE BOM
28    if bytes.len() >= 2 && bytes[0] == 0xFE && bytes[1] == 0xFF {
29        return decode_utf16be(&bytes[2..]);
30    }
31
32    // Check for UTF-8 BOM
33    if bytes.len() >= 3 && bytes[0] == 0xEF && bytes[1] == 0xBB && bytes[2] == 0xBF {
34        return String::from_utf8_lossy(&bytes[3..]).into_owned();
35    }
36
37    match encoding {
38        Encoding::WinAnsiEncoding => decode_winansi(bytes),
39        Encoding::MacRomanEncoding => decode_mac_roman(bytes),
40        Encoding::PDFDocEncoding => decode_pdfdoc(bytes),
41        Encoding::StandardEncoding => decode_winansi(bytes), // close enough for display
42        Encoding::Identity => {
43            // Try UTF-8 first
44            String::from_utf8_lossy(bytes).into_owned()
45        }
46    }
47}
48
49fn decode_utf16be(bytes: &[u8]) -> String {
50    let mut chars = Vec::new();
51    let mut i = 0;
52    while i + 1 < bytes.len() {
53        let code = ((bytes[i] as u16) << 8) | bytes[i + 1] as u16;
54        i += 2;
55
56        // Handle surrogate pairs
57        if (0xD800..=0xDBFF).contains(&code) && i + 1 < bytes.len() {
58            let low = ((bytes[i] as u16) << 8) | bytes[i + 1] as u16;
59            if (0xDC00..=0xDFFF).contains(&low) {
60                i += 2;
61                let cp = 0x10000 + ((code as u32 - 0xD800) << 10) + (low as u32 - 0xDC00);
62                if let Some(c) = char::from_u32(cp) {
63                    chars.push(c);
64                }
65                continue;
66            }
67        }
68
69        if let Some(c) = char::from_u32(code as u32) {
70            chars.push(c);
71        }
72    }
73    chars.into_iter().collect()
74}
75
76/// WinAnsi (Windows-1252) decoding.
77fn decode_winansi(bytes: &[u8]) -> String {
78    bytes
79        .iter()
80        .map(|&b| WINANSI_TO_UNICODE[b as usize])
81        .collect()
82}
83
84/// Mac Roman decoding (simplified — uses same table for now).
85fn decode_mac_roman(bytes: &[u8]) -> String {
86    // Simplified: just use the byte value as a char for ASCII range
87    bytes
88        .iter()
89        .map(|&b| {
90            if b < 128 {
91                b as char
92            } else {
93                WINANSI_TO_UNICODE[b as usize] // approximate
94            }
95        })
96        .collect()
97}
98
99/// PDFDocEncoding decoding.
100fn decode_pdfdoc(bytes: &[u8]) -> String {
101    bytes
102        .iter()
103        .map(|&b| PDFDOC_TO_UNICODE[b as usize])
104        .collect()
105}
106
107/// Windows-1252 to Unicode mapping table.
108static WINANSI_TO_UNICODE: [char; 256] = {
109    let mut table = ['\0'; 256];
110    let mut i = 0;
111    while i < 128 {
112        table[i] = i as u8 as char;
113        i += 1;
114    }
115    while i < 256 {
116        table[i] = i as u8 as char; // default: Latin-1
117        i += 1;
118    }
119    // Windows-1252 specific mappings (0x80-0x9F)
120    table[0x80] = '\u{20AC}'; // Euro sign
121    table[0x82] = '\u{201A}'; // Single low-9 quotation mark
122    table[0x83] = '\u{0192}'; // Latin small letter f with hook
123    table[0x84] = '\u{201E}'; // Double low-9 quotation mark
124    table[0x85] = '\u{2026}'; // Horizontal ellipsis
125    table[0x86] = '\u{2020}'; // Dagger
126    table[0x87] = '\u{2021}'; // Double dagger
127    table[0x88] = '\u{02C6}'; // Modifier letter circumflex accent
128    table[0x89] = '\u{2030}'; // Per mille sign
129    table[0x8A] = '\u{0160}'; // Latin capital letter S with caron
130    table[0x8B] = '\u{2039}'; // Single left-pointing angle quotation mark
131    table[0x8C] = '\u{0152}'; // Latin capital ligature OE
132    table[0x8E] = '\u{017D}'; // Latin capital letter Z with caron
133    table[0x91] = '\u{2018}'; // Left single quotation mark
134    table[0x92] = '\u{2019}'; // Right single quotation mark
135    table[0x93] = '\u{201C}'; // Left double quotation mark
136    table[0x94] = '\u{201D}'; // Right double quotation mark
137    table[0x95] = '\u{2022}'; // Bullet
138    table[0x96] = '\u{2013}'; // En dash
139    table[0x97] = '\u{2014}'; // Em dash
140    table[0x98] = '\u{02DC}'; // Small tilde
141    table[0x99] = '\u{2122}'; // Trade mark sign
142    table[0x9A] = '\u{0161}'; // Latin small letter s with caron
143    table[0x9B] = '\u{203A}'; // Single right-pointing angle quotation mark
144    table[0x9C] = '\u{0153}'; // Latin small ligature oe
145    table[0x9E] = '\u{017E}'; // Latin small letter z with caron
146    table[0x9F] = '\u{0178}'; // Latin capital letter Y with diaeresis
147    table
148};
149
150/// PDFDocEncoding to Unicode (identical to WinAnsi for most codes).
151static PDFDOC_TO_UNICODE: [char; 256] = {
152    let mut table = WINANSI_TO_UNICODE;
153    // PDFDocEncoding differences from WinAnsi in 0x80-0x9F and some control chars
154    // (Simplified: use WinAnsi as base)
155    table[0x7F] = '\u{FFFD}'; // Undefined
156    table[0x80] = '\u{2022}'; // Bullet (different from WinAnsi)
157    table[0xAD] = '\u{00AD}'; // Soft hyphen
158    table
159};
160
161#[cfg(test)]
162mod tests {
163    use super::*;
164
165    #[test]
166    fn test_decode_ascii() {
167        let result = decode_text(b"Hello", Encoding::WinAnsiEncoding);
168        assert_eq!(result, "Hello");
169    }
170
171    #[test]
172    fn test_decode_utf16be_bom() {
173        let data = [0xFE, 0xFF, 0x00, 0x48, 0x00, 0x69]; // "Hi"
174        let result = decode_text(&data, Encoding::WinAnsiEncoding);
175        assert_eq!(result, "Hi");
176    }
177
178    #[test]
179    fn test_decode_winansi_special() {
180        // Euro sign (0x80 in WinAnsi)
181        let result = decode_text(&[0x80], Encoding::WinAnsiEncoding);
182        assert_eq!(result, "\u{20AC}");
183    }
184
185    #[test]
186    fn test_encoding_from_name() {
187        assert_eq!(
188            Encoding::from_name(b"WinAnsiEncoding"),
189            Encoding::WinAnsiEncoding
190        );
191        assert_eq!(
192            Encoding::from_name(b"MacRomanEncoding"),
193            Encoding::MacRomanEncoding
194        );
195        assert_eq!(Encoding::from_name(b"Identity-H"), Encoding::Identity);
196    }
197}