justpdf_core/font/
encoding.rs1#[derive(Debug, Clone, Copy, PartialEq, Eq)]
3pub enum Encoding {
4 StandardEncoding,
5 MacRomanEncoding,
6 WinAnsiEncoding,
7 PDFDocEncoding,
8 Identity,
10}
11
12impl Encoding {
13 pub fn from_name(name: &[u8]) -> Self {
14 match name {
15 b"StandardEncoding" => Self::StandardEncoding,
16 b"MacRomanEncoding" => Self::MacRomanEncoding,
17 b"WinAnsiEncoding" => Self::WinAnsiEncoding,
18 b"PDFDocEncoding" => Self::PDFDocEncoding,
19 b"Identity-H" | b"Identity-V" => Self::Identity,
20 _ => Self::StandardEncoding,
21 }
22 }
23}
24
25pub fn decode_text(bytes: &[u8], encoding: Encoding) -> String {
27 if bytes.len() >= 2 && bytes[0] == 0xFE && bytes[1] == 0xFF {
29 return decode_utf16be(&bytes[2..]);
30 }
31
32 if bytes.len() >= 3 && bytes[0] == 0xEF && bytes[1] == 0xBB && bytes[2] == 0xBF {
34 return String::from_utf8_lossy(&bytes[3..]).into_owned();
35 }
36
37 match encoding {
38 Encoding::WinAnsiEncoding => decode_winansi(bytes),
39 Encoding::MacRomanEncoding => decode_mac_roman(bytes),
40 Encoding::PDFDocEncoding => decode_pdfdoc(bytes),
41 Encoding::StandardEncoding => decode_winansi(bytes), Encoding::Identity => {
43 String::from_utf8_lossy(bytes).into_owned()
45 }
46 }
47}
48
49fn decode_utf16be(bytes: &[u8]) -> String {
50 let mut chars = Vec::new();
51 let mut i = 0;
52 while i + 1 < bytes.len() {
53 let code = ((bytes[i] as u16) << 8) | bytes[i + 1] as u16;
54 i += 2;
55
56 if (0xD800..=0xDBFF).contains(&code) && i + 1 < bytes.len() {
58 let low = ((bytes[i] as u16) << 8) | bytes[i + 1] as u16;
59 if (0xDC00..=0xDFFF).contains(&low) {
60 i += 2;
61 let cp = 0x10000 + ((code as u32 - 0xD800) << 10) + (low as u32 - 0xDC00);
62 if let Some(c) = char::from_u32(cp) {
63 chars.push(c);
64 }
65 continue;
66 }
67 }
68
69 if let Some(c) = char::from_u32(code as u32) {
70 chars.push(c);
71 }
72 }
73 chars.into_iter().collect()
74}
75
76fn decode_winansi(bytes: &[u8]) -> String {
78 bytes
79 .iter()
80 .map(|&b| WINANSI_TO_UNICODE[b as usize])
81 .collect()
82}
83
84fn decode_mac_roman(bytes: &[u8]) -> String {
86 bytes
88 .iter()
89 .map(|&b| {
90 if b < 128 {
91 b as char
92 } else {
93 WINANSI_TO_UNICODE[b as usize] }
95 })
96 .collect()
97}
98
99fn decode_pdfdoc(bytes: &[u8]) -> String {
101 bytes
102 .iter()
103 .map(|&b| PDFDOC_TO_UNICODE[b as usize])
104 .collect()
105}
106
107static WINANSI_TO_UNICODE: [char; 256] = {
109 let mut table = ['\0'; 256];
110 let mut i = 0;
111 while i < 128 {
112 table[i] = i as u8 as char;
113 i += 1;
114 }
115 while i < 256 {
116 table[i] = i as u8 as char; i += 1;
118 }
119 table[0x80] = '\u{20AC}'; table[0x82] = '\u{201A}'; table[0x83] = '\u{0192}'; table[0x84] = '\u{201E}'; table[0x85] = '\u{2026}'; table[0x86] = '\u{2020}'; table[0x87] = '\u{2021}'; table[0x88] = '\u{02C6}'; table[0x89] = '\u{2030}'; table[0x8A] = '\u{0160}'; table[0x8B] = '\u{2039}'; table[0x8C] = '\u{0152}'; table[0x8E] = '\u{017D}'; table[0x91] = '\u{2018}'; table[0x92] = '\u{2019}'; table[0x93] = '\u{201C}'; table[0x94] = '\u{201D}'; table[0x95] = '\u{2022}'; table[0x96] = '\u{2013}'; table[0x97] = '\u{2014}'; table[0x98] = '\u{02DC}'; table[0x99] = '\u{2122}'; table[0x9A] = '\u{0161}'; table[0x9B] = '\u{203A}'; table[0x9C] = '\u{0153}'; table[0x9E] = '\u{017E}'; table[0x9F] = '\u{0178}'; table
148};
149
150static PDFDOC_TO_UNICODE: [char; 256] = {
152 let mut table = WINANSI_TO_UNICODE;
153 table[0x7F] = '\u{FFFD}'; table[0x80] = '\u{2022}'; table[0xAD] = '\u{00AD}'; table
159};
160
161#[cfg(test)]
162mod tests {
163 use super::*;
164
165 #[test]
166 fn test_decode_ascii() {
167 let result = decode_text(b"Hello", Encoding::WinAnsiEncoding);
168 assert_eq!(result, "Hello");
169 }
170
171 #[test]
172 fn test_decode_utf16be_bom() {
173 let data = [0xFE, 0xFF, 0x00, 0x48, 0x00, 0x69]; let result = decode_text(&data, Encoding::WinAnsiEncoding);
175 assert_eq!(result, "Hi");
176 }
177
178 #[test]
179 fn test_decode_winansi_special() {
180 let result = decode_text(&[0x80], Encoding::WinAnsiEncoding);
182 assert_eq!(result, "\u{20AC}");
183 }
184
185 #[test]
186 fn test_encoding_from_name() {
187 assert_eq!(
188 Encoding::from_name(b"WinAnsiEncoding"),
189 Encoding::WinAnsiEncoding
190 );
191 assert_eq!(
192 Encoding::from_name(b"MacRomanEncoding"),
193 Encoding::MacRomanEncoding
194 );
195 assert_eq!(Encoding::from_name(b"Identity-H"), Encoding::Identity);
196 }
197}