1#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
17pub enum Charset {
18 #[default]
20 Utf8,
21 Latin1,
23 Win1252,
25 #[cfg(feature = "charset-full")]
28 Encoding(&'static encoding_rs::Encoding),
29 Dos(&'static [char; 128]),
32 Unknown,
34}
35
36impl Charset {
37 pub fn from_name(name: &str) -> Self {
43 let n: String = name
44 .chars()
45 .filter(|c| c.is_ascii_alphanumeric())
46 .map(|c| c.to_ascii_uppercase())
47 .collect();
48 match n.as_str() {
49 "UTF8" | "UNICODEFSS" => Charset::Utf8,
50 "ISO88591" | "LATIN1" => Charset::Latin1,
51 "WIN1252" | "WINDOWS1252" => Charset::Win1252,
52 other => match dos_table(other) {
53 Some(table) => Charset::Dos(table),
54 None => Self::resolve_extra(other),
55 },
56 }
57 }
58
59 #[cfg(feature = "charset-full")]
62 fn resolve_extra(normalized: &str) -> Self {
63 match whatwg_label(normalized) {
64 Some(label) => match encoding_rs::Encoding::for_label(label.as_bytes()) {
65 Some(enc) => Charset::Encoding(enc),
66 None => Charset::Unknown,
67 },
68 None => Charset::Unknown,
69 }
70 }
71
72 #[cfg(not(feature = "charset-full"))]
73 fn resolve_extra(_normalized: &str) -> Self {
74 Charset::Unknown
75 }
76
77 pub fn decode(self, raw: &[u8]) -> String {
79 match self {
80 Charset::Utf8 | Charset::Unknown => String::from_utf8_lossy(raw).into_owned(),
81 Charset::Latin1 => raw.iter().map(|&b| b as char).collect(),
82 Charset::Win1252 => raw.iter().map(|&b| win1252_char(b)).collect(),
83 #[cfg(feature = "charset-full")]
84 Charset::Encoding(enc) => enc.decode(raw).0.into_owned(),
85 Charset::Dos(table) => raw
86 .iter()
87 .map(|&b| {
88 if b < 0x80 {
89 b as char
90 } else {
91 table[(b - 0x80) as usize]
92 }
93 })
94 .collect(),
95 }
96 }
97
98 pub fn encode(self, s: &str) -> Vec<u8> {
104 match self {
105 Charset::Utf8 | Charset::Unknown => s.as_bytes().to_vec(),
106 Charset::Latin1 => s
107 .chars()
108 .map(|c| if (c as u32) <= 0xFF { c as u8 } else { b'?' })
109 .collect(),
110 Charset::Win1252 => s.chars().map(win1252_byte).collect(),
111 #[cfg(feature = "charset-full")]
112 Charset::Encoding(enc) => enc.encode(s).0.into_owned(),
113 Charset::Dos(table) => s
114 .chars()
115 .map(|c| {
116 if (c as u32) < 0x80 {
117 c as u8
118 } else {
119 table
121 .iter()
122 .position(|&t| t == c)
123 .map_or(b'?', |i| (i + 0x80) as u8)
124 }
125 })
126 .collect(),
127 }
128 }
129}
130
131fn dos_table(n: &str) -> Option<&'static [char; 128]> {
134 use crate::dos::*;
135 Some(match n {
136 "DOS437" => &CP437,
137 "DOS737" => &CP737,
138 "DOS775" => &CP775,
139 "DOS850" => &CP850,
140 "DOS852" => &CP852,
141 "DOS855" => &CP855,
142 "DOS857" => &CP857,
143 "DOS858" => &CP858,
144 "DOS860" => &CP860,
145 "DOS861" => &CP861,
146 "DOS862" => &CP862,
147 "DOS863" => &CP863,
148 "DOS864" => &CP864,
149 "DOS865" => &CP865,
150 "DOS866" => &CP866,
151 "DOS869" => &CP869,
152 _ => return None,
153 })
154}
155
156#[cfg(feature = "charset-full")]
160fn whatwg_label(n: &str) -> Option<&'static str> {
161 if let Some(num) = n.strip_prefix("ISO8859") {
163 return match num {
166 "2" | "3" | "4" | "5" | "6" | "7" | "8" | "9" | "10" | "13" | "14" | "15" | "16" => {
167 Some(match num {
168 "2" => "iso-8859-2",
169 "3" => "iso-8859-3",
170 "4" => "iso-8859-4",
171 "5" => "iso-8859-5",
172 "6" => "iso-8859-6",
173 "7" => "iso-8859-7",
174 "8" => "iso-8859-8",
175 "9" => "iso-8859-9", "10" => "iso-8859-10",
177 "13" => "iso-8859-13",
178 "14" => "iso-8859-14",
179 "15" => "iso-8859-15",
180 _ => "iso-8859-16",
181 })
182 }
183 _ => None,
184 };
185 }
186 Some(match n {
187 "SJIS0208" | "SJIS" | "SHIFTJIS" => "shift_jis",
189 "EUCJ0208" | "EUCJP" => "euc-jp",
190 "KSC5601" | "EUCKR" => "euc-kr",
192 "GB2312" | "GBK" => "gbk",
194 "GB18030" => "gb18030",
195 "BIG5" => "big5",
196 "KOI8R" => "koi8-r",
198 "KOI8U" => "koi8-u",
199 "TIS620" => "windows-874",
200 "WIN1250" => "windows-1250",
201 "WIN1251" => "windows-1251",
202 "WIN1253" => "windows-1253",
203 "WIN1254" => "windows-1254",
204 "WIN1255" => "windows-1255",
205 "WIN1256" => "windows-1256",
206 "WIN1257" => "windows-1257",
207 "WIN1258" => "windows-1258",
208 _ => return None,
209 })
210}
211
212fn win1252_char(b: u8) -> char {
216 match b {
217 0x80 => '\u{20AC}', 0x82 => '\u{201A}', 0x83 => '\u{0192}', 0x84 => '\u{201E}', 0x85 => '\u{2026}', 0x86 => '\u{2020}', 0x87 => '\u{2021}', 0x88 => '\u{02C6}', 0x89 => '\u{2030}', 0x8A => '\u{0160}', 0x8B => '\u{2039}', 0x8C => '\u{0152}', 0x8E => '\u{017D}', 0x91 => '\u{2018}', 0x92 => '\u{2019}', 0x93 => '\u{201C}', 0x94 => '\u{201D}', 0x95 => '\u{2022}', 0x96 => '\u{2013}', 0x97 => '\u{2014}', 0x98 => '\u{02DC}', 0x99 => '\u{2122}', 0x9A => '\u{0161}', 0x9B => '\u{203A}', 0x9C => '\u{0153}', 0x9E => '\u{017E}', 0x9F => '\u{0178}', other => other as char,
246 }
247}
248
249fn win1252_byte(c: char) -> u8 {
252 match c {
253 '\u{20AC}' => 0x80,
254 '\u{201A}' => 0x82,
255 '\u{0192}' => 0x83,
256 '\u{201E}' => 0x84,
257 '\u{2026}' => 0x85,
258 '\u{2020}' => 0x86,
259 '\u{2021}' => 0x87,
260 '\u{02C6}' => 0x88,
261 '\u{2030}' => 0x89,
262 '\u{0160}' => 0x8A,
263 '\u{2039}' => 0x8B,
264 '\u{0152}' => 0x8C,
265 '\u{017D}' => 0x8E,
266 '\u{2018}' => 0x91,
267 '\u{2019}' => 0x92,
268 '\u{201C}' => 0x93,
269 '\u{201D}' => 0x94,
270 '\u{2022}' => 0x95,
271 '\u{2013}' => 0x96,
272 '\u{2014}' => 0x97,
273 '\u{02DC}' => 0x98,
274 '\u{2122}' => 0x99,
275 '\u{0161}' => 0x9A,
276 '\u{203A}' => 0x9B,
277 '\u{0153}' => 0x9C,
278 '\u{017E}' => 0x9E,
279 '\u{0178}' => 0x9F,
280 c if (c as u32) <= 0xFF => c as u8,
283 _ => b'?',
284 }
285}
286
287#[cfg(test)]
288mod tests {
289 use super::*;
290
291 #[test]
292 fn name_resolution() {
293 assert_eq!(Charset::from_name("UTF8"), Charset::Utf8);
294 assert_eq!(Charset::from_name("utf-8"), Charset::Utf8);
295 assert_eq!(Charset::from_name("ISO8859_1"), Charset::Latin1);
296 assert_eq!(Charset::from_name("Latin1"), Charset::Latin1);
297 assert_eq!(Charset::from_name("WIN1252"), Charset::Win1252);
298 assert_eq!(Charset::from_name("NOSUCHCHARSET"), Charset::Unknown);
300 }
301
302 #[test]
303 fn latin1_decode() {
304 assert_eq!(Charset::Latin1.decode(&[0x48, 0xE9, 0xF1]), "Héñ");
306 }
307
308 #[test]
309 fn win1252_decode() {
310 assert_eq!(Charset::Win1252.decode(&[0x80]), "€");
312 assert_eq!(Charset::Win1252.decode(&[0x93, 0x94]), "\u{201C}\u{201D}");
313 assert_eq!(Charset::Win1252.decode(&[0xE9]), "é");
314 }
315
316 #[test]
317 fn utf8_passthrough() {
318 assert_eq!(Charset::Utf8.decode("café €".as_bytes()), "café €");
319 }
320
321 #[test]
322 fn encode_inverts_decode() {
323 for (cs, bytes) in [
324 (Charset::Latin1, vec![0x48u8, 0xE9, 0xF1, 0x20, 0xFF]),
325 (Charset::Win1252, vec![0x80, 0x93, 0x94, 0xE9, 0x97]),
326 ] {
327 let s = cs.decode(&bytes);
328 assert_eq!(cs.encode(&s), bytes, "roundtrip falhou para {cs:?}");
329 }
330 }
331
332 #[test]
333 fn encode_unrepresentable_is_question_mark() {
334 assert_eq!(Charset::Latin1.encode("a€b"), b"a?b");
336 assert_eq!(Charset::Win1252.encode("x\u{4E00}y"), b"x?y");
338 }
339
340 #[test]
341 fn dos_code_pages_resolve_and_roundtrip() {
342 assert!(matches!(Charset::from_name("DOS850"), Charset::Dos(_)));
344 assert!(matches!(Charset::from_name("DOS437"), Charset::Dos(_)));
345 let cp850 = Charset::from_name("DOS850");
347 assert_eq!(cp850.decode(&[0x41, 0x82, 0xA5]), "Aé\u{D1}");
348 assert_eq!(cp850.encode("Aé\u{D1}"), vec![0x41, 0x82, 0xA5]);
349 let cp860 = Charset::from_name("DOS860");
351 assert_eq!(cp860.decode(&[0x84, 0x85, 0x94]), "ãàõ");
352 assert_eq!(cp860.encode("ãàõ"), vec![0x84, 0x85, 0x94]);
353 assert_eq!(cp850.encode("€"), b"?");
355 }
356
357 #[cfg(not(feature = "charset-full"))]
358 #[test]
359 fn multibyte_without_feature_is_unknown() {
360 assert_eq!(Charset::from_name("SJIS_0208"), Charset::Unknown);
362 assert_eq!(Charset::from_name("EUCJ_0208"), Charset::Unknown);
363 }
364
365 #[cfg(feature = "charset-full")]
366 mod full {
367 use super::*;
368
369 #[test]
370 fn resolves_multibyte_names() {
371 for name in [
373 "SJIS_0208",
374 "EUCJ_0208",
375 "GBK",
376 "BIG_5",
377 "WIN1251",
378 "ISO8859_2",
379 ] {
380 assert!(
381 matches!(Charset::from_name(name), Charset::Encoding(_)),
382 "{name} não resolveu para encoding_rs"
383 );
384 }
385 }
386
387 #[test]
388 fn shift_jis_roundtrip() {
389 let sjis = Charset::from_name("SJIS_0208");
390 let bytes = sjis.encode("日本語");
392 assert_eq!(bytes, vec![0x93, 0xfa, 0x96, 0x7b, 0x8c, 0xea]);
393 assert_eq!(sjis.decode(&bytes), "日本語");
394 }
395
396 #[test]
397 fn win1251_decode_cyrillic() {
398 let cp = Charset::from_name("WIN1251");
399 assert_eq!(cp.decode(&[0xcf]), "П");
401 }
402
403 #[test]
404 fn iso8859_15_euro() {
405 assert_eq!(Charset::from_name("ISO8859_15").decode(&[0xA4]), "€");
407 }
408 }
409}