1use crate::TermError;
2
3#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
12pub enum Encoding {
13 #[default]
14 Utf8,
15 Ascii,
16 Iso8859_1,
20 Windows1252,
22 Iso8859_15,
23 Macintosh,
24 Iso8859_2,
26 Windows1250,
27 Iso8859_3,
29 Iso8859_9,
32 Windows1254,
33 Iso8859_4,
35 Iso8859_10,
36 Iso8859_13,
37 Windows1257,
38 Iso8859_14,
40 Iso8859_16,
42 Iso8859_5,
44 Windows1251,
45 Koi8R,
46 Koi8U,
47 Ibm866,
48 XMacCyrillic,
49 Iso8859_7,
51 Windows1253,
52 Iso8859_8,
54 Iso8859_8I,
55 Windows1255,
56 Iso8859_6,
58 Windows1256,
59 Windows1258,
61 Windows874,
63 ShiftJis,
65 EucJp,
66 Iso2022Jp,
67 Gbk,
69 Gb18030,
70 Big5,
71 EucKr,
73 Utf16Be,
75 Utf16Le,
76}
77
78impl Encoding {
79 pub const ALL: &[Encoding] = &[
80 Self::Utf8, Self::Ascii,
81 Self::Iso8859_1, Self::Iso8859_2, Self::Iso8859_3, Self::Iso8859_4,
82 Self::Iso8859_5, Self::Iso8859_6, Self::Iso8859_7, Self::Iso8859_8,
83 Self::Iso8859_8I, Self::Iso8859_9, Self::Iso8859_10, Self::Iso8859_13,
84 Self::Iso8859_14, Self::Iso8859_15, Self::Iso8859_16,
85 Self::Windows874, Self::Windows1250, Self::Windows1251, Self::Windows1252,
86 Self::Windows1253, Self::Windows1254, Self::Windows1255, Self::Windows1256,
87 Self::Windows1257, Self::Windows1258,
88 Self::Koi8R, Self::Koi8U, Self::Ibm866,
89 Self::Macintosh, Self::XMacCyrillic,
90 Self::ShiftJis, Self::EucJp, Self::Iso2022Jp,
91 Self::Gbk, Self::Gb18030, Self::Big5,
92 Self::EucKr,
93 Self::Utf16Be, Self::Utf16Le,
94 ];
95
96 pub fn from_name(name: &str) -> Option<Self> {
97 let lower = name.trim().to_ascii_lowercase();
98 match lower.as_str() {
99 "utf-8" | "utf8" => Some(Self::Utf8),
100 "us-ascii" | "ascii" | "iso-ir-6" => Some(Self::Ascii),
101 "iso-8859-1" | "latin1" | "latin-1" | "iso_8859-1" | "l1" => Some(Self::Iso8859_1),
102 _ => {
103 let enc_rs = encoding_rs::Encoding::for_label(lower.as_bytes())?;
104 Self::from_encoding_rs(enc_rs)
105 }
106 }
107 }
108
109 pub fn name(&self) -> &'static str {
111 match self {
112 Self::Utf8 => "utf-8",
113 Self::Ascii => "us-ascii",
114 Self::Iso8859_1 => "iso-8859-1",
115 Self::Iso8859_2 => "iso-8859-2",
116 Self::Iso8859_3 => "iso-8859-3",
117 Self::Iso8859_4 => "iso-8859-4",
118 Self::Iso8859_5 => "iso-8859-5",
119 Self::Iso8859_6 => "iso-8859-6",
120 Self::Iso8859_7 => "iso-8859-7",
121 Self::Iso8859_8 => "iso-8859-8",
122 Self::Iso8859_8I => "iso-8859-8-i",
123 Self::Iso8859_9 => "iso-8859-9",
124 Self::Iso8859_10 => "iso-8859-10",
125 Self::Iso8859_13 => "iso-8859-13",
126 Self::Iso8859_14 => "iso-8859-14",
127 Self::Iso8859_15 => "iso-8859-15",
128 Self::Iso8859_16 => "iso-8859-16",
129 Self::Windows874 => "windows-874",
130 Self::Windows1250 => "windows-1250",
131 Self::Windows1251 => "windows-1251",
132 Self::Windows1252 => "windows-1252",
133 Self::Windows1253 => "windows-1253",
134 Self::Windows1254 => "windows-1254",
135 Self::Windows1255 => "windows-1255",
136 Self::Windows1256 => "windows-1256",
137 Self::Windows1257 => "windows-1257",
138 Self::Windows1258 => "windows-1258",
139 Self::Koi8R => "koi8-r",
140 Self::Koi8U => "koi8-u",
141 Self::Ibm866 => "ibm866",
142 Self::Macintosh => "macintosh",
143 Self::XMacCyrillic => "x-mac-cyrillic",
144 Self::ShiftJis => "shift_jis",
145 Self::EucJp => "euc-jp",
146 Self::Iso2022Jp => "iso-2022-jp",
147 Self::Gbk => "gbk",
148 Self::Gb18030 => "gb18030",
149 Self::Big5 => "big5",
150 Self::EucKr => "euc-kr",
151 Self::Utf16Be => "utf-16be",
152 Self::Utf16Le => "utf-16le",
153 }
154 }
155
156 fn to_encoding_rs(&self) -> &'static encoding_rs::Encoding {
158 match self {
159 Self::Utf8 => encoding_rs::UTF_8,
160 Self::Ascii | Self::Iso8859_1 => encoding_rs::WINDOWS_1252,
161 Self::Windows1252 => encoding_rs::WINDOWS_1252,
162 Self::Iso8859_15 => encoding_rs::ISO_8859_15,
163 Self::Macintosh => encoding_rs::MACINTOSH,
164 Self::Iso8859_2 => encoding_rs::ISO_8859_2,
165 Self::Windows1250 => encoding_rs::WINDOWS_1250,
166 Self::Iso8859_3 => encoding_rs::ISO_8859_3,
167 Self::Iso8859_9 => encoding_rs::WINDOWS_1254,
168 Self::Windows1254 => encoding_rs::WINDOWS_1254,
169 Self::Iso8859_4 => encoding_rs::ISO_8859_4,
170 Self::Iso8859_10 => encoding_rs::ISO_8859_10,
171 Self::Iso8859_13 => encoding_rs::ISO_8859_13,
172 Self::Windows1257 => encoding_rs::WINDOWS_1257,
173 Self::Iso8859_14 => encoding_rs::ISO_8859_14,
174 Self::Iso8859_16 => encoding_rs::ISO_8859_16,
175 Self::Iso8859_5 => encoding_rs::ISO_8859_5,
176 Self::Windows1251 => encoding_rs::WINDOWS_1251,
177 Self::Koi8R => encoding_rs::KOI8_R,
178 Self::Koi8U => encoding_rs::KOI8_U,
179 Self::Ibm866 => encoding_rs::IBM866,
180 Self::XMacCyrillic => encoding_rs::X_MAC_CYRILLIC,
181 Self::Iso8859_7 => encoding_rs::ISO_8859_7,
182 Self::Windows1253 => encoding_rs::WINDOWS_1253,
183 Self::Iso8859_8 => encoding_rs::ISO_8859_8,
184 Self::Iso8859_8I => encoding_rs::ISO_8859_8_I,
185 Self::Windows1255 => encoding_rs::WINDOWS_1255,
186 Self::Iso8859_6 => encoding_rs::ISO_8859_6,
187 Self::Windows1256 => encoding_rs::WINDOWS_1256,
188 Self::Windows1258 => encoding_rs::WINDOWS_1258,
189 Self::Windows874 => encoding_rs::WINDOWS_874,
190 Self::ShiftJis => encoding_rs::SHIFT_JIS,
191 Self::EucJp => encoding_rs::EUC_JP,
192 Self::Iso2022Jp => encoding_rs::ISO_2022_JP,
193 Self::Gbk => encoding_rs::GBK,
194 Self::Gb18030 => encoding_rs::GB18030,
195 Self::Big5 => encoding_rs::BIG5,
196 Self::EucKr => encoding_rs::EUC_KR,
197 Self::Utf16Be => encoding_rs::UTF_16BE,
198 Self::Utf16Le => encoding_rs::UTF_16LE,
199 }
200 }
201
202 fn from_encoding_rs(enc: &'static encoding_rs::Encoding) -> Option<Self> {
204 Some(match enc {
205 e if e == encoding_rs::UTF_8 => Self::Utf8,
206 e if e == encoding_rs::WINDOWS_1252 => Self::Windows1252,
207 e if e == encoding_rs::ISO_8859_15 => Self::Iso8859_15,
208 e if e == encoding_rs::MACINTOSH => Self::Macintosh,
209 e if e == encoding_rs::ISO_8859_2 => Self::Iso8859_2,
210 e if e == encoding_rs::WINDOWS_1250 => Self::Windows1250,
211 e if e == encoding_rs::ISO_8859_3 => Self::Iso8859_3,
212 e if e == encoding_rs::WINDOWS_1254 => Self::Windows1254,
213 e if e == encoding_rs::ISO_8859_4 => Self::Iso8859_4,
214 e if e == encoding_rs::ISO_8859_10 => Self::Iso8859_10,
215 e if e == encoding_rs::ISO_8859_13 => Self::Iso8859_13,
216 e if e == encoding_rs::WINDOWS_1257 => Self::Windows1257,
217 e if e == encoding_rs::ISO_8859_14 => Self::Iso8859_14,
218 e if e == encoding_rs::ISO_8859_16 => Self::Iso8859_16,
219 e if e == encoding_rs::ISO_8859_5 => Self::Iso8859_5,
220 e if e == encoding_rs::WINDOWS_1251 => Self::Windows1251,
221 e if e == encoding_rs::KOI8_R => Self::Koi8R,
222 e if e == encoding_rs::KOI8_U => Self::Koi8U,
223 e if e == encoding_rs::IBM866 => Self::Ibm866,
224 e if e == encoding_rs::X_MAC_CYRILLIC => Self::XMacCyrillic,
225 e if e == encoding_rs::ISO_8859_7 => Self::Iso8859_7,
226 e if e == encoding_rs::WINDOWS_1253 => Self::Windows1253,
227 e if e == encoding_rs::ISO_8859_8 => Self::Iso8859_8,
228 e if e == encoding_rs::ISO_8859_8_I => Self::Iso8859_8I,
229 e if e == encoding_rs::WINDOWS_1255 => Self::Windows1255,
230 e if e == encoding_rs::ISO_8859_6 => Self::Iso8859_6,
231 e if e == encoding_rs::WINDOWS_1256 => Self::Windows1256,
232 e if e == encoding_rs::WINDOWS_1258 => Self::Windows1258,
233 e if e == encoding_rs::WINDOWS_874 => Self::Windows874,
234 e if e == encoding_rs::SHIFT_JIS => Self::ShiftJis,
235 e if e == encoding_rs::EUC_JP => Self::EucJp,
236 e if e == encoding_rs::ISO_2022_JP => Self::Iso2022Jp,
237 e if e == encoding_rs::GBK => Self::Gbk,
238 e if e == encoding_rs::GB18030 => Self::Gb18030,
239 e if e == encoding_rs::BIG5 => Self::Big5,
240 e if e == encoding_rs::EUC_KR => Self::EucKr,
241 e if e == encoding_rs::UTF_16BE => Self::Utf16Be,
242 e if e == encoding_rs::UTF_16LE => Self::Utf16Le,
243 _ => return None,
244 })
245 }
246
247 pub fn decode(&self, bytes: &[u8]) -> Result<String, TermError> {
256 match self {
257 Self::Utf8 => {
258 let s = std::str::from_utf8(bytes)
259 .map_err(|e| TermError::Encoding(e.to_string().into()))?;
260 Ok(s.to_owned())
261 }
262 Self::Ascii => {
263 if let Some(pos) = bytes.iter().position(|&b| b > 127) {
264 return Err(TermError::Encoding(
265 format!("non-ASCII byte 0x{:02X} at offset {}", bytes[pos], pos).into(),
266 ));
267 }
268 Ok(unsafe { String::from_utf8_unchecked(bytes.to_vec()) })
269 }
270 Self::Iso8859_1 => {
271 let mut out = String::with_capacity(bytes.len());
272 for &b in bytes {
273 out.push(b as char);
274 }
275 Ok(out)
276 }
277 _ => {
278 let (cow, _, had_errors) = self.to_encoding_rs().decode(bytes);
279 if had_errors {
280 return Err(TermError::Encoding(
281 format!("invalid {} sequence", self.name()).into(),
282 ));
283 }
284 Ok(cow.into_owned())
285 }
286 }
287 }
288
289 pub fn encode(&self, s: &str) -> Result<Vec<u8>, TermError> {
298 match self {
299 Self::Utf8 => Ok(s.as_bytes().to_vec()),
300 Self::Ascii => {
301 if let Some(ch) = s.chars().find(|c| !c.is_ascii()) {
302 return Err(TermError::Encoding(
303 format!("non-ASCII character '{}' (U+{:04X})", ch, ch as u32).into(),
304 ));
305 }
306 Ok(s.as_bytes().to_vec())
307 }
308 Self::Iso8859_1 => {
309 let mut out = Vec::with_capacity(s.len());
310 for ch in s.chars() {
311 let cp = ch as u32;
312 if cp > 0xFF {
313 return Err(TermError::Encoding(
314 format!(
315 "character '{}' (U+{:04X}) not representable in iso-8859-1",
316 ch, cp
317 )
318 .into(),
319 ));
320 }
321 out.push(cp as u8);
322 }
323 Ok(out)
324 }
325 _ => {
326 let (cow, _, had_errors) = self.to_encoding_rs().encode(s);
327 if had_errors {
328 return Err(TermError::Encoding(
329 format!("string not representable in {}", self.name()).into(),
330 ));
331 }
332 Ok(cow.into_owned())
333 }
334 }
335 }
336}
337
338impl std::fmt::Display for Encoding {
339 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
340 f.write_str(self.name())
341 }
342}
343
344#[cfg(test)]
345mod tests {
346 use super::*;
347
348 #[test]
349 fn from_name_case_insensitive() {
350 assert_eq!(Encoding::from_name("UTF-8"), Some(Encoding::Utf8));
351 assert_eq!(Encoding::from_name("utf8"), Some(Encoding::Utf8));
352 assert_eq!(Encoding::from_name("ISO-8859-1"), Some(Encoding::Iso8859_1));
353 assert_eq!(Encoding::from_name("latin1"), Some(Encoding::Iso8859_1));
354 assert_eq!(Encoding::from_name("Windows-1252"), Some(Encoding::Windows1252));
355 assert_eq!(Encoding::from_name("us-ascii"), Some(Encoding::Ascii));
356 assert_eq!(Encoding::from_name("unknown"), None);
357 }
358
359 #[test]
360 fn from_name_encoding_rs_labels() {
361 assert_eq!(Encoding::from_name("shift_jis"), Some(Encoding::ShiftJis));
362 assert_eq!(Encoding::from_name("euc-jp"), Some(Encoding::EucJp));
363 assert_eq!(Encoding::from_name("gbk"), Some(Encoding::Gbk));
364 assert_eq!(Encoding::from_name("big5"), Some(Encoding::Big5));
365 assert_eq!(Encoding::from_name("koi8-r"), Some(Encoding::Koi8R));
366 assert_eq!(Encoding::from_name("iso-8859-2"), Some(Encoding::Iso8859_2));
367 assert_eq!(Encoding::from_name("windows-1251"), Some(Encoding::Windows1251));
368 assert_eq!(Encoding::from_name("utf-16le"), Some(Encoding::Utf16Le));
369 assert_eq!(Encoding::from_name("utf-16be"), Some(Encoding::Utf16Be));
370 }
371
372 #[test]
373 fn decode_utf8_valid() {
374 assert_eq!(Encoding::Utf8.decode("café".as_bytes()).unwrap(), "café");
375 }
376
377 #[test]
378 fn decode_utf8_invalid() {
379 assert!(Encoding::Utf8.decode(&[0xFF, 0xFE]).is_err());
380 }
381
382 #[test]
383 fn decode_ascii_valid() {
384 assert_eq!(Encoding::Ascii.decode(b"hello").unwrap(), "hello");
385 }
386
387 #[test]
388 fn decode_ascii_invalid() {
389 assert!(Encoding::Ascii.decode(&[0x80]).is_err());
390 }
391
392 #[test]
393 fn decode_latin1() {
394 assert_eq!(
395 Encoding::Iso8859_1.decode(&[0x63, 0x61, 0x66, 0xE9]).unwrap(),
396 "café"
397 );
398 }
399
400 #[test]
401 fn decode_latin1_full_range() {
402 let bytes: Vec<u8> = (0u8..=255).collect();
403 let s = Encoding::Iso8859_1.decode(&bytes).unwrap();
404 assert_eq!(s.chars().count(), 256);
405 assert_eq!(s.chars().last(), Some('\u{FF}'));
406 }
407
408 #[test]
409 fn decode_windows1252() {
410 assert_eq!(Encoding::Windows1252.decode(&[0x93]).unwrap(), "\u{201C}");
411 }
412
413 #[test]
414 fn decode_windows1251_cyrillic() {
415 let bytes = &[0xCF, 0xF0, 0xE8, 0xE2, 0xE5, 0xF2];
417 assert_eq!(Encoding::Windows1251.decode(bytes).unwrap(), "Привет");
418 }
419
420 #[test]
421 fn decode_shift_jis() {
422 let bytes = &[0x82, 0xB1, 0x82, 0xF1];
424 assert_eq!(Encoding::ShiftJis.decode(bytes).unwrap(), "こん");
425 }
426
427 #[test]
428 fn encode_utf8() {
429 assert_eq!(Encoding::Utf8.encode("café").unwrap(), "café".as_bytes());
430 }
431
432 #[test]
433 fn encode_ascii_valid() {
434 assert_eq!(Encoding::Ascii.encode("hello").unwrap(), b"hello");
435 }
436
437 #[test]
438 fn encode_ascii_invalid() {
439 assert!(Encoding::Ascii.encode("café").is_err());
440 }
441
442 #[test]
443 fn encode_latin1() {
444 assert_eq!(
445 Encoding::Iso8859_1.encode("café").unwrap(),
446 vec![0x63, 0x61, 0x66, 0xE9]
447 );
448 }
449
450 #[test]
451 fn encode_latin1_out_of_range() {
452 assert!(Encoding::Iso8859_1.encode("Ā").is_err());
453 }
454
455 #[test]
456 fn encode_windows1252() {
457 assert_eq!(
458 Encoding::Windows1252.encode("\u{201C}").unwrap(),
459 vec![0x93]
460 );
461 }
462
463 #[test]
464 fn encode_windows1251_cyrillic() {
465 assert_eq!(
466 Encoding::Windows1251.encode("Привет").unwrap(),
467 vec![0xCF, 0xF0, 0xE8, 0xE2, 0xE5, 0xF2]
468 );
469 }
470
471 #[test]
472 fn decode_encode_roundtrip() {
473 for enc in [Encoding::Utf8, Encoding::Ascii, Encoding::Iso8859_1, Encoding::Windows1252] {
474 let original = b"hello";
475 let s = enc.decode(original).unwrap();
476 let bytes = enc.encode(&s).unwrap();
477 assert_eq!(&bytes, original, "roundtrip failed for {}", enc);
478 }
479 }
480
481 #[test]
482 fn encode_decode_roundtrip_latin1() {
483 let original = "café";
484 let bytes = Encoding::Iso8859_1.encode(original).unwrap();
485 let s = Encoding::Iso8859_1.decode(&bytes).unwrap();
486 assert_eq!(s, original);
487 }
488
489 #[test]
490 fn encode_decode_roundtrip_windows1251() {
491 let original = "Привет";
492 let bytes = Encoding::Windows1251.encode(original).unwrap();
493 let s = Encoding::Windows1251.decode(&bytes).unwrap();
494 assert_eq!(s, original);
495 }
496
497 #[test]
500 fn decode_koi8r_cyrillic() {
501 let bytes = &[0xF0, 0xD2, 0xC9, 0xD7, 0xC5, 0xD4];
503 assert_eq!(Encoding::Koi8R.decode(bytes).unwrap(), "Привет");
504 }
505
506 #[test]
507 fn encode_koi8r_cyrillic() {
508 let bytes = Encoding::Koi8R.encode("Привет").unwrap();
509 assert_eq!(bytes, vec![0xF0, 0xD2, 0xC9, 0xD7, 0xC5, 0xD4]);
510 }
511
512 #[test]
513 fn decode_koi8u_ukrainian() {
514 let bytes = &[0xEB, 0xC9, 0xA7, 0xD7];
516 assert_eq!(Encoding::Koi8U.decode(bytes).unwrap(), "Київ");
517 }
518
519 #[test]
520 fn decode_iso8859_5_cyrillic() {
521 let bytes = &[0xBC, 0xD8, 0xE0];
523 assert_eq!(Encoding::Iso8859_5.decode(bytes).unwrap(), "Мир");
524 }
525
526 #[test]
527 fn encode_decode_roundtrip_koi8r() {
528 let original = "Здравствуйте";
529 let bytes = Encoding::Koi8R.encode(original).unwrap();
530 let s = Encoding::Koi8R.decode(&bytes).unwrap();
531 assert_eq!(s, original);
532 }
533
534 #[test]
537 fn decode_gbk_chinese() {
538 let bytes = &[0xC4, 0xE3, 0xBA, 0xC3];
540 assert_eq!(Encoding::Gbk.decode(bytes).unwrap(), "你好");
541 }
542
543 #[test]
544 fn encode_gbk_chinese() {
545 let bytes = Encoding::Gbk.encode("你好").unwrap();
546 assert_eq!(bytes, vec![0xC4, 0xE3, 0xBA, 0xC3]);
547 }
548
549 #[test]
550 fn decode_big5_traditional_chinese() {
551 let bytes = &[0xA5, 0x40, 0xAC, 0xC9];
553 assert_eq!(Encoding::Big5.decode(bytes).unwrap(), "世界");
554 }
555
556 #[test]
557 fn decode_euc_kr_korean() {
558 let bytes = &[0xC7, 0xD1, 0xB1, 0xDB];
560 assert_eq!(Encoding::EucKr.decode(bytes).unwrap(), "한글");
561 }
562
563 #[test]
564 fn decode_euc_jp_japanese() {
565 let bytes = &[0xC6, 0xFC, 0xCB, 0xDC];
567 assert_eq!(Encoding::EucJp.decode(bytes).unwrap(), "日本");
568 }
569
570 #[test]
571 fn encode_decode_roundtrip_shift_jis() {
572 let original = "東京タワー";
573 let bytes = Encoding::ShiftJis.encode(original).unwrap();
574 let s = Encoding::ShiftJis.decode(&bytes).unwrap();
575 assert_eq!(s, original);
576 }
577
578 #[test]
579 fn encode_decode_roundtrip_gb18030() {
580 let original = "中文测试";
581 let bytes = Encoding::Gb18030.encode(original).unwrap();
582 let s = Encoding::Gb18030.decode(&bytes).unwrap();
583 assert_eq!(s, original);
584 }
585
586 #[test]
587 fn encode_decode_roundtrip_euc_kr() {
588 let original = "서울";
589 let bytes = Encoding::EucKr.encode(original).unwrap();
590 let s = Encoding::EucKr.decode(&bytes).unwrap();
591 assert_eq!(s, original);
592 }
593
594 #[test]
595 fn name_roundtrip() {
596 for enc in [
597 Encoding::Utf8, Encoding::Ascii, Encoding::Iso8859_1, Encoding::Windows1252,
598 Encoding::Windows1251, Encoding::Koi8R, Encoding::ShiftJis, Encoding::EucJp,
599 Encoding::Gbk, Encoding::Big5, Encoding::EucKr, Encoding::Utf16Be,
600 ] {
601 assert_eq!(
602 Encoding::from_name(enc.name()),
603 Some(enc),
604 "name roundtrip failed for {:?} (name={})",
605 enc,
606 enc.name()
607 );
608 }
609 }
610}