Skip to main content

arena_terms/
encoding.rs

1use crate::TermError;
2
3/// Input encoding for term data.
4///
5/// Determines how raw bytes are transcoded to/from UTF-8 when
6/// producing or consuming string values (atoms, variables, strings, etc.).
7/// Binary content (`bin{...}`) always uses raw bytes regardless of encoding.
8///
9/// Encoding names follow the WHATWG Encoding Standard / IANA charset names
10/// (the same names accepted by `encoding_rs` and HTTP `Content-Type` headers).
11#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
12pub enum Encoding {
13    #[default]
14    Utf8,
15    Ascii,
16    // True ISO-8859-1 (Latin-1): bytes 0x80-0x9F map to Unicode control characters.
17    // This differs from Windows-1252, where 0x80-0x9F map to printable characters
18    // (€, „, ", etc.). Use Windows1252 if you need the Windows/WHATWG behavior.
19    Iso8859_1,
20    // Western
21    Windows1252,
22    Iso8859_15,
23    Macintosh,
24    // Central European
25    Iso8859_2,
26    Windows1250,
27    // South European / Turkish
28    Iso8859_3,
29    // ISO-8859-9 is identical to Windows-1254 (WHATWG merges them).
30    // Both kept as separate variants for naming convenience.
31    Iso8859_9,
32    Windows1254,
33    // North European / Baltic
34    Iso8859_4,
35    Iso8859_10,
36    Iso8859_13,
37    Windows1257,
38    // Celtic
39    Iso8859_14,
40    // Romanian
41    Iso8859_16,
42    // Cyrillic
43    Iso8859_5,
44    Windows1251,
45    Koi8R,
46    Koi8U,
47    Ibm866,
48    XMacCyrillic,
49    // Greek
50    Iso8859_7,
51    Windows1253,
52    // Hebrew
53    Iso8859_8,
54    Iso8859_8I,
55    Windows1255,
56    // Arabic
57    Iso8859_6,
58    Windows1256,
59    // Vietnamese
60    Windows1258,
61    // Thai
62    Windows874,
63    // Japanese
64    ShiftJis,
65    EucJp,
66    Iso2022Jp,
67    // Chinese
68    Gbk,
69    Gb18030,
70    Big5,
71    // Korean
72    EucKr,
73    // Unicode
74    Utf16Be,
75    Utf16Le,
76}
77
78impl Encoding {
79    pub const ALL: &[Encoding] = &[
80        Self::Utf8, Self::Ascii,
81        Self::Iso8859_1, Self::Iso8859_2, Self::Iso8859_3, Self::Iso8859_4,
82        Self::Iso8859_5, Self::Iso8859_6, Self::Iso8859_7, Self::Iso8859_8,
83        Self::Iso8859_8I, Self::Iso8859_9, Self::Iso8859_10, Self::Iso8859_13,
84        Self::Iso8859_14, Self::Iso8859_15, Self::Iso8859_16,
85        Self::Windows874, Self::Windows1250, Self::Windows1251, Self::Windows1252,
86        Self::Windows1253, Self::Windows1254, Self::Windows1255, Self::Windows1256,
87        Self::Windows1257, Self::Windows1258,
88        Self::Koi8R, Self::Koi8U, Self::Ibm866,
89        Self::Macintosh, Self::XMacCyrillic,
90        Self::ShiftJis, Self::EucJp, Self::Iso2022Jp,
91        Self::Gbk, Self::Gb18030, Self::Big5,
92        Self::EucKr,
93        Self::Utf16Be, Self::Utf16Le,
94    ];
95
96    pub fn from_name(name: &str) -> Option<Self> {
97        let lower = name.trim().to_ascii_lowercase();
98        match lower.as_str() {
99            "utf-8" | "utf8" => Some(Self::Utf8),
100            "us-ascii" | "ascii" | "iso-ir-6" => Some(Self::Ascii),
101            "iso-8859-1" | "latin1" | "latin-1" | "iso_8859-1" | "l1" => Some(Self::Iso8859_1),
102            _ => {
103                let enc_rs = encoding_rs::Encoding::for_label(lower.as_bytes())?;
104                Self::from_encoding_rs(enc_rs)
105            }
106        }
107    }
108
109    /// Returns the canonical name for this encoding (lowercase).
110    pub fn name(&self) -> &'static str {
111        match self {
112            Self::Utf8 => "utf-8",
113            Self::Ascii => "us-ascii",
114            Self::Iso8859_1 => "iso-8859-1",
115            Self::Iso8859_2 => "iso-8859-2",
116            Self::Iso8859_3 => "iso-8859-3",
117            Self::Iso8859_4 => "iso-8859-4",
118            Self::Iso8859_5 => "iso-8859-5",
119            Self::Iso8859_6 => "iso-8859-6",
120            Self::Iso8859_7 => "iso-8859-7",
121            Self::Iso8859_8 => "iso-8859-8",
122            Self::Iso8859_8I => "iso-8859-8-i",
123            Self::Iso8859_9 => "iso-8859-9",
124            Self::Iso8859_10 => "iso-8859-10",
125            Self::Iso8859_13 => "iso-8859-13",
126            Self::Iso8859_14 => "iso-8859-14",
127            Self::Iso8859_15 => "iso-8859-15",
128            Self::Iso8859_16 => "iso-8859-16",
129            Self::Windows874 => "windows-874",
130            Self::Windows1250 => "windows-1250",
131            Self::Windows1251 => "windows-1251",
132            Self::Windows1252 => "windows-1252",
133            Self::Windows1253 => "windows-1253",
134            Self::Windows1254 => "windows-1254",
135            Self::Windows1255 => "windows-1255",
136            Self::Windows1256 => "windows-1256",
137            Self::Windows1257 => "windows-1257",
138            Self::Windows1258 => "windows-1258",
139            Self::Koi8R => "koi8-r",
140            Self::Koi8U => "koi8-u",
141            Self::Ibm866 => "ibm866",
142            Self::Macintosh => "macintosh",
143            Self::XMacCyrillic => "x-mac-cyrillic",
144            Self::ShiftJis => "shift_jis",
145            Self::EucJp => "euc-jp",
146            Self::Iso2022Jp => "iso-2022-jp",
147            Self::Gbk => "gbk",
148            Self::Gb18030 => "gb18030",
149            Self::Big5 => "big5",
150            Self::EucKr => "euc-kr",
151            Self::Utf16Be => "utf-16be",
152            Self::Utf16Le => "utf-16le",
153        }
154    }
155
156    /// Returns the `encoding_rs::Encoding` for variants that delegate to it.
157    fn to_encoding_rs(&self) -> &'static encoding_rs::Encoding {
158        match self {
159            Self::Utf8 => encoding_rs::UTF_8,
160            Self::Ascii | Self::Iso8859_1 => encoding_rs::WINDOWS_1252,
161            Self::Windows1252 => encoding_rs::WINDOWS_1252,
162            Self::Iso8859_15 => encoding_rs::ISO_8859_15,
163            Self::Macintosh => encoding_rs::MACINTOSH,
164            Self::Iso8859_2 => encoding_rs::ISO_8859_2,
165            Self::Windows1250 => encoding_rs::WINDOWS_1250,
166            Self::Iso8859_3 => encoding_rs::ISO_8859_3,
167            Self::Iso8859_9 => encoding_rs::WINDOWS_1254,
168            Self::Windows1254 => encoding_rs::WINDOWS_1254,
169            Self::Iso8859_4 => encoding_rs::ISO_8859_4,
170            Self::Iso8859_10 => encoding_rs::ISO_8859_10,
171            Self::Iso8859_13 => encoding_rs::ISO_8859_13,
172            Self::Windows1257 => encoding_rs::WINDOWS_1257,
173            Self::Iso8859_14 => encoding_rs::ISO_8859_14,
174            Self::Iso8859_16 => encoding_rs::ISO_8859_16,
175            Self::Iso8859_5 => encoding_rs::ISO_8859_5,
176            Self::Windows1251 => encoding_rs::WINDOWS_1251,
177            Self::Koi8R => encoding_rs::KOI8_R,
178            Self::Koi8U => encoding_rs::KOI8_U,
179            Self::Ibm866 => encoding_rs::IBM866,
180            Self::XMacCyrillic => encoding_rs::X_MAC_CYRILLIC,
181            Self::Iso8859_7 => encoding_rs::ISO_8859_7,
182            Self::Windows1253 => encoding_rs::WINDOWS_1253,
183            Self::Iso8859_8 => encoding_rs::ISO_8859_8,
184            Self::Iso8859_8I => encoding_rs::ISO_8859_8_I,
185            Self::Windows1255 => encoding_rs::WINDOWS_1255,
186            Self::Iso8859_6 => encoding_rs::ISO_8859_6,
187            Self::Windows1256 => encoding_rs::WINDOWS_1256,
188            Self::Windows1258 => encoding_rs::WINDOWS_1258,
189            Self::Windows874 => encoding_rs::WINDOWS_874,
190            Self::ShiftJis => encoding_rs::SHIFT_JIS,
191            Self::EucJp => encoding_rs::EUC_JP,
192            Self::Iso2022Jp => encoding_rs::ISO_2022_JP,
193            Self::Gbk => encoding_rs::GBK,
194            Self::Gb18030 => encoding_rs::GB18030,
195            Self::Big5 => encoding_rs::BIG5,
196            Self::EucKr => encoding_rs::EUC_KR,
197            Self::Utf16Be => encoding_rs::UTF_16BE,
198            Self::Utf16Le => encoding_rs::UTF_16LE,
199        }
200    }
201
202    /// Maps an `encoding_rs::Encoding` to our enum.
203    fn from_encoding_rs(enc: &'static encoding_rs::Encoding) -> Option<Self> {
204        Some(match enc {
205            e if e == encoding_rs::UTF_8 => Self::Utf8,
206            e if e == encoding_rs::WINDOWS_1252 => Self::Windows1252,
207            e if e == encoding_rs::ISO_8859_15 => Self::Iso8859_15,
208            e if e == encoding_rs::MACINTOSH => Self::Macintosh,
209            e if e == encoding_rs::ISO_8859_2 => Self::Iso8859_2,
210            e if e == encoding_rs::WINDOWS_1250 => Self::Windows1250,
211            e if e == encoding_rs::ISO_8859_3 => Self::Iso8859_3,
212            e if e == encoding_rs::WINDOWS_1254 => Self::Windows1254,
213            e if e == encoding_rs::ISO_8859_4 => Self::Iso8859_4,
214            e if e == encoding_rs::ISO_8859_10 => Self::Iso8859_10,
215            e if e == encoding_rs::ISO_8859_13 => Self::Iso8859_13,
216            e if e == encoding_rs::WINDOWS_1257 => Self::Windows1257,
217            e if e == encoding_rs::ISO_8859_14 => Self::Iso8859_14,
218            e if e == encoding_rs::ISO_8859_16 => Self::Iso8859_16,
219            e if e == encoding_rs::ISO_8859_5 => Self::Iso8859_5,
220            e if e == encoding_rs::WINDOWS_1251 => Self::Windows1251,
221            e if e == encoding_rs::KOI8_R => Self::Koi8R,
222            e if e == encoding_rs::KOI8_U => Self::Koi8U,
223            e if e == encoding_rs::IBM866 => Self::Ibm866,
224            e if e == encoding_rs::X_MAC_CYRILLIC => Self::XMacCyrillic,
225            e if e == encoding_rs::ISO_8859_7 => Self::Iso8859_7,
226            e if e == encoding_rs::WINDOWS_1253 => Self::Windows1253,
227            e if e == encoding_rs::ISO_8859_8 => Self::Iso8859_8,
228            e if e == encoding_rs::ISO_8859_8_I => Self::Iso8859_8I,
229            e if e == encoding_rs::WINDOWS_1255 => Self::Windows1255,
230            e if e == encoding_rs::ISO_8859_6 => Self::Iso8859_6,
231            e if e == encoding_rs::WINDOWS_1256 => Self::Windows1256,
232            e if e == encoding_rs::WINDOWS_1258 => Self::Windows1258,
233            e if e == encoding_rs::WINDOWS_874 => Self::Windows874,
234            e if e == encoding_rs::SHIFT_JIS => Self::ShiftJis,
235            e if e == encoding_rs::EUC_JP => Self::EucJp,
236            e if e == encoding_rs::ISO_2022_JP => Self::Iso2022Jp,
237            e if e == encoding_rs::GBK => Self::Gbk,
238            e if e == encoding_rs::GB18030 => Self::Gb18030,
239            e if e == encoding_rs::BIG5 => Self::Big5,
240            e if e == encoding_rs::EUC_KR => Self::EucKr,
241            e if e == encoding_rs::UTF_16BE => Self::Utf16Be,
242            e if e == encoding_rs::UTF_16LE => Self::Utf16Le,
243            _ => return None,
244        })
245    }
246
247    /// Decodes a byte slice from this encoding into a UTF-8 string.
248    ///
249    /// # Examples
250    /// ```
251    /// # use arena_terms::Encoding;
252    /// let s = Encoding::Iso8859_1.decode(&[0x63, 0x61, 0x66, 0xE9]).unwrap();
253    /// assert_eq!(s, "café");
254    /// ```
255    pub fn decode(&self, bytes: &[u8]) -> Result<String, TermError> {
256        match self {
257            Self::Utf8 => {
258                let s = std::str::from_utf8(bytes)
259                    .map_err(|e| TermError::Encoding(e.to_string().into()))?;
260                Ok(s.to_owned())
261            }
262            Self::Ascii => {
263                if let Some(pos) = bytes.iter().position(|&b| b > 127) {
264                    return Err(TermError::Encoding(
265                        format!("non-ASCII byte 0x{:02X} at offset {}", bytes[pos], pos).into(),
266                    ));
267                }
268                Ok(unsafe { String::from_utf8_unchecked(bytes.to_vec()) })
269            }
270            Self::Iso8859_1 => {
271                let mut out = String::with_capacity(bytes.len());
272                for &b in bytes {
273                    out.push(b as char);
274                }
275                Ok(out)
276            }
277            _ => {
278                let (cow, _, had_errors) = self.to_encoding_rs().decode(bytes);
279                if had_errors {
280                    return Err(TermError::Encoding(
281                        format!("invalid {} sequence", self.name()).into(),
282                    ));
283                }
284                Ok(cow.into_owned())
285            }
286        }
287    }
288
289    /// Encodes a UTF-8 string into bytes in this encoding.
290    ///
291    /// # Examples
292    /// ```
293    /// # use arena_terms::Encoding;
294    /// let bytes = Encoding::Iso8859_1.encode("café").unwrap();
295    /// assert_eq!(bytes, vec![0x63, 0x61, 0x66, 0xE9]);
296    /// ```
297    pub fn encode(&self, s: &str) -> Result<Vec<u8>, TermError> {
298        match self {
299            Self::Utf8 => Ok(s.as_bytes().to_vec()),
300            Self::Ascii => {
301                if let Some(ch) = s.chars().find(|c| !c.is_ascii()) {
302                    return Err(TermError::Encoding(
303                        format!("non-ASCII character '{}' (U+{:04X})", ch, ch as u32).into(),
304                    ));
305                }
306                Ok(s.as_bytes().to_vec())
307            }
308            Self::Iso8859_1 => {
309                let mut out = Vec::with_capacity(s.len());
310                for ch in s.chars() {
311                    let cp = ch as u32;
312                    if cp > 0xFF {
313                        return Err(TermError::Encoding(
314                            format!(
315                                "character '{}' (U+{:04X}) not representable in iso-8859-1",
316                                ch, cp
317                            )
318                            .into(),
319                        ));
320                    }
321                    out.push(cp as u8);
322                }
323                Ok(out)
324            }
325            _ => {
326                let (cow, _, had_errors) = self.to_encoding_rs().encode(s);
327                if had_errors {
328                    return Err(TermError::Encoding(
329                        format!("string not representable in {}", self.name()).into(),
330                    ));
331                }
332                Ok(cow.into_owned())
333            }
334        }
335    }
336}
337
338impl std::fmt::Display for Encoding {
339    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
340        f.write_str(self.name())
341    }
342}
343
344#[cfg(test)]
345mod tests {
346    use super::*;
347
348    #[test]
349    fn from_name_case_insensitive() {
350        assert_eq!(Encoding::from_name("UTF-8"), Some(Encoding::Utf8));
351        assert_eq!(Encoding::from_name("utf8"), Some(Encoding::Utf8));
352        assert_eq!(Encoding::from_name("ISO-8859-1"), Some(Encoding::Iso8859_1));
353        assert_eq!(Encoding::from_name("latin1"), Some(Encoding::Iso8859_1));
354        assert_eq!(Encoding::from_name("Windows-1252"), Some(Encoding::Windows1252));
355        assert_eq!(Encoding::from_name("us-ascii"), Some(Encoding::Ascii));
356        assert_eq!(Encoding::from_name("unknown"), None);
357    }
358
359    #[test]
360    fn from_name_encoding_rs_labels() {
361        assert_eq!(Encoding::from_name("shift_jis"), Some(Encoding::ShiftJis));
362        assert_eq!(Encoding::from_name("euc-jp"), Some(Encoding::EucJp));
363        assert_eq!(Encoding::from_name("gbk"), Some(Encoding::Gbk));
364        assert_eq!(Encoding::from_name("big5"), Some(Encoding::Big5));
365        assert_eq!(Encoding::from_name("koi8-r"), Some(Encoding::Koi8R));
366        assert_eq!(Encoding::from_name("iso-8859-2"), Some(Encoding::Iso8859_2));
367        assert_eq!(Encoding::from_name("windows-1251"), Some(Encoding::Windows1251));
368        assert_eq!(Encoding::from_name("utf-16le"), Some(Encoding::Utf16Le));
369        assert_eq!(Encoding::from_name("utf-16be"), Some(Encoding::Utf16Be));
370    }
371
372    #[test]
373    fn decode_utf8_valid() {
374        assert_eq!(Encoding::Utf8.decode("café".as_bytes()).unwrap(), "café");
375    }
376
377    #[test]
378    fn decode_utf8_invalid() {
379        assert!(Encoding::Utf8.decode(&[0xFF, 0xFE]).is_err());
380    }
381
382    #[test]
383    fn decode_ascii_valid() {
384        assert_eq!(Encoding::Ascii.decode(b"hello").unwrap(), "hello");
385    }
386
387    #[test]
388    fn decode_ascii_invalid() {
389        assert!(Encoding::Ascii.decode(&[0x80]).is_err());
390    }
391
392    #[test]
393    fn decode_latin1() {
394        assert_eq!(
395            Encoding::Iso8859_1.decode(&[0x63, 0x61, 0x66, 0xE9]).unwrap(),
396            "café"
397        );
398    }
399
400    #[test]
401    fn decode_latin1_full_range() {
402        let bytes: Vec<u8> = (0u8..=255).collect();
403        let s = Encoding::Iso8859_1.decode(&bytes).unwrap();
404        assert_eq!(s.chars().count(), 256);
405        assert_eq!(s.chars().last(), Some('\u{FF}'));
406    }
407
408    #[test]
409    fn decode_windows1252() {
410        assert_eq!(Encoding::Windows1252.decode(&[0x93]).unwrap(), "\u{201C}");
411    }
412
413    #[test]
414    fn decode_windows1251_cyrillic() {
415        // 0xCF 0xF0 0xE8 0xE2 0xE5 0xF2 = "Привет" in Windows-1251
416        let bytes = &[0xCF, 0xF0, 0xE8, 0xE2, 0xE5, 0xF2];
417        assert_eq!(Encoding::Windows1251.decode(bytes).unwrap(), "Привет");
418    }
419
420    #[test]
421    fn decode_shift_jis() {
422        // 0x82 0xB1 0x82 0xF1 = "こん" in Shift_JIS
423        let bytes = &[0x82, 0xB1, 0x82, 0xF1];
424        assert_eq!(Encoding::ShiftJis.decode(bytes).unwrap(), "こん");
425    }
426
427    #[test]
428    fn encode_utf8() {
429        assert_eq!(Encoding::Utf8.encode("café").unwrap(), "café".as_bytes());
430    }
431
432    #[test]
433    fn encode_ascii_valid() {
434        assert_eq!(Encoding::Ascii.encode("hello").unwrap(), b"hello");
435    }
436
437    #[test]
438    fn encode_ascii_invalid() {
439        assert!(Encoding::Ascii.encode("café").is_err());
440    }
441
442    #[test]
443    fn encode_latin1() {
444        assert_eq!(
445            Encoding::Iso8859_1.encode("café").unwrap(),
446            vec![0x63, 0x61, 0x66, 0xE9]
447        );
448    }
449
450    #[test]
451    fn encode_latin1_out_of_range() {
452        assert!(Encoding::Iso8859_1.encode("Ā").is_err());
453    }
454
455    #[test]
456    fn encode_windows1252() {
457        assert_eq!(
458            Encoding::Windows1252.encode("\u{201C}").unwrap(),
459            vec![0x93]
460        );
461    }
462
463    #[test]
464    fn encode_windows1251_cyrillic() {
465        assert_eq!(
466            Encoding::Windows1251.encode("Привет").unwrap(),
467            vec![0xCF, 0xF0, 0xE8, 0xE2, 0xE5, 0xF2]
468        );
469    }
470
471    #[test]
472    fn decode_encode_roundtrip() {
473        for enc in [Encoding::Utf8, Encoding::Ascii, Encoding::Iso8859_1, Encoding::Windows1252] {
474            let original = b"hello";
475            let s = enc.decode(original).unwrap();
476            let bytes = enc.encode(&s).unwrap();
477            assert_eq!(&bytes, original, "roundtrip failed for {}", enc);
478        }
479    }
480
481    #[test]
482    fn encode_decode_roundtrip_latin1() {
483        let original = "café";
484        let bytes = Encoding::Iso8859_1.encode(original).unwrap();
485        let s = Encoding::Iso8859_1.decode(&bytes).unwrap();
486        assert_eq!(s, original);
487    }
488
489    #[test]
490    fn encode_decode_roundtrip_windows1251() {
491        let original = "Привет";
492        let bytes = Encoding::Windows1251.encode(original).unwrap();
493        let s = Encoding::Windows1251.decode(&bytes).unwrap();
494        assert_eq!(s, original);
495    }
496
497    // -- Cyrillic encodings --
498
499    #[test]
500    fn decode_koi8r_cyrillic() {
501        // "Привет" in KOI8-R
502        let bytes = &[0xF0, 0xD2, 0xC9, 0xD7, 0xC5, 0xD4];
503        assert_eq!(Encoding::Koi8R.decode(bytes).unwrap(), "Привет");
504    }
505
506    #[test]
507    fn encode_koi8r_cyrillic() {
508        let bytes = Encoding::Koi8R.encode("Привет").unwrap();
509        assert_eq!(bytes, vec![0xF0, 0xD2, 0xC9, 0xD7, 0xC5, 0xD4]);
510    }
511
512    #[test]
513    fn decode_koi8u_ukrainian() {
514        // "Київ" in KOI8-U
515        let bytes = &[0xEB, 0xC9, 0xA7, 0xD7];
516        assert_eq!(Encoding::Koi8U.decode(bytes).unwrap(), "Київ");
517    }
518
519    #[test]
520    fn decode_iso8859_5_cyrillic() {
521        // "Мир" in ISO-8859-5: М=0xBC, и=0xD8, р=0xE0
522        let bytes = &[0xBC, 0xD8, 0xE0];
523        assert_eq!(Encoding::Iso8859_5.decode(bytes).unwrap(), "Мир");
524    }
525
526    #[test]
527    fn encode_decode_roundtrip_koi8r() {
528        let original = "Здравствуйте";
529        let bytes = Encoding::Koi8R.encode(original).unwrap();
530        let s = Encoding::Koi8R.decode(&bytes).unwrap();
531        assert_eq!(s, original);
532    }
533
534    // -- CJK encodings --
535
536    #[test]
537    fn decode_gbk_chinese() {
538        // "你好" in GBK: 0xC4E3 0xBAC3
539        let bytes = &[0xC4, 0xE3, 0xBA, 0xC3];
540        assert_eq!(Encoding::Gbk.decode(bytes).unwrap(), "你好");
541    }
542
543    #[test]
544    fn encode_gbk_chinese() {
545        let bytes = Encoding::Gbk.encode("你好").unwrap();
546        assert_eq!(bytes, vec![0xC4, 0xE3, 0xBA, 0xC3]);
547    }
548
549    #[test]
550    fn decode_big5_traditional_chinese() {
551        // "世界" in Big5: 0xA5 0x40 0xAC 0xC9
552        let bytes = &[0xA5, 0x40, 0xAC, 0xC9];
553        assert_eq!(Encoding::Big5.decode(bytes).unwrap(), "世界");
554    }
555
556    #[test]
557    fn decode_euc_kr_korean() {
558        // "한글" in EUC-KR: 0xC7 0xD1 0xB1 0xDB
559        let bytes = &[0xC7, 0xD1, 0xB1, 0xDB];
560        assert_eq!(Encoding::EucKr.decode(bytes).unwrap(), "한글");
561    }
562
563    #[test]
564    fn decode_euc_jp_japanese() {
565        // "日本" in EUC-JP: 0xC6 0xFC 0xCB 0xDC
566        let bytes = &[0xC6, 0xFC, 0xCB, 0xDC];
567        assert_eq!(Encoding::EucJp.decode(bytes).unwrap(), "日本");
568    }
569
570    #[test]
571    fn encode_decode_roundtrip_shift_jis() {
572        let original = "東京タワー";
573        let bytes = Encoding::ShiftJis.encode(original).unwrap();
574        let s = Encoding::ShiftJis.decode(&bytes).unwrap();
575        assert_eq!(s, original);
576    }
577
578    #[test]
579    fn encode_decode_roundtrip_gb18030() {
580        let original = "中文测试";
581        let bytes = Encoding::Gb18030.encode(original).unwrap();
582        let s = Encoding::Gb18030.decode(&bytes).unwrap();
583        assert_eq!(s, original);
584    }
585
586    #[test]
587    fn encode_decode_roundtrip_euc_kr() {
588        let original = "서울";
589        let bytes = Encoding::EucKr.encode(original).unwrap();
590        let s = Encoding::EucKr.decode(&bytes).unwrap();
591        assert_eq!(s, original);
592    }
593
594    #[test]
595    fn name_roundtrip() {
596        for enc in [
597            Encoding::Utf8, Encoding::Ascii, Encoding::Iso8859_1, Encoding::Windows1252,
598            Encoding::Windows1251, Encoding::Koi8R, Encoding::ShiftJis, Encoding::EucJp,
599            Encoding::Gbk, Encoding::Big5, Encoding::EucKr, Encoding::Utf16Be,
600        ] {
601            assert_eq!(
602                Encoding::from_name(enc.name()),
603                Some(enc),
604                "name roundtrip failed for {:?} (name={})",
605                enc,
606                enc.name()
607            );
608        }
609    }
610}