1pub mod codepages;
25
26use crate::error::{Error, Result};
27
28#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
40#[repr(u32)]
41#[allow(missing_docs)] pub enum Encoding {
43 Unknown = 0,
45 UsAscii = 1,
47 Iso8859_1 = 2,
49 Iso8859_2 = 3,
51 Iso8859_3 = 4,
53 Iso8859_4 = 5,
55 Iso8859_5 = 6,
57 Iso8859_6 = 7,
59 Iso8859_7 = 8,
61 Iso8859_8 = 9,
63 Iso8859_9 = 10,
65 Iso8859_10 = 11,
67 Iso8859_11 = 12,
69 Iso8859_13 = 13,
71 Iso8859_14 = 14,
73 Iso8859_15 = 15,
75 Iso8859_16 = 16,
77 Koi8R = 17,
79 Iscii = 18,
81 Utf8 = 19,
83 Ucs2 = 20,
85}
86
87impl Encoding {
88 pub fn from_name(name: &str) -> Self {
93 match name.to_ascii_uppercase().as_str() {
97 "ANSI_X3.4-1968" | "ANSI_X3.4-1986" | "ASCII" | "US-ASCII"
99 | "ISO646-US" | "IBM367" | "US" | "ISO_646.IRV:1991"
100 | "ISO-IR-6" | "CP367" | "CSASCII" => Encoding::UsAscii,
101
102 "ISO_8859-1" | "ISO_8859-1:1987" | "ISO-8859-1" | "ISO-IR-100"
104 | "LATIN1" | "L1" | "IBM819" | "CSISOLATIN1" => Encoding::Iso8859_1,
105
106 "ISO_8859-2" | "ISO_8859-2:1987" | "ISO-8859-2" | "ISO-IR-101"
108 | "LATIN2" | "L2" | "CSISOLATIN2" => Encoding::Iso8859_2,
109
110 "ISO_8859-3" | "ISO_8859-3:1988" | "ISO-8859-3" | "ISO-IR-109"
112 | "LATIN3" | "L3" | "CSISOLATIN3" => Encoding::Iso8859_3,
113
114 "ISO_8859-4" | "ISO_8859-4:1988" | "ISO-8859-4" | "ISO-IR-110"
116 | "LATIN4" | "L4" | "CSISOLATIN4" => Encoding::Iso8859_4,
117
118 "ISO_8859-5" | "ISO_8859-5:1988" | "ISO-8859-5" | "ISO-IR-144"
120 | "CYRILLIC" | "CSISOLATINCYRILLIC" => Encoding::Iso8859_5,
121
122 "ISO_8859-6" | "ISO_8859-6:1987" | "ISO-8859-6" | "ISO-IR-127"
124 | "ECMA-114" | "ASMO-708" | "ARABIC" | "CSISOLATINARABIC"
125 => Encoding::Iso8859_6,
126
127 "ISO_8859-7" | "ISO_8859-7:1987" | "ISO-8859-7" | "ISO-IR-126"
129 | "ECMA-118" | "ELOT_928" | "GREEK" | "GREEK8"
130 | "CSISOLATINGREEK" => Encoding::Iso8859_7,
131
132 "ISO_8859-8" | "ISO_8859-8:1988" | "ISO-8859-8" | "ISO-IR-138"
134 | "HEBREW" | "CSISOLATINHEBREW" => Encoding::Iso8859_8,
135
136 "ISO_8859-9" | "ISO_8859-9:1989" | "ISO-8859-9" | "ISO-IR-148"
138 | "LATIN5" | "L5" | "CSISOLATIN5" => Encoding::Iso8859_9,
139
140 "ISO_8859-10" | "ISO-8859-10" | "ISO-IR-157" | "LATIN6" | "L6"
142 | "CSISOLATIN6" => Encoding::Iso8859_10,
143
144 "ISO_8859-11" | "ISO-8859-11" | "TIS-620" => Encoding::Iso8859_11,
146
147 "ISO_8859-13" | "ISO-8859-13" | "LATIN7" | "L7" => Encoding::Iso8859_13,
149
150 "ISO_8859-14" | "ISO-8859-14" | "ISO-IR-199" | "LATIN8" | "L8"
152 | "ISO-CELTIC" => Encoding::Iso8859_14,
153
154 "ISO_8859-15" | "ISO-8859-15" | "LATIN9" | "LATIN-9" | "LATIN0"
156 => Encoding::Iso8859_15,
157
158 "ISO_8859-16" | "ISO-8859-16" | "ISO-IR-226" | "LATIN10" | "L10"
160 => Encoding::Iso8859_16,
161
162 "KOI8-R" | "CSKOI8R" => Encoding::Koi8R,
164
165 "ISCII" => Encoding::Iscii,
167
168 "UTF-8" | "UTF8" => Encoding::Utf8,
170
171 "ISO-10646-UCS-2" | "UCS-2" | "CSUNICODE" => Encoding::Ucs2,
173
174 _ => Encoding::Unknown,
175 }
176 }
177
178 pub fn is_single_byte(self) -> bool {
180 matches!(
181 self,
182 Encoding::UsAscii
183 | Encoding::Iso8859_1
184 | Encoding::Iso8859_2
185 | Encoding::Iso8859_3
186 | Encoding::Iso8859_4
187 | Encoding::Iso8859_5
188 | Encoding::Iso8859_6
189 | Encoding::Iso8859_7
190 | Encoding::Iso8859_8
191 | Encoding::Iso8859_9
192 | Encoding::Iso8859_10
193 | Encoding::Iso8859_11
194 | Encoding::Iso8859_13
195 | Encoding::Iso8859_14
196 | Encoding::Iso8859_15
197 | Encoding::Iso8859_16
198 | Encoding::Koi8R
199 | Encoding::Iscii
200 )
201 }
202
203 pub fn codepage(self) -> Option<&'static [u16; 128]> {
206 use codepages::*;
207 match self {
208 Encoding::Iso8859_1 => Some(&ISO_8859_1),
209 Encoding::Iso8859_2 => Some(&ISO_8859_2),
210 Encoding::Iso8859_3 => Some(&ISO_8859_3),
211 Encoding::Iso8859_4 => Some(&ISO_8859_4),
212 Encoding::Iso8859_5 => Some(&ISO_8859_5),
213 Encoding::Iso8859_6 => Some(&ISO_8859_6),
214 Encoding::Iso8859_7 => Some(&ISO_8859_7),
215 Encoding::Iso8859_8 => Some(&ISO_8859_8),
216 Encoding::Iso8859_9 => Some(&ISO_8859_9),
217 Encoding::Iso8859_10 => Some(&ISO_8859_10),
218 Encoding::Iso8859_11 => Some(&ISO_8859_11),
219 Encoding::Iso8859_13 => Some(&ISO_8859_13),
220 Encoding::Iso8859_14 => Some(&ISO_8859_14),
221 Encoding::Iso8859_15 => Some(&ISO_8859_15),
222 Encoding::Iso8859_16 => Some(&ISO_8859_16),
223 Encoding::Koi8R => Some(&KOI8_R),
224 Encoding::Iscii => Some(&ISCII),
225 _ => None,
226 }
227 }
228}
229
230pub const REPLACEMENT_CHAR: u32 = 0xFFFD;
236
237pub fn utf8_decode_one(buf: &[u8]) -> (u32, usize) {
253 debug_assert!(!buf.is_empty(), "utf8_decode_one called on empty buffer");
254
255 let c0 = buf[0];
256 match c0 >> 4 {
257 0x0..=0x7 => (c0 as u32, 1),
259
260 0x8..=0xB => (REPLACEMENT_CHAR, 1),
262
263 0xC | 0xD => {
265 if buf.len() < 2 {
266 return (REPLACEMENT_CHAR, buf.len());
267 }
268 let c1 = buf[1];
269 if c1 & 0xC0 != 0x80 {
270 return (REPLACEMENT_CHAR, 1);
271 }
272 let cp = ((c0 as u32 & 0x1F) << 6) | (c1 as u32 & 0x3F);
273 (cp, 2)
274 }
275
276 0xE => {
278 if buf.len() < 3 {
279 return (REPLACEMENT_CHAR, buf.len().min(1));
280 }
281 let c1 = buf[1];
282 let c2 = buf[2];
283 if c1 & 0xC0 != 0x80 {
284 return (REPLACEMENT_CHAR, 1);
285 }
286 if c2 & 0xC0 != 0x80 {
287 return (REPLACEMENT_CHAR, 1);
288 }
289 let cp = ((c0 as u32 & 0x0F) << 12)
290 | ((c1 as u32 & 0x3F) << 6)
291 | (c2 as u32 & 0x3F);
292 let cp = if cp == 0xFFFD { 0x001A } else { cp };
294 (cp, 3)
295 }
296
297 _ => {
299 if buf.len() < 4 {
300 return (REPLACEMENT_CHAR, buf.len().min(1));
301 }
302 let c1 = buf[1];
303 let c2 = buf[2];
304 let c3 = buf[3];
305 if c1 & 0xC0 != 0x80 || c2 & 0xC0 != 0x80 || c3 & 0xC0 != 0x80 {
306 return (REPLACEMENT_CHAR, 1);
307 }
308 let cp = ((c0 as u32 & 0x07) << 18)
309 | ((c1 as u32 & 0x3F) << 12)
310 | ((c2 as u32 & 0x3F) << 6)
311 | (c3 as u32 & 0x3F);
312 let cp = if cp <= 0x10_FFFF { cp } else { REPLACEMENT_CHAR };
313 (cp, 4)
314 }
315 }
316}
317
318pub fn utf8_encode_one(cp: u32, buf: &mut [u8]) -> usize {
327 if cp < 0x80 {
328 debug_assert!(buf.len() >= 1);
329 buf[0] = cp as u8;
330 1
331 } else if cp < 0x800 {
332 debug_assert!(buf.len() >= 2);
333 buf[0] = 0xC0 | (cp >> 6) as u8;
334 buf[1] = 0x80 | (cp & 0x3F) as u8;
335 2
336 } else if cp < 0x10000 {
337 debug_assert!(buf.len() >= 3);
338 buf[0] = 0xE0 | (cp >> 12) as u8;
339 buf[1] = 0x80 | ((cp >> 6) & 0x3F) as u8;
340 buf[2] = 0x80 | (cp & 0x3F) as u8;
341 3
342 } else {
343 debug_assert!(buf.len() >= 4);
344 buf[0] = 0xF0 | (cp >> 18) as u8;
345 buf[1] = 0x80 | ((cp >> 12) & 0x3F) as u8;
346 buf[2] = 0x80 | ((cp >> 6) & 0x3F) as u8;
347 buf[3] = 0x80 | (cp & 0x3F) as u8;
348 4
349 }
350}
351
352#[derive(Debug, Clone, Copy, PartialEq, Eq)]
358pub enum DecodeMode {
359 Strict,
361 Auto,
364}
365
366pub struct TextDecoder<'a> {
374 buf: &'a [u8],
375 pos: usize,
376 encoding: Encoding,
377 mode: DecodeMode,
378 fell_back: bool,
381}
382
383impl<'a> TextDecoder<'a> {
384 pub fn new(buf: &'a [u8], encoding: Encoding, mode: DecodeMode) -> Result<Self> {
386 if encoding == Encoding::Unknown {
387 return Err(Error::UnknownTextEncoding(
388 "cannot decode with Encoding::Unknown".to_string(),
389 ));
390 }
391 Ok(TextDecoder {
392 buf,
393 pos: 0,
394 encoding,
395 mode,
396 fell_back: false,
397 })
398 }
399
400 pub fn utf8(buf: &'a [u8]) -> Self {
402 TextDecoder {
403 buf,
404 pos: 0,
405 encoding: Encoding::Utf8,
406 mode: DecodeMode::Strict,
407 fell_back: false,
408 }
409 }
410
411 pub fn is_eof(&self) -> bool {
413 self.pos >= self.buf.len()
414 }
415
416 pub fn remaining(&self) -> &[u8] {
418 &self.buf[self.pos..]
419 }
420
421 pub fn peek(&self) -> Option<u32> {
423 if self.is_eof() {
424 return None;
425 }
426 let mut clone = TextDecoder {
428 buf: self.buf,
429 pos: self.pos,
430 encoding: self.encoding,
431 mode: self.mode,
432 fell_back: self.fell_back,
433 };
434 clone.next_codepoint()
435 }
436
437 pub fn next_codepoint(&mut self) -> Option<u32> {
439 if self.is_eof() {
440 return None;
441 }
442 let cp = self.decode_one();
443 Some(cp)
444 }
445
446 pub fn collect_codepoints(&mut self) -> Vec<u32> {
448 let mut out = Vec::with_capacity(self.buf.len() - self.pos);
449 while let Some(cp) = self.next_codepoint() {
450 if cp == 0 {
451 break; }
453 out.push(cp);
454 }
455 out
456 }
457
458 pub fn decode_to_string(&mut self) -> String {
461 let codepoints = self.collect_codepoints();
462 codepoints
463 .into_iter()
464 .map(|cp| char::from_u32(cp).unwrap_or('\u{FFFD}'))
465 .collect()
466 }
467
468 fn decode_one(&mut self) -> u32 {
471 match self.encoding {
472 Encoding::UsAscii => self.decode_ascii(),
473 Encoding::Utf8 => self.decode_utf8(),
474 Encoding::Ucs2 => self.decode_ucs2(),
475 enc if enc.is_single_byte() => {
476 if self.mode == DecodeMode::Auto && !self.fell_back {
477 self.decode_auto()
478 } else {
479 self.decode_codepage()
480 }
481 }
482 _ => {
483 self.pos += 1;
484 REPLACEMENT_CHAR
485 }
486 }
487 }
488
489 fn decode_ascii(&mut self) -> u32 {
490 let b = self.buf[self.pos];
491 self.pos += 1;
492 if b < 0x80 { b as u32 } else { REPLACEMENT_CHAR }
493 }
494
495 fn decode_utf8(&mut self) -> u32 {
496 let (cp, consumed) = utf8_decode_one(&self.buf[self.pos..]);
497 self.pos += consumed;
498 cp
499 }
500
501 fn decode_codepage(&mut self) -> u32 {
502 let b = self.buf[self.pos];
503 self.pos += 1;
504 if b < 0x80 {
505 b as u32
506 } else if let Some(table) = self.encoding.codepage() {
507 table[(b - 0x80) as usize] as u32
508 } else {
509 REPLACEMENT_CHAR
510 }
511 }
512
513 fn decode_auto(&mut self) -> u32 {
516 let saved_pos = self.pos;
517 let (cp, consumed) = utf8_decode_one(&self.buf[self.pos..]);
518 if cp == REPLACEMENT_CHAR {
519 self.fell_back = true;
521 self.pos = saved_pos;
522 self.decode_codepage()
523 } else {
524 self.pos += consumed;
525 cp
526 }
527 }
528
529 fn decode_ucs2(&mut self) -> u32 {
530 if self.pos + 1 >= self.buf.len() {
531 self.pos = self.buf.len();
532 return REPLACEMENT_CHAR;
533 }
534 let lo = self.buf[self.pos] as u32;
535 let hi = self.buf[self.pos + 1] as u32;
536 self.pos += 2;
537 lo | (hi << 8)
538 }
539}
540
541impl<'a> Iterator for TextDecoder<'a> {
546 type Item = u32;
547
548 fn next(&mut self) -> Option<u32> {
549 if self.is_eof() {
550 return None;
551 }
552 let cp = self.decode_one();
553 if cp == 0 {
554 self.pos = self.buf.len();
556 return None;
557 }
558 Some(cp)
559 }
560}
561
562pub fn decode_utf8_to_string(bytes: &[u8]) -> String {
570 let mut dec = TextDecoder::utf8(bytes);
571 dec.decode_to_string()
572}
573
574pub fn decode_to_string(bytes: &[u8], encoding: Encoding) -> Result<String> {
576 let mut dec = TextDecoder::new(bytes, encoding, DecodeMode::Strict)?;
577 Ok(dec.decode_to_string())
578}
579
580#[cfg(test)]
585mod tests {
586 use super::*;
587
588 #[test]
591 fn utf8_decode_ascii_range() {
592 for b in 0u8..0x80 {
593 let (cp, n) = utf8_decode_one(&[b]);
594 assert_eq!(cp, b as u32, "ascii byte 0x{b:02x}");
595 assert_eq!(n, 1);
596 }
597 }
598
599 #[test]
600 fn utf8_decode_two_byte() {
601 let (cp, n) = utf8_decode_one(&[0xC3, 0xA9]);
603 assert_eq!(cp, 0x00E9);
604 assert_eq!(n, 2);
605 }
606
607 #[test]
608 fn utf8_decode_three_byte() {
609 let (cp, n) = utf8_decode_one(&[0xE2, 0x82, 0xAC]);
611 assert_eq!(cp, 0x20AC);
612 assert_eq!(n, 3);
613 }
614
615 #[test]
616 fn utf8_decode_four_byte() {
617 let (cp, n) = utf8_decode_one(&[0xF0, 0x9F, 0x98, 0x80]);
619 assert_eq!(cp, 0x1F600);
620 assert_eq!(n, 4);
621 }
622
623 #[test]
624 fn utf8_decode_overlong_replacement() {
625 let (cp, n) = utf8_decode_one(&[0x80]);
627 assert_eq!(cp, REPLACEMENT_CHAR);
628 assert_eq!(n, 1);
629 }
630
631 #[test]
632 fn utf8_decode_bad_continuation() {
633 let (cp, n) = utf8_decode_one(&[0xC3, 0x20]);
635 assert_eq!(cp, REPLACEMENT_CHAR);
636 assert_eq!(n, 1);
637 }
638
639 #[test]
640 fn utf8_decode_codepoint_max() {
641 let (cp, n) = utf8_decode_one(&[0xF4, 0x8F, 0xBF, 0xBF]);
643 assert_eq!(cp, 0x10FFFF);
644 assert_eq!(n, 4);
645 }
646
647 #[test]
648 fn utf8_decode_above_max_is_replacement() {
649 let (cp, _) = utf8_decode_one(&[0xF4, 0x90, 0x80, 0x80]);
651 assert_eq!(cp, REPLACEMENT_CHAR);
652 }
653
654 #[test]
655 fn utf8_decode_iumlaut_half_bug_workaround() {
656 let (cp, n) = utf8_decode_one(&[0xEF, 0xBF, 0xBD]);
659 assert_eq!(cp, 0x001A, "expected the C workaround U+001A, got 0x{cp:04x}");
661 assert_eq!(n, 3);
662 }
663
664 #[test]
667 fn utf8_roundtrip_bmp() {
668 let mut buf = [0u8; 4];
669 for cp in [0u32, 0x41, 0xFF, 0x100, 0x7FF, 0x800, 0xFFFE, 0xFFFF] {
672 if let Some(ch) = char::from_u32(cp) {
673 let s = ch.encode_utf8(&mut buf);
674 let (decoded, _) = utf8_decode_one(s.as_bytes());
675 let expected = if cp == 0xFFFD { 0x001A } else { cp };
677 assert_eq!(decoded, expected, "cp=U+{cp:04X}");
678 }
679 }
680 }
681
682 #[test]
685 fn utf8_encode_ascii() {
686 let mut buf = [0u8; 4];
687 assert_eq!(utf8_encode_one(b'A' as u32, &mut buf), 1);
688 assert_eq!(buf[0], b'A');
689 }
690
691 #[test]
692 fn utf8_encode_two_byte() {
693 let mut buf = [0u8; 4];
694 let n = utf8_encode_one(0x00E9, &mut buf); assert_eq!(n, 2);
696 assert_eq!(&buf[..2], &[0xC3, 0xA9]);
697 }
698
699 #[test]
700 fn utf8_encode_three_byte() {
701 let mut buf = [0u8; 4];
702 let n = utf8_encode_one(0x20AC, &mut buf); assert_eq!(n, 3);
704 assert_eq!(&buf[..3], &[0xE2, 0x82, 0xAC]);
705 }
706
707 #[test]
708 fn utf8_encode_four_byte() {
709 let mut buf = [0u8; 4];
710 let n = utf8_encode_one(0x1F600, &mut buf); assert_eq!(n, 4);
712 assert_eq!(&buf[..4], &[0xF0, 0x9F, 0x98, 0x80]);
713 }
714
715 #[test]
718 fn encoding_from_name_utf8() {
719 assert_eq!(Encoding::from_name("UTF-8"), Encoding::Utf8);
720 assert_eq!(Encoding::from_name("UTF8"), Encoding::Utf8);
721 assert_eq!(Encoding::from_name("utf-8"), Encoding::Utf8); }
723
724 #[test]
725 fn encoding_from_name_ascii_aliases() {
726 for alias in &["ASCII", "US-ASCII", "ANSI_X3.4-1968", "IBM367"] {
727 assert_eq!(
728 Encoding::from_name(alias),
729 Encoding::UsAscii,
730 "alias: {alias}"
731 );
732 }
733 }
734
735 #[test]
736 fn encoding_from_name_latin1_aliases() {
737 for alias in &["ISO-8859-1", "ISO_8859-1", "LATIN1", "L1", "IBM819"] {
738 assert_eq!(
739 Encoding::from_name(alias),
740 Encoding::Iso8859_1,
741 "alias: {alias}"
742 );
743 }
744 }
745
746 #[test]
747 fn encoding_from_name_koi8r() {
748 assert_eq!(Encoding::from_name("KOI8-R"), Encoding::Koi8R);
749 assert_eq!(Encoding::from_name("CSKOI8R"), Encoding::Koi8R);
750 }
751
752 #[test]
753 fn encoding_from_name_unknown() {
754 assert_eq!(Encoding::from_name("bogus"), Encoding::Unknown);
755 assert_eq!(Encoding::from_name(""), Encoding::Unknown);
756 assert_eq!(Encoding::from_name("SHIFT_JIS"),Encoding::Unknown); }
758
759 #[test]
762 fn iso8859_1_is_identity() {
763 let table = Encoding::Iso8859_1.codepage().unwrap();
765 for i in 0usize..128 {
766 assert_eq!(table[i] as usize, i + 0x80, "byte 0x{:02X}", i + 0x80);
767 }
768 }
769
770 #[test]
771 fn iso8859_15_euro_sign() {
772 let table = Encoding::Iso8859_15.codepage().unwrap();
774 let idx = 0xA4usize - 0x80; assert_eq!(table[idx], 0x20AC);
776 }
777
778 #[test]
779 fn koi8r_sample() {
780 let table = Encoding::Koi8R.codepage().unwrap();
784 let idx = 0xC1usize - 0x80; assert_eq!(table[idx], 0x00C1,
786 "espeak-ng KOI8-R table at 0xC1 should be U+00C1 (mirrors C source)");
787 }
788
789 #[test]
792 fn text_decoder_utf8_hello() {
793 let input = b"hello";
794 let codepoints: Vec<u32> = TextDecoder::utf8(input).collect();
795 assert_eq!(codepoints, vec![b'h' as u32, b'e' as u32, b'l' as u32,
796 b'l' as u32, b'o' as u32]);
797 }
798
799 #[test]
800 fn text_decoder_utf8_multibyte() {
801 let input = "café".as_bytes();
803 let codepoints: Vec<u32> = TextDecoder::utf8(input).collect();
804 assert_eq!(codepoints, vec![b'c' as u32, b'a' as u32, b'f' as u32, 0x00E9]);
805 }
806
807 #[test]
808 fn text_decoder_null_terminates() {
809 let input = b"hi\x00world";
810 let codepoints: Vec<u32> = TextDecoder::utf8(input).collect();
811 assert_eq!(codepoints, vec![b'h' as u32, b'i' as u32]);
813 }
814
815 #[test]
816 fn text_decoder_iso8859_1() {
817 let input = &[0xE9u8];
819 let mut dec = TextDecoder::new(input, Encoding::Iso8859_1, DecodeMode::Strict).unwrap();
820 let cp = dec.next_codepoint().unwrap();
821 assert_eq!(cp, 0x00E9);
822 }
823
824 #[test]
825 fn text_decoder_iso8859_15_euro() {
826 let input = &[0xA4u8];
828 let mut dec = TextDecoder::new(input, Encoding::Iso8859_15, DecodeMode::Strict).unwrap();
829 let cp = dec.next_codepoint().unwrap();
830 assert_eq!(cp, 0x20AC);
831 }
832
833 #[test]
834 fn text_decoder_ascii_rejects_high_bytes() {
835 let input = &[0x80u8];
836 let mut dec = TextDecoder::new(input, Encoding::UsAscii, DecodeMode::Strict).unwrap();
837 let cp = dec.next_codepoint().unwrap();
838 assert_eq!(cp, REPLACEMENT_CHAR);
839 }
840
841 #[test]
842 fn text_decoder_auto_mode_utf8_first() {
843 let mut dec = TextDecoder::new(
845 b"hi",
846 Encoding::Iso8859_1, DecodeMode::Auto,
848 ).unwrap();
849 assert_eq!(dec.next_codepoint(), Some(b'h' as u32));
850 assert_eq!(dec.next_codepoint(), Some(b'i' as u32));
851 assert!(!dec.fell_back, "should not have fallen back");
852 }
853
854 #[test]
855 fn text_decoder_auto_mode_fallback_on_bad_utf8() {
856 let mut dec = TextDecoder::new(
859 &[0xA4u8],
860 Encoding::Iso8859_15,
861 DecodeMode::Auto,
862 ).unwrap();
863 let cp = dec.next_codepoint().unwrap();
864 assert_eq!(cp, 0x20AC, "expected euro sign U+20AC");
865 assert!(dec.fell_back, "should have fallen back to codepage");
866 }
867
868 #[test]
869 fn text_decoder_ucs2_hello() {
870 let input = &[0x48u8, 0x00, 0x69, 0x00];
872 let codepoints: Vec<u32> = TextDecoder::new(input, Encoding::Ucs2, DecodeMode::Strict)
873 .unwrap()
874 .collect();
875 assert_eq!(codepoints, vec![b'H' as u32, b'i' as u32]);
876 }
877
878 #[test]
879 fn text_decoder_eof_flag() {
880 let mut dec = TextDecoder::utf8(b"x");
881 assert!(!dec.is_eof());
882 dec.next_codepoint();
883 assert!(dec.is_eof());
884 }
885
886 #[test]
887 fn decode_utf8_to_string_emoji() {
888 let s = "😀 world";
889 let decoded = decode_utf8_to_string(s.as_bytes());
890 assert_eq!(decoded, s);
891 }
892
893 #[test]
894 fn decode_to_string_iso8859_1_cafe() {
895 let input = b"caf\xE9";
897 let s = decode_to_string(input, Encoding::Iso8859_1).unwrap();
898 assert_eq!(s, "café");
899 }
900
901 #[test]
902 fn decoder_error_on_unknown_encoding() {
903 let result = TextDecoder::new(b"x", Encoding::Unknown, DecodeMode::Strict);
904 assert!(result.is_err());
905 }
906}