espeak_ng/encoding/
mod.rs

1//! Text encoding detection and decoding.
2//!
3//! Rust port of `encoding.c` and `encoding.h`.
4//!
5//! The C original uses a vtable of function pointers (`espeak_ng_TEXT_DECODER`).
6//! Here we use an enum + method dispatch instead, which is idiomatic Rust and
7//! avoids `unsafe`.
8//!
9//! # Supported encodings
10//! UTF-8, US-ASCII, ISO-8859-1 through -16, KOI8-R, ISCII, UCS-2.
11//!
12//! # Example
13//! ```rust
14//! use espeak_ng::encoding::{Encoding, TextDecoder, DecodeMode};
15//!
16//! let enc = Encoding::from_name("UTF-8");
17//! assert_eq!(enc, Encoding::Utf8);
18//!
19//! let mut dec = TextDecoder::utf8("héllo".as_bytes());
20//! let codepoints = dec.collect_codepoints();
21//! assert_eq!(codepoints[0], 'h' as u32);
22//! ```
23
24pub mod codepages;
25
26use crate::error::{Error, Result};
27
28// ---------------------------------------------------------------------------
29// Encoding enum
30// ---------------------------------------------------------------------------
31
32/// Text encoding, mirroring `espeak_ng_ENCODING` from `encoding.h`.
33///
34/// Variants are in the same order as the C enum so that casting a raw integer
35/// (e.g. from a binary data file) to `Encoding` works correctly.
36///
37/// Use [`Encoding::from_name`] to resolve an IANA/MIME name, and
38/// [`TextDecoder`] to decode byte streams.
39#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
40#[repr(u32)]
41#[allow(missing_docs)] // variant names match standard ISO/IANA charset names
42pub enum Encoding {
43    /// Encoding not recognised.
44    Unknown       = 0,
45    /// 7-bit US-ASCII.
46    UsAscii       = 1,
47    /// ISO-8859-1 (Latin-1 — Western European).
48    Iso8859_1     = 2,
49    /// ISO-8859-2 (Latin-2 — Central European).
50    Iso8859_2     = 3,
51    /// ISO-8859-3 (Latin-3 — South European).
52    Iso8859_3     = 4,
53    /// ISO-8859-4 (Latin-4 — North European).
54    Iso8859_4     = 5,
55    /// ISO-8859-5 (Cyrillic).
56    Iso8859_5     = 6,
57    /// ISO-8859-6 (Arabic).
58    Iso8859_6     = 7,
59    /// ISO-8859-7 (Greek).
60    Iso8859_7     = 8,
61    /// ISO-8859-8 (Hebrew).
62    Iso8859_8     = 9,
63    /// ISO-8859-9 (Latin-5 — Turkish).
64    Iso8859_9     = 10,
65    /// ISO-8859-10 (Latin-6 — Nordic).
66    Iso8859_10    = 11,
67    /// ISO-8859-11 (Thai).
68    Iso8859_11    = 12,
69    /// ISO-8859-13 (Latin-7 — Baltic Rim).  Note: 12 is not a valid ISO-8859 number.
70    Iso8859_13    = 13,
71    /// ISO-8859-14 (Latin-8 — Celtic).
72    Iso8859_14    = 14,
73    /// ISO-8859-15 (Latin-9 — Western European with €).
74    Iso8859_15    = 15,
75    /// ISO-8859-16 (Latin-10 — South-Eastern European).
76    Iso8859_16    = 16,
77    /// KOI8-R (Russian Cyrillic).
78    Koi8R         = 17,
79    /// ISCII (Indian scripts).
80    Iscii         = 18,
81    /// UTF-8 Unicode.
82    Utf8          = 19,
83    /// UCS-2 little-endian (16-bit Unicode, no surrogates).
84    Ucs2          = 20,
85}
86
87impl Encoding {
88    /// Resolve an encoding from its IANA/MIME name.
89    ///
90    /// Mirrors `espeak_ng_EncodingFromName()` in encoding.c, which uses the
91    /// `mnem_encoding[]` table.  Comparison is case-insensitive.
92    pub fn from_name(name: &str) -> Self {
93        // The C table is huge; we implement the same look-up as a match on
94        // normalised names.  The canonical names come from:
95        //   http://www.iana.org/assignments/character-sets/character-sets.xhtml
96        match name.to_ascii_uppercase().as_str() {
97            // US-ASCII aliases
98            "ANSI_X3.4-1968" | "ANSI_X3.4-1986" | "ASCII" | "US-ASCII"
99            | "ISO646-US" | "IBM367" | "US" | "ISO_646.IRV:1991"
100            | "ISO-IR-6" | "CP367" | "CSASCII" => Encoding::UsAscii,
101
102            // ISO-8859-1
103            "ISO_8859-1" | "ISO_8859-1:1987" | "ISO-8859-1" | "ISO-IR-100"
104            | "LATIN1" | "L1" | "IBM819" | "CSISOLATIN1" => Encoding::Iso8859_1,
105
106            // ISO-8859-2
107            "ISO_8859-2" | "ISO_8859-2:1987" | "ISO-8859-2" | "ISO-IR-101"
108            | "LATIN2" | "L2" | "CSISOLATIN2" => Encoding::Iso8859_2,
109
110            // ISO-8859-3
111            "ISO_8859-3" | "ISO_8859-3:1988" | "ISO-8859-3" | "ISO-IR-109"
112            | "LATIN3" | "L3" | "CSISOLATIN3" => Encoding::Iso8859_3,
113
114            // ISO-8859-4
115            "ISO_8859-4" | "ISO_8859-4:1988" | "ISO-8859-4" | "ISO-IR-110"
116            | "LATIN4" | "L4" | "CSISOLATIN4" => Encoding::Iso8859_4,
117
118            // ISO-8859-5
119            "ISO_8859-5" | "ISO_8859-5:1988" | "ISO-8859-5" | "ISO-IR-144"
120            | "CYRILLIC" | "CSISOLATINCYRILLIC" => Encoding::Iso8859_5,
121
122            // ISO-8859-6
123            "ISO_8859-6" | "ISO_8859-6:1987" | "ISO-8859-6" | "ISO-IR-127"
124            | "ECMA-114" | "ASMO-708" | "ARABIC" | "CSISOLATINARABIC"
125            => Encoding::Iso8859_6,
126
127            // ISO-8859-7
128            "ISO_8859-7" | "ISO_8859-7:1987" | "ISO-8859-7" | "ISO-IR-126"
129            | "ECMA-118" | "ELOT_928" | "GREEK" | "GREEK8"
130            | "CSISOLATINGREEK" => Encoding::Iso8859_7,
131
132            // ISO-8859-8
133            "ISO_8859-8" | "ISO_8859-8:1988" | "ISO-8859-8" | "ISO-IR-138"
134            | "HEBREW" | "CSISOLATINHEBREW" => Encoding::Iso8859_8,
135
136            // ISO-8859-9
137            "ISO_8859-9" | "ISO_8859-9:1989" | "ISO-8859-9" | "ISO-IR-148"
138            | "LATIN5" | "L5" | "CSISOLATIN5" => Encoding::Iso8859_9,
139
140            // ISO-8859-10
141            "ISO_8859-10" | "ISO-8859-10" | "ISO-IR-157" | "LATIN6" | "L6"
142            | "CSISOLATIN6" => Encoding::Iso8859_10,
143
144            // ISO-8859-11 / TIS-620
145            "ISO_8859-11" | "ISO-8859-11" | "TIS-620" => Encoding::Iso8859_11,
146
147            // ISO-8859-13
148            "ISO_8859-13" | "ISO-8859-13" | "LATIN7" | "L7" => Encoding::Iso8859_13,
149
150            // ISO-8859-14
151            "ISO_8859-14" | "ISO-8859-14" | "ISO-IR-199" | "LATIN8" | "L8"
152            | "ISO-CELTIC" => Encoding::Iso8859_14,
153
154            // ISO-8859-15
155            "ISO_8859-15" | "ISO-8859-15" | "LATIN9" | "LATIN-9" | "LATIN0"
156            => Encoding::Iso8859_15,
157
158            // ISO-8859-16
159            "ISO_8859-16" | "ISO-8859-16" | "ISO-IR-226" | "LATIN10" | "L10"
160            => Encoding::Iso8859_16,
161
162            // KOI8-R
163            "KOI8-R" | "CSKOI8R" => Encoding::Koi8R,
164
165            // ISCII
166            "ISCII" => Encoding::Iscii,
167
168            // UTF-8
169            "UTF-8" | "UTF8" => Encoding::Utf8,
170
171            // UCS-2 / ISO 10646
172            "ISO-10646-UCS-2" | "UCS-2" | "CSUNICODE" => Encoding::Ucs2,
173
174            _ => Encoding::Unknown,
175        }
176    }
177
178    /// Returns `true` if this is a single-byte encoding (includes ASCII).
179    pub fn is_single_byte(self) -> bool {
180        matches!(
181            self,
182            Encoding::UsAscii
183                | Encoding::Iso8859_1
184                | Encoding::Iso8859_2
185                | Encoding::Iso8859_3
186                | Encoding::Iso8859_4
187                | Encoding::Iso8859_5
188                | Encoding::Iso8859_6
189                | Encoding::Iso8859_7
190                | Encoding::Iso8859_8
191                | Encoding::Iso8859_9
192                | Encoding::Iso8859_10
193                | Encoding::Iso8859_11
194                | Encoding::Iso8859_13
195                | Encoding::Iso8859_14
196                | Encoding::Iso8859_15
197                | Encoding::Iso8859_16
198                | Encoding::Koi8R
199                | Encoding::Iscii
200        )
201    }
202
203    /// Return the codepage table for single-byte encodings, if one exists.
204    /// The table maps bytes 0x80–0xFF (index = byte - 0x80) to Unicode codepoints.
205    pub fn codepage(self) -> Option<&'static [u16; 128]> {
206        use codepages::*;
207        match self {
208            Encoding::Iso8859_1  => Some(&ISO_8859_1),
209            Encoding::Iso8859_2  => Some(&ISO_8859_2),
210            Encoding::Iso8859_3  => Some(&ISO_8859_3),
211            Encoding::Iso8859_4  => Some(&ISO_8859_4),
212            Encoding::Iso8859_5  => Some(&ISO_8859_5),
213            Encoding::Iso8859_6  => Some(&ISO_8859_6),
214            Encoding::Iso8859_7  => Some(&ISO_8859_7),
215            Encoding::Iso8859_8  => Some(&ISO_8859_8),
216            Encoding::Iso8859_9  => Some(&ISO_8859_9),
217            Encoding::Iso8859_10 => Some(&ISO_8859_10),
218            Encoding::Iso8859_11 => Some(&ISO_8859_11),
219            Encoding::Iso8859_13 => Some(&ISO_8859_13),
220            Encoding::Iso8859_14 => Some(&ISO_8859_14),
221            Encoding::Iso8859_15 => Some(&ISO_8859_15),
222            Encoding::Iso8859_16 => Some(&ISO_8859_16),
223            Encoding::Koi8R      => Some(&KOI8_R),
224            Encoding::Iscii      => Some(&ISCII),
225            _                    => None,
226        }
227    }
228}
229
230// ---------------------------------------------------------------------------
231// Replacement character
232// ---------------------------------------------------------------------------
233
234/// Unicode Replacement Character U+FFFD, used for invalid sequences.
235pub const REPLACEMENT_CHAR: u32 = 0xFFFD;
236
237// ---------------------------------------------------------------------------
238// UTF-8 decoding (standalone, stateless)
239// ---------------------------------------------------------------------------
240
241/// Decode one UTF-8 codepoint from `buf`.
242///
243/// Returns `(codepoint, bytes_consumed)`.  On error the replacement character
244/// U+FFFD is returned and `bytes_consumed` is 1 (the invalid lead byte is
245/// skipped so the caller can continue).
246///
247/// Mirrors `string_decoder_getc_utf_8()` in encoding.c, including the
248/// "I umlaut a half" (U+FFFD → U+001A) workaround for 3-byte sequences.
249///
250/// # Panics
251/// Panics in debug builds if `buf` is empty.
252pub fn utf8_decode_one(buf: &[u8]) -> (u32, usize) {
253    debug_assert!(!buf.is_empty(), "utf8_decode_one called on empty buffer");
254
255    let c0 = buf[0];
256    match c0 >> 4 {
257        // 0xxxxxxx — 1-byte ASCII
258        0x0..=0x7 => (c0 as u32, 1),
259
260        // 10xxxxxx — unexpected continuation byte
261        0x8..=0xB => (REPLACEMENT_CHAR, 1),
262
263        // 110xxxxx — 2-byte sequence
264        0xC | 0xD => {
265            if buf.len() < 2 {
266                return (REPLACEMENT_CHAR, buf.len());
267            }
268            let c1 = buf[1];
269            if c1 & 0xC0 != 0x80 {
270                return (REPLACEMENT_CHAR, 1);
271            }
272            let cp = ((c0 as u32 & 0x1F) << 6) | (c1 as u32 & 0x3F);
273            (cp, 2)
274        }
275
276        // 1110xxxx — 3-byte sequence
277        0xE => {
278            if buf.len() < 3 {
279                return (REPLACEMENT_CHAR, buf.len().min(1));
280            }
281            let c1 = buf[1];
282            let c2 = buf[2];
283            if c1 & 0xC0 != 0x80 {
284                return (REPLACEMENT_CHAR, 1);
285            }
286            if c2 & 0xC0 != 0x80 {
287                return (REPLACEMENT_CHAR, 1);
288            }
289            let cp = ((c0 as u32 & 0x0F) << 12)
290                | ((c1 as u32 & 0x3F) << 6)
291                | (c2 as u32 & 0x3F);
292            // Mirror C code: "fix the I umlaut a half bug"
293            let cp = if cp == 0xFFFD { 0x001A } else { cp };
294            (cp, 3)
295        }
296
297        // 11110xxx — 4-byte sequence
298        _ /* 0xF */ => {
299            if buf.len() < 4 {
300                return (REPLACEMENT_CHAR, buf.len().min(1));
301            }
302            let c1 = buf[1];
303            let c2 = buf[2];
304            let c3 = buf[3];
305            if c1 & 0xC0 != 0x80 || c2 & 0xC0 != 0x80 || c3 & 0xC0 != 0x80 {
306                return (REPLACEMENT_CHAR, 1);
307            }
308            let cp = ((c0 as u32 & 0x07) << 18)
309                | ((c1 as u32 & 0x3F) << 12)
310                | ((c2 as u32 & 0x3F) << 6)
311                | (c3 as u32 & 0x3F);
312            let cp = if cp <= 0x10_FFFF { cp } else { REPLACEMENT_CHAR };
313            (cp, 4)
314        }
315    }
316}
317
318/// Encode a Unicode codepoint as UTF-8.
319///
320/// Mirrors `utf8_out()` / `out_ptr` logic from various files.
321/// Returns the number of bytes written into `buf` (1–4).
322///
323/// # Panics
324/// Panics in debug builds if `cp` is not a valid Unicode scalar value or if
325/// `buf` is too small.
326pub fn utf8_encode_one(cp: u32, buf: &mut [u8]) -> usize {
327    if cp < 0x80 {
328        debug_assert!(buf.len() >= 1);
329        buf[0] = cp as u8;
330        1
331    } else if cp < 0x800 {
332        debug_assert!(buf.len() >= 2);
333        buf[0] = 0xC0 | (cp >> 6) as u8;
334        buf[1] = 0x80 | (cp & 0x3F) as u8;
335        2
336    } else if cp < 0x10000 {
337        debug_assert!(buf.len() >= 3);
338        buf[0] = 0xE0 | (cp >> 12) as u8;
339        buf[1] = 0x80 | ((cp >> 6) & 0x3F) as u8;
340        buf[2] = 0x80 | (cp & 0x3F) as u8;
341        3
342    } else {
343        debug_assert!(buf.len() >= 4);
344        buf[0] = 0xF0 | (cp >> 18) as u8;
345        buf[1] = 0x80 | ((cp >> 12) & 0x3F) as u8;
346        buf[2] = 0x80 | ((cp >> 6) & 0x3F) as u8;
347        buf[3] = 0x80 | (cp & 0x3F) as u8;
348        4
349    }
350}
351
352// ---------------------------------------------------------------------------
353// TextDecoder – the main streaming decoder
354// ---------------------------------------------------------------------------
355
356/// Decoding mode, controlling how the auto-detection heuristic works.
357#[derive(Debug, Clone, Copy, PartialEq, Eq)]
358pub enum DecodeMode {
359    /// Strict: use exactly the encoding provided, no auto-detection.
360    Strict,
361    /// Auto: try UTF-8 first; fall back to the provided single-byte encoding
362    /// on the first invalid byte (mirrors `espeakCHARS_AUTO`).
363    Auto,
364}
365
366/// A streaming text decoder that produces Unicode codepoints from an in-memory
367/// byte slice.
368///
369/// Mirrors the `espeak_ng_TEXT_DECODER` C struct + its associated functions.
370///
371/// The decoder borrows the input slice for its lifetime so no allocation is
372/// needed.
373pub struct TextDecoder<'a> {
374    buf:      &'a [u8],
375    pos:      usize,
376    encoding: Encoding,
377    mode:     DecodeMode,
378    /// When `mode == Auto` and we have fallen back to the codepage, this flag
379    /// records that the switch has happened so we stop trying UTF-8.
380    fell_back: bool,
381}
382
383impl<'a> TextDecoder<'a> {
384    /// Create a new decoder for `buf` with the given `encoding` and `mode`.
385    pub fn new(buf: &'a [u8], encoding: Encoding, mode: DecodeMode) -> Result<Self> {
386        if encoding == Encoding::Unknown {
387            return Err(Error::UnknownTextEncoding(
388                "cannot decode with Encoding::Unknown".to_string(),
389            ));
390        }
391        Ok(TextDecoder {
392            buf,
393            pos: 0,
394            encoding,
395            mode,
396            fell_back: false,
397        })
398    }
399
400    /// Convenience: UTF-8 strict decoder (the most common case).
401    pub fn utf8(buf: &'a [u8]) -> Self {
402        TextDecoder {
403            buf,
404            pos: 0,
405            encoding: Encoding::Utf8,
406            mode: DecodeMode::Strict,
407            fell_back: false,
408        }
409    }
410
411    /// Returns `true` when all bytes have been consumed.
412    pub fn is_eof(&self) -> bool {
413        self.pos >= self.buf.len()
414    }
415
416    /// Remaining bytes (useful for slicing the original buffer).
417    pub fn remaining(&self) -> &[u8] {
418        &self.buf[self.pos..]
419    }
420
421    /// Peek at the next codepoint without advancing the position.
422    pub fn peek(&self) -> Option<u32> {
423        if self.is_eof() {
424            return None;
425        }
426        // We clone just the position to avoid borrowing issues.
427        let mut clone = TextDecoder {
428            buf:       self.buf,
429            pos:       self.pos,
430            encoding:  self.encoding,
431            mode:      self.mode,
432            fell_back: self.fell_back,
433        };
434        clone.next_codepoint()
435    }
436
437    /// Consume and return the next codepoint, or `None` at EOF.
438    pub fn next_codepoint(&mut self) -> Option<u32> {
439        if self.is_eof() {
440            return None;
441        }
442        let cp = self.decode_one();
443        Some(cp)
444    }
445
446    /// Collect all remaining codepoints into a `Vec`.
447    pub fn collect_codepoints(&mut self) -> Vec<u32> {
448        let mut out = Vec::with_capacity(self.buf.len() - self.pos);
449        while let Some(cp) = self.next_codepoint() {
450            if cp == 0 {
451                break; // null-terminated, as the C code does
452            }
453            out.push(cp);
454        }
455        out
456    }
457
458    /// Collect and decode to a Rust `String` (replacing invalid codepoints
459    /// with U+FFFD, mirroring C behaviour).
460    pub fn decode_to_string(&mut self) -> String {
461        let codepoints = self.collect_codepoints();
462        codepoints
463            .into_iter()
464            .map(|cp| char::from_u32(cp).unwrap_or('\u{FFFD}'))
465            .collect()
466    }
467
468    // ----- private ----------------------------------------------------------
469
470    fn decode_one(&mut self) -> u32 {
471        match self.encoding {
472            Encoding::UsAscii => self.decode_ascii(),
473            Encoding::Utf8    => self.decode_utf8(),
474            Encoding::Ucs2    => self.decode_ucs2(),
475            enc if enc.is_single_byte() => {
476                if self.mode == DecodeMode::Auto && !self.fell_back {
477                    self.decode_auto()
478                } else {
479                    self.decode_codepage()
480                }
481            }
482            _ => {
483                self.pos += 1;
484                REPLACEMENT_CHAR
485            }
486        }
487    }
488
489    fn decode_ascii(&mut self) -> u32 {
490        let b = self.buf[self.pos];
491        self.pos += 1;
492        if b < 0x80 { b as u32 } else { REPLACEMENT_CHAR }
493    }
494
495    fn decode_utf8(&mut self) -> u32 {
496        let (cp, consumed) = utf8_decode_one(&self.buf[self.pos..]);
497        self.pos += consumed;
498        cp
499    }
500
501    fn decode_codepage(&mut self) -> u32 {
502        let b = self.buf[self.pos];
503        self.pos += 1;
504        if b < 0x80 {
505            b as u32
506        } else if let Some(table) = self.encoding.codepage() {
507            table[(b - 0x80) as usize] as u32
508        } else {
509            REPLACEMENT_CHAR
510        }
511    }
512
513    /// Auto mode: try UTF-8; on first failure, switch to the codepage.
514    /// Mirrors `string_decoder_getc_auto()` in encoding.c.
515    fn decode_auto(&mut self) -> u32 {
516        let saved_pos = self.pos;
517        let (cp, consumed) = utf8_decode_one(&self.buf[self.pos..]);
518        if cp == REPLACEMENT_CHAR {
519            // UTF-8 failed; fall back permanently to the codepage
520            self.fell_back = true;
521            self.pos = saved_pos;
522            self.decode_codepage()
523        } else {
524            self.pos += consumed;
525            cp
526        }
527    }
528
529    fn decode_ucs2(&mut self) -> u32 {
530        if self.pos + 1 >= self.buf.len() {
531            self.pos = self.buf.len();
532            return REPLACEMENT_CHAR;
533        }
534        let lo = self.buf[self.pos] as u32;
535        let hi = self.buf[self.pos + 1] as u32;
536        self.pos += 2;
537        lo | (hi << 8)
538    }
539}
540
541// ---------------------------------------------------------------------------
542// Iterator impl
543// ---------------------------------------------------------------------------
544
545impl<'a> Iterator for TextDecoder<'a> {
546    type Item = u32;
547
548    fn next(&mut self) -> Option<u32> {
549        if self.is_eof() {
550            return None;
551        }
552        let cp = self.decode_one();
553        if cp == 0 {
554            // null terminator – signal end like the C code does
555            self.pos = self.buf.len();
556            return None;
557        }
558        Some(cp)
559    }
560}
561
562// ---------------------------------------------------------------------------
563// Convenience free functions (mirrors the C public API)
564// ---------------------------------------------------------------------------
565
566/// Decode a UTF-8 byte slice to a `String`.
567///
568/// Invalid sequences are replaced with U+FFFD, matching C behaviour.
569pub fn decode_utf8_to_string(bytes: &[u8]) -> String {
570    let mut dec = TextDecoder::utf8(bytes);
571    dec.decode_to_string()
572}
573
574/// Decode a byte slice with the given encoding to a `String`.
575pub fn decode_to_string(bytes: &[u8], encoding: Encoding) -> Result<String> {
576    let mut dec = TextDecoder::new(bytes, encoding, DecodeMode::Strict)?;
577    Ok(dec.decode_to_string())
578}
579
580// ---------------------------------------------------------------------------
581// Tests
582// ---------------------------------------------------------------------------
583
584#[cfg(test)]
585mod tests {
586    use super::*;
587
588    // ---- utf8_decode_one ---------------------------------------------------
589
590    #[test]
591    fn utf8_decode_ascii_range() {
592        for b in 0u8..0x80 {
593            let (cp, n) = utf8_decode_one(&[b]);
594            assert_eq!(cp, b as u32, "ascii byte 0x{b:02x}");
595            assert_eq!(n, 1);
596        }
597    }
598
599    #[test]
600    fn utf8_decode_two_byte() {
601        // U+00E9 LATIN SMALL LETTER E WITH ACUTE  →  0xC3 0xA9
602        let (cp, n) = utf8_decode_one(&[0xC3, 0xA9]);
603        assert_eq!(cp, 0x00E9);
604        assert_eq!(n, 2);
605    }
606
607    #[test]
608    fn utf8_decode_three_byte() {
609        // U+20AC EURO SIGN  →  0xE2 0x82 0xAC
610        let (cp, n) = utf8_decode_one(&[0xE2, 0x82, 0xAC]);
611        assert_eq!(cp, 0x20AC);
612        assert_eq!(n, 3);
613    }
614
615    #[test]
616    fn utf8_decode_four_byte() {
617        // U+1F600 GRINNING FACE  →  0xF0 0x9F 0x98 0x80
618        let (cp, n) = utf8_decode_one(&[0xF0, 0x9F, 0x98, 0x80]);
619        assert_eq!(cp, 0x1F600);
620        assert_eq!(n, 4);
621    }
622
623    #[test]
624    fn utf8_decode_overlong_replacement() {
625        // Continuation byte in lead position → replacement char, skip 1
626        let (cp, n) = utf8_decode_one(&[0x80]);
627        assert_eq!(cp, REPLACEMENT_CHAR);
628        assert_eq!(n, 1);
629    }
630
631    #[test]
632    fn utf8_decode_bad_continuation() {
633        // Lead byte says 2-byte, but second byte is ASCII (not 10xxxxxx)
634        let (cp, n) = utf8_decode_one(&[0xC3, 0x20]);
635        assert_eq!(cp, REPLACEMENT_CHAR);
636        assert_eq!(n, 1);
637    }
638
639    #[test]
640    fn utf8_decode_codepoint_max() {
641        // U+10FFFF — maximum valid codepoint (4 bytes)
642        let (cp, n) = utf8_decode_one(&[0xF4, 0x8F, 0xBF, 0xBF]);
643        assert_eq!(cp, 0x10FFFF);
644        assert_eq!(n, 4);
645    }
646
647    #[test]
648    fn utf8_decode_above_max_is_replacement() {
649        // Byte pattern that would give cp > 0x10FFFF  (0xF4 0x90 0x80 0x80 → U+110000)
650        let (cp, _) = utf8_decode_one(&[0xF4, 0x90, 0x80, 0x80]);
651        assert_eq!(cp, REPLACEMENT_CHAR);
652    }
653
654    #[test]
655    fn utf8_decode_iumlaut_half_bug_workaround() {
656        // The C code maps U+FFFD from a 3-byte sequence to U+001A.
657        // The 3-byte encoding of U+FFFD is EF BF BD.
658        let (cp, n) = utf8_decode_one(&[0xEF, 0xBF, 0xBD]);
659        // Normal: should be 0xFFFD but the C workaround makes it 0x001A
660        assert_eq!(cp, 0x001A, "expected the C workaround U+001A, got 0x{cp:04x}");
661        assert_eq!(n, 3);
662    }
663
664    // ---- round-trip --------------------------------------------------------
665
666    #[test]
667    fn utf8_roundtrip_bmp() {
668        let mut buf = [0u8; 4];
669        // A selection of BMP codepoints including surrogates (which are invalid
670        // in UTF-8 but let's not crash on them).
671        for cp in [0u32, 0x41, 0xFF, 0x100, 0x7FF, 0x800, 0xFFFE, 0xFFFF] {
672            if let Some(ch) = char::from_u32(cp) {
673                let s = ch.encode_utf8(&mut buf);
674                let (decoded, _) = utf8_decode_one(s.as_bytes());
675                // Account for the C workaround: U+FFFD maps to U+001A in 3-byte
676                let expected = if cp == 0xFFFD { 0x001A } else { cp };
677                assert_eq!(decoded, expected, "cp=U+{cp:04X}");
678            }
679        }
680    }
681
682    // ---- utf8_encode_one ---------------------------------------------------
683
684    #[test]
685    fn utf8_encode_ascii() {
686        let mut buf = [0u8; 4];
687        assert_eq!(utf8_encode_one(b'A' as u32, &mut buf), 1);
688        assert_eq!(buf[0], b'A');
689    }
690
691    #[test]
692    fn utf8_encode_two_byte() {
693        let mut buf = [0u8; 4];
694        let n = utf8_encode_one(0x00E9, &mut buf); // é
695        assert_eq!(n, 2);
696        assert_eq!(&buf[..2], &[0xC3, 0xA9]);
697    }
698
699    #[test]
700    fn utf8_encode_three_byte() {
701        let mut buf = [0u8; 4];
702        let n = utf8_encode_one(0x20AC, &mut buf); // €
703        assert_eq!(n, 3);
704        assert_eq!(&buf[..3], &[0xE2, 0x82, 0xAC]);
705    }
706
707    #[test]
708    fn utf8_encode_four_byte() {
709        let mut buf = [0u8; 4];
710        let n = utf8_encode_one(0x1F600, &mut buf); // 😀
711        assert_eq!(n, 4);
712        assert_eq!(&buf[..4], &[0xF0, 0x9F, 0x98, 0x80]);
713    }
714
715    // ---- Encoding::from_name -----------------------------------------------
716
717    #[test]
718    fn encoding_from_name_utf8() {
719        assert_eq!(Encoding::from_name("UTF-8"),  Encoding::Utf8);
720        assert_eq!(Encoding::from_name("UTF8"),   Encoding::Utf8);
721        assert_eq!(Encoding::from_name("utf-8"),  Encoding::Utf8); // case-insensitive
722    }
723
724    #[test]
725    fn encoding_from_name_ascii_aliases() {
726        for alias in &["ASCII", "US-ASCII", "ANSI_X3.4-1968", "IBM367"] {
727            assert_eq!(
728                Encoding::from_name(alias),
729                Encoding::UsAscii,
730                "alias: {alias}"
731            );
732        }
733    }
734
735    #[test]
736    fn encoding_from_name_latin1_aliases() {
737        for alias in &["ISO-8859-1", "ISO_8859-1", "LATIN1", "L1", "IBM819"] {
738            assert_eq!(
739                Encoding::from_name(alias),
740                Encoding::Iso8859_1,
741                "alias: {alias}"
742            );
743        }
744    }
745
746    #[test]
747    fn encoding_from_name_koi8r() {
748        assert_eq!(Encoding::from_name("KOI8-R"),  Encoding::Koi8R);
749        assert_eq!(Encoding::from_name("CSKOI8R"), Encoding::Koi8R);
750    }
751
752    #[test]
753    fn encoding_from_name_unknown() {
754        assert_eq!(Encoding::from_name("bogus"),    Encoding::Unknown);
755        assert_eq!(Encoding::from_name(""),         Encoding::Unknown);
756        assert_eq!(Encoding::from_name("SHIFT_JIS"),Encoding::Unknown); // not supported
757    }
758
759    // ---- codepage tables ---------------------------------------------------
760
761    #[test]
762    fn iso8859_1_is_identity() {
763        // ISO-8859-1 bytes 0x80–0xFF should map 1-to-1 to Unicode.
764        let table = Encoding::Iso8859_1.codepage().unwrap();
765        for i in 0usize..128 {
766            assert_eq!(table[i] as usize, i + 0x80, "byte 0x{:02X}", i + 0x80);
767        }
768    }
769
770    #[test]
771    fn iso8859_15_euro_sign() {
772        // In ISO-8859-15 byte 0xA4 is U+20AC EURO SIGN (not U+00A4 as in ISO-8859-1).
773        let table = Encoding::Iso8859_15.codepage().unwrap();
774        let idx = 0xA4usize - 0x80; // = 36
775        assert_eq!(table[idx], 0x20AC);
776    }
777
778    #[test]
779    fn koi8r_sample() {
780        // The espeak-ng KOI8-R table actually contains ISO-8859-16 data
781        // (a quirk of the C implementation we faithfully reproduce).
782        // At byte 0xC1 the C table returns U+00C1 (LATIN CAPITAL A WITH ACUTE).
783        let table = Encoding::Koi8R.codepage().unwrap();
784        let idx = 0xC1usize - 0x80; // = 65
785        assert_eq!(table[idx], 0x00C1,
786            "espeak-ng KOI8-R table at 0xC1 should be U+00C1 (mirrors C source)");
787    }
788
789    // ---- TextDecoder -------------------------------------------------------
790
791    #[test]
792    fn text_decoder_utf8_hello() {
793        let input = b"hello";
794        let codepoints: Vec<u32> = TextDecoder::utf8(input).collect();
795        assert_eq!(codepoints, vec![b'h' as u32, b'e' as u32, b'l' as u32,
796                                    b'l' as u32, b'o' as u32]);
797    }
798
799    #[test]
800    fn text_decoder_utf8_multibyte() {
801        // "café" = c a f U+00E9
802        let input = "café".as_bytes();
803        let codepoints: Vec<u32> = TextDecoder::utf8(input).collect();
804        assert_eq!(codepoints, vec![b'c' as u32, b'a' as u32, b'f' as u32, 0x00E9]);
805    }
806
807    #[test]
808    fn text_decoder_null_terminates() {
809        let input = b"hi\x00world";
810        let codepoints: Vec<u32> = TextDecoder::utf8(input).collect();
811        // Should stop at the null byte
812        assert_eq!(codepoints, vec![b'h' as u32, b'i' as u32]);
813    }
814
815    #[test]
816    fn text_decoder_iso8859_1() {
817        // byte 0xE9 in ISO-8859-1 → U+00E9 (é)
818        let input = &[0xE9u8];
819        let mut dec = TextDecoder::new(input, Encoding::Iso8859_1, DecodeMode::Strict).unwrap();
820        let cp = dec.next_codepoint().unwrap();
821        assert_eq!(cp, 0x00E9);
822    }
823
824    #[test]
825    fn text_decoder_iso8859_15_euro() {
826        // byte 0xA4 in ISO-8859-15 → U+20AC (€)
827        let input = &[0xA4u8];
828        let mut dec = TextDecoder::new(input, Encoding::Iso8859_15, DecodeMode::Strict).unwrap();
829        let cp = dec.next_codepoint().unwrap();
830        assert_eq!(cp, 0x20AC);
831    }
832
833    #[test]
834    fn text_decoder_ascii_rejects_high_bytes() {
835        let input = &[0x80u8];
836        let mut dec = TextDecoder::new(input, Encoding::UsAscii, DecodeMode::Strict).unwrap();
837        let cp = dec.next_codepoint().unwrap();
838        assert_eq!(cp, REPLACEMENT_CHAR);
839    }
840
841    #[test]
842    fn text_decoder_auto_mode_utf8_first() {
843        // "hello" in UTF-8 – auto mode should use UTF-8 cleanly
844        let mut dec = TextDecoder::new(
845            b"hi",
846            Encoding::Iso8859_1, // fallback encoding
847            DecodeMode::Auto,
848        ).unwrap();
849        assert_eq!(dec.next_codepoint(), Some(b'h' as u32));
850        assert_eq!(dec.next_codepoint(), Some(b'i' as u32));
851        assert!(!dec.fell_back, "should not have fallen back");
852    }
853
854    #[test]
855    fn text_decoder_auto_mode_fallback_on_bad_utf8() {
856        // 0xA4 is not valid UTF-8 lead byte; in auto mode with ISO-8859-15
857        // it should fall back to the codepage and return U+20AC.
858        let mut dec = TextDecoder::new(
859            &[0xA4u8],
860            Encoding::Iso8859_15,
861            DecodeMode::Auto,
862        ).unwrap();
863        let cp = dec.next_codepoint().unwrap();
864        assert_eq!(cp, 0x20AC, "expected euro sign U+20AC");
865        assert!(dec.fell_back, "should have fallen back to codepage");
866    }
867
868    #[test]
869    fn text_decoder_ucs2_hello() {
870        // "Hi" in little-endian UCS-2: 0x48 0x00 0x69 0x00
871        let input = &[0x48u8, 0x00, 0x69, 0x00];
872        let codepoints: Vec<u32> = TextDecoder::new(input, Encoding::Ucs2, DecodeMode::Strict)
873            .unwrap()
874            .collect();
875        assert_eq!(codepoints, vec![b'H' as u32, b'i' as u32]);
876    }
877
878    #[test]
879    fn text_decoder_eof_flag() {
880        let mut dec = TextDecoder::utf8(b"x");
881        assert!(!dec.is_eof());
882        dec.next_codepoint();
883        assert!(dec.is_eof());
884    }
885
886    #[test]
887    fn decode_utf8_to_string_emoji() {
888        let s = "😀 world";
889        let decoded = decode_utf8_to_string(s.as_bytes());
890        assert_eq!(decoded, s);
891    }
892
893    #[test]
894    fn decode_to_string_iso8859_1_cafe() {
895        // "café" in ISO-8859-1 = b"caf\xE9"
896        let input = b"caf\xE9";
897        let s = decode_to_string(input, Encoding::Iso8859_1).unwrap();
898        assert_eq!(s, "café");
899    }
900
901    #[test]
902    fn decoder_error_on_unknown_encoding() {
903        let result = TextDecoder::new(b"x", Encoding::Unknown, DecodeMode::Strict);
904        assert!(result.is_err());
905    }
906}
espeak_ng/encoding/mod.rs

espeak_ng/encoding/
mod.rs