pub mod codepages;
use crate::error::{Error, Result};
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
#[repr(u32)]
#[allow(missing_docs)] pub enum Encoding {
Unknown = 0,
UsAscii = 1,
Iso8859_1 = 2,
Iso8859_2 = 3,
Iso8859_3 = 4,
Iso8859_4 = 5,
Iso8859_5 = 6,
Iso8859_6 = 7,
Iso8859_7 = 8,
Iso8859_8 = 9,
Iso8859_9 = 10,
Iso8859_10 = 11,
Iso8859_11 = 12,
Iso8859_13 = 13,
Iso8859_14 = 14,
Iso8859_15 = 15,
Iso8859_16 = 16,
Koi8R = 17,
Iscii = 18,
Utf8 = 19,
Ucs2 = 20,
}
impl Encoding {
pub fn from_name(name: &str) -> Self {
match name.to_ascii_uppercase().as_str() {
"ANSI_X3.4-1968" | "ANSI_X3.4-1986" | "ASCII" | "US-ASCII"
| "ISO646-US" | "IBM367" | "US" | "ISO_646.IRV:1991"
| "ISO-IR-6" | "CP367" | "CSASCII" => Encoding::UsAscii,
"ISO_8859-1" | "ISO_8859-1:1987" | "ISO-8859-1" | "ISO-IR-100"
| "LATIN1" | "L1" | "IBM819" | "CSISOLATIN1" => Encoding::Iso8859_1,
"ISO_8859-2" | "ISO_8859-2:1987" | "ISO-8859-2" | "ISO-IR-101"
| "LATIN2" | "L2" | "CSISOLATIN2" => Encoding::Iso8859_2,
"ISO_8859-3" | "ISO_8859-3:1988" | "ISO-8859-3" | "ISO-IR-109"
| "LATIN3" | "L3" | "CSISOLATIN3" => Encoding::Iso8859_3,
"ISO_8859-4" | "ISO_8859-4:1988" | "ISO-8859-4" | "ISO-IR-110"
| "LATIN4" | "L4" | "CSISOLATIN4" => Encoding::Iso8859_4,
"ISO_8859-5" | "ISO_8859-5:1988" | "ISO-8859-5" | "ISO-IR-144"
| "CYRILLIC" | "CSISOLATINCYRILLIC" => Encoding::Iso8859_5,
"ISO_8859-6" | "ISO_8859-6:1987" | "ISO-8859-6" | "ISO-IR-127"
| "ECMA-114" | "ASMO-708" | "ARABIC" | "CSISOLATINARABIC"
=> Encoding::Iso8859_6,
"ISO_8859-7" | "ISO_8859-7:1987" | "ISO-8859-7" | "ISO-IR-126"
| "ECMA-118" | "ELOT_928" | "GREEK" | "GREEK8"
| "CSISOLATINGREEK" => Encoding::Iso8859_7,
"ISO_8859-8" | "ISO_8859-8:1988" | "ISO-8859-8" | "ISO-IR-138"
| "HEBREW" | "CSISOLATINHEBREW" => Encoding::Iso8859_8,
"ISO_8859-9" | "ISO_8859-9:1989" | "ISO-8859-9" | "ISO-IR-148"
| "LATIN5" | "L5" | "CSISOLATIN5" => Encoding::Iso8859_9,
"ISO_8859-10" | "ISO-8859-10" | "ISO-IR-157" | "LATIN6" | "L6"
| "CSISOLATIN6" => Encoding::Iso8859_10,
"ISO_8859-11" | "ISO-8859-11" | "TIS-620" => Encoding::Iso8859_11,
"ISO_8859-13" | "ISO-8859-13" | "LATIN7" | "L7" => Encoding::Iso8859_13,
"ISO_8859-14" | "ISO-8859-14" | "ISO-IR-199" | "LATIN8" | "L8"
| "ISO-CELTIC" => Encoding::Iso8859_14,
"ISO_8859-15" | "ISO-8859-15" | "LATIN9" | "LATIN-9" | "LATIN0"
=> Encoding::Iso8859_15,
"ISO_8859-16" | "ISO-8859-16" | "ISO-IR-226" | "LATIN10" | "L10"
=> Encoding::Iso8859_16,
"KOI8-R" | "CSKOI8R" => Encoding::Koi8R,
"ISCII" => Encoding::Iscii,
"UTF-8" | "UTF8" => Encoding::Utf8,
"ISO-10646-UCS-2" | "UCS-2" | "CSUNICODE" => Encoding::Ucs2,
_ => Encoding::Unknown,
}
}
pub fn is_single_byte(self) -> bool {
matches!(
self,
Encoding::UsAscii
| Encoding::Iso8859_1
| Encoding::Iso8859_2
| Encoding::Iso8859_3
| Encoding::Iso8859_4
| Encoding::Iso8859_5
| Encoding::Iso8859_6
| Encoding::Iso8859_7
| Encoding::Iso8859_8
| Encoding::Iso8859_9
| Encoding::Iso8859_10
| Encoding::Iso8859_11
| Encoding::Iso8859_13
| Encoding::Iso8859_14
| Encoding::Iso8859_15
| Encoding::Iso8859_16
| Encoding::Koi8R
| Encoding::Iscii
)
}
pub fn codepage(self) -> Option<&'static [u16; 128]> {
use codepages::*;
match self {
Encoding::Iso8859_1 => Some(&ISO_8859_1),
Encoding::Iso8859_2 => Some(&ISO_8859_2),
Encoding::Iso8859_3 => Some(&ISO_8859_3),
Encoding::Iso8859_4 => Some(&ISO_8859_4),
Encoding::Iso8859_5 => Some(&ISO_8859_5),
Encoding::Iso8859_6 => Some(&ISO_8859_6),
Encoding::Iso8859_7 => Some(&ISO_8859_7),
Encoding::Iso8859_8 => Some(&ISO_8859_8),
Encoding::Iso8859_9 => Some(&ISO_8859_9),
Encoding::Iso8859_10 => Some(&ISO_8859_10),
Encoding::Iso8859_11 => Some(&ISO_8859_11),
Encoding::Iso8859_13 => Some(&ISO_8859_13),
Encoding::Iso8859_14 => Some(&ISO_8859_14),
Encoding::Iso8859_15 => Some(&ISO_8859_15),
Encoding::Iso8859_16 => Some(&ISO_8859_16),
Encoding::Koi8R => Some(&KOI8_R),
Encoding::Iscii => Some(&ISCII),
_ => None,
}
}
}
pub const REPLACEMENT_CHAR: u32 = 0xFFFD;
pub fn utf8_decode_one(buf: &[u8]) -> (u32, usize) {
debug_assert!(!buf.is_empty(), "utf8_decode_one called on empty buffer");
let c0 = buf[0];
match c0 >> 4 {
0x0..=0x7 => (c0 as u32, 1),
0x8..=0xB => (REPLACEMENT_CHAR, 1),
0xC | 0xD => {
if buf.len() < 2 {
return (REPLACEMENT_CHAR, buf.len());
}
let c1 = buf[1];
if c1 & 0xC0 != 0x80 {
return (REPLACEMENT_CHAR, 1);
}
let cp = ((c0 as u32 & 0x1F) << 6) | (c1 as u32 & 0x3F);
(cp, 2)
}
0xE => {
if buf.len() < 3 {
return (REPLACEMENT_CHAR, buf.len().min(1));
}
let c1 = buf[1];
let c2 = buf[2];
if c1 & 0xC0 != 0x80 {
return (REPLACEMENT_CHAR, 1);
}
if c2 & 0xC0 != 0x80 {
return (REPLACEMENT_CHAR, 1);
}
let cp = ((c0 as u32 & 0x0F) << 12)
| ((c1 as u32 & 0x3F) << 6)
| (c2 as u32 & 0x3F);
let cp = if cp == 0xFFFD { 0x001A } else { cp };
(cp, 3)
}
_ => {
if buf.len() < 4 {
return (REPLACEMENT_CHAR, buf.len().min(1));
}
let c1 = buf[1];
let c2 = buf[2];
let c3 = buf[3];
if c1 & 0xC0 != 0x80 || c2 & 0xC0 != 0x80 || c3 & 0xC0 != 0x80 {
return (REPLACEMENT_CHAR, 1);
}
let cp = ((c0 as u32 & 0x07) << 18)
| ((c1 as u32 & 0x3F) << 12)
| ((c2 as u32 & 0x3F) << 6)
| (c3 as u32 & 0x3F);
let cp = if cp <= 0x10_FFFF { cp } else { REPLACEMENT_CHAR };
(cp, 4)
}
}
}
pub fn utf8_encode_one(cp: u32, buf: &mut [u8]) -> usize {
if cp < 0x80 {
debug_assert!(buf.len() >= 1);
buf[0] = cp as u8;
1
} else if cp < 0x800 {
debug_assert!(buf.len() >= 2);
buf[0] = 0xC0 | (cp >> 6) as u8;
buf[1] = 0x80 | (cp & 0x3F) as u8;
2
} else if cp < 0x10000 {
debug_assert!(buf.len() >= 3);
buf[0] = 0xE0 | (cp >> 12) as u8;
buf[1] = 0x80 | ((cp >> 6) & 0x3F) as u8;
buf[2] = 0x80 | (cp & 0x3F) as u8;
3
} else {
debug_assert!(buf.len() >= 4);
buf[0] = 0xF0 | (cp >> 18) as u8;
buf[1] = 0x80 | ((cp >> 12) & 0x3F) as u8;
buf[2] = 0x80 | ((cp >> 6) & 0x3F) as u8;
buf[3] = 0x80 | (cp & 0x3F) as u8;
4
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum DecodeMode {
Strict,
Auto,
}
pub struct TextDecoder<'a> {
buf: &'a [u8],
pos: usize,
encoding: Encoding,
mode: DecodeMode,
fell_back: bool,
}
impl<'a> TextDecoder<'a> {
pub fn new(buf: &'a [u8], encoding: Encoding, mode: DecodeMode) -> Result<Self> {
if encoding == Encoding::Unknown {
return Err(Error::UnknownTextEncoding(
"cannot decode with Encoding::Unknown".to_string(),
));
}
Ok(TextDecoder {
buf,
pos: 0,
encoding,
mode,
fell_back: false,
})
}
pub fn utf8(buf: &'a [u8]) -> Self {
TextDecoder {
buf,
pos: 0,
encoding: Encoding::Utf8,
mode: DecodeMode::Strict,
fell_back: false,
}
}
pub fn is_eof(&self) -> bool {
self.pos >= self.buf.len()
}
pub fn remaining(&self) -> &[u8] {
&self.buf[self.pos..]
}
pub fn peek(&self) -> Option<u32> {
if self.is_eof() {
return None;
}
let mut clone = TextDecoder {
buf: self.buf,
pos: self.pos,
encoding: self.encoding,
mode: self.mode,
fell_back: self.fell_back,
};
clone.next_codepoint()
}
pub fn next_codepoint(&mut self) -> Option<u32> {
if self.is_eof() {
return None;
}
let cp = self.decode_one();
Some(cp)
}
pub fn collect_codepoints(&mut self) -> Vec<u32> {
let mut out = Vec::with_capacity(self.buf.len() - self.pos);
while let Some(cp) = self.next_codepoint() {
if cp == 0 {
break; }
out.push(cp);
}
out
}
pub fn decode_to_string(&mut self) -> String {
let codepoints = self.collect_codepoints();
codepoints
.into_iter()
.map(|cp| char::from_u32(cp).unwrap_or('\u{FFFD}'))
.collect()
}
fn decode_one(&mut self) -> u32 {
match self.encoding {
Encoding::UsAscii => self.decode_ascii(),
Encoding::Utf8 => self.decode_utf8(),
Encoding::Ucs2 => self.decode_ucs2(),
enc if enc.is_single_byte() => {
if self.mode == DecodeMode::Auto && !self.fell_back {
self.decode_auto()
} else {
self.decode_codepage()
}
}
_ => {
self.pos += 1;
REPLACEMENT_CHAR
}
}
}
fn decode_ascii(&mut self) -> u32 {
let b = self.buf[self.pos];
self.pos += 1;
if b < 0x80 { b as u32 } else { REPLACEMENT_CHAR }
}
fn decode_utf8(&mut self) -> u32 {
let (cp, consumed) = utf8_decode_one(&self.buf[self.pos..]);
self.pos += consumed;
cp
}
fn decode_codepage(&mut self) -> u32 {
let b = self.buf[self.pos];
self.pos += 1;
if b < 0x80 {
b as u32
} else if let Some(table) = self.encoding.codepage() {
table[(b - 0x80) as usize] as u32
} else {
REPLACEMENT_CHAR
}
}
fn decode_auto(&mut self) -> u32 {
let saved_pos = self.pos;
let (cp, consumed) = utf8_decode_one(&self.buf[self.pos..]);
if cp == REPLACEMENT_CHAR {
self.fell_back = true;
self.pos = saved_pos;
self.decode_codepage()
} else {
self.pos += consumed;
cp
}
}
fn decode_ucs2(&mut self) -> u32 {
if self.pos + 1 >= self.buf.len() {
self.pos = self.buf.len();
return REPLACEMENT_CHAR;
}
let lo = self.buf[self.pos] as u32;
let hi = self.buf[self.pos + 1] as u32;
self.pos += 2;
lo | (hi << 8)
}
}
impl<'a> Iterator for TextDecoder<'a> {
type Item = u32;
fn next(&mut self) -> Option<u32> {
if self.is_eof() {
return None;
}
let cp = self.decode_one();
if cp == 0 {
self.pos = self.buf.len();
return None;
}
Some(cp)
}
}
pub fn decode_utf8_to_string(bytes: &[u8]) -> String {
let mut dec = TextDecoder::utf8(bytes);
dec.decode_to_string()
}
pub fn decode_to_string(bytes: &[u8], encoding: Encoding) -> Result<String> {
let mut dec = TextDecoder::new(bytes, encoding, DecodeMode::Strict)?;
Ok(dec.decode_to_string())
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn utf8_decode_ascii_range() {
for b in 0u8..0x80 {
let (cp, n) = utf8_decode_one(&[b]);
assert_eq!(cp, b as u32, "ascii byte 0x{b:02x}");
assert_eq!(n, 1);
}
}
#[test]
fn utf8_decode_two_byte() {
let (cp, n) = utf8_decode_one(&[0xC3, 0xA9]);
assert_eq!(cp, 0x00E9);
assert_eq!(n, 2);
}
#[test]
fn utf8_decode_three_byte() {
let (cp, n) = utf8_decode_one(&[0xE2, 0x82, 0xAC]);
assert_eq!(cp, 0x20AC);
assert_eq!(n, 3);
}
#[test]
fn utf8_decode_four_byte() {
let (cp, n) = utf8_decode_one(&[0xF0, 0x9F, 0x98, 0x80]);
assert_eq!(cp, 0x1F600);
assert_eq!(n, 4);
}
#[test]
fn utf8_decode_overlong_replacement() {
let (cp, n) = utf8_decode_one(&[0x80]);
assert_eq!(cp, REPLACEMENT_CHAR);
assert_eq!(n, 1);
}
#[test]
fn utf8_decode_bad_continuation() {
let (cp, n) = utf8_decode_one(&[0xC3, 0x20]);
assert_eq!(cp, REPLACEMENT_CHAR);
assert_eq!(n, 1);
}
#[test]
fn utf8_decode_codepoint_max() {
let (cp, n) = utf8_decode_one(&[0xF4, 0x8F, 0xBF, 0xBF]);
assert_eq!(cp, 0x10FFFF);
assert_eq!(n, 4);
}
#[test]
fn utf8_decode_above_max_is_replacement() {
let (cp, _) = utf8_decode_one(&[0xF4, 0x90, 0x80, 0x80]);
assert_eq!(cp, REPLACEMENT_CHAR);
}
#[test]
fn utf8_decode_iumlaut_half_bug_workaround() {
let (cp, n) = utf8_decode_one(&[0xEF, 0xBF, 0xBD]);
assert_eq!(cp, 0x001A, "expected the C workaround U+001A, got 0x{cp:04x}");
assert_eq!(n, 3);
}
#[test]
fn utf8_roundtrip_bmp() {
let mut buf = [0u8; 4];
for cp in [0u32, 0x41, 0xFF, 0x100, 0x7FF, 0x800, 0xFFFE, 0xFFFF] {
if let Some(ch) = char::from_u32(cp) {
let s = ch.encode_utf8(&mut buf);
let (decoded, _) = utf8_decode_one(s.as_bytes());
let expected = if cp == 0xFFFD { 0x001A } else { cp };
assert_eq!(decoded, expected, "cp=U+{cp:04X}");
}
}
}
#[test]
fn utf8_encode_ascii() {
let mut buf = [0u8; 4];
assert_eq!(utf8_encode_one(b'A' as u32, &mut buf), 1);
assert_eq!(buf[0], b'A');
}
#[test]
fn utf8_encode_two_byte() {
let mut buf = [0u8; 4];
let n = utf8_encode_one(0x00E9, &mut buf); assert_eq!(n, 2);
assert_eq!(&buf[..2], &[0xC3, 0xA9]);
}
#[test]
fn utf8_encode_three_byte() {
let mut buf = [0u8; 4];
let n = utf8_encode_one(0x20AC, &mut buf); assert_eq!(n, 3);
assert_eq!(&buf[..3], &[0xE2, 0x82, 0xAC]);
}
#[test]
fn utf8_encode_four_byte() {
let mut buf = [0u8; 4];
let n = utf8_encode_one(0x1F600, &mut buf); assert_eq!(n, 4);
assert_eq!(&buf[..4], &[0xF0, 0x9F, 0x98, 0x80]);
}
#[test]
fn encoding_from_name_utf8() {
assert_eq!(Encoding::from_name("UTF-8"), Encoding::Utf8);
assert_eq!(Encoding::from_name("UTF8"), Encoding::Utf8);
assert_eq!(Encoding::from_name("utf-8"), Encoding::Utf8); }
#[test]
fn encoding_from_name_ascii_aliases() {
for alias in &["ASCII", "US-ASCII", "ANSI_X3.4-1968", "IBM367"] {
assert_eq!(
Encoding::from_name(alias),
Encoding::UsAscii,
"alias: {alias}"
);
}
}
#[test]
fn encoding_from_name_latin1_aliases() {
for alias in &["ISO-8859-1", "ISO_8859-1", "LATIN1", "L1", "IBM819"] {
assert_eq!(
Encoding::from_name(alias),
Encoding::Iso8859_1,
"alias: {alias}"
);
}
}
#[test]
fn encoding_from_name_koi8r() {
assert_eq!(Encoding::from_name("KOI8-R"), Encoding::Koi8R);
assert_eq!(Encoding::from_name("CSKOI8R"), Encoding::Koi8R);
}
#[test]
fn encoding_from_name_unknown() {
assert_eq!(Encoding::from_name("bogus"), Encoding::Unknown);
assert_eq!(Encoding::from_name(""), Encoding::Unknown);
assert_eq!(Encoding::from_name("SHIFT_JIS"),Encoding::Unknown); }
#[test]
fn iso8859_1_is_identity() {
let table = Encoding::Iso8859_1.codepage().unwrap();
for i in 0usize..128 {
assert_eq!(table[i] as usize, i + 0x80, "byte 0x{:02X}", i + 0x80);
}
}
#[test]
fn iso8859_15_euro_sign() {
let table = Encoding::Iso8859_15.codepage().unwrap();
let idx = 0xA4usize - 0x80; assert_eq!(table[idx], 0x20AC);
}
#[test]
fn koi8r_sample() {
let table = Encoding::Koi8R.codepage().unwrap();
let idx = 0xC1usize - 0x80; assert_eq!(table[idx], 0x00C1,
"espeak-ng KOI8-R table at 0xC1 should be U+00C1 (mirrors C source)");
}
#[test]
fn text_decoder_utf8_hello() {
let input = b"hello";
let codepoints: Vec<u32> = TextDecoder::utf8(input).collect();
assert_eq!(codepoints, vec![b'h' as u32, b'e' as u32, b'l' as u32,
b'l' as u32, b'o' as u32]);
}
#[test]
fn text_decoder_utf8_multibyte() {
let input = "café".as_bytes();
let codepoints: Vec<u32> = TextDecoder::utf8(input).collect();
assert_eq!(codepoints, vec![b'c' as u32, b'a' as u32, b'f' as u32, 0x00E9]);
}
#[test]
fn text_decoder_null_terminates() {
let input = b"hi\x00world";
let codepoints: Vec<u32> = TextDecoder::utf8(input).collect();
assert_eq!(codepoints, vec![b'h' as u32, b'i' as u32]);
}
#[test]
fn text_decoder_iso8859_1() {
let input = &[0xE9u8];
let mut dec = TextDecoder::new(input, Encoding::Iso8859_1, DecodeMode::Strict).unwrap();
let cp = dec.next_codepoint().unwrap();
assert_eq!(cp, 0x00E9);
}
#[test]
fn text_decoder_iso8859_15_euro() {
let input = &[0xA4u8];
let mut dec = TextDecoder::new(input, Encoding::Iso8859_15, DecodeMode::Strict).unwrap();
let cp = dec.next_codepoint().unwrap();
assert_eq!(cp, 0x20AC);
}
#[test]
fn text_decoder_ascii_rejects_high_bytes() {
let input = &[0x80u8];
let mut dec = TextDecoder::new(input, Encoding::UsAscii, DecodeMode::Strict).unwrap();
let cp = dec.next_codepoint().unwrap();
assert_eq!(cp, REPLACEMENT_CHAR);
}
#[test]
fn text_decoder_auto_mode_utf8_first() {
let mut dec = TextDecoder::new(
b"hi",
Encoding::Iso8859_1, DecodeMode::Auto,
).unwrap();
assert_eq!(dec.next_codepoint(), Some(b'h' as u32));
assert_eq!(dec.next_codepoint(), Some(b'i' as u32));
assert!(!dec.fell_back, "should not have fallen back");
}
#[test]
fn text_decoder_auto_mode_fallback_on_bad_utf8() {
let mut dec = TextDecoder::new(
&[0xA4u8],
Encoding::Iso8859_15,
DecodeMode::Auto,
).unwrap();
let cp = dec.next_codepoint().unwrap();
assert_eq!(cp, 0x20AC, "expected euro sign U+20AC");
assert!(dec.fell_back, "should have fallen back to codepage");
}
#[test]
fn text_decoder_ucs2_hello() {
let input = &[0x48u8, 0x00, 0x69, 0x00];
let codepoints: Vec<u32> = TextDecoder::new(input, Encoding::Ucs2, DecodeMode::Strict)
.unwrap()
.collect();
assert_eq!(codepoints, vec![b'H' as u32, b'i' as u32]);
}
#[test]
fn text_decoder_eof_flag() {
let mut dec = TextDecoder::utf8(b"x");
assert!(!dec.is_eof());
dec.next_codepoint();
assert!(dec.is_eof());
}
#[test]
fn decode_utf8_to_string_emoji() {
let s = "😀 world";
let decoded = decode_utf8_to_string(s.as_bytes());
assert_eq!(decoded, s);
}
#[test]
fn decode_to_string_iso8859_1_cafe() {
let input = b"caf\xE9";
let s = decode_to_string(input, Encoding::Iso8859_1).unwrap();
assert_eq!(s, "café");
}
#[test]
fn decoder_error_on_unknown_encoding() {
let result = TextDecoder::new(b"x", Encoding::Unknown, DecodeMode::Strict);
assert!(result.is_err());
}
}