#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
pub enum Charset {
#[default]
Utf8,
Latin1,
Win1252,
#[cfg(feature = "charset-full")]
Encoding(&'static encoding_rs::Encoding),
Dos(&'static [char; 128]),
Unknown,
}
impl Charset {
pub fn from_name(name: &str) -> Self {
let n: String = name
.chars()
.filter(|c| c.is_ascii_alphanumeric())
.map(|c| c.to_ascii_uppercase())
.collect();
match n.as_str() {
"UTF8" | "UNICODEFSS" => Charset::Utf8,
"ISO88591" | "LATIN1" => Charset::Latin1,
"WIN1252" | "WINDOWS1252" => Charset::Win1252,
other => match dos_table(other) {
Some(table) => Charset::Dos(table),
None => Self::resolve_extra(other),
},
}
}
#[cfg(feature = "charset-full")]
fn resolve_extra(normalized: &str) -> Self {
match whatwg_label(normalized) {
Some(label) => match encoding_rs::Encoding::for_label(label.as_bytes()) {
Some(enc) => Charset::Encoding(enc),
None => Charset::Unknown,
},
None => Charset::Unknown,
}
}
#[cfg(not(feature = "charset-full"))]
fn resolve_extra(_normalized: &str) -> Self {
Charset::Unknown
}
pub fn decode(self, raw: &[u8]) -> String {
match self {
Charset::Utf8 | Charset::Unknown => String::from_utf8_lossy(raw).into_owned(),
Charset::Latin1 => raw.iter().map(|&b| b as char).collect(),
Charset::Win1252 => raw.iter().map(|&b| win1252_char(b)).collect(),
#[cfg(feature = "charset-full")]
Charset::Encoding(enc) => enc.decode(raw).0.into_owned(),
Charset::Dos(table) => raw
.iter()
.map(|&b| {
if b < 0x80 {
b as char
} else {
table[(b - 0x80) as usize]
}
})
.collect(),
}
}
pub fn encode(self, s: &str) -> Vec<u8> {
match self {
Charset::Utf8 | Charset::Unknown => s.as_bytes().to_vec(),
Charset::Latin1 => s
.chars()
.map(|c| if (c as u32) <= 0xFF { c as u8 } else { b'?' })
.collect(),
Charset::Win1252 => s.chars().map(win1252_byte).collect(),
#[cfg(feature = "charset-full")]
Charset::Encoding(enc) => enc.encode(s).0.into_owned(),
Charset::Dos(table) => s
.chars()
.map(|c| {
if (c as u32) < 0x80 {
c as u8
} else {
table
.iter()
.position(|&t| t == c)
.map_or(b'?', |i| (i + 0x80) as u8)
}
})
.collect(),
}
}
}
fn dos_table(n: &str) -> Option<&'static [char; 128]> {
use crate::dos::*;
Some(match n {
"DOS437" => &CP437,
"DOS737" => &CP737,
"DOS775" => &CP775,
"DOS850" => &CP850,
"DOS852" => &CP852,
"DOS855" => &CP855,
"DOS857" => &CP857,
"DOS858" => &CP858,
"DOS860" => &CP860,
"DOS861" => &CP861,
"DOS862" => &CP862,
"DOS863" => &CP863,
"DOS864" => &CP864,
"DOS865" => &CP865,
"DOS866" => &CP866,
"DOS869" => &CP869,
_ => return None,
})
}
#[cfg(feature = "charset-full")]
fn whatwg_label(n: &str) -> Option<&'static str> {
if let Some(num) = n.strip_prefix("ISO8859") {
return match num {
"2" | "3" | "4" | "5" | "6" | "7" | "8" | "9" | "10" | "13" | "14" | "15" | "16" => {
Some(match num {
"2" => "iso-8859-2",
"3" => "iso-8859-3",
"4" => "iso-8859-4",
"5" => "iso-8859-5",
"6" => "iso-8859-6",
"7" => "iso-8859-7",
"8" => "iso-8859-8",
"9" => "iso-8859-9", "10" => "iso-8859-10",
"13" => "iso-8859-13",
"14" => "iso-8859-14",
"15" => "iso-8859-15",
_ => "iso-8859-16",
})
}
_ => None,
};
}
Some(match n {
"SJIS0208" | "SJIS" | "SHIFTJIS" => "shift_jis",
"EUCJ0208" | "EUCJP" => "euc-jp",
"KSC5601" | "EUCKR" => "euc-kr",
"GB2312" | "GBK" => "gbk",
"GB18030" => "gb18030",
"BIG5" => "big5",
"KOI8R" => "koi8-r",
"KOI8U" => "koi8-u",
"TIS620" => "windows-874",
"WIN1250" => "windows-1250",
"WIN1251" => "windows-1251",
"WIN1253" => "windows-1253",
"WIN1254" => "windows-1254",
"WIN1255" => "windows-1255",
"WIN1256" => "windows-1256",
"WIN1257" => "windows-1257",
"WIN1258" => "windows-1258",
_ => return None,
})
}
fn win1252_char(b: u8) -> char {
match b {
0x80 => '\u{20AC}', 0x82 => '\u{201A}', 0x83 => '\u{0192}', 0x84 => '\u{201E}', 0x85 => '\u{2026}', 0x86 => '\u{2020}', 0x87 => '\u{2021}', 0x88 => '\u{02C6}', 0x89 => '\u{2030}', 0x8A => '\u{0160}', 0x8B => '\u{2039}', 0x8C => '\u{0152}', 0x8E => '\u{017D}', 0x91 => '\u{2018}', 0x92 => '\u{2019}', 0x93 => '\u{201C}', 0x94 => '\u{201D}', 0x95 => '\u{2022}', 0x96 => '\u{2013}', 0x97 => '\u{2014}', 0x98 => '\u{02DC}', 0x99 => '\u{2122}', 0x9A => '\u{0161}', 0x9B => '\u{203A}', 0x9C => '\u{0153}', 0x9E => '\u{017E}', 0x9F => '\u{0178}', other => other as char,
}
}
fn win1252_byte(c: char) -> u8 {
match c {
'\u{20AC}' => 0x80,
'\u{201A}' => 0x82,
'\u{0192}' => 0x83,
'\u{201E}' => 0x84,
'\u{2026}' => 0x85,
'\u{2020}' => 0x86,
'\u{2021}' => 0x87,
'\u{02C6}' => 0x88,
'\u{2030}' => 0x89,
'\u{0160}' => 0x8A,
'\u{2039}' => 0x8B,
'\u{0152}' => 0x8C,
'\u{017D}' => 0x8E,
'\u{2018}' => 0x91,
'\u{2019}' => 0x92,
'\u{201C}' => 0x93,
'\u{201D}' => 0x94,
'\u{2022}' => 0x95,
'\u{2013}' => 0x96,
'\u{2014}' => 0x97,
'\u{02DC}' => 0x98,
'\u{2122}' => 0x99,
'\u{0161}' => 0x9A,
'\u{203A}' => 0x9B,
'\u{0153}' => 0x9C,
'\u{017E}' => 0x9E,
'\u{0178}' => 0x9F,
c if (c as u32) <= 0xFF => c as u8,
_ => b'?',
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn name_resolution() {
assert_eq!(Charset::from_name("UTF8"), Charset::Utf8);
assert_eq!(Charset::from_name("utf-8"), Charset::Utf8);
assert_eq!(Charset::from_name("ISO8859_1"), Charset::Latin1);
assert_eq!(Charset::from_name("Latin1"), Charset::Latin1);
assert_eq!(Charset::from_name("WIN1252"), Charset::Win1252);
assert_eq!(Charset::from_name("NOSUCHCHARSET"), Charset::Unknown);
}
#[test]
fn latin1_decode() {
assert_eq!(Charset::Latin1.decode(&[0x48, 0xE9, 0xF1]), "Héñ");
}
#[test]
fn win1252_decode() {
assert_eq!(Charset::Win1252.decode(&[0x80]), "€");
assert_eq!(Charset::Win1252.decode(&[0x93, 0x94]), "\u{201C}\u{201D}");
assert_eq!(Charset::Win1252.decode(&[0xE9]), "é");
}
#[test]
fn utf8_passthrough() {
assert_eq!(Charset::Utf8.decode("café €".as_bytes()), "café €");
}
#[test]
fn encode_inverts_decode() {
for (cs, bytes) in [
(Charset::Latin1, vec![0x48u8, 0xE9, 0xF1, 0x20, 0xFF]),
(Charset::Win1252, vec![0x80, 0x93, 0x94, 0xE9, 0x97]),
] {
let s = cs.decode(&bytes);
assert_eq!(cs.encode(&s), bytes, "roundtrip falhou para {cs:?}");
}
}
#[test]
fn encode_unrepresentable_is_question_mark() {
assert_eq!(Charset::Latin1.encode("a€b"), b"a?b");
assert_eq!(Charset::Win1252.encode("x\u{4E00}y"), b"x?y");
}
#[test]
fn dos_code_pages_resolve_and_roundtrip() {
assert!(matches!(Charset::from_name("DOS850"), Charset::Dos(_)));
assert!(matches!(Charset::from_name("DOS437"), Charset::Dos(_)));
let cp850 = Charset::from_name("DOS850");
assert_eq!(cp850.decode(&[0x41, 0x82, 0xA5]), "Aé\u{D1}");
assert_eq!(cp850.encode("Aé\u{D1}"), vec![0x41, 0x82, 0xA5]);
let cp860 = Charset::from_name("DOS860");
assert_eq!(cp860.decode(&[0x84, 0x85, 0x94]), "ãàõ");
assert_eq!(cp860.encode("ãàõ"), vec![0x84, 0x85, 0x94]);
assert_eq!(cp850.encode("€"), b"?");
}
#[cfg(not(feature = "charset-full"))]
#[test]
fn multibyte_without_feature_is_unknown() {
assert_eq!(Charset::from_name("SJIS_0208"), Charset::Unknown);
assert_eq!(Charset::from_name("EUCJ_0208"), Charset::Unknown);
}
#[cfg(feature = "charset-full")]
mod full {
use super::*;
#[test]
fn resolves_multibyte_names() {
for name in [
"SJIS_0208",
"EUCJ_0208",
"GBK",
"BIG_5",
"WIN1251",
"ISO8859_2",
] {
assert!(
matches!(Charset::from_name(name), Charset::Encoding(_)),
"{name} não resolveu para encoding_rs"
);
}
}
#[test]
fn shift_jis_roundtrip() {
let sjis = Charset::from_name("SJIS_0208");
let bytes = sjis.encode("日本語");
assert_eq!(bytes, vec![0x93, 0xfa, 0x96, 0x7b, 0x8c, 0xea]);
assert_eq!(sjis.decode(&bytes), "日本語");
}
#[test]
fn win1251_decode_cyrillic() {
let cp = Charset::from_name("WIN1251");
assert_eq!(cp.decode(&[0xcf]), "П");
}
#[test]
fn iso8859_15_euro() {
assert_eq!(Charset::from_name("ISO8859_15").decode(&[0xA4]), "€");
}
}
}