eml_codec/mime/
charset.rs

1use encoding_rs::Encoding;
2
3/// Specific implementation of charset
4///
5/// imf_codec has its own charset list to follow IANA's one.
6/// encoding_rs implements a different standard that does not know US_ASCII.
7/// using encoding_rs datastructures directly would lead to a loss of information.
8/// https://www.iana.org/assignments/character-sets/character-sets.xhtml
9#[allow(non_camel_case_types)]
10#[derive(Debug, PartialEq, Default, Clone)]
11pub enum EmailCharset {
12    #[default]
13    US_ASCII,
14    ISO_8859_1,
15    ISO_8859_2,
16    ISO_8859_3,
17    ISO_8859_4,
18    ISO_8859_5,
19    ISO_8859_6,
20    ISO_8859_7,
21    ISO_8859_8,
22    ISO_8859_9,
23    ISO_8859_10,
24    Shift_JIS,
25    EUC_JP,
26    ISO_2022_KR,
27    EUC_KR,
28    ISO_2022_JP,
29    ISO_2022_JP_2,
30    ISO_8859_6_E,
31    ISO_8859_6_I,
32    ISO_8859_8_E,
33    ISO_8859_8_I,
34    GB2312,
35    Big5,
36    KOI8_R,
37    UTF_8,
38    Unknown,
39}
40
41impl<'a> From<&'a str> for EmailCharset {
42    fn from(s: &'a str) -> Self {
43        Self::from(s.as_bytes())
44    }
45}
46
47impl<'a> From<&'a [u8]> for EmailCharset {
48    fn from(s: &'a [u8]) -> Self {
49        match s.to_ascii_lowercase().as_slice() {
50            b"us-ascii" | b"ascii" => EmailCharset::US_ASCII,
51            b"iso-8859-1" => EmailCharset::ISO_8859_1,
52            b"iso-8859-2" => EmailCharset::ISO_8859_2,
53            b"iso-8859-3" => EmailCharset::ISO_8859_3,
54            b"iso-8859-4" => EmailCharset::ISO_8859_4,
55            b"iso-8859-5" => EmailCharset::ISO_8859_5,
56            b"iso-8859-6" => EmailCharset::ISO_8859_6,
57            b"iso-8859-7" => EmailCharset::ISO_8859_7,
58            b"iso-8859-8" => EmailCharset::ISO_8859_8,
59            b"iso-8859-9" => EmailCharset::ISO_8859_9,
60            b"iso-8859-10" => EmailCharset::ISO_8859_10,
61            b"shift_jis" => EmailCharset::Shift_JIS,
62            b"euc-jp" => EmailCharset::EUC_JP,
63            b"iso-2022-kr" => EmailCharset::ISO_2022_KR,
64            b"euc-kr" => EmailCharset::EUC_KR,
65            b"iso-2022-jp" => EmailCharset::ISO_2022_JP,
66            b"iso-2022-jp-2" => EmailCharset::ISO_2022_JP_2,
67            b"iso-8859-6-e" => EmailCharset::ISO_8859_6_E,
68            b"iso-8859-6-i" => EmailCharset::ISO_8859_6_I,
69            b"iso-8859-8-e" => EmailCharset::ISO_8859_8_E,
70            b"iso-8859-8-i" => EmailCharset::ISO_8859_8_I,
71            b"gb2312" => EmailCharset::GB2312,
72            b"big5" => EmailCharset::Big5,
73            b"koi8-r" => EmailCharset::KOI8_R,
74            b"utf-8" | b"utf8" => EmailCharset::UTF_8,
75            _ => EmailCharset::Unknown,
76        }
77    }
78}
79
80impl ToString for EmailCharset {
81    fn to_string(&self) -> String {
82        self.as_str().into()
83    }
84}
85
86impl EmailCharset {
87    pub fn as_str(&self) -> &'static str {
88        use EmailCharset::*;
89        match self {
90            US_ASCII => "US-ASCII",
91            ISO_8859_1 => "ISO-8859-1",
92            ISO_8859_2 => "ISO-8859-2",
93            ISO_8859_3 => "ISO-8859-3",
94            ISO_8859_4 => "ISO-8859-4",
95            ISO_8859_5 => "ISO-8859-5",
96            ISO_8859_6 => "ISO-8859-6",
97            ISO_8859_7 => "ISO-8859-7",
98            ISO_8859_8 => "ISO-8859-8",
99            ISO_8859_9 => "ISO-8859-9",
100            ISO_8859_10 => "ISO-8859-10",
101            Shift_JIS => "Shift_JIS",
102            EUC_JP => "EUC-JP",
103            ISO_2022_KR => "ISO-2022-KR",
104            EUC_KR => "EUC-KR",
105            ISO_2022_JP => "ISO-2022-JP",
106            ISO_2022_JP_2 => "ISO-2022-JP-2",
107            ISO_8859_6_E => "ISO-8859-6-E",
108            ISO_8859_6_I => "ISO-8859-6-I",
109            ISO_8859_8_E => "ISO-8859-8-E",
110            ISO_8859_8_I => "ISO-8859-8-I",
111            GB2312 => "GB2312",
112            Big5 => "Big5",
113            KOI8_R => "KOI8-R",
114            UTF_8 => "UTF-8",
115            Unknown => "UTF-8",
116        }
117    }
118
119    pub fn as_encoding(&self) -> &'static Encoding {
120        Encoding::for_label(self.as_str().as_bytes()).unwrap_or(encoding_rs::WINDOWS_1252)
121    }
122}
123
124#[cfg(test)]
125mod tests {
126    use super::*;
127    #[test]
128    fn test_charset() {
129        assert_eq!(EmailCharset::from(&b"Us-Ascii"[..]).as_str(), "US-ASCII",);
130
131        assert_eq!(
132            EmailCharset::from(&b"Us-Ascii"[..]).as_encoding(),
133            encoding_rs::WINDOWS_1252,
134        );
135
136        assert_eq!(
137            EmailCharset::from(&b"ISO-8859-1"[..]).as_encoding(),
138            encoding_rs::WINDOWS_1252,
139        );
140
141        assert_eq!(
142            EmailCharset::from(&b"utf-8"[..]).as_encoding(),
143            encoding_rs::UTF_8,
144        );
145
146        assert_eq!(
147            EmailCharset::from(&b"utf8"[..]).as_encoding(),
148            encoding_rs::UTF_8,
149        );
150    }
151}