Skip to main content

eml_codec/text/
charset.rs

1use crate::i18n::ContainsUtf8;
2use crate::text::words::is_vchar;
3use bounded_static::{IntoBoundedStatic, ToBoundedStatic};
4use charset::Charset;
5#[cfg(feature = "tracing-recover")]
6use tracing::warn;
7#[cfg(feature = "arbitrary")]
8use {crate::fuzz_eq::FuzzEq, arbitrary::Arbitrary};
9
10/// Email charsets are defined by IANA
11/// <https://www.iana.org/assignments/character-sets/character-sets.xhtml>
12///
13/// We piggy-back on the "charset" library that is specifically designed for
14/// email.
15#[allow(non_camel_case_types)]
16#[derive(Clone, ContainsUtf8, Debug, Default, PartialEq)]
17#[contains_utf8(false)]
18pub enum EmailCharset {
19    #[default]
20    US_ASCII,
21    Charset(Charset),
22    // Invariant: must contain ASCII characters satisfying is_vchar
23    Unknown(String),
24}
25
26impl<T: AsRef<[u8]>> From<T> for EmailCharset {
27    fn from(bytes: T) -> Self {
28        match bytes.as_ref().to_ascii_lowercase().as_slice() {
29            b"us-ascii" | b"ascii" => Self::US_ASCII,
30            _ => {
31                // Filter out bytes that are not ASCII printable, in case there are some…
32                let sanitized: String = bytes
33                    .as_ref()
34                    .iter()
35                    .cloned()
36                    .filter_map(|b| (b.is_ascii() && is_vchar(b as char)).then_some(b as char))
37                    .collect();
38                match Charset::for_label(sanitized.as_bytes()) {
39                    Some(c) => Self::Charset(c),
40                    None => {
41                        #[cfg(feature = "tracing-recover")]
42                        warn!(value = sanitized, "unknown charset");
43                        Self::Unknown(sanitized)
44                    }
45                }
46            }
47        }
48    }
49}
50
51impl ToString for EmailCharset {
52    fn to_string(&self) -> String {
53        String::from_utf8_lossy(self.as_bytes()).into()
54    }
55}
56
57impl EmailCharset {
58    pub fn as_bytes(&self) -> &[u8] {
59        match self {
60            Self::US_ASCII => b"us-ascii",
61            Self::Charset(c) => c.name().as_bytes(),
62            Self::Unknown(s) => s.as_bytes(),
63        }
64    }
65
66    pub fn as_str(&self) -> &str {
67        match self {
68            Self::US_ASCII => "us-ascii",
69            Self::Charset(c) => c.name(),
70            Self::Unknown(s) => s.as_str(),
71        }
72    }
73
74    pub fn utf8() -> Self {
75        Self::Charset(Charset::for_encoding(encoding_rs::UTF_8))
76    }
77
78    pub fn decode<'a>(&self, bytes: &'a [u8]) -> std::borrow::Cow<'a, str> {
79        match self {
80            Self::US_ASCII | Self::Unknown(_) => charset::decode_ascii(bytes),
81            Self::Charset(c) => {
82                let (s, _has_malformed) = c.decode_without_bom_handling(bytes);
83                s
84            }
85        }
86    }
87}
88
89impl IntoBoundedStatic for EmailCharset {
90    type Static = Self;
91    fn into_static(self) -> Self::Static {
92        self
93    }
94}
95
96impl ToBoundedStatic for EmailCharset {
97    type Static = Self;
98    fn to_static(&self) -> Self::Static {
99        self.clone()
100    }
101}
102
103#[cfg(feature = "arbitrary")]
104impl<'a> Arbitrary<'a> for EmailCharset {
105    fn arbitrary(u: &mut arbitrary::Unstructured<'a>) -> arbitrary::Result<Self> {
106        // preselect some charsets to help the fuzzer
107        match u.int_in_range(0..=6)? {
108            0 => Ok(Self::US_ASCII),
109            1 => Ok(Self::utf8()),
110            2 => Ok(Self::from(b"KOI-8R")),
111            3 => Ok(Self::from(b"iso-8859-1")),
112            4 => Ok(Self::from(b"iso-8859-15")),
113            5 => Ok(Self::from(b"GBK")),
114            6 => {
115                let label: &[u8] = u.arbitrary()?;
116                Ok(Self::from(label))
117            }
118            _ => unreachable!(),
119        }
120    }
121}
122#[cfg(feature = "arbitrary")]
123impl FuzzEq for EmailCharset {
124    fn fuzz_eq(&self, other: &Self) -> bool {
125        self == other
126    }
127}
128
129#[cfg(test)]
130mod tests {
131    use super::*;
132    #[test]
133    fn test_charset() {
134        assert_eq!(EmailCharset::from(&b"Us-Ascii"[..]).as_bytes(), b"us-ascii",);
135
136        assert_eq!(EmailCharset::from(&b"Us-Ascii"[..]), EmailCharset::US_ASCII,);
137
138        assert_eq!(
139            EmailCharset::from(&b"ISO-8859-1"[..]).as_bytes(),
140            b"windows-1252",
141        );
142
143        assert_eq!(EmailCharset::from(&b"utf-8"[..]).as_bytes(), b"UTF-8",);
144
145        assert_eq!(EmailCharset::from(&b"utf8"[..]).as_bytes(), b"UTF-8",);
146
147        assert_eq!(
148            EmailCharset::from(&b"!*\x00\x01abc"[..]),
149            EmailCharset::Unknown("!*abc".to_string()),
150        );
151    }
152}