eml_codec/text/
charset.rs1use crate::i18n::ContainsUtf8;
2use crate::text::words::is_vchar;
3use bounded_static::{IntoBoundedStatic, ToBoundedStatic};
4use charset::Charset;
5#[cfg(feature = "tracing-recover")]
6use tracing::warn;
7#[cfg(feature = "arbitrary")]
8use {crate::fuzz_eq::FuzzEq, arbitrary::Arbitrary};
9
10#[allow(non_camel_case_types)]
16#[derive(Clone, ContainsUtf8, Debug, Default, PartialEq)]
17#[contains_utf8(false)]
18pub enum EmailCharset {
19 #[default]
20 US_ASCII,
21 Charset(Charset),
22 Unknown(String),
24}
25
26impl<T: AsRef<[u8]>> From<T> for EmailCharset {
27 fn from(bytes: T) -> Self {
28 match bytes.as_ref().to_ascii_lowercase().as_slice() {
29 b"us-ascii" | b"ascii" => Self::US_ASCII,
30 _ => {
31 let sanitized: String = bytes
33 .as_ref()
34 .iter()
35 .cloned()
36 .filter_map(|b| (b.is_ascii() && is_vchar(b as char)).then_some(b as char))
37 .collect();
38 match Charset::for_label(sanitized.as_bytes()) {
39 Some(c) => Self::Charset(c),
40 None => {
41 #[cfg(feature = "tracing-recover")]
42 warn!(value = sanitized, "unknown charset");
43 Self::Unknown(sanitized)
44 }
45 }
46 }
47 }
48 }
49}
50
51impl ToString for EmailCharset {
52 fn to_string(&self) -> String {
53 String::from_utf8_lossy(self.as_bytes()).into()
54 }
55}
56
57impl EmailCharset {
58 pub fn as_bytes(&self) -> &[u8] {
59 match self {
60 Self::US_ASCII => b"us-ascii",
61 Self::Charset(c) => c.name().as_bytes(),
62 Self::Unknown(s) => s.as_bytes(),
63 }
64 }
65
66 pub fn as_str(&self) -> &str {
67 match self {
68 Self::US_ASCII => "us-ascii",
69 Self::Charset(c) => c.name(),
70 Self::Unknown(s) => s.as_str(),
71 }
72 }
73
74 pub fn utf8() -> Self {
75 Self::Charset(Charset::for_encoding(encoding_rs::UTF_8))
76 }
77
78 pub fn decode<'a>(&self, bytes: &'a [u8]) -> std::borrow::Cow<'a, str> {
79 match self {
80 Self::US_ASCII | Self::Unknown(_) => charset::decode_ascii(bytes),
81 Self::Charset(c) => {
82 let (s, _has_malformed) = c.decode_without_bom_handling(bytes);
83 s
84 }
85 }
86 }
87}
88
89impl IntoBoundedStatic for EmailCharset {
90 type Static = Self;
91 fn into_static(self) -> Self::Static {
92 self
93 }
94}
95
96impl ToBoundedStatic for EmailCharset {
97 type Static = Self;
98 fn to_static(&self) -> Self::Static {
99 self.clone()
100 }
101}
102
103#[cfg(feature = "arbitrary")]
104impl<'a> Arbitrary<'a> for EmailCharset {
105 fn arbitrary(u: &mut arbitrary::Unstructured<'a>) -> arbitrary::Result<Self> {
106 match u.int_in_range(0..=6)? {
108 0 => Ok(Self::US_ASCII),
109 1 => Ok(Self::utf8()),
110 2 => Ok(Self::from(b"KOI-8R")),
111 3 => Ok(Self::from(b"iso-8859-1")),
112 4 => Ok(Self::from(b"iso-8859-15")),
113 5 => Ok(Self::from(b"GBK")),
114 6 => {
115 let label: &[u8] = u.arbitrary()?;
116 Ok(Self::from(label))
117 }
118 _ => unreachable!(),
119 }
120 }
121}
122#[cfg(feature = "arbitrary")]
123impl FuzzEq for EmailCharset {
124 fn fuzz_eq(&self, other: &Self) -> bool {
125 self == other
126 }
127}
128
129#[cfg(test)]
130mod tests {
131 use super::*;
132 #[test]
133 fn test_charset() {
134 assert_eq!(EmailCharset::from(&b"Us-Ascii"[..]).as_bytes(), b"us-ascii",);
135
136 assert_eq!(EmailCharset::from(&b"Us-Ascii"[..]), EmailCharset::US_ASCII,);
137
138 assert_eq!(
139 EmailCharset::from(&b"ISO-8859-1"[..]).as_bytes(),
140 b"windows-1252",
141 );
142
143 assert_eq!(EmailCharset::from(&b"utf-8"[..]).as_bytes(), b"UTF-8",);
144
145 assert_eq!(EmailCharset::from(&b"utf8"[..]).as_bytes(), b"UTF-8",);
146
147 assert_eq!(
148 EmailCharset::from(&b"!*\x00\x01abc"[..]),
149 EmailCharset::Unknown("!*abc".to_string()),
150 );
151 }
152}