eml-codec 0.4.0

Email enCOder DECoder in Rust. Support Internet Message Format and MIME (RFC 822, 5322, 2045, 2046, 2047, 2048, 2049, 6532).
Documentation
use crate::i18n::ContainsUtf8;
use crate::text::words::is_vchar;
use bounded_static::{IntoBoundedStatic, ToBoundedStatic};
use charset::Charset;
#[cfg(feature = "tracing-recover")]
use tracing::warn;
#[cfg(feature = "arbitrary")]
use {crate::fuzz_eq::FuzzEq, arbitrary::Arbitrary};

/// Email charsets are defined by IANA
/// <https://www.iana.org/assignments/character-sets/character-sets.xhtml>
///
/// We piggy-back on the "charset" library that is specifically designed for
/// email.
#[allow(non_camel_case_types)]
#[derive(Clone, ContainsUtf8, Debug, Default, PartialEq)]
#[contains_utf8(false)]
pub enum EmailCharset {
    #[default]
    US_ASCII,
    Charset(Charset),
    // Invariant: must contain ASCII characters satisfying is_vchar
    Unknown(String),
}

impl<T: AsRef<[u8]>> From<T> for EmailCharset {
    fn from(bytes: T) -> Self {
        match bytes.as_ref().to_ascii_lowercase().as_slice() {
            b"us-ascii" | b"ascii" => Self::US_ASCII,
            _ => {
                // Filter out bytes that are not ASCII printable, in case there are some…
                let sanitized: String = bytes
                    .as_ref()
                    .iter()
                    .cloned()
                    .filter_map(|b| (b.is_ascii() && is_vchar(b as char)).then_some(b as char))
                    .collect();
                match Charset::for_label(sanitized.as_bytes()) {
                    Some(c) => Self::Charset(c),
                    None => {
                        #[cfg(feature = "tracing-recover")]
                        warn!(value = sanitized, "unknown charset");
                        Self::Unknown(sanitized)
                    }
                }
            }
        }
    }
}

impl ToString for EmailCharset {
    fn to_string(&self) -> String {
        String::from_utf8_lossy(self.as_bytes()).into()
    }
}

impl EmailCharset {
    pub fn as_bytes(&self) -> &[u8] {
        match self {
            Self::US_ASCII => b"us-ascii",
            Self::Charset(c) => c.name().as_bytes(),
            Self::Unknown(s) => s.as_bytes(),
        }
    }

    pub fn as_str(&self) -> &str {
        match self {
            Self::US_ASCII => "us-ascii",
            Self::Charset(c) => c.name(),
            Self::Unknown(s) => s.as_str(),
        }
    }

    pub fn utf8() -> Self {
        Self::Charset(Charset::for_encoding(encoding_rs::UTF_8))
    }

    pub fn decode<'a>(&self, bytes: &'a [u8]) -> std::borrow::Cow<'a, str> {
        match self {
            Self::US_ASCII | Self::Unknown(_) => charset::decode_ascii(bytes),
            Self::Charset(c) => {
                let (s, _has_malformed) = c.decode_without_bom_handling(bytes);
                s
            }
        }
    }
}

impl IntoBoundedStatic for EmailCharset {
    type Static = Self;
    fn into_static(self) -> Self::Static {
        self
    }
}

impl ToBoundedStatic for EmailCharset {
    type Static = Self;
    fn to_static(&self) -> Self::Static {
        self.clone()
    }
}

#[cfg(feature = "arbitrary")]
impl<'a> Arbitrary<'a> for EmailCharset {
    fn arbitrary(u: &mut arbitrary::Unstructured<'a>) -> arbitrary::Result<Self> {
        // preselect some charsets to help the fuzzer
        match u.int_in_range(0..=6)? {
            0 => Ok(Self::US_ASCII),
            1 => Ok(Self::utf8()),
            2 => Ok(Self::from(b"KOI-8R")),
            3 => Ok(Self::from(b"iso-8859-1")),
            4 => Ok(Self::from(b"iso-8859-15")),
            5 => Ok(Self::from(b"GBK")),
            6 => {
                let label: &[u8] = u.arbitrary()?;
                Ok(Self::from(label))
            }
            _ => unreachable!(),
        }
    }
}
#[cfg(feature = "arbitrary")]
impl FuzzEq for EmailCharset {
    fn fuzz_eq(&self, other: &Self) -> bool {
        self == other
    }
}

#[cfg(test)]
mod tests {
    use super::*;
    #[test]
    fn test_charset() {
        assert_eq!(EmailCharset::from(&b"Us-Ascii"[..]).as_bytes(), b"us-ascii",);

        assert_eq!(EmailCharset::from(&b"Us-Ascii"[..]), EmailCharset::US_ASCII,);

        assert_eq!(
            EmailCharset::from(&b"ISO-8859-1"[..]).as_bytes(),
            b"windows-1252",
        );

        assert_eq!(EmailCharset::from(&b"utf-8"[..]).as_bytes(), b"UTF-8",);

        assert_eq!(EmailCharset::from(&b"utf8"[..]).as_bytes(), b"UTF-8",);

        assert_eq!(
            EmailCharset::from(&b"!*\x00\x01abc"[..]),
            EmailCharset::Unknown("!*abc".to_string()),
        );
    }
}