structured-email-address 0.0.11

RFC 5321/5322/6531 email address parser, validator, and normalizer. Subaddress extraction, provider-aware normalization, PSL domain validation, anti-homoglyph protection.
Documentation
//! # structured-email-address
//!
//! RFC 5321/5322/6531 conformant email address parser, validator, and normalizer.
//!
//! Unlike existing Rust crates that stop at RFC validation, this crate provides:
//! - **Subaddress extraction**: `user+tag@domain` → separate `user`, `tag`, `domain`
//! - **Provider-aware normalization**: Gmail dot-stripping, configurable case folding
//! - **PSL domain validation**: verify domain against the Public Suffix List
//! - **Anti-homoglyph protection**: detect Cyrillic/Latin lookalikes via Unicode skeleton
//! - **Configurable strictness**: Strict (5321), Standard (5322), Lax (obs-* allowed)
//! - **Zero-copy parsing**: internal spans into the input string
//!
//! # Quick Start
//!
//! ```
//! use structured_email_address::{EmailAddress, Config};
//!
//! // Simple: parse with defaults
//! let email: EmailAddress = "user+tag@example.com".parse().unwrap();
//! assert_eq!(email.local_part(), "user+tag");
//! assert_eq!(email.tag(), Some("tag"));
//! assert_eq!(email.domain(), "example.com");
//!
//! // Configured: Gmail normalization pipeline
//! let config = Config::builder()
//!     .strip_subaddress()
//!     .dots_gmail_only()
//!     .lowercase_all()
//!     .build();
//!
//! let email = EmailAddress::parse_with("A.L.I.C.E+promo@Gmail.COM", &config).unwrap();
//! assert_eq!(email.canonical(), "alice@gmail.com");
//! assert_eq!(email.tag(), Some("promo"));
//! ```

#![cfg_attr(
    not(test),
    deny(clippy::unwrap_used, clippy::expect_used, clippy::panic)
)]

mod config;
mod error;
mod normalize;
mod parser;
mod provider;
mod validate;

pub use config::{
    CasePolicy, Config, ConfigBuilder, DomainCheck, DotPolicy, Strictness, SubaddressPolicy,
};
pub use error::{Error, ErrorKind};
pub use normalize::confusable_skeleton;
pub use provider::{ProviderRegistry, ProviderRule};

/// A parsed, validated, and normalized email address.
///
/// Immutable after construction. All accessors return borrowed data.
#[derive(Debug, Clone)]
pub struct EmailAddress {
    /// Original input, exactly as supplied to the parser.
    original: String,
    /// Canonical local part (after normalization).
    local_part: String,
    /// Extracted subaddress tag, if any.
    tag: Option<String>,
    /// Canonical domain (IDNA-encoded, lowercased).
    domain: String,
    /// Unicode form of the domain (only when domain has punycode labels).
    domain_unicode: Option<String>,
    /// Display name, if parsed from `name-addr` format.
    display_name: Option<String>,
    /// Confusable skeleton, if config enabled it.
    skeleton: Option<String>,
    /// Whether the domain is a known freemail provider (from the registry).
    freemail: bool,
}

impl EmailAddress {
    /// Parse and validate with the given configuration.
    pub fn parse_with(input: &str, config: &Config) -> Result<Self, Error> {
        let parsed = parser::parse(
            input,
            config.strictness,
            config.allow_display_name,
            config.allow_domain_literal,
        )?;

        let normalized = normalize::normalize(&parsed, config)?;
        validate::validate(&parsed, &normalized, config)?;

        // Freemail status comes from the provider registry (built-ins + any
        // custom rules), independent of provider-aware normalization.
        let freemail = config
            .providers
            .lookup(&normalized.domain)
            .is_some_and(|p| p.is_freemail());

        Ok(Self {
            original: parsed.input.to_string(),
            local_part: normalized.local_part,
            tag: normalized.tag,
            domain: normalized.domain,
            domain_unicode: normalized.domain_unicode,
            display_name: normalized.display_name,
            skeleton: normalized.skeleton,
            freemail,
        })
    }

    /// The canonical local part (after normalization).
    ///
    /// If subaddress stripping is enabled, this excludes the `+tag`.
    /// If dot stripping is enabled, dots are removed.
    pub fn local_part(&self) -> &str {
        &self.local_part
    }

    /// The extracted subaddress tag, if present.
    ///
    /// For `user+promo@example.com`, returns `Some("promo")`.
    /// Always extracted regardless of [`SubaddressPolicy`] — the policy only
    /// affects whether it appears in [`canonical()`](Self::canonical).
    pub fn tag(&self) -> Option<&str> {
        self.tag.as_deref()
    }

    /// The canonical domain (IDNA-encoded, lowercased).
    pub fn domain(&self) -> &str {
        &self.domain
    }

    /// The canonical domain in Unicode form.
    ///
    /// For internationalized domains (`münchen.de` → `xn--mnchen-3ya.de`),
    /// returns the Unicode form of the canonical domain. For ASCII-only
    /// domains, returns the same value as [`domain()`](Self::domain).
    ///
    /// # Security
    ///
    /// The Unicode form is intended for **display only**. It may reintroduce
    /// [IDN homograph attacks](https://en.wikipedia.org/wiki/IDN_homograph_attack)
    /// where visually similar characters from different scripts produce
    /// different domain names (e.g. Cyrillic `а` vs Latin `a`).
    ///
    /// For security-sensitive comparisons (allow-lists, deduplication, access
    /// control), always use [`domain()`](Self::domain) which returns the
    /// ACE/Punycode form. If you must compare Unicode domains, apply your own
    /// confusable-detection logic (see [`confusable_skeleton()`]).
    ///
    /// ```
    /// use structured_email_address::EmailAddress;
    ///
    /// let email: EmailAddress = "user@münchen.de".parse().unwrap();
    /// assert_eq!(email.domain(), "xn--mnchen-3ya.de");
    /// assert_eq!(email.domain_unicode(), "münchen.de");
    ///
    /// let ascii: EmailAddress = "user@example.com".parse().unwrap();
    /// assert_eq!(ascii.domain_unicode(), "example.com");
    /// ```
    pub fn domain_unicode(&self) -> &str {
        self.domain_unicode.as_deref().unwrap_or(&self.domain)
    }

    /// The display name, if parsed from `"Name" <addr>` or `Name <addr>` format.
    pub fn display_name(&self) -> Option<&str> {
        self.display_name.as_deref()
    }

    /// The full canonical address: `local_part@domain`.
    ///
    /// If the local part contains characters that require quoting (spaces,
    /// special chars), it is wrapped in quotes for RFC compliance.
    pub fn canonical(&self) -> String {
        if needs_quoting(&self.local_part) {
            let escaped = escape_local_part(&self.local_part);
            format!("\"{}\"@{}", escaped, self.domain)
        } else {
            format!("{}@{}", self.local_part, self.domain)
        }
    }

    /// The original input, exactly as supplied to the parser (not trimmed).
    pub fn original(&self) -> &str {
        &self.original
    }

    /// The confusable skeleton of the local part (if config enabled it).
    ///
    /// Two addresses with the same skeleton + domain are visually confusable.
    pub fn skeleton(&self) -> Option<&str> {
        self.skeleton.as_deref()
    }

    /// Check if the domain is a known freemail provider.
    ///
    /// Determined from the [`ProviderRegistry`] in the [`Config`] used to parse
    /// (built-in providers plus any registered via
    /// [`ConfigBuilder::add_provider`]).
    pub fn is_freemail(&self) -> bool {
        self.freemail
    }

    /// Parse a batch of email addresses with the given configuration.
    ///
    /// Returns one `Result` per input, in the same order. The config is
    /// shared across all inputs, amortizing setup cost.
    ///
    /// # Example
    ///
    /// ```
    /// use structured_email_address::{EmailAddress, Config};
    ///
    /// let config = Config::default();
    /// let results = EmailAddress::parse_batch(
    ///     &["alice@example.com", "invalid", "bob@example.org"],
    ///     &config,
    /// );
    /// assert!(results[0].is_ok());
    /// assert!(results[1].is_err());
    /// assert!(results[2].is_ok());
    /// ```
    pub fn parse_batch(inputs: &[&str], config: &Config) -> Vec<Result<Self, Error>> {
        inputs
            .iter()
            .map(|input| Self::parse_with(input, config))
            .collect()
    }

    /// Parse a batch of email addresses in parallel using rayon.
    ///
    /// Same semantics as [`parse_batch`](Self::parse_batch), but distributes
    /// work across rayon's thread pool. Useful for bulk import/validation of
    /// large lists (10K+ addresses).
    ///
    /// Requires the `rayon` feature.
    ///
    /// # Example
    ///
    /// ```
    /// use structured_email_address::{EmailAddress, Config};
    ///
    /// let config = Config::default();
    /// let results = EmailAddress::parse_batch_par(
    ///     &["alice@example.com", "bob@example.org"],
    ///     &config,
    /// );
    /// assert!(results.iter().all(|r| r.is_ok()));
    /// ```
    #[cfg(feature = "rayon")]
    pub fn parse_batch_par(inputs: &[&str], config: &Config) -> Vec<Result<Self, Error>> {
        use rayon::prelude::*;

        inputs
            .par_iter()
            .map(|input| Self::parse_with(input, config))
            .collect()
    }
}

impl std::fmt::Display for EmailAddress {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        let local = if needs_quoting(&self.local_part) {
            format!("\"{}\"", escape_local_part(&self.local_part))
        } else {
            self.local_part.clone()
        };
        match &self.display_name {
            Some(name) => write!(
                f,
                "\"{}\" <{}@{}>",
                escape_display_name(name),
                local,
                self.domain
            ),
            None => write!(f, "{}@{}", local, self.domain),
        }
    }
}

/// Check if a local-part needs quoting for RFC 5321/5322 serialization.
/// Returns true if the local part contains characters outside of atext.
fn needs_quoting(local: &str) -> bool {
    if local.is_empty() {
        return true;
    }
    // Dots are only safe in valid dot-atom form (no leading/trailing/consecutive dots).
    if local.starts_with('.') || local.ends_with('.') || local.contains("..") {
        return true;
    }
    local.chars().any(|ch| {
        !ch.is_ascii_alphanumeric()
            && !matches!(
                ch,
                '!' | '#'
                    | '$'
                    | '%'
                    | '&'
                    | '\''
                    | '*'
                    | '+'
                    | '-'
                    | '/'
                    | '='
                    | '?'
                    | '^'
                    | '_'
                    | '`'
                    | '{'
                    | '|'
                    | '}'
                    | '~'
                    | '.'
            )
            && (ch as u32) < 0x80 // non-ASCII doesn't need quoting per RFC 6531
    })
}

/// Escape a local-part for use inside quotes: backslash-escape `"` and `\`,
/// strip CR/LF to prevent header injection (FWS is collapsed during normalization).
fn escape_local_part(local: &str) -> String {
    let mut escaped = String::with_capacity(local.len());
    for ch in local.chars() {
        match ch {
            '"' | '\\' => {
                escaped.push('\\');
                escaped.push(ch);
            }
            '\r' | '\n' => {} // strip CRLF to prevent header injection
            _ => escaped.push(ch),
        }
    }
    escaped
}

/// Backslash-escapes `"` and `\`, and strips bare CR/LF to prevent
/// header injection in serialized output.
fn escape_display_name(name: &str) -> String {
    let mut escaped = String::with_capacity(name.len());
    for ch in name.chars() {
        match ch {
            '"' => {
                escaped.push('\\');
                escaped.push('"');
            }
            '\\' => {
                escaped.push('\\');
                escaped.push('\\');
            }
            '\r' | '\n' => {} // strip CRLF
            _ => escaped.push(ch),
        }
    }
    escaped
}

/// Equality is based on canonical form (`local_part` + `domain`) only.
/// Display name, tag, and skeleton are intentionally excluded —
/// `"John" <user@example.com>` equals `"Jane" <user@example.com>`
/// because they route to the same mailbox.
impl PartialEq for EmailAddress {
    fn eq(&self, other: &Self) -> bool {
        self.local_part == other.local_part && self.domain == other.domain
    }
}

impl Eq for EmailAddress {}

impl std::hash::Hash for EmailAddress {
    fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
        self.local_part.hash(state);
        self.domain.hash(state);
    }
}

impl std::str::FromStr for EmailAddress {
    type Err = Error;

    fn from_str(s: &str) -> Result<Self, Self::Err> {
        Self::parse_with(s, &Config::default())
    }
}

#[cfg(feature = "serde")]
impl serde::Serialize for EmailAddress {
    fn serialize<S: serde::Serializer>(&self, serializer: S) -> Result<S::Ok, S::Error> {
        self.canonical().serialize(serializer)
    }
}

#[cfg(feature = "serde")]
impl<'de> serde::Deserialize<'de> for EmailAddress {
    fn deserialize<D: serde::Deserializer<'de>>(deserializer: D) -> Result<Self, D::Error> {
        let s = String::deserialize(deserializer)?;
        s.parse().map_err(serde::de::Error::custom)
    }
}

#[cfg(test)]
mod tests;