Skip to main content

structured_email_address/
provider.rs

1//! Provider-aware normalization rules.
2//!
3//! Different mail providers treat the local part differently: Gmail ignores
4//! dots, most freemail providers fold case, subaddress separators vary. A
5//! [`ProviderRegistry`] maps domains to [`ProviderRule`]s so normalization can
6//! be provider-aware, and applications can register their own providers.
7//!
8//! The registry is also the source of truth for [`EmailAddress::is_freemail`],
9//! independent of whether provider-aware normalization is enabled.
10//!
11//! [`EmailAddress::is_freemail`]: crate::EmailAddress::is_freemail
12
13/// Normalization rule for one mail provider (a set of equivalent domains).
14///
15/// Construct with [`ProviderRule::new`] and refine with the builder-style
16/// setters. Fields are private so the rule can gain options without a breaking
17/// change.
18///
19/// # Example
20///
21/// ```
22/// use structured_email_address::ProviderRule;
23///
24/// // A corporate provider that ignores dots and folds case, tag separator '+'.
25/// let rule = ProviderRule::new(["mail.example.com"])
26///     .strip_dots(true)
27///     .lowercase_local(true)
28///     .freemail(false);
29/// assert!(rule.matches("MAIL.EXAMPLE.COM"));
30/// ```
31#[derive(Debug, Clone)]
32pub struct ProviderRule {
33    domains: Vec<Box<str>>,
34    strip_dots: bool,
35    lowercase_local: bool,
36    subaddress_sep: Option<char>,
37    is_freemail: bool,
38}
39
40impl ProviderRule {
41    /// Create a rule for the given domains.
42    ///
43    /// Domains are stored in their IDNA-ASCII (punycode) canonical form so a
44    /// rule registered as `münchen.de` and one as `xn--mnchen-3ya.de` are
45    /// equivalent, and matching agrees with the canonical domain used elsewhere.
46    ///
47    /// Defaults: no dot stripping, no case folding, `+` subaddress separator,
48    /// and `is_freemail = false` (a custom rule is treated as a private domain
49    /// unless you opt in with [`freemail(true)`](Self::freemail)).
50    pub fn new<I, S>(domains: I) -> Self
51    where
52        I: IntoIterator<Item = S>,
53        S: Into<String>,
54    {
55        Self {
56            domains: domains
57                .into_iter()
58                .map(|d| canonical_domain(&d.into()))
59                .collect(),
60            strip_dots: false,
61            lowercase_local: false,
62            subaddress_sep: Some('+'),
63            is_freemail: false,
64        }
65    }
66
67    /// Set whether dots in the local part are insignificant (e.g. Gmail).
68    #[must_use]
69    pub fn strip_dots(mut self, yes: bool) -> Self {
70        self.strip_dots = yes;
71        self
72    }
73
74    /// Set whether the local part is case-insensitive (folded to lowercase).
75    #[must_use]
76    pub fn lowercase_local(mut self, yes: bool) -> Self {
77        self.lowercase_local = yes;
78        self
79    }
80
81    /// Set the subaddress separator, or `None` if the provider has no
82    /// subaddressing.
83    #[must_use]
84    pub fn subaddress_separator(mut self, sep: Option<char>) -> Self {
85        self.subaddress_sep = sep;
86        self
87    }
88
89    /// Set whether this provider is a free webmail provider.
90    #[must_use]
91    pub fn freemail(mut self, yes: bool) -> Self {
92        self.is_freemail = yes;
93        self
94    }
95
96    /// Returns true if `domain` belongs to this provider.
97    ///
98    /// The domain is canonicalized to IDNA-ASCII before comparison, so Unicode
99    /// and punycode spellings of the same domain match.
100    pub fn matches(&self, domain: &str) -> bool {
101        self.matches_canonical(&canonical_domain(domain))
102    }
103
104    /// Match against a domain already in canonical (IDNA-ASCII) form.
105    fn matches_canonical(&self, canonical: &str) -> bool {
106        self.domains.iter().any(|d| &**d == canonical)
107    }
108
109    /// Whether the local part's dots are insignificant.
110    pub fn strips_dots(&self) -> bool {
111        self.strip_dots
112    }
113
114    /// Whether the local part is case-insensitive.
115    pub fn folds_case(&self) -> bool {
116        self.lowercase_local
117    }
118
119    /// The provider's subaddress separator, if any.
120    pub fn separator(&self) -> Option<char> {
121        self.subaddress_sep
122    }
123
124    /// Whether this is a free webmail provider.
125    pub fn is_freemail(&self) -> bool {
126        self.is_freemail
127    }
128}
129
130/// A set of [`ProviderRule`]s with domain lookup.
131///
132/// [`builtin`](Self::builtin) seeds the well-known providers; applications can
133/// extend it with [`add`](Self::add). User-added rules take precedence over
134/// built-ins, so a custom rule can redefine a built-in provider.
135#[derive(Debug, Clone)]
136pub struct ProviderRegistry {
137    rules: Vec<ProviderRule>,
138}
139
140// Process-wide built-in registry, constructed once. `builtin()` clones it and
141// the GmailOnly dot-policy borrows it, so neither pays a per-call allocation.
142// no-std: once_cell::race::OnceBox (alloc) or a caller-injected registry.
143static BUILTIN: std::sync::LazyLock<ProviderRegistry> = std::sync::LazyLock::new(|| {
144    let p = |domains: &[&str]| {
145        ProviderRule::new(domains.iter().copied())
146            .lowercase_local(true)
147            .freemail(true)
148    };
149    ProviderRegistry {
150        rules: vec![
151            p(&["gmail.com", "googlemail.com"]).strip_dots(true),
152            p(&["outlook.com", "hotmail.com", "live.com", "msn.com"]),
153            p(&["yahoo.com", "yahoo.co.uk", "yahoo.co.jp"]),
154            p(&["protonmail.com", "proton.me"]),
155            p(&["icloud.com", "me.com", "mac.com"]),
156            p(&["yandex.ru", "yandex.com"]),
157            p(&["mail.ru"]),
158            // Freemail providers without special normalization quirks.
159            p(&[
160                "aol.com",
161                "mail.com",
162                "zoho.com",
163                "gmx.com",
164                "gmx.de",
165                "web.de",
166                "tutanota.com",
167                "tuta.io",
168                "fastmail.com",
169            ]),
170        ],
171    }
172});
173
174/// Borrow the process-wide built-in registry without allocating.
175pub(crate) fn builtin_ref() -> &'static ProviderRegistry {
176    &BUILTIN
177}
178
179impl ProviderRegistry {
180    /// An empty registry.
181    pub fn empty() -> Self {
182        Self { rules: Vec::new() }
183    }
184
185    /// The built-in registry of well-known mail providers.
186    ///
187    /// Only Gmail/Googlemail ignore dots; every entry folds local-part case and
188    /// uses `+` as its subaddress separator. All built-ins are freemail.
189    ///
190    /// Returns an owned clone of a process-wide shared registry, so the rule set
191    /// is constructed (and its domains IDNA-canonicalized) only once.
192    pub fn builtin() -> Self {
193        builtin_ref().clone()
194    }
195
196    /// Add a rule. User-added rules take precedence over earlier ones.
197    pub fn add(&mut self, rule: ProviderRule) {
198        self.rules.push(rule);
199    }
200
201    /// Builder-style [`add`](Self::add).
202    #[must_use]
203    pub fn with(mut self, rule: ProviderRule) -> Self {
204        self.add(rule);
205        self
206    }
207
208    /// Look up the rule for a domain, or `None` if no provider matches.
209    ///
210    /// Most-recently-added rules win, so a custom rule overrides a built-in for
211    /// the same domain. The domain is canonicalized to IDNA-ASCII once, so
212    /// Unicode and punycode spellings resolve to the same rule.
213    pub fn lookup(&self, domain: &str) -> Option<&ProviderRule> {
214        let canonical = canonical_domain(domain);
215        self.rules
216            .iter()
217            .rev()
218            .find(|r| r.matches_canonical(&canonical))
219    }
220}
221
222impl Default for ProviderRegistry {
223    fn default() -> Self {
224        Self::builtin()
225    }
226}
227
228/// Canonicalize a domain to its IDNA-ASCII (punycode) form, lowercased.
229///
230/// Falls back to ASCII lowercasing if the input is not a valid domain, so
231/// matching never panics on arbitrary registry input.
232fn canonical_domain(domain: &str) -> Box<str> {
233    idna::domain_to_ascii(domain)
234        .unwrap_or_else(|_| domain.to_ascii_lowercase())
235        .into_boxed_str()
236}
237
238#[cfg(test)]
239mod tests;