Skip to main content

structured_email_address/
normalize.rs

1//! Email address normalization.
2//!
3//! Converts parsed email addresses to canonical form based on [`Config`] settings.
4//! Adapted from StructuredID's `sid-authn/normalize.rs` with generalized provider support.
5
6use unicode_normalization::UnicodeNormalization;
7use unicode_security::confusable_detection::skeleton;
8
9use crate::config::{CasePolicy, Config, DotPolicy, SubaddressPolicy};
10use crate::parser::Parsed;
11
12/// Result of normalization: owned canonical parts.
13#[derive(Debug, Clone)]
14pub(crate) struct Normalized {
15    /// Canonical local part (after tag stripping, dot removal, case folding).
16    pub local_part: String,
17    /// Extracted subaddress tag, if any (before stripping).
18    pub tag: Option<String>,
19    /// Canonical domain (after IDNA encoding, case folding).
20    pub domain: String,
21    /// Display name from the original, if present.
22    pub display_name: Option<String>,
23    /// Confusable skeleton of the local part (for homoglyph detection).
24    pub skeleton: Option<String>,
25}
26
27/// Normalize a parsed email address according to the given config.
28pub(crate) fn normalize(parsed: &Parsed<'_>, config: &Config) -> Normalized {
29    let raw_local = parsed.local_part.as_str(parsed.input);
30    let raw_domain = parsed.domain.as_str(parsed.input);
31
32    // Strip quotes from quoted-string local parts for normalization.
33    let unquoted_local = if raw_local.starts_with('"') && raw_local.ends_with('"') {
34        &raw_local[1..raw_local.len() - 1]
35    } else {
36        raw_local
37    };
38
39    // Step 1: Unicode NFC normalization.
40    let nfc_local: String = unquoted_local.nfc().collect();
41    let nfc_domain: String = raw_domain.nfc().collect();
42
43    // Step 2: Case folding.
44    let cased_local = match config.case_policy {
45        CasePolicy::All => nfc_local.to_lowercase(),
46        CasePolicy::Domain | CasePolicy::Preserve => nfc_local,
47    };
48
49    // Step 3: Extract subaddress tag.
50    let sep = config.subaddress_separator;
51    let (base_local, tag) = match cased_local.split_once(sep) {
52        Some((base, tag)) => (base.to_string(), Some(tag.to_string())),
53        None => (cased_local, None),
54    };
55
56    // Step 4: Apply subaddress policy to canonical form.
57    let local_after_tag = match config.subaddress {
58        SubaddressPolicy::Strip => base_local.clone(),
59        SubaddressPolicy::Preserve => match &tag {
60            Some(t) => format!("{}{}{}", base_local, sep, t),
61            None => base_local.clone(),
62        },
63    };
64
65    // Step 5: Dot policy.
66    let local_after_dots = apply_dot_policy(&local_after_tag, &nfc_domain, config.dot_policy);
67
68    // Step 6: Domain — IDNA encoding (punycode for international domains).
69    let canonical_domain =
70        idna::domain_to_ascii(&nfc_domain).unwrap_or_else(|_| nfc_domain.to_lowercase());
71
72    // Step 7: Domain case (always lowercase per RFC).
73    let canonical_domain = canonical_domain.to_lowercase();
74
75    // Step 8: Anti-homoglyph skeleton (optional).
76    let skel = if config.check_confusables {
77        Some(confusable_skeleton(&local_after_dots))
78    } else {
79        None
80    };
81
82    // Display name
83    let display_name = parsed
84        .display_name
85        .map(|span| span.as_str(parsed.input).to_string());
86
87    Normalized {
88        local_part: local_after_dots,
89        tag,
90        domain: canonical_domain,
91        display_name,
92        skeleton: skel,
93    }
94}
95
96/// Apply dot-stripping policy.
97fn apply_dot_policy(local: &str, domain: &str, policy: DotPolicy) -> String {
98    match policy {
99        DotPolicy::Preserve => local.to_string(),
100        DotPolicy::Always => local.replace('.', ""),
101        DotPolicy::GmailOnly => {
102            let domain_lower = domain.to_lowercase();
103            if is_gmail_domain(&domain_lower) {
104                local.replace('.', "")
105            } else {
106                local.to_string()
107            }
108        }
109    }
110}
111
112/// Check if domain is a Gmail domain (ignores dots in local part).
113fn is_gmail_domain(domain: &str) -> bool {
114    matches!(domain, "gmail.com" | "googlemail.com")
115}
116
117/// Compute confusable skeleton for anti-homoglyph protection.
118///
119/// Two strings with the same skeleton are visually confusable.
120/// Use during registration to prevent lookalike accounts.
121pub fn confusable_skeleton(input: &str) -> String {
122    let nfc: String = input.nfc().collect();
123    skeleton(&nfc).collect::<String>().to_lowercase()
124}
125
126#[cfg(test)]
127mod tests {
128    use super::*;
129    use crate::config::Config;
130    use crate::parser;
131
132    fn parse_and_normalize(input: &str, config: &Config) -> Normalized {
133        let parsed = parser::parse(
134            input,
135            config.strictness,
136            config.allow_display_name,
137            config.allow_domain_literal,
138        )
139        .unwrap_or_else(|e| panic!("parse failed for '{input}': {e}"));
140        normalize(&parsed, config)
141    }
142
143    #[test]
144    fn basic_normalization() {
145        let config = Config::default();
146        let n = parse_and_normalize("User@Example.COM", &config);
147        assert_eq!(n.local_part, "User"); // Domain-only lowercase by default
148        assert_eq!(n.domain, "example.com");
149    }
150
151    #[test]
152    fn lowercase_all() {
153        let config = Config::builder().lowercase_all().build();
154        let n = parse_and_normalize("User@Example.COM", &config);
155        assert_eq!(n.local_part, "user");
156        assert_eq!(n.domain, "example.com");
157    }
158
159    #[test]
160    fn subaddress_extraction() {
161        let config = Config::default();
162        let n = parse_and_normalize("user+promo@example.com", &config);
163        assert_eq!(n.tag, Some("promo".to_string()));
164        // Preserved by default
165        assert_eq!(n.local_part, "user+promo");
166    }
167
168    #[test]
169    fn subaddress_strip() {
170        let config = Config::builder().strip_subaddress().lowercase_all().build();
171        let n = parse_and_normalize("user+promo@example.com", &config);
172        assert_eq!(n.tag, Some("promo".to_string()));
173        assert_eq!(n.local_part, "user");
174    }
175
176    #[test]
177    fn gmail_dot_stripping() {
178        let config = Config::builder().dots_gmail_only().lowercase_all().build();
179
180        let n = parse_and_normalize("a.l.i.c.e@gmail.com", &config);
181        assert_eq!(n.local_part, "alice");
182
183        // Non-gmail: dots preserved
184        let n = parse_and_normalize("a.l.i.c.e@example.com", &config);
185        assert_eq!(n.local_part, "a.l.i.c.e");
186    }
187
188    #[test]
189    fn idna_domain() {
190        let config = Config::default();
191        let n = parse_and_normalize("user@münchen.de", &config);
192        assert_eq!(n.domain, "xn--mnchen-3ya.de");
193    }
194
195    #[test]
196    fn confusable_skeleton_cyrillic() {
197        // Cyrillic 'а' (U+0430) vs Latin 'a' (U+0061)
198        let latin = confusable_skeleton("alice");
199        let cyrillic = confusable_skeleton("\u{0430}lice");
200        assert_eq!(latin, cyrillic);
201    }
202
203    #[test]
204    fn full_pipeline() {
205        let config = Config::builder()
206            .strip_subaddress()
207            .dots_gmail_only()
208            .lowercase_all()
209            .check_confusables()
210            .build();
211
212        let n = parse_and_normalize("A.L.I.C.E+promo@Gmail.COM", &config);
213        assert_eq!(n.local_part, "alice");
214        assert_eq!(n.tag, Some("promo".to_string()));
215        assert_eq!(n.domain, "gmail.com");
216        assert!(n.skeleton.is_some());
217    }
218}