use unicode_normalization::UnicodeNormalization;
use unicode_security::confusable_detection::skeleton;
use crate::config::{CasePolicy, Config, DotPolicy, SubaddressPolicy};
use crate::error::{Error, ErrorKind};
use crate::parser::Parsed;
#[derive(Debug, Clone)]
pub(crate) struct Normalized {
pub local_part: String,
pub tag: Option<String>,
pub domain: String,
pub domain_unicode: Option<String>,
pub display_name: Option<String>,
pub skeleton: Option<String>,
}
pub(crate) fn normalize(parsed: &Parsed<'_>, config: &Config) -> Result<Normalized, Error> {
let local = parsed.local_part_str();
let domain_str = parsed.domain_str();
let is_quoted = local.starts_with('"') && local.ends_with('"');
let unquoted_local = if is_quoted {
unescape_quoted_string(&local[1..local.len() - 1])
} else {
local.to_string()
};
let nfc_local: String = unquoted_local.nfc().collect();
let nfc_domain: String = domain_str.nfc().collect();
let cased_local = match config.case_policy {
CasePolicy::All => nfc_local.to_lowercase(),
CasePolicy::Domain | CasePolicy::Preserve => nfc_local,
};
let (_base_local, tag, local_after_dots) = if is_quoted {
(cased_local.clone(), None, cased_local)
} else {
let sep = config.subaddress_separator;
let (base, tag) = match cased_local.split_once(sep) {
Some((base, tag)) if !base.is_empty() => (base.to_string(), Some(tag.to_string())),
_ => (cased_local, None),
};
let local_after_tag = match config.subaddress {
SubaddressPolicy::Strip => base.clone(),
SubaddressPolicy::Preserve => match &tag {
Some(t) => format!("{}{}{}", base, sep, t),
None => base.clone(),
},
};
let after_dots = apply_dot_policy(&local_after_tag, &nfc_domain, config.dot_policy);
(base, tag, after_dots)
};
let canonical_domain = if nfc_domain.starts_with('[') {
nfc_domain.to_lowercase()
} else {
idna::domain_to_ascii_strict(&nfc_domain).map_err(|e| {
Error::new(
ErrorKind::IdnaError(format!("{}: {}", nfc_domain, e)),
parsed.domain.start,
)
})?
};
let domain_unicode = if canonical_domain
.split('.')
.any(|label| label.starts_with("xn--"))
{
let (unicode, result) = idna::domain_to_unicode(&canonical_domain);
if result.is_ok() && unicode != canonical_domain {
Some(unicode)
} else {
None
}
} else {
None
};
let skel = if config.check_confusables {
Some(confusable_skeleton(&local_after_dots))
} else {
None
};
let display_name = parsed
.display_name
.map(|span| unescape_quoted_string(span.as_str(parsed.input)));
Ok(Normalized {
local_part: local_after_dots,
tag,
domain: canonical_domain,
domain_unicode,
display_name,
skeleton: skel,
})
}
fn apply_dot_policy(local: &str, domain: &str, policy: DotPolicy) -> String {
match policy {
DotPolicy::Preserve => local.to_string(),
DotPolicy::Always => local.replace('.', ""),
DotPolicy::GmailOnly => {
if is_gmail_domain(domain) {
local.replace('.', "")
} else {
local.to_string()
}
}
}
}
fn is_gmail_domain(domain: &str) -> bool {
domain.eq_ignore_ascii_case("gmail.com") || domain.eq_ignore_ascii_case("googlemail.com")
}
fn unescape_quoted_string(s: &str) -> String {
let mut out = String::with_capacity(s.len());
let mut chars = s.chars().peekable();
while let Some(ch) = chars.next() {
if ch == '\\' {
if let Some(escaped) = chars.next() {
out.push(escaped);
} else {
out.push(ch);
}
} else if ch == '\r' {
if chars.peek() == Some(&'\n') {
chars.next(); while matches!(chars.peek(), Some(' ' | '\t')) {
chars.next();
}
out.push(' ');
}
} else if ch == '\n' {
} else {
out.push(ch);
}
}
out
}
pub fn confusable_skeleton(input: &str) -> String {
let nfc: String = input.nfc().collect();
skeleton(&nfc).collect::<String>().to_lowercase()
}
#[cfg(test)]
mod tests {
use super::*;
use crate::config::Config;
use crate::parser;
fn parse_and_normalize(input: &str, config: &Config) -> Normalized {
let parsed = parser::parse(
input,
config.strictness,
config.allow_display_name,
config.allow_domain_literal,
)
.unwrap_or_else(|e| panic!("parse failed for '{input}': {e}"));
normalize(&parsed, config).unwrap_or_else(|e| panic!("normalize failed for '{input}': {e}"))
}
#[test]
fn basic_normalization() {
let config = Config::default();
let n = parse_and_normalize("User@Example.COM", &config);
assert_eq!(n.local_part, "User"); assert_eq!(n.domain, "example.com");
}
#[test]
fn lowercase_all() {
let config = Config::builder().lowercase_all().build();
let n = parse_and_normalize("User@Example.COM", &config);
assert_eq!(n.local_part, "user");
assert_eq!(n.domain, "example.com");
}
#[test]
fn subaddress_extraction() {
let config = Config::default();
let n = parse_and_normalize("user+promo@example.com", &config);
assert_eq!(n.tag, Some("promo".to_string()));
assert_eq!(n.local_part, "user+promo");
}
#[test]
fn subaddress_strip() {
let config = Config::builder().strip_subaddress().lowercase_all().build();
let n = parse_and_normalize("user+promo@example.com", &config);
assert_eq!(n.tag, Some("promo".to_string()));
assert_eq!(n.local_part, "user");
}
#[test]
fn gmail_dot_stripping() {
let config = Config::builder().dots_gmail_only().lowercase_all().build();
let n = parse_and_normalize("a.l.i.c.e@gmail.com", &config);
assert_eq!(n.local_part, "alice");
let n = parse_and_normalize("a.l.i.c.e@example.com", &config);
assert_eq!(n.local_part, "a.l.i.c.e");
}
#[test]
fn idna_domain() {
let config = Config::default();
let n = parse_and_normalize("user@münchen.de", &config);
assert_eq!(n.domain, "xn--mnchen-3ya.de");
assert_eq!(n.domain_unicode.as_deref(), Some("münchen.de"));
}
#[test]
fn ascii_domain_no_unicode_field() {
let config = Config::default();
let n = parse_and_normalize("user@example.com", &config);
assert_eq!(n.domain, "example.com");
assert_eq!(n.domain_unicode, None);
}
#[test]
fn idna_error_propagated() {
use crate::parser::Span;
let long_label = "a".repeat(64);
let input = format!("user@{long_label}.com");
let config = Config::default();
let parsed = crate::parser::Parsed {
input: &input,
display_name: None,
local_part: Span { start: 0, end: 4 },
domain: Span {
start: 5,
end: input.len(),
},
comments: vec![],
local_part_clean: None,
domain_clean: None,
};
let err = normalize(&parsed, &config).unwrap_err();
assert!(
matches!(err.kind(), ErrorKind::IdnaError(_)),
"expected IdnaError, got {:?}",
err.kind()
);
}
#[test]
fn confusable_skeleton_cyrillic() {
let latin = confusable_skeleton("alice");
let cyrillic = confusable_skeleton("\u{0430}lice");
assert_eq!(latin, cyrillic);
}
#[test]
fn quoted_local_unescapes_quoted_pairs() {
let config = Config::default();
let n1 = parse_and_normalize("\"a\\ b\"@example.com", &config);
let n2 = parse_and_normalize("\"a b\"@example.com", &config);
assert_eq!(
n1.local_part, n2.local_part,
"quoted-pair backslash must be unescaped"
);
assert_eq!(n1.local_part, "a b");
}
#[test]
fn quoted_local_preserves_plus_and_dots() {
let config = Config::builder()
.strip_subaddress()
.dots_gmail_only()
.lowercase_all()
.build();
let n = parse_and_normalize("\"a+b\"@gmail.com", &config);
assert_eq!(
n.local_part, "a+b",
"subaddress must not split inside quoted local"
);
assert_eq!(n.tag, None, "no tag extraction for quoted local");
let n = parse_and_normalize("\"a.b\"@gmail.com", &config);
assert_eq!(
n.local_part, "a.b",
"dots must not be stripped inside quoted local"
);
}
#[test]
fn full_pipeline() {
let config = Config::builder()
.strip_subaddress()
.dots_gmail_only()
.lowercase_all()
.check_confusables()
.build();
let n = parse_and_normalize("A.L.I.C.E+promo@Gmail.COM", &config);
assert_eq!(n.local_part, "alice");
assert_eq!(n.tag, Some("promo".to_string()));
assert_eq!(n.domain, "gmail.com");
assert!(n.skeleton.is_some());
}
#[test]
fn obs_cfws_stripped_before_normalization() {
let config = Config::builder()
.strictness(crate::Strictness::Lax)
.lowercase_all()
.build();
let n = parse_and_normalize("User (comment) . Name@Example (c) . COM", &config);
assert_eq!(n.local_part, "user.name", "CFWS stripped + lowercased");
assert_eq!(n.domain, "example.com", "domain CFWS stripped + lowercased");
}
#[test]
fn obs_cfws_stripped_with_idna() {
let config = Config::builder()
.strictness(crate::Strictness::Lax)
.lowercase_all()
.build();
let n = parse_and_normalize("user@münchen (comment) . de", &config);
assert_eq!(n.domain, "xn--mnchen-3ya.de");
assert_eq!(n.domain_unicode.as_deref(), Some("münchen.de"));
}
}