use unicode_normalization::UnicodeNormalization;
use unicode_security::confusable_detection::skeleton;
use crate::config::{CasePolicy, Config, DotPolicy, SubaddressPolicy};
use crate::error::{Error, ErrorKind};
use crate::parser::Parsed;
#[derive(Debug, Clone)]
pub(crate) struct Normalized {
pub local_part: String,
pub tag: Option<String>,
pub domain: String,
pub domain_unicode: Option<String>,
pub display_name: Option<String>,
pub skeleton: Option<String>,
}
pub(crate) fn normalize(parsed: &Parsed<'_>, config: &Config) -> Result<Normalized, Error> {
let local = parsed.local_part_str();
let domain_str = parsed.domain_str();
let is_quoted = local.starts_with('"') && local.ends_with('"');
let unquoted_local = if is_quoted {
unescape_quoted_string(&local[1..local.len() - 1])
} else {
local.to_string()
};
let nfc_local: String = unquoted_local.nfc().collect();
let nfc_domain: String = domain_str.nfc().collect();
let canonical_domain = if nfc_domain.starts_with('[') {
nfc_domain.to_lowercase()
} else {
idna::domain_to_ascii_strict(&nfc_domain).map_err(|e| {
Error::new(
ErrorKind::IdnaError(format!("{}: {}", nfc_domain, e)),
parsed.domain.start,
)
})?
};
let provider = if config.provider_aware {
config.providers.lookup(&canonical_domain)
} else {
None
};
let lowercase_local = match provider {
Some(p) if !is_quoted => p.folds_case(),
_ => matches!(config.case_policy, CasePolicy::All),
};
let cased_local = if lowercase_local {
nfc_local.to_lowercase()
} else {
nfc_local
};
let (_base_local, tag, local_after_dots) = if is_quoted {
(cased_local.clone(), None, cased_local)
} else {
let sep: Option<char> = match provider {
Some(p) => p.separator(),
None => Some(config.subaddress_separator),
};
let (base, tag) = match sep {
Some(s) => match cased_local.split_once(s) {
Some((base, tag)) if !base.is_empty() => (base.to_string(), Some(tag.to_string())),
_ => (cased_local, None),
},
None => (cased_local, None),
};
let local_after_tag = match config.subaddress {
SubaddressPolicy::Strip => base.clone(),
SubaddressPolicy::Preserve => match (&tag, sep) {
(Some(t), Some(s)) => format!("{base}{s}{t}"),
_ => base.clone(),
},
};
let strip = match provider {
Some(p) => p.strips_dots(),
None => match config.dot_policy {
DotPolicy::Preserve => false,
DotPolicy::Always => true,
DotPolicy::GmailOnly => crate::provider::builtin_ref()
.lookup(&canonical_domain)
.is_some_and(|p| p.strips_dots()),
},
};
let after_dots = if strip {
local_after_tag.replace('.', "")
} else {
local_after_tag
};
(base, tag, after_dots)
};
let domain_unicode = if canonical_domain
.split('.')
.any(|label| label.starts_with("xn--"))
{
let (unicode, result) = idna::domain_to_unicode(&canonical_domain);
if result.is_ok() && unicode != canonical_domain {
Some(unicode)
} else {
None
}
} else {
None
};
let skel = if config.check_confusables {
Some(confusable_skeleton(&local_after_dots))
} else {
None
};
let display_name = parsed
.display_name
.map(|span| unescape_quoted_string(span.as_str(parsed.input)));
Ok(Normalized {
local_part: local_after_dots,
tag,
domain: canonical_domain,
domain_unicode,
display_name,
skeleton: skel,
})
}
fn unescape_quoted_string(s: &str) -> String {
let mut out = String::with_capacity(s.len());
let mut chars = s.chars().peekable();
while let Some(ch) = chars.next() {
if ch == '\\' {
if let Some(escaped) = chars.next() {
out.push(escaped);
} else {
out.push(ch);
}
} else if ch == '\r' {
if chars.peek() == Some(&'\n') {
chars.next(); while matches!(chars.peek(), Some(' ' | '\t')) {
chars.next();
}
out.push(' ');
}
} else if ch == '\n' {
} else {
out.push(ch);
}
}
out
}
pub fn confusable_skeleton(input: &str) -> String {
let nfc: String = input.nfc().collect();
skeleton(&nfc).collect::<String>().to_lowercase()
}
#[cfg(test)]
mod tests;