use unicode_normalization::UnicodeNormalization;
use crate::{confusables, scripts};
fn is_ipv6_literal(normalized: &str) -> bool {
if !(normalized.starts_with('[') && normalized.ends_with(']')) {
return false;
}
let inner = &normalized[1..normalized.len() - 1];
if inner.is_empty() || !inner.contains(':') {
return false;
}
let addr_part = match inner.find('%') {
Some(pos) => &inner[..pos],
None => inner,
};
let colon_count = addr_part.chars().filter(|&c| c == ':').count();
if colon_count > 7 {
return false;
}
if inner.bytes().filter(|&b| b == b'%').count() > 1 {
return false;
}
inner
.as_bytes()
.iter()
.all(|&b| b.is_ascii_hexdigit() || b == b':' || b == b'.' || b == b'%')
}
#[derive(Clone, Debug, PartialEq, Eq)]
pub(crate) struct HostnameAnalysis {
pub(crate) suspicious: bool,
pub(crate) scripts: Vec<String>,
pub(crate) mixed_script: bool,
pub(crate) has_confusables: bool,
pub(crate) canonical: String,
}
pub(crate) fn is_suspicious_hostname(hostname: &str) -> (bool, HostnameAnalysis) {
let normalized: String = hostname.nfkc().collect();
if is_ipv6_literal(&normalized) {
return (
false,
HostnameAnalysis {
suspicious: false,
scripts: Vec::new(),
mixed_script: false,
has_confusables: false,
canonical: normalized,
},
);
}
let mut suspicious = false;
let mut all_scripts: Vec<&str> = Vec::new();
let mut seen_scripts: std::collections::HashSet<&str> = std::collections::HashSet::new();
let mut has_mixed = false;
let mut has_confusables = false;
let mut decoded_labels: Vec<String> = Vec::new();
for raw_label in normalized.split('.') {
if raw_label.is_empty() {
decoded_labels.push(String::new());
continue;
}
let is_ace =
raw_label.len() >= 4 && raw_label.as_bytes()[..4].eq_ignore_ascii_case(b"xn--");
let label: String = if is_ace {
let (unicode, result) = idna::domain_to_unicode(raw_label);
if result.is_err() {
suspicious = true;
}
unicode.nfkc().collect()
} else {
raw_label.to_string()
};
decoded_labels.push(label.clone());
let label_scripts = scripts::detect_scripts(&label);
for s in &label_scripts {
if seen_scripts.insert(s) {
all_scripts.push(s);
}
}
if label_scripts.len() > 1 {
has_mixed = true;
suspicious = true;
}
match confusables::is_confusable(&label, "latin") {
Ok(true) => {
has_confusables = true;
suspicious = true;
}
Ok(false) => {}
Err(_) => {
suspicious = true;
}
}
}
let decoded_hostname = decoded_labels.join(".");
let canonical =
confusables::normalize_confusables(&decoded_hostname, "latin").unwrap_or(decoded_hostname);
(
suspicious,
HostnameAnalysis {
suspicious,
scripts: all_scripts.into_iter().map(String::from).collect(),
mixed_script: has_mixed,
has_confusables,
canonical,
},
)
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_clean_hostname_not_suspicious() {
let (suspicious, details) = is_suspicious_hostname("paypal.com");
assert!(!suspicious);
assert!(!details.has_confusables);
assert!(!details.mixed_script);
}
#[test]
fn test_cyrillic_spoof() {
let (suspicious, details) = is_suspicious_hostname("\u{0440}\u{0430}ypal.com");
assert!(suspicious);
assert!(details.has_confusables);
assert!(details.mixed_script);
assert_eq!(details.canonical, "paypal.com");
}
#[test]
fn test_full_cyrillic_domain() {
let (_, details) = is_suspicious_hostname("яндекс.ру");
assert!(!details.mixed_script);
}
#[test]
fn test_mixed_non_latin_scripts_suspicious() {
let (suspicious, details) = is_suspicious_hostname("\u{044F}\u{03C8}.com");
assert!(suspicious, "mixed Cyrillic+Greek label must be suspicious");
assert!(details.mixed_script);
assert!(
!details.has_confusables,
"neither я nor ψ is a Latin confusable; the mixed-script rule must \
be what flags this label"
);
assert!(details.scripts.iter().any(|s| s == "Cyrillic"));
assert!(details.scripts.iter().any(|s| s == "Greek"));
}
#[test]
fn test_punycode_non_homograph_not_suspicious() {
let (suspicious, _) = is_suspicious_hostname("xn--n3h.com");
assert!(!suspicious);
}
#[test]
fn test_punycode_homograph_suspicious() {
let spoof = "\u{0430}\u{0440}\u{0440}\u{04CF}\u{0435}"; let ace = idna::domain_to_ascii(spoof).expect("encode Cyrillic spoof to ACE");
assert!(
ace.starts_with("xn--"),
"expected an xn-- label, got {ace:?}"
);
let hostname = format!("{ace}.com");
let (suspicious, details) = is_suspicious_hostname(&hostname);
assert!(
suspicious,
"Cyrillic homograph in ACE form {hostname:?} must be suspicious"
);
assert!(details.has_confusables);
}
#[test]
fn test_ipv6_loopback_not_suspicious() {
let (suspicious, details) = is_suspicious_hostname("[::1]");
assert!(!suspicious);
assert!(!details.mixed_script);
assert!(!details.has_confusables);
}
#[test]
fn test_ipv6_full_not_suspicious() {
let (suspicious, details) = is_suspicious_hostname("[2001:db8::1]");
assert!(!suspicious);
assert!(details.scripts.is_empty());
}
}