use unicode_normalization::UnicodeNormalization;
#[derive(Debug, Clone, Copy, PartialEq)]
pub enum EvasionKind {
CyrillicHomoglyph,
GreekHomoglyph,
Fullwidth,
ZeroWidth,
RTLOverride,
Decomposed,
Suspicious,
}
impl EvasionKind {
pub fn description(&self) -> &'static str {
match self {
Self::CyrillicHomoglyph => "Cyrillic lookalike character",
Self::GreekHomoglyph => "Greek lookalike character",
Self::Fullwidth => "Fullwidth ASCII variant",
Self::ZeroWidth => "Zero-width character",
Self::RTLOverride => "Right-to-left override",
Self::Decomposed => "Decomposed Unicode form",
Self::Suspicious => "Suspicious Unicode usage",
}
}
}
#[derive(Debug, Clone)]
pub struct EvasionMatch {
pub position: usize,
pub kind: EvasionKind,
pub char: char,
pub replacement: Option<char>,
}
pub fn detect_unicode_attacks(text: &str) -> Vec<EvasionMatch> {
let mut matches = Vec::new();
for (byte_pos, ch) in text.char_indices() {
if let Some(latin) = cyrillic_to_latin(ch) {
matches.push(EvasionMatch {
position: byte_pos,
kind: EvasionKind::CyrillicHomoglyph,
char: ch,
replacement: Some(latin),
});
continue;
}
if let Some(latin) = greek_to_latin(ch) {
matches.push(EvasionMatch {
position: byte_pos,
kind: EvasionKind::GreekHomoglyph,
char: ch,
replacement: Some(latin),
});
continue;
}
if is_fullwidth(ch) {
matches.push(EvasionMatch {
position: byte_pos,
kind: EvasionKind::Fullwidth,
char: ch,
replacement: Some(fullwidth_to_ascii(ch)),
});
continue;
}
if is_zero_width(ch) {
matches.push(EvasionMatch {
position: byte_pos,
kind: EvasionKind::ZeroWidth,
char: ch,
replacement: None,
});
continue;
}
if is_rtl_override(ch) {
matches.push(EvasionMatch {
position: byte_pos,
kind: EvasionKind::RTLOverride,
char: ch,
replacement: None,
});
continue;
}
if is_combining_mark(ch) {
matches.push(EvasionMatch {
position: byte_pos,
kind: EvasionKind::Decomposed,
char: ch,
replacement: None,
});
continue;
}
if is_unicode_separator_evasion(ch) {
matches.push(EvasionMatch {
position: byte_pos,
kind: EvasionKind::Suspicious,
char: ch,
replacement: None,
});
continue;
}
}
matches
}
pub fn normalize_homoglyphs(text: &str) -> std::borrow::Cow<'_, str> {
if text.is_ascii() && !contains_ascii_evasion(text.as_bytes()) {
return std::borrow::Cow::Borrowed(text);
}
if !text.is_ascii() && !contains_evasion(text) {
return std::borrow::Cow::Borrowed(text);
}
let mut normalized = String::with_capacity(text.len());
for ch in text.chars() {
if let Some(latin) = cyrillic_to_latin(ch) {
normalized.push(latin);
continue;
}
if let Some(latin) = greek_to_latin(ch) {
normalized.push(latin);
continue;
}
if is_fullwidth(ch) {
normalized.push(fullwidth_to_ascii(ch));
continue;
}
if is_zero_width(ch)
|| is_rtl_override(ch)
|| is_unicode_separator_evasion(ch)
|| is_combining_mark(ch)
|| is_ascii_evasion_control(ch)
{
continue;
}
normalized.push(ch);
}
std::borrow::Cow::Owned(normalized)
}
pub fn full_normalize(text: &str) -> String {
let nfc: String = text.nfc().collect();
normalize_homoglyphs(&nfc).into_owned()
}
static EVASION_ANCHORS: std::sync::LazyLock<Vec<String>> = std::sync::LazyLock::new(|| {
#[derive(serde::Deserialize)]
struct AnchorFile {
anchors: Vec<String>,
}
let raw = include_str!("../data/evasion-anchors.toml");
match toml::from_str::<AnchorFile>(raw) {
Ok(parsed) => parsed.anchors,
Err(e) => {
tracing::warn!(
error = %e,
"evasion-anchors.toml failed to parse; prefix-anchored interior-control \
strip disabled this run (split-credential evasion will not be normalized)",
);
Vec::new()
}
}
});
static EVASION_ANCHOR_AC: std::sync::LazyLock<Option<aho_corasick::AhoCorasick>> =
std::sync::LazyLock::new(|| {
let anchors = &*EVASION_ANCHORS;
if anchors.is_empty() {
return None;
}
aho_corasick::AhoCorasick::new(anchors).ok()
});
#[inline]
fn is_credential_body_byte(b: u8) -> bool {
b.is_ascii_alphanumeric() || matches!(b, b'_' | b'+' | b'/' | b'=' | b'.' | b'-')
}
#[inline]
fn is_interior_control(b: u8) -> bool {
matches!(b, b'\t' | b'\r')
}
pub fn strip_interior_evasion_controls(text: &str) -> std::borrow::Cow<'_, str> {
let bytes = text.as_bytes();
if bytes.len() < 3 {
return std::borrow::Cow::Borrowed(text);
}
let has_candidate = (1..bytes.len() - 1).any(|i| {
is_interior_control(bytes[i])
&& is_credential_body_byte(bytes[i - 1])
&& is_credential_body_byte(bytes[i + 1])
});
if !has_candidate {
return std::borrow::Cow::Borrowed(text);
}
let Some(ac) = &*EVASION_ANCHOR_AC else {
return std::borrow::Cow::Borrowed(text);
};
const MAX_BODY_WINDOW: usize = 256;
let mut drop_mask = vec![false; bytes.len()];
let mut any_drop = false;
for mat in ac.find_iter(text) {
let start = mat.start();
let end = mat.end();
if start > 0 && is_credential_body_byte(bytes[start - 1]) {
continue;
}
let window_end = end.saturating_add(MAX_BODY_WINDOW).min(bytes.len());
let mut j = end;
while j < window_end {
let b = bytes[j];
if is_credential_body_byte(b) {
j += 1;
} else if is_interior_control(b)
&& j + 1 < bytes.len()
&& is_credential_body_byte(bytes[j + 1])
{
drop_mask[j] = true;
any_drop = true;
j += 1;
} else {
break;
}
}
}
if !any_drop {
return std::borrow::Cow::Borrowed(text);
}
let mut out = Vec::with_capacity(bytes.len());
for (i, &b) in bytes.iter().enumerate() {
if !drop_mask[i] {
out.push(b);
}
}
String::from_utf8(out)
.map(std::borrow::Cow::Owned)
.unwrap_or(std::borrow::Cow::Borrowed(text))
}
pub fn contains_evasion(text: &str) -> bool {
contains_ascii_evasion(text.as_bytes())
|| !detect_unicode_attacks(text).is_empty()
|| text
.chars()
.any(|ch| is_unicode_separator_evasion(ch) || is_combining_mark(ch))
}
fn contains_ascii_evasion(bytes: &[u8]) -> bool {
bytes
.iter()
.any(|&b| b < 0x20 && !matches!(b, b'\n' | b'\r' | b'\t'))
}
fn is_ascii_evasion_control(ch: char) -> bool {
ch.is_ascii_control() && !matches!(ch, '\n' | '\r' | '\t')
}
fn cyrillic_to_latin(ch: char) -> Option<char> {
match ch {
'а' => Some('a'), 'е' => Some('e'), 'і' => Some('i'), 'ј' => Some('j'), 'о' => Some('o'), 'р' => Some('p'), 'с' => Some('c'), 'у' => Some('y'), 'х' => Some('x'), 'ѕ' => Some('s'), 'һ' => Some('h'), 'ɡ' => Some('g'), 'ї' => Some('i'), 'к' => Some('k'), 'т' => Some('t'), 'А' => Some('A'), 'В' => Some('B'), 'Е' => Some('E'), 'І' => Some('I'), 'Ј' => Some('J'), 'К' => Some('K'), 'М' => Some('M'), 'Н' => Some('H'), 'О' => Some('O'), 'Р' => Some('P'), 'С' => Some('C'), 'Ѕ' => Some('S'), 'Т' => Some('T'), 'Х' => Some('X'), 'Ү' => Some('Y'), 'Ї' => Some('I'), _ => None,
}
}
fn greek_to_latin(ch: char) -> Option<char> {
match ch {
'α' => Some('a'), 'β' => Some('b'), 'ε' => Some('e'), 'ι' => Some('i'), 'κ' => Some('k'), 'ν' => Some('v'), 'ο' => Some('o'), 'ρ' => Some('p'), 'τ' => Some('t'), 'υ' => Some('u'), 'χ' => Some('x'), 'ω' => Some('w'), 'Α' => Some('A'), 'Β' => Some('B'), 'Ε' => Some('E'), 'Η' => Some('H'), 'Ι' => Some('I'), 'Κ' => Some('K'), 'Μ' => Some('M'), 'Ν' => Some('N'), 'Ο' => Some('O'), 'Ρ' => Some('P'), 'Τ' => Some('T'), 'Υ' => Some('Y'), 'Χ' => Some('X'), 'Ζ' => Some('Z'), _ => None,
}
}
fn is_fullwidth(ch: char) -> bool {
matches!(ch, '\u{FF00}'..='\u{FFEF}')
}
fn fullwidth_to_ascii(ch: char) -> char {
if is_fullwidth(ch) {
let code = ch as u32;
if (0xFF01..=0xFF5E).contains(&code) {
std::char::from_u32(code - 0xFEE0).unwrap_or(ch)
} else {
ch
}
} else {
ch
}
}
pub fn is_evasion_char(ch: char) -> bool {
is_zero_width(ch) || is_rtl_override(ch)
}
fn is_zero_width(ch: char) -> bool {
matches!(
ch,
'\u{200B}' | '\u{200C}' | '\u{200D}' | '\u{FEFF}' | '\u{2060}' | '\u{180E}' | '\u{200E}' | '\u{200F}' | '\u{00AD}' | '\u{2066}' | '\u{2067}' | '\u{2068}' | '\u{2069}' )
}
fn is_unicode_separator_evasion(ch: char) -> bool {
matches!(
ch,
'\u{0085}' | '\u{00A0}' | '\u{2000}'
..='\u{200A}' | '\u{2028}' | '\u{2029}' | '\u{202F}' | '\u{205F}' | '\u{3000}' )
}
fn is_combining_mark(ch: char) -> bool {
matches!(ch, '\u{0300}'..='\u{036F}')
}
fn is_rtl_override(ch: char) -> bool {
matches!(
ch,
'\u{202E}' | '\u{202D}' | '\u{202A}' | '\u{202B}' | '\u{202C}' )
}