use unicode_normalization::UnicodeNormalization;
#[derive(Debug, Clone, Copy, PartialEq)]
pub enum EvasionKind {
CyrillicHomoglyph,
GreekHomoglyph,
Fullwidth,
ZeroWidth,
RTLOverride,
Decomposed,
Suspicious,
}
impl EvasionKind {
pub fn description(&self) -> &'static str {
match self {
Self::CyrillicHomoglyph => "Cyrillic lookalike character",
Self::GreekHomoglyph => "Greek lookalike character",
Self::Fullwidth => "Fullwidth ASCII variant",
Self::ZeroWidth => "Zero-width character",
Self::RTLOverride => "Right-to-left override",
Self::Decomposed => "Decomposed Unicode form",
Self::Suspicious => "Suspicious Unicode usage",
}
}
}
#[derive(Debug, Clone)]
pub struct EvasionMatch {
pub position: usize,
pub kind: EvasionKind,
pub char: char,
pub replacement: Option<char>,
}
pub fn detect_unicode_attacks(text: &str) -> Vec<EvasionMatch> {
let mut matches = Vec::new();
for (byte_pos, ch) in text.char_indices() {
if let Some(latin) = cyrillic_to_latin(ch) {
matches.push(EvasionMatch {
position: byte_pos,
kind: EvasionKind::CyrillicHomoglyph,
char: ch,
replacement: Some(latin),
});
continue;
}
if let Some(latin) = greek_to_latin(ch) {
matches.push(EvasionMatch {
position: byte_pos,
kind: EvasionKind::GreekHomoglyph,
char: ch,
replacement: Some(latin),
});
continue;
}
if is_fullwidth(ch) {
matches.push(EvasionMatch {
position: byte_pos,
kind: EvasionKind::Fullwidth,
char: ch,
replacement: Some(fullwidth_to_ascii(ch)),
});
continue;
}
if is_zero_width(ch) {
matches.push(EvasionMatch {
position: byte_pos,
kind: EvasionKind::ZeroWidth,
char: ch,
replacement: None,
});
continue;
}
if is_rtl_override(ch) {
matches.push(EvasionMatch {
position: byte_pos,
kind: EvasionKind::RTLOverride,
char: ch,
replacement: None,
});
continue;
}
}
matches
}
pub fn normalize_homoglyphs(text: &str) -> String {
text.chars()
.map(|ch| {
if let Some(latin) = cyrillic_to_latin(ch) {
return latin;
}
if let Some(latin) = greek_to_latin(ch) {
return latin;
}
if is_fullwidth(ch) {
return fullwidth_to_ascii(ch);
}
if is_zero_width(ch) {
return '\0'; }
if is_rtl_override(ch) {
return '\0';
}
ch
})
.filter(|&ch| ch != '\0')
.collect()
}
pub fn full_normalize(text: &str) -> String {
let nfc: String = text.nfc().collect();
normalize_homoglyphs(&nfc)
}
pub fn contains_evasion(text: &str) -> bool {
!detect_unicode_attacks(text).is_empty()
}
fn cyrillic_to_latin(ch: char) -> Option<char> {
match ch {
'а' => Some('a'), 'е' => Some('e'), 'і' => Some('i'), 'ј' => Some('j'), 'о' => Some('o'), 'р' => Some('p'), 'с' => Some('c'), 'у' => Some('y'), 'х' => Some('x'), 'ѕ' => Some('s'), 'һ' => Some('h'), 'ɡ' => Some('g'), 'А' => Some('A'), 'В' => Some('B'), 'Е' => Some('E'), 'І' => Some('I'), 'Ј' => Some('J'), 'К' => Some('K'), 'М' => Some('M'), 'Н' => Some('H'), 'О' => Some('O'), 'Р' => Some('P'), 'С' => Some('C'), 'Т' => Some('T'), 'Х' => Some('X'), 'Ү' => Some('Y'), _ => None,
}
}
fn greek_to_latin(ch: char) -> Option<char> {
match ch {
'α' => Some('a'), 'β' => Some('b'), 'ε' => Some('e'), 'ι' => Some('i'), 'κ' => Some('k'), 'ν' => Some('v'), 'ο' => Some('o'), 'ρ' => Some('p'), 'τ' => Some('t'), 'υ' => Some('u'), 'χ' => Some('x'), 'ω' => Some('w'), 'Α' => Some('A'), 'Β' => Some('B'), 'Ε' => Some('E'), 'Η' => Some('H'), 'Ι' => Some('I'), 'Κ' => Some('K'), 'Μ' => Some('M'), 'Ν' => Some('N'), 'Ο' => Some('O'), 'Ρ' => Some('P'), 'Τ' => Some('T'), 'Υ' => Some('Y'), 'Χ' => Some('X'), 'Ζ' => Some('Z'), _ => None,
}
}
fn is_fullwidth(ch: char) -> bool {
matches!(ch, '\u{FF00}'..='\u{FFEF}')
}
fn fullwidth_to_ascii(ch: char) -> char {
if is_fullwidth(ch) {
let code = ch as u32;
if (0xFF01..=0xFF5E).contains(&code) {
std::char::from_u32(code - 0xFEE0).unwrap_or(ch)
} else {
ch
}
} else {
ch
}
}
pub fn is_evasion_char(ch: char) -> bool {
is_zero_width(ch) || is_rtl_override(ch)
}
fn is_zero_width(ch: char) -> bool {
matches!(
ch,
'\u{200B}' | '\u{200C}' | '\u{200D}' | '\u{FEFF}' | '\u{2060}' | '\u{180E}' | '\u{200E}' | '\u{200F}' )
}
fn is_rtl_override(ch: char) -> bool {
matches!(
ch,
'\u{202E}' | '\u{202D}' | '\u{202A}' | '\u{202B}' | '\u{202C}' )
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_detect_cyrillic_homoglyph() {
let text = "ghp_секрет"; let attacks = detect_unicode_attacks(text);
assert!(!attacks.is_empty());
assert!(
attacks
.iter()
.any(|a| a.kind == EvasionKind::CyrillicHomoglyph)
);
}
#[test]
fn test_normalize_homoglyphs() {
let text = "ghp_fullwidth"; let normalized = normalize_homoglyphs(text);
assert!(normalized.contains("ghp_"));
}
#[test]
fn test_remove_zero_width() {
let text = "ghp_\u{200B}secret"; let normalized = normalize_homoglyphs(text);
assert!(!normalized.contains('\u{200B}'));
}
#[test]
fn test_full_normalize() {
let text = "ghp_\u{0065}\u{0308}secret"; let normalized = full_normalize(text);
assert!(normalized.contains('e') && normalized.contains("ghp_"));
}
}