homoglyph-detect 0.1.0

Detect Cyrillic/Greek lookalike chars masquerading as ASCII. For prompt-injection and phishing defense. Zero deps.
Documentation
//! # homoglyph-detect
//!
//! Detect Cyrillic / Greek / fullwidth lookalike characters masquerading
//! as ASCII letters. Used to catch domain-spoof URLs and content that
//! mixes scripts to slip past keyword filters.
//!
//! Common attack: replace the `a` in `claude` with Cyrillic `а`
//! (U+0430). It renders identically but bypasses keyword matching.
//!
//! ## Example
//!
//! ```
//! use homoglyph_detect::{find_homoglyphs, normalize_to_ascii};
//! let attack = "cl\u{0430}ude"; // Cyrillic 'a'
//! let hits = find_homoglyphs(attack);
//! assert_eq!(hits.len(), 1);
//! assert_eq!(hits[0].ascii_equivalent, 'a');
//! assert_eq!(normalize_to_ascii(attack), "claude");
//! ```

#![deny(missing_docs)]

/// One detected lookalike.
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub struct Finding {
    /// The lookalike char as it appeared in the input.
    pub original: char,
    /// The ASCII letter it impersonates.
    pub ascii_equivalent: char,
    /// 0-based byte position in the source string.
    pub byte_pos: usize,
}

/// Return every lookalike position in `s`.
pub fn find_homoglyphs(s: &str) -> Vec<Finding> {
    let mut out = Vec::new();
    for (byte_pos, c) in s.char_indices() {
        if let Some(eq) = ascii_equivalent(c) {
            out.push(Finding {
                original: c,
                ascii_equivalent: eq,
                byte_pos,
            });
        }
    }
    out
}

/// True when at least one lookalike is present.
pub fn has_homoglyphs(s: &str) -> bool {
    s.chars().any(|c| ascii_equivalent(c).is_some())
}

/// Replace each lookalike with its ASCII equivalent.
pub fn normalize_to_ascii(s: &str) -> String {
    s.chars()
        .map(|c| ascii_equivalent(c).unwrap_or(c))
        .collect()
}

/// Per-char lookalike → ASCII mapping. Returns `None` if the char is
/// not a known confusable for any ASCII letter.
pub fn ascii_equivalent(c: char) -> Option<char> {
    match c {
        // Cyrillic letters that look like ASCII (lowercase).
        '\u{0430}' => Some('a'),
        '\u{0435}' => Some('e'),
        '\u{043E}' => Some('o'),
        '\u{0440}' => Some('p'),
        '\u{0441}' => Some('c'),
        '\u{0445}' => Some('x'),
        '\u{0443}' => Some('y'),
        '\u{04CF}' => Some('l'),
        // Cyrillic uppercase
        '\u{0410}' => Some('A'),
        '\u{0412}' => Some('B'),
        '\u{0415}' => Some('E'),
        '\u{041A}' => Some('K'),
        '\u{041C}' => Some('M'),
        '\u{041D}' => Some('H'),
        '\u{041E}' => Some('O'),
        '\u{0420}' => Some('P'),
        '\u{0421}' => Some('C'),
        '\u{0422}' => Some('T'),
        '\u{0425}' => Some('X'),
        // Greek
        '\u{03B1}' => Some('a'),
        '\u{03BF}' => Some('o'),
        '\u{03C1}' => Some('p'),
        '\u{0391}' => Some('A'),
        '\u{0392}' => Some('B'),
        '\u{0395}' => Some('E'),
        '\u{0396}' => Some('Z'),
        '\u{0397}' => Some('H'),
        '\u{0399}' => Some('I'),
        '\u{039A}' => Some('K'),
        '\u{039C}' => Some('M'),
        '\u{039D}' => Some('N'),
        '\u{039F}' => Some('O'),
        '\u{03A1}' => Some('P'),
        '\u{03A4}' => Some('T'),
        '\u{03A7}' => Some('X'),
        // Fullwidth ASCII letters (U+FF21..U+FF3A, U+FF41..U+FF5A)
        c if ('\u{FF21}'..='\u{FF3A}').contains(&c) => {
            Some(('A' as u32 + (c as u32 - 0xFF21)) as u8 as char)
        }
        c if ('\u{FF41}'..='\u{FF5A}').contains(&c) => {
            Some(('a' as u32 + (c as u32 - 0xFF41)) as u8 as char)
        }
        _ => None,
    }
}