Skip to main content

homoglyph_detect/
lib.rs

1//! # homoglyph-detect
2//!
3//! Detect Cyrillic / Greek / fullwidth lookalike characters masquerading
4//! as ASCII letters. Used to catch domain-spoof URLs and content that
5//! mixes scripts to slip past keyword filters.
6//!
7//! Common attack: replace the `a` in `claude` with Cyrillic `а`
8//! (U+0430). It renders identically but bypasses keyword matching.
9//!
10//! ## Example
11//!
12//! ```
13//! use homoglyph_detect::{find_homoglyphs, normalize_to_ascii};
14//! let attack = "cl\u{0430}ude"; // Cyrillic 'a'
15//! let hits = find_homoglyphs(attack);
16//! assert_eq!(hits.len(), 1);
17//! assert_eq!(hits[0].ascii_equivalent, 'a');
18//! assert_eq!(normalize_to_ascii(attack), "claude");
19//! ```
20
21#![deny(missing_docs)]
22
23/// One detected lookalike.
24#[derive(Debug, Clone, Copy, PartialEq, Eq)]
25pub struct Finding {
26    /// The lookalike char as it appeared in the input.
27    pub original: char,
28    /// The ASCII letter it impersonates.
29    pub ascii_equivalent: char,
30    /// 0-based byte position in the source string.
31    pub byte_pos: usize,
32}
33
34/// Return every lookalike position in `s`.
35pub fn find_homoglyphs(s: &str) -> Vec<Finding> {
36    let mut out = Vec::new();
37    for (byte_pos, c) in s.char_indices() {
38        if let Some(eq) = ascii_equivalent(c) {
39            out.push(Finding {
40                original: c,
41                ascii_equivalent: eq,
42                byte_pos,
43            });
44        }
45    }
46    out
47}
48
49/// True when at least one lookalike is present.
50pub fn has_homoglyphs(s: &str) -> bool {
51    s.chars().any(|c| ascii_equivalent(c).is_some())
52}
53
54/// Replace each lookalike with its ASCII equivalent.
55pub fn normalize_to_ascii(s: &str) -> String {
56    s.chars()
57        .map(|c| ascii_equivalent(c).unwrap_or(c))
58        .collect()
59}
60
61/// Per-char lookalike → ASCII mapping. Returns `None` if the char is
62/// not a known confusable for any ASCII letter.
63pub fn ascii_equivalent(c: char) -> Option<char> {
64    match c {
65        // Cyrillic letters that look like ASCII (lowercase).
66        '\u{0430}' => Some('a'),
67        '\u{0435}' => Some('e'),
68        '\u{043E}' => Some('o'),
69        '\u{0440}' => Some('p'),
70        '\u{0441}' => Some('c'),
71        '\u{0445}' => Some('x'),
72        '\u{0443}' => Some('y'),
73        '\u{04CF}' => Some('l'),
74        // Cyrillic uppercase
75        '\u{0410}' => Some('A'),
76        '\u{0412}' => Some('B'),
77        '\u{0415}' => Some('E'),
78        '\u{041A}' => Some('K'),
79        '\u{041C}' => Some('M'),
80        '\u{041D}' => Some('H'),
81        '\u{041E}' => Some('O'),
82        '\u{0420}' => Some('P'),
83        '\u{0421}' => Some('C'),
84        '\u{0422}' => Some('T'),
85        '\u{0425}' => Some('X'),
86        // Greek
87        '\u{03B1}' => Some('a'),
88        '\u{03BF}' => Some('o'),
89        '\u{03C1}' => Some('p'),
90        '\u{0391}' => Some('A'),
91        '\u{0392}' => Some('B'),
92        '\u{0395}' => Some('E'),
93        '\u{0396}' => Some('Z'),
94        '\u{0397}' => Some('H'),
95        '\u{0399}' => Some('I'),
96        '\u{039A}' => Some('K'),
97        '\u{039C}' => Some('M'),
98        '\u{039D}' => Some('N'),
99        '\u{039F}' => Some('O'),
100        '\u{03A1}' => Some('P'),
101        '\u{03A4}' => Some('T'),
102        '\u{03A7}' => Some('X'),
103        // Fullwidth ASCII letters (U+FF21..U+FF3A, U+FF41..U+FF5A)
104        c if ('\u{FF21}'..='\u{FF3A}').contains(&c) => {
105            Some(('A' as u32 + (c as u32 - 0xFF21)) as u8 as char)
106        }
107        c if ('\u{FF41}'..='\u{FF5A}').contains(&c) => {
108            Some(('a' as u32 + (c as u32 - 0xFF41)) as u8 as char)
109        }
110        _ => None,
111    }
112}