Skip to main content

mur_common/skill/scan/
unicode.rs

1use unicode_normalization::UnicodeNormalization;
2
3const BIDI_OVERRIDES: &[char] = &[
4    '\u{202A}', '\u{202B}', '\u{202C}', '\u{202D}', '\u{202E}', '\u{2066}', '\u{2067}', '\u{2068}',
5    '\u{2069}',
6];
7
8const INVISIBLE_SEPARATORS: &[char] = &['\u{200B}', '\u{200C}', '\u{200D}', '\u{2060}', '\u{FEFF}'];
9
10#[derive(Debug, PartialEq, Eq)]
11pub struct UnicodeFinding {
12    pub kind: UnicodeKind,
13    pub codepoint: char,
14}
15
16#[derive(Debug, PartialEq, Eq, Clone, Copy)]
17pub enum UnicodeKind {
18    BidiOverride,
19    InvisibleSeparator,
20    NotNfc,
21}
22
23pub fn scan_unicode(input: &str) -> (String, Vec<UnicodeFinding>) {
24    let mut findings = Vec::new();
25    let nfc: String = input.nfc().collect();
26    if nfc != input {
27        findings.push(UnicodeFinding {
28            kind: UnicodeKind::NotNfc,
29            codepoint: '\u{0}',
30        });
31    }
32    for c in nfc.chars() {
33        if BIDI_OVERRIDES.contains(&c) {
34            findings.push(UnicodeFinding {
35                kind: UnicodeKind::BidiOverride,
36                codepoint: c,
37            });
38        } else if INVISIBLE_SEPARATORS.contains(&c) {
39            findings.push(UnicodeFinding {
40                kind: UnicodeKind::InvisibleSeparator,
41                codepoint: c,
42            });
43        }
44    }
45    (nfc, findings)
46}
47
48#[cfg(test)]
49mod tests {
50    use super::*;
51
52    #[test]
53    fn clean_ascii_passes() {
54        let (n, f) = scan_unicode("hello world");
55        assert_eq!(n, "hello world");
56        assert!(f.is_empty());
57    }
58
59    #[test]
60    fn detects_rlo() {
61        let (_, f) = scan_unicode("hello\u{202E}world");
62        assert!(f.iter().any(|x| x.kind == UnicodeKind::BidiOverride));
63    }
64
65    #[test]
66    fn detects_zwj() {
67        let (_, f) = scan_unicode("admin\u{200D}istrator");
68        assert!(f.iter().any(|x| x.kind == UnicodeKind::InvisibleSeparator));
69    }
70
71    #[test]
72    fn detects_non_nfc() {
73        let nfd = "cafe\u{0301}";
74        let (n, f) = scan_unicode(nfd);
75        assert!(f.iter().any(|x| x.kind == UnicodeKind::NotNfc));
76        assert_eq!(n, "café");
77    }
78}