mur_common/skill/scan/
unicode.rs1use unicode_normalization::UnicodeNormalization;
2
3const BIDI_OVERRIDES: &[char] = &[
4 '\u{202A}', '\u{202B}', '\u{202C}', '\u{202D}', '\u{202E}', '\u{2066}', '\u{2067}', '\u{2068}',
5 '\u{2069}',
6];
7
8const INVISIBLE_SEPARATORS: &[char] = &['\u{200B}', '\u{200C}', '\u{200D}', '\u{2060}', '\u{FEFF}'];
9
10#[derive(Debug, PartialEq, Eq)]
11pub struct UnicodeFinding {
12 pub kind: UnicodeKind,
13 pub codepoint: char,
14}
15
16#[derive(Debug, PartialEq, Eq, Clone, Copy)]
17pub enum UnicodeKind {
18 BidiOverride,
19 InvisibleSeparator,
20 NotNfc,
21}
22
23pub fn scan_unicode(input: &str) -> (String, Vec<UnicodeFinding>) {
24 let mut findings = Vec::new();
25 let nfc: String = input.nfc().collect();
26 if nfc != input {
27 findings.push(UnicodeFinding {
28 kind: UnicodeKind::NotNfc,
29 codepoint: '\u{0}',
30 });
31 }
32 for c in nfc.chars() {
33 if BIDI_OVERRIDES.contains(&c) {
34 findings.push(UnicodeFinding {
35 kind: UnicodeKind::BidiOverride,
36 codepoint: c,
37 });
38 } else if INVISIBLE_SEPARATORS.contains(&c) {
39 findings.push(UnicodeFinding {
40 kind: UnicodeKind::InvisibleSeparator,
41 codepoint: c,
42 });
43 }
44 }
45 (nfc, findings)
46}
47
48#[cfg(test)]
49mod tests {
50 use super::*;
51
52 #[test]
53 fn clean_ascii_passes() {
54 let (n, f) = scan_unicode("hello world");
55 assert_eq!(n, "hello world");
56 assert!(f.is_empty());
57 }
58
59 #[test]
60 fn detects_rlo() {
61 let (_, f) = scan_unicode("hello\u{202E}world");
62 assert!(f.iter().any(|x| x.kind == UnicodeKind::BidiOverride));
63 }
64
65 #[test]
66 fn detects_zwj() {
67 let (_, f) = scan_unicode("admin\u{200D}istrator");
68 assert!(f.iter().any(|x| x.kind == UnicodeKind::InvisibleSeparator));
69 }
70
71 #[test]
72 fn detects_non_nfc() {
73 let nfd = "cafe\u{0301}";
74 let (n, f) = scan_unicode(nfd);
75 assert!(f.iter().any(|x| x.kind == UnicodeKind::NotNfc));
76 assert_eq!(n, "café");
77 }
78}