Skip to main content

simd_normalizer/
confusable.rs

1//! UTS #39 confusable skeleton mapping.
2//!
3//! Implements the skeleton algorithm from UTS #39 Section 4:
4//! `skeleton(input) = NFD(confusable_map(NFD(input)))`
5//!
6//! Two strings are confusable if they produce the same skeleton.
7
8use alloc::string::String;
9
10use crate::tables::{self, ConfusableResult};
11
12/// Apply the confusable mapping to a single character.
13///
14/// Pushes the mapped character(s) to `out`. If the character has no
15/// confusable mapping, it is pushed unchanged.
16///
17/// A 256-byte compile-time bloom filter (see
18/// [`tables::confusable_bloom_might_contain`]) gates the binary search into
19/// the mapping table. The bloom is built from the same `CONFUSABLE_MAPPINGS`
20/// list that `lookup_confusable` consults, so by construction every source
21/// codepoint hashes to a set bit — false negatives are impossible. A clear
22/// bit means the character is provably not in the table and we skip the
23/// search outright. The vast majority of codepoints in real-world text
24/// (ASCII, common Latin-1, CJK ideographs, etc.) are not confusable sources,
25/// so the bloom check eliminates most of the per-codepoint search cost.
26#[inline]
27fn confusable_map_char(c: char, out: &mut String) {
28    if !tables::confusable_bloom_might_contain(c as u32) {
29        out.push(c);
30        return;
31    }
32    match tables::lookup_confusable(c) {
33        ConfusableResult::None => out.push(c),
34        ConfusableResult::Single(mapped) => out.push(mapped),
35        ConfusableResult::Expansion { offset, length } => {
36            let data = tables::confusable_expansion_data(offset, length);
37            for &cp in data {
38                // SAFETY: expansion table contains only valid Unicode scalar values.
39                debug_assert!(cp <= 0x10FFFF && !(0xD800..=0xDFFF).contains(&cp));
40                let ch = unsafe { char::from_u32_unchecked(cp) };
41                out.push(ch);
42            }
43        },
44    }
45}
46
47/// Compute the UTS #39 skeleton of a string.
48///
49/// The skeleton algorithm: `NFD(confusable_map(NFD(input)))`, iterated
50/// to a fixed point. Iteration is required because some confusable
51/// expansions produce characters that themselves have confusable
52/// mappings; UTS #39 specifies that `skeleton(skeleton(X)) = skeleton(X)`,
53/// so we converge before returning.
54///
55/// Two strings are confusable (visually similar) if and only if
56/// `skeleton(a) == skeleton(b)`.
57pub fn skeleton(input: &str) -> String {
58    if input.is_empty() {
59        return String::new();
60    }
61
62    // Start from NFD(input); subsequent passes operate on the previous output.
63    let mut current: String = crate::nfd().normalize(input).into_owned();
64
65    // Apply confusable_map + NFD until a fixed point. A small iteration
66    // cap guards against pathological inputs; in practice 1–2 passes suffice.
67    for _ in 0..8 {
68        let mut mapped = String::with_capacity(current.len());
69        for ch in current.chars() {
70            confusable_map_char(ch, &mut mapped);
71        }
72        let next = crate::nfd().normalize(&mapped).into_owned();
73        if next == current {
74            return next;
75        }
76        current = next;
77    }
78    current
79}
80
81/// Check whether two strings are confusable (have the same skeleton).
82pub fn are_confusable(a: &str, b: &str) -> bool {
83    skeleton(a) == skeleton(b)
84}
85
86#[cfg(test)]
87mod tests {
88    use super::*;
89
90    #[test]
91    fn skeleton_empty() {
92        assert_eq!(skeleton(""), "");
93    }
94
95    #[test]
96    fn skeleton_ascii_unchanged() {
97        // Most lowercase ASCII maps to itself.
98        let s = skeleton("hello");
99        // 'h', 'e', 'l', 'l', 'o' — none are confusable prototypes typically
100        // but let's just check it doesn't panic.
101        assert!(!s.is_empty());
102    }
103
104    #[test]
105    fn confusable_latin_cyrillic_a() {
106        // Latin 'a' (U+0061) and Cyrillic 'а' (U+0430) should produce the
107        // same skeleton.
108        assert!(
109            are_confusable("a", "\u{0430}"),
110            "Latin 'a' and Cyrillic 'а' should be confusable"
111        );
112    }
113
114    #[test]
115    fn confusable_latin_cyrillic_word() {
116        // "apple" in Latin vs "аррlе" with Cyrillic а, р, р, and е
117        // The Cyrillic lookalikes: а=U+0430, р=U+0440, е=U+0435
118        let latin = "apple";
119        let mixed = "\u{0430}\u{0440}\u{0440}l\u{0435}";
120        assert!(are_confusable(latin, mixed));
121    }
122
123    #[test]
124    fn not_confusable_different_strings() {
125        assert!(!are_confusable("hello", "world"));
126    }
127
128    #[test]
129    fn confusable_identical_strings() {
130        assert!(are_confusable("test", "test"));
131    }
132
133    #[test]
134    fn skeleton_convergence() {
135        // UTS #39 requires skeleton(skeleton(X)) == skeleton(X).
136        let input = "Hel\u{0430}"; // mix of Latin and Cyrillic
137        let s1 = skeleton(input);
138        let s2 = skeleton(&s1);
139        assert_eq!(s1, s2, "skeleton must be idempotent");
140    }
141
142    #[test]
143    fn skeleton_idempotent_on_cascading_mapping() {
144        // Regression for fuzzer-found case where the confusable table maps
145        // U+1D0E (ᴎ) through multiple hops; a single NFD→map→NFD pass was
146        // not a fixed point. Skeleton must converge regardless.
147        let input = "\u{1D0E}\u{326}\u{306}";
148        let s1 = skeleton(input);
149        let s2 = skeleton(&s1);
150        assert_eq!(s1, s2, "skeleton must be idempotent for cascading maps");
151    }
152
153    #[test]
154    fn confusable_fullwidth() {
155        // Fullwidth Latin A (U+FF21) should be confusable with regular 'A'
156        // (they both map to the same skeleton after confusable mapping).
157        // Note: fullwidth is handled by NFKD, confusable mapping handles others.
158        let s1 = skeleton("A");
159        let s2 = skeleton("\u{FF21}");
160        // These may or may not be equal depending on whether confusables.txt
161        // includes the mapping. At minimum, verify no panic.
162        let _ = (s1, s2);
163    }
164}