Skip to main content

simd_normalizer/
confusable.rs

1//! UTS #39 confusable skeleton mapping.
2//!
3//! Implements the skeleton algorithm from UTS #39 Section 4:
4//! `skeleton(input) = NFD(confusable_map(NFD(input)))`
5//!
6//! Two strings are confusable if they produce the same skeleton.
7
8use alloc::string::String;
9
10use crate::tables::{self, ConfusableResult};
11
12/// Apply the confusable mapping to a single character.
13///
14/// Pushes the mapped character(s) to `out`. If the character has no
15/// confusable mapping, it is pushed unchanged.
16#[inline]
17fn confusable_map_char(c: char, out: &mut String) {
18    match tables::lookup_confusable(c) {
19        ConfusableResult::None => out.push(c),
20        ConfusableResult::Single(mapped) => out.push(mapped),
21        ConfusableResult::Expansion { offset, length } => {
22            let data = tables::confusable_expansion_data(offset, length);
23            for &cp in data {
24                // SAFETY: expansion table contains only valid Unicode scalar values.
25                debug_assert!(cp <= 0x10FFFF && !(0xD800..=0xDFFF).contains(&cp));
26                let ch = unsafe { char::from_u32_unchecked(cp) };
27                out.push(ch);
28            }
29        },
30    }
31}
32
33/// Compute the UTS #39 skeleton of a string.
34///
35/// The skeleton algorithm: `NFD(confusable_map(NFD(input)))`, iterated
36/// to a fixed point. Iteration is required because some confusable
37/// expansions produce characters that themselves have confusable
38/// mappings; UTS #39 specifies that `skeleton(skeleton(X)) = skeleton(X)`,
39/// so we converge before returning.
40///
41/// Two strings are confusable (visually similar) if and only if
42/// `skeleton(a) == skeleton(b)`.
43pub fn skeleton(input: &str) -> String {
44    if input.is_empty() {
45        return String::new();
46    }
47
48    // Start from NFD(input); subsequent passes operate on the previous output.
49    let mut current: String = crate::nfd().normalize(input).into_owned();
50
51    // Apply confusable_map + NFD until a fixed point. A small iteration
52    // cap guards against pathological inputs; in practice 1–2 passes suffice.
53    for _ in 0..8 {
54        let mut mapped = String::with_capacity(current.len());
55        for ch in current.chars() {
56            confusable_map_char(ch, &mut mapped);
57        }
58        let next = crate::nfd().normalize(&mapped).into_owned();
59        if next == current {
60            return next;
61        }
62        current = next;
63    }
64    current
65}
66
67/// Check whether two strings are confusable (have the same skeleton).
68pub fn are_confusable(a: &str, b: &str) -> bool {
69    skeleton(a) == skeleton(b)
70}
71
72#[cfg(test)]
73mod tests {
74    use super::*;
75
76    #[test]
77    fn skeleton_empty() {
78        assert_eq!(skeleton(""), "");
79    }
80
81    #[test]
82    fn skeleton_ascii_unchanged() {
83        // Most lowercase ASCII maps to itself.
84        let s = skeleton("hello");
85        // 'h', 'e', 'l', 'l', 'o' — none are confusable prototypes typically
86        // but let's just check it doesn't panic.
87        assert!(!s.is_empty());
88    }
89
90    #[test]
91    fn confusable_latin_cyrillic_a() {
92        // Latin 'a' (U+0061) and Cyrillic 'а' (U+0430) should produce the
93        // same skeleton.
94        assert!(
95            are_confusable("a", "\u{0430}"),
96            "Latin 'a' and Cyrillic 'а' should be confusable"
97        );
98    }
99
100    #[test]
101    fn confusable_latin_cyrillic_word() {
102        // "apple" in Latin vs "аррlе" with Cyrillic а, р, р, and е
103        // The Cyrillic lookalikes: а=U+0430, р=U+0440, е=U+0435
104        let latin = "apple";
105        let mixed = "\u{0430}\u{0440}\u{0440}l\u{0435}";
106        assert!(are_confusable(latin, mixed));
107    }
108
109    #[test]
110    fn not_confusable_different_strings() {
111        assert!(!are_confusable("hello", "world"));
112    }
113
114    #[test]
115    fn confusable_identical_strings() {
116        assert!(are_confusable("test", "test"));
117    }
118
119    #[test]
120    fn skeleton_convergence() {
121        // UTS #39 requires skeleton(skeleton(X)) == skeleton(X).
122        let input = "Hel\u{0430}"; // mix of Latin and Cyrillic
123        let s1 = skeleton(input);
124        let s2 = skeleton(&s1);
125        assert_eq!(s1, s2, "skeleton must be idempotent");
126    }
127
128    #[test]
129    fn skeleton_idempotent_on_cascading_mapping() {
130        // Regression for fuzzer-found case where the confusable table maps
131        // U+1D0E (ᴎ) through multiple hops; a single NFD→map→NFD pass was
132        // not a fixed point. Skeleton must converge regardless.
133        let input = "\u{1D0E}\u{326}\u{306}";
134        let s1 = skeleton(input);
135        let s2 = skeleton(&s1);
136        assert_eq!(s1, s2, "skeleton must be idempotent for cascading maps");
137    }
138
139    #[test]
140    fn confusable_fullwidth() {
141        // Fullwidth Latin A (U+FF21) should be confusable with regular 'A'
142        // (they both map to the same skeleton after confusable mapping).
143        // Note: fullwidth is handled by NFKD, confusable mapping handles others.
144        let s1 = skeleton("A");
145        let s2 = skeleton("\u{FF21}");
146        // These may or may not be equal depending on whether confusables.txt
147        // includes the mapping. At minimum, verify no panic.
148        let _ = (s1, s2);
149    }
150}