simd_normalizer/confusable.rs
1//! UTS #39 confusable skeleton mapping.
2//!
3//! Implements the skeleton algorithm from UTS #39 Section 4:
4//! `skeleton(input) = NFD(confusable_map(NFD(input)))`
5//!
6//! Two strings are confusable if they produce the same skeleton.
7
8use alloc::string::String;
9
10use crate::tables::{self, ConfusableResult};
11
12/// Apply the confusable mapping to a single character.
13///
14/// Pushes the mapped character(s) to `out`. If the character has no
15/// confusable mapping, it is pushed unchanged.
16///
17/// A 256-byte compile-time bloom filter (see
18/// [`tables::confusable_bloom_might_contain`]) gates the binary search into
19/// the mapping table. The bloom is built from the same `CONFUSABLE_MAPPINGS`
20/// list that `lookup_confusable` consults, so by construction every source
21/// codepoint hashes to a set bit — false negatives are impossible. A clear
22/// bit means the character is provably not in the table and we skip the
23/// search outright. The vast majority of codepoints in real-world text
24/// (ASCII, common Latin-1, CJK ideographs, etc.) are not confusable sources,
25/// so the bloom check eliminates most of the per-codepoint search cost.
26#[inline]
27fn confusable_map_char(c: char, out: &mut String) {
28 if !tables::confusable_bloom_might_contain(c as u32) {
29 out.push(c);
30 return;
31 }
32 match tables::lookup_confusable(c) {
33 ConfusableResult::None => out.push(c),
34 ConfusableResult::Single(mapped) => out.push(mapped),
35 ConfusableResult::Expansion { offset, length } => {
36 let data = tables::confusable_expansion_data(offset, length);
37 for &cp in data {
38 // SAFETY: expansion table contains only valid Unicode scalar values.
39 debug_assert!(cp <= 0x10FFFF && !(0xD800..=0xDFFF).contains(&cp));
40 let ch = unsafe { char::from_u32_unchecked(cp) };
41 out.push(ch);
42 }
43 },
44 }
45}
46
47/// Compute the UTS #39 skeleton of a string.
48///
49/// The skeleton algorithm: `NFD(confusable_map(NFD(input)))`, iterated
50/// to a fixed point. Iteration is required because some confusable
51/// expansions produce characters that themselves have confusable
52/// mappings; UTS #39 specifies that `skeleton(skeleton(X)) = skeleton(X)`,
53/// so we converge before returning.
54///
55/// Two strings are confusable (visually similar) if and only if
56/// `skeleton(a) == skeleton(b)`.
57pub fn skeleton(input: &str) -> String {
58 if input.is_empty() {
59 return String::new();
60 }
61
62 // Start from NFD(input); subsequent passes operate on the previous output.
63 let mut current: String = crate::nfd().normalize(input).into_owned();
64
65 // Apply confusable_map + NFD until a fixed point. A small iteration
66 // cap guards against pathological inputs; in practice 1–2 passes suffice.
67 for _ in 0..8 {
68 let mut mapped = String::with_capacity(current.len());
69 for ch in current.chars() {
70 confusable_map_char(ch, &mut mapped);
71 }
72 let next = crate::nfd().normalize(&mapped).into_owned();
73 if next == current {
74 return next;
75 }
76 current = next;
77 }
78 current
79}
80
81/// Check whether two strings are confusable (have the same skeleton).
82pub fn are_confusable(a: &str, b: &str) -> bool {
83 skeleton(a) == skeleton(b)
84}
85
86#[cfg(test)]
87mod tests {
88 use super::*;
89
90 #[test]
91 fn skeleton_empty() {
92 assert_eq!(skeleton(""), "");
93 }
94
95 #[test]
96 fn skeleton_ascii_unchanged() {
97 // Most lowercase ASCII maps to itself.
98 let s = skeleton("hello");
99 // 'h', 'e', 'l', 'l', 'o' — none are confusable prototypes typically
100 // but let's just check it doesn't panic.
101 assert!(!s.is_empty());
102 }
103
104 #[test]
105 fn confusable_latin_cyrillic_a() {
106 // Latin 'a' (U+0061) and Cyrillic 'а' (U+0430) should produce the
107 // same skeleton.
108 assert!(
109 are_confusable("a", "\u{0430}"),
110 "Latin 'a' and Cyrillic 'а' should be confusable"
111 );
112 }
113
114 #[test]
115 fn confusable_latin_cyrillic_word() {
116 // "apple" in Latin vs "аррlе" with Cyrillic а, р, р, and е
117 // The Cyrillic lookalikes: а=U+0430, р=U+0440, е=U+0435
118 let latin = "apple";
119 let mixed = "\u{0430}\u{0440}\u{0440}l\u{0435}";
120 assert!(are_confusable(latin, mixed));
121 }
122
123 #[test]
124 fn not_confusable_different_strings() {
125 assert!(!are_confusable("hello", "world"));
126 }
127
128 #[test]
129 fn confusable_identical_strings() {
130 assert!(are_confusable("test", "test"));
131 }
132
133 #[test]
134 fn skeleton_convergence() {
135 // UTS #39 requires skeleton(skeleton(X)) == skeleton(X).
136 let input = "Hel\u{0430}"; // mix of Latin and Cyrillic
137 let s1 = skeleton(input);
138 let s2 = skeleton(&s1);
139 assert_eq!(s1, s2, "skeleton must be idempotent");
140 }
141
142 #[test]
143 fn skeleton_idempotent_on_cascading_mapping() {
144 // Regression for fuzzer-found case where the confusable table maps
145 // U+1D0E (ᴎ) through multiple hops; a single NFD→map→NFD pass was
146 // not a fixed point. Skeleton must converge regardless.
147 let input = "\u{1D0E}\u{326}\u{306}";
148 let s1 = skeleton(input);
149 let s2 = skeleton(&s1);
150 assert_eq!(s1, s2, "skeleton must be idempotent for cascading maps");
151 }
152
153 #[test]
154 fn confusable_fullwidth() {
155 // Fullwidth Latin A (U+FF21) should be confusable with regular 'A'
156 // (they both map to the same skeleton after confusable mapping).
157 // Note: fullwidth is handled by NFKD, confusable mapping handles others.
158 let s1 = skeleton("A");
159 let s2 = skeleton("\u{FF21}");
160 // These may or may not be equal depending on whether confusables.txt
161 // includes the mapping. At minimum, verify no panic.
162 let _ = (s1, s2);
163 }
164}