simd_normalizer/confusable.rs
1//! UTS #39 confusable skeleton mapping.
2//!
3//! Implements the skeleton algorithm from UTS #39 Section 4:
4//! `skeleton(input) = NFD(confusable_map(NFD(input)))`
5//!
6//! Two strings are confusable if they produce the same skeleton.
7
8use alloc::string::String;
9
10use crate::tables::{self, ConfusableResult};
11
12/// Apply the confusable mapping to a single character.
13///
14/// Pushes the mapped character(s) to `out`. If the character has no
15/// confusable mapping, it is pushed unchanged.
16#[inline]
17fn confusable_map_char(c: char, out: &mut String) {
18 match tables::lookup_confusable(c) {
19 ConfusableResult::None => out.push(c),
20 ConfusableResult::Single(mapped) => out.push(mapped),
21 ConfusableResult::Expansion { offset, length } => {
22 let data = tables::confusable_expansion_data(offset, length);
23 for &cp in data {
24 // SAFETY: expansion table contains only valid Unicode scalar values.
25 debug_assert!(cp <= 0x10FFFF && !(0xD800..=0xDFFF).contains(&cp));
26 let ch = unsafe { char::from_u32_unchecked(cp) };
27 out.push(ch);
28 }
29 },
30 }
31}
32
33/// Compute the UTS #39 skeleton of a string.
34///
35/// The skeleton algorithm: `NFD(confusable_map(NFD(input)))`, iterated
36/// to a fixed point. Iteration is required because some confusable
37/// expansions produce characters that themselves have confusable
38/// mappings; UTS #39 specifies that `skeleton(skeleton(X)) = skeleton(X)`,
39/// so we converge before returning.
40///
41/// Two strings are confusable (visually similar) if and only if
42/// `skeleton(a) == skeleton(b)`.
43pub fn skeleton(input: &str) -> String {
44 if input.is_empty() {
45 return String::new();
46 }
47
48 // Start from NFD(input); subsequent passes operate on the previous output.
49 let mut current: String = crate::nfd().normalize(input).into_owned();
50
51 // Apply confusable_map + NFD until a fixed point. A small iteration
52 // cap guards against pathological inputs; in practice 1–2 passes suffice.
53 for _ in 0..8 {
54 let mut mapped = String::with_capacity(current.len());
55 for ch in current.chars() {
56 confusable_map_char(ch, &mut mapped);
57 }
58 let next = crate::nfd().normalize(&mapped).into_owned();
59 if next == current {
60 return next;
61 }
62 current = next;
63 }
64 current
65}
66
67/// Check whether two strings are confusable (have the same skeleton).
68pub fn are_confusable(a: &str, b: &str) -> bool {
69 skeleton(a) == skeleton(b)
70}
71
72#[cfg(test)]
73mod tests {
74 use super::*;
75
76 #[test]
77 fn skeleton_empty() {
78 assert_eq!(skeleton(""), "");
79 }
80
81 #[test]
82 fn skeleton_ascii_unchanged() {
83 // Most lowercase ASCII maps to itself.
84 let s = skeleton("hello");
85 // 'h', 'e', 'l', 'l', 'o' — none are confusable prototypes typically
86 // but let's just check it doesn't panic.
87 assert!(!s.is_empty());
88 }
89
90 #[test]
91 fn confusable_latin_cyrillic_a() {
92 // Latin 'a' (U+0061) and Cyrillic 'а' (U+0430) should produce the
93 // same skeleton.
94 assert!(
95 are_confusable("a", "\u{0430}"),
96 "Latin 'a' and Cyrillic 'а' should be confusable"
97 );
98 }
99
100 #[test]
101 fn confusable_latin_cyrillic_word() {
102 // "apple" in Latin vs "аррlе" with Cyrillic а, р, р, and е
103 // The Cyrillic lookalikes: а=U+0430, р=U+0440, е=U+0435
104 let latin = "apple";
105 let mixed = "\u{0430}\u{0440}\u{0440}l\u{0435}";
106 assert!(are_confusable(latin, mixed));
107 }
108
109 #[test]
110 fn not_confusable_different_strings() {
111 assert!(!are_confusable("hello", "world"));
112 }
113
114 #[test]
115 fn confusable_identical_strings() {
116 assert!(are_confusable("test", "test"));
117 }
118
119 #[test]
120 fn skeleton_convergence() {
121 // UTS #39 requires skeleton(skeleton(X)) == skeleton(X).
122 let input = "Hel\u{0430}"; // mix of Latin and Cyrillic
123 let s1 = skeleton(input);
124 let s2 = skeleton(&s1);
125 assert_eq!(s1, s2, "skeleton must be idempotent");
126 }
127
128 #[test]
129 fn skeleton_idempotent_on_cascading_mapping() {
130 // Regression for fuzzer-found case where the confusable table maps
131 // U+1D0E (ᴎ) through multiple hops; a single NFD→map→NFD pass was
132 // not a fixed point. Skeleton must converge regardless.
133 let input = "\u{1D0E}\u{326}\u{306}";
134 let s1 = skeleton(input);
135 let s2 = skeleton(&s1);
136 assert_eq!(s1, s2, "skeleton must be idempotent for cascading maps");
137 }
138
139 #[test]
140 fn confusable_fullwidth() {
141 // Fullwidth Latin A (U+FF21) should be confusable with regular 'A'
142 // (they both map to the same skeleton after confusable mapping).
143 // Note: fullwidth is handled by NFKD, confusable mapping handles others.
144 let s1 = skeleton("A");
145 let s2 = skeleton("\u{FF21}");
146 // These may or may not be equal depending on whether confusables.txt
147 // includes the mapping. At minimum, verify no panic.
148 let _ = (s1, s2);
149 }
150}