ucd_util/name.rs
1/// Normalize the given character name in place according to UAX44-LM2.
2///
3/// See: https://unicode.org/reports/tr44/#UAX44-LM2
4pub fn character_name_normalize(string: &mut String) {
5 let bytes = unsafe {
6 // SAFETY: `character_name_normalize_bytes` guarantees that
7 // `bytes[..len]` is valid UTF-8.
8 string.as_mut_vec()
9 };
10 let len = character_name_normalize_bytes(bytes).len();
11 bytes.truncate(len);
12}
13
14/// Normalize the given character name in place according to UAX44-LM2.
15///
16/// The slice returned is guaranteed to be valid UTF-8 for all possible values
17/// of `slice`.
18///
19/// See: https://unicode.org/reports/tr44/#UAX44-LM2
20fn character_name_normalize_bytes(slice: &mut [u8]) -> &mut [u8] {
21 // According to Unicode 4.8, character names consist only of Latin
22 // capital letters A to Z, ASCII digits, ASCII space or ASCII hypen.
23 // Therefore, we can do very simplistic case folding and operate on the
24 // raw bytes, since everything is ASCII. Note that we don't actually know
25 // whether `slice` is all ASCII or not, so we drop all non-ASCII bytes.
26 let mut next_write = 0;
27 let mut prev_letter = false;
28 // let mut prev_space = true;
29 for i in 0..slice.len() {
30 // SAFETY ARGUMENT: To guarantee that the resulting slice is valid
31 // UTF-8, we ensure that the slice contains only ASCII bytes. In
32 // particular, we drop every non-ASCII byte from the normalized string.
33 let b = slice[i];
34 if b == b' ' {
35 // Drop spaces.
36 } else if b == b'_' {
37 // Drop the underscore.
38 } else if b == b'-' {
39 let medial = prev_letter
40 && slice.get(i + 1).map_or(false, |b| b.is_ascii_alphabetic());
41 let mut keep_hyphen = !medial;
42 // We want to keep the hypen only if it isn't medial. However,
43 // there is one exception. We need to keep the hypen in the
44 // character (U+1180) named `HANGUL JUNGSEONG O-E`. So we check for
45 // that here.
46 let next_e =
47 slice.get(i + 1).map_or(false, |&b| b == b'E' || b == b'e');
48 // More characters after the final E are fine, as long as they are
49 // underscores and spaces.
50 let rest_empty = i + 2 >= slice.len()
51 || slice[i + 2..].iter().all(|&b| b == b' ' || b == b'_');
52 if !keep_hyphen && next_e && rest_empty {
53 keep_hyphen = slice[..next_write] == b"hanguljungseongo"[..];
54 }
55 if keep_hyphen {
56 slice[next_write] = b;
57 next_write += 1;
58 }
59 } else if b'A' <= b && b <= b'Z' {
60 slice[next_write] = b + (b'a' - b'A');
61 next_write += 1;
62 } else if b <= 0x7F {
63 slice[next_write] = b;
64 next_write += 1;
65 }
66 // prev_space = false;
67 prev_letter = b.is_ascii_alphabetic();
68 }
69 &mut slice[..next_write]
70}
71
72/// Normalize the given symbolic name in place according to UAX44-LM3.
73///
74/// A "symbolic name" typically corresponds to property names and property
75/// value aliases. Note, though, that it should not be applied to property
76/// string values.
77///
78/// See: https://unicode.org/reports/tr44/#UAX44-LM2
79pub fn symbolic_name_normalize(string: &mut String) {
80 let bytes = unsafe {
81 // SAFETY: `symbolic_name_normalize_bytes` guarantees that
82 // `bytes[..len]` is valid UTF-8.
83 string.as_mut_vec()
84 };
85 let len = symbolic_name_normalize_bytes(bytes).len();
86 bytes.truncate(len);
87}
88
89/// Normalize the given symbolic name in place according to UAX44-LM3.
90///
91/// A "symbolic name" typically corresponds to property names and property
92/// value aliases. Note, though, that it should not be applied to property
93/// string values.
94///
95/// The slice returned is guaranteed to be valid UTF-8 for all possible values
96/// of `slice`.
97///
98/// See: https://unicode.org/reports/tr44/#UAX44-LM3
99fn symbolic_name_normalize_bytes(slice: &mut [u8]) -> &mut [u8] {
100 // I couldn't find a place in the standard that specified that property
101 // names/aliases had a particular structure (unlike character names), but
102 // we assume that it's ASCII only and drop anything that isn't ASCII.
103 let mut start = 0;
104 let mut starts_with_is = false;
105 if slice.len() > 2 {
106 // Ignore any "is" prefix.
107 starts_with_is = slice[0..2] == b"is"[..]
108 || slice[0..2] == b"IS"[..]
109 || slice[0..2] == b"iS"[..]
110 || slice[0..2] == b"Is"[..];
111 if starts_with_is {
112 start = 2;
113 }
114 }
115 let mut next_write = 0;
116 for i in start..slice.len() {
117 // SAFETY ARGUMENT: To guarantee that the resulting slice is valid
118 // UTF-8, we ensure that the slice contains only ASCII bytes. In
119 // particular, we drop every non-ASCII byte from the normalized string.
120 let b = slice[i];
121 if b == b' ' || b == b'_' || b == b'-' {
122 continue;
123 } else if b'A' <= b && b <= b'Z' {
124 slice[next_write] = b + (b'a' - b'A');
125 next_write += 1;
126 } else if b <= 0x7F {
127 slice[next_write] = b;
128 next_write += 1;
129 }
130 }
131 // Special case: ISO_Comment has a 'isc' abbreviation. Since we generally
132 // ignore 'is' prefixes, the 'isc' abbreviation gets caught in the cross
133 // fire and ends up creating an alias for 'c' to 'ISO_Comment', but it
134 // is actually an alias for the 'Other' general category.
135 if starts_with_is && next_write == 1 && slice[0] == b'c' {
136 slice[0] = b'i';
137 slice[1] = b's';
138 slice[2] = b'c';
139 next_write = 3;
140 }
141 &mut slice[..next_write]
142}
143
144#[cfg(test)]
145mod tests {
146 use super::{
147 character_name_normalize, character_name_normalize_bytes,
148 symbolic_name_normalize, symbolic_name_normalize_bytes,
149 };
150
151 fn char_norm(s: &str) -> String {
152 let mut s = s.to_string();
153 character_name_normalize(&mut s);
154 s
155 }
156
157 fn sym_norm(s: &str) -> String {
158 let mut s = s.to_string();
159 symbolic_name_normalize(&mut s);
160 s
161 }
162
163 #[test]
164 fn char_normalize() {
165 assert_eq!(char_norm("HANGUL JUNGSEONG O-E"), "hanguljungseongo-e");
166 assert_eq!(char_norm("HANGUL JUNGSEONG O-E _"), "hanguljungseongo-e");
167 assert_eq!(char_norm("zero-width space"), "zerowidthspace");
168 assert_eq!(char_norm("zerowidthspace"), "zerowidthspace");
169 assert_eq!(char_norm("ZERO WIDTH SPACE"), "zerowidthspace");
170 assert_eq!(char_norm("TIBETAN MARK TSA -PHRU"), "tibetanmarktsa-phru");
171 assert_eq!(char_norm("tibetan_letter_-a"), "tibetanletter-a");
172 }
173
174 #[test]
175 fn sym_normalize() {
176 assert_eq!(sym_norm("Line_Break"), "linebreak");
177 assert_eq!(sym_norm("Line-break"), "linebreak");
178 assert_eq!(sym_norm("linebreak"), "linebreak");
179 assert_eq!(sym_norm("BA"), "ba");
180 assert_eq!(sym_norm("ba"), "ba");
181 assert_eq!(sym_norm("Greek"), "greek");
182 assert_eq!(sym_norm("isGreek"), "greek");
183 assert_eq!(sym_norm("IS_Greek"), "greek");
184 assert_eq!(sym_norm("isc"), "isc");
185 assert_eq!(sym_norm("is c"), "isc");
186 assert_eq!(sym_norm("is_c"), "isc");
187 assert_eq!(sym_norm("IS"), "is");
188 }
189
190 #[test]
191 fn valid_utf8_character() {
192 let mut x = b"abc\xFFxyz".to_vec();
193 let y = character_name_normalize_bytes(&mut x);
194 assert_eq!(y, b"abcxyz");
195 }
196
197 #[test]
198 fn valid_utf8_symbolic() {
199 let mut x = b"abc\xFFxyz".to_vec();
200 let y = symbolic_name_normalize_bytes(&mut x);
201 assert_eq!(y, b"abcxyz");
202 }
203}