detone/lib.rs
1// Copyright 2019 Mozilla Foundation. See the COPYRIGHT
2// file at the top-level directory of this distribution.
3//
4// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
5// https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
6// <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
7// option. This file may not be copied, modified, or distributed
8// except according to those terms.
9
10//! An iterator adapter that takes an iterator over `char` yielding a sequence of
11//! `char`s in Normalization Form C (this precondition is not checked!) and
12//! yields `char`s either such that tone marks that wouldn't otherwise fit into
13//! windows-1258 are decomposed or such that text is decomposed into orthographic
14//! units.
15//!
16//! Use cases include preprocessing before encoding Vietnamese text into
17//! windows-1258 or converting precomposed Vietnamese text into a form that looks
18//! like it was written with the (non-IME) Vietnamese keyboard layout (e.g. for
19//! machine learning training or benchmarking purposes).
20
21#[repr(align(64))] // Align to cache lines
22struct ToneData {
23 windows_1258_key: [u8; 16],
24 windows_1258_value: [u8; 16],
25 middle_key: [u8; 14],
26 middle_value: [u8; 14],
27 extensions_for_vietnamese: [u16; 90],
28}
29
30// These arrays list the actual decomposed code point combinations that should
31// replace a single, composed code point. For example, 0x1EA0 ("Ạ") decomposes
32// into 0x0041 ("A") + 0x0323 ("Combining dot below"). Decompositions for
33// windows-1258 are always two code points.
34//
35// Entries here pack information about both decomposed points into a single
36// integer, where the lower bits indicate the first code point, and the upper
37// bits indicate the second code point (usually a tone mark). There are three
38// sets of decompositions, and each packs the the two code points together
39// differently to make efficient use of memory.
40static TONE_DATA: ToneData = ToneData {
41 // Index for orthographic-only decompositions. This array lists the Unicode
42 // code points that can be decomposed, while the `windows_1258_value` array
43 // lists the actual decompositions for them at the corresponding index.
44 windows_1258_key: [
45 0xC0, // À
46 0xC1, // Á
47 0xC8, // È
48 0xC9, // É
49 0xCD, // Í
50 0xD3, // Ó
51 0xD9, // Ù
52 0xDA, // Ú
53 0xE0, // à
54 0xE1, // á
55 0xE8, // è
56 0xE9, // é
57 0xED, // í
58 0xF3, // ó
59 0xF9, // ù
60 0xFA, // ú
61 ],
62 // Orthographic only decompositions. Given a composed code point, find it
63 // in the above array, then look for the decomposition at the corresponding
64 // index in this array.
65 //
66 // The lower 7 bits of a value is the first replacement code point. The
67 // upper bit is the second code point, offset by negative 0x0300. For
68 // example, the decomposition of 0xC0 ("À") is 0x41, which represents the
69 // code points 0x41 ("A") + 0x0300 ("Combining grave accent"):
70 //
71 // 0x41 = 0b_0100_0001
72 // First: 0b_0100_0001 = 0x41
73 // ^^^^^^^^ (Lower 7 bits)
74 // Second: 0b_0 = 0x00 + 0x300 = 0x0300
75 // ^ (Upper bit)
76 //
77 windows_1258_value: [
78 0x41, // À
79 0xC1, // Á
80 0x45, // È
81 0xC5, // É
82 0xC9, // Í
83 0xCF, // Ó
84 0x55, // Ù
85 0xD5, // Ú
86 0x61, // à
87 0xE1, // á
88 0x65, // è
89 0xE5, // é
90 0xE9, // í
91 0xEF, // ó
92 0x75, // ù
93 0xF5, // ú
94 ],
95 // Index for decompositions of assorted code points outside the range
96 // 0x1ea0 - 0x1efa. This array lists the Unicode code points that can be
97 // decomposed (offset by negative 0xC3 so they fit in one byte). The actual
98 // decomposition is at the corresponding index of the `middle_value` array.
99 middle_key: [
100 0x00, // Ã
101 0x09, // Ì
102 0x0F, // Ò
103 0x12, // Õ
104 0x1A, // Ý
105 0x20, // ã
106 0x29, // ì
107 0x2F, // ò
108 0x32, // õ
109 0x3A, // ý
110 0x65, // Ĩ
111 0x66, // ĩ
112 0xA5, // Ũ
113 0xA6, // ũ
114 ],
115 // Decompositions. Given a composed code point, find it in the above array,
116 // then find the decomposition at the corresponding index in this array.
117 //
118 // The lower 7 bits of a value is the first replacement code point. The
119 // second code point is more complicated:
120 // - If the first point is 0x59 ("Y") or 0x79 ("y"), it is 0x0301
121 // ("Combining Acute Accent"). For these, ignore the upper bit.
122 // - If the upper bit is 0, it is 0x0300 ("Combining Grave Accent").
123 // - If the upper bit is 1, it is 0x0303 ("Combining Tilde")
124 //
125 // For example, the decomposition of 0xC3 ("Ã") 0xC1, which is the code
126 // points 0x41 ("A") + 0x0303 ("Combining tilde"):
127 //
128 // 0xC1 = 0b_1100_0001
129 // First: 0b_0100_0001 = 0x41
130 // ^^^^^^^^ (Lower 7 bits)
131 // Second: 0b_1 = 0x01 -> 0x0303
132 // ^ (Upper bit)
133 //
134 middle_value: [
135 0xC1, // Ã
136 0x49, // Ì
137 0x4F, // Ò
138 0xCF, // Õ
139 0x59, // Ý
140 0xE1, // ã
141 0x69, // ì
142 0x6F, // ò
143 0xEF, // õ
144 0x79, // ý
145 0xC9, // Ĩ
146 0xE9, // ĩ
147 0xD5, // Ũ
148 0xF5, // ũ
149 ],
150 // Decompositions for code points in the range 0x1ea0 - 0x1efa (the main
151 // range of composed vowels + accents + tone marks used in Vietnamese).
152 //
153 // Decompositions are listed in order, so the decomposition for code point
154 // 0x1ea0 is at index 0, 0x1ea0 is at index 1, etc.
155 //
156 // The lower 10 bits of a value is the first replacement code point. The
157 // upper 6 bits are the second code point, offset by negative 0x0300. For
158 // example, the decomposition of 0x1EA0 ("Ạ") is 0x8C41, which represents
159 // the code points 0x41 ("A") + 0x0323 ("Combining dot below"):
160 //
161 // 0x8C41 = 0b_1000_1100_0100_0001
162 // First: 0b_0000_0000_0100_0001 = 0x41
163 // ^^^^^^^^^^^ (Lower 10 bits)
164 // Second: 0b_1000_11 = 0x23 + 0x300 = 0x0323
165 // ^^^^^^^ (Upper 6 bits)
166 //
167 extensions_for_vietnamese: [
168 0x8C41, // Ạ
169 0x8C61, // ạ
170 0x2441, // Ả
171 0x2461, // ả
172 0x04C2, // Ấ
173 0x04E2, // ấ
174 0x00C2, // Ầ
175 0x00E2, // ầ
176 0x24C2, // Ẩ
177 0x24E2, // ẩ
178 0x0CC2, // Ẫ
179 0x0CE2, // ẫ
180 0x8CC2, // Ậ
181 0x8CE2, // ậ
182 0x0502, // Ắ
183 0x0503, // ắ
184 0x0102, // Ằ
185 0x0103, // ằ
186 0x2502, // Ẳ
187 0x2503, // ẳ
188 0x0D02, // Ẵ
189 0x0D03, // ẵ
190 0x8D02, // Ặ
191 0x8D03, // ặ
192 0x8C45, // Ẹ
193 0x8C65, // ẹ
194 0x2445, // Ẻ
195 0x2465, // ẻ
196 0x0C45, // Ẽ
197 0x0C65, // ẽ
198 0x04CA, // Ế
199 0x04EA, // ế
200 0x00CA, // Ề
201 0x00EA, // ề
202 0x24CA, // Ể
203 0x24EA, // ể
204 0x0CCA, // Ễ
205 0x0CEA, // ễ
206 0x8CCA, // Ệ
207 0x8CEA, // ệ
208 0x2449, // Ỉ
209 0x2469, // ỉ
210 0x8C49, // Ị
211 0x8C69, // ị
212 0x8C4F, // Ọ
213 0x8C6F, // ọ
214 0x244F, // Ỏ
215 0x246F, // ỏ
216 0x04D4, // Ố
217 0x04F4, // ố
218 0x00D4, // Ồ
219 0x00F4, // ồ
220 0x24D4, // Ổ
221 0x24F4, // ổ
222 0x0CD4, // Ỗ
223 0x0CF4, // ỗ
224 0x8CD4, // Ộ
225 0x8CF4, // ộ
226 0x05A0, // Ớ
227 0x05A1, // ớ
228 0x01A0, // Ờ
229 0x01A1, // ờ
230 0x25A0, // Ở
231 0x25A1, // ở
232 0x0DA0, // Ỡ
233 0x0DA1, // ỡ
234 0x8DA0, // Ợ
235 0x8DA1, // ợ
236 0x8C55, // Ụ
237 0x8C75, // ụ
238 0x2455, // Ủ
239 0x2475, // ủ
240 0x05AF, // Ứ
241 0x05B0, // ứ
242 0x01AF, // Ừ
243 0x01B0, // ừ
244 0x25AF, // Ử
245 0x25B0, // ử
246 0x0DAF, // Ữ
247 0x0DB0, // ữ
248 0x8DAF, // Ự
249 0x8DB0, // ự
250 0x0059, // Ỳ
251 0x0079, // ỳ
252 0x8C59, // Ỵ
253 0x8C79, // ỵ
254 0x2459, // Ỷ
255 0x2479, // ỷ
256 0x0C59, // Ỹ
257 0x0C79, // ỹ
258 ],
259};
260
261fn expand(u: u16) -> char {
262 unsafe { std::char::from_u32_unchecked(u32::from(u)) }
263}
264
265/// An iterator adapter yielding `char` with tone marks detached.
266#[derive(Debug)]
267pub struct DecomposeVietnamese<I> {
268 delegate: I,
269 pending: char,
270 orthographic: bool,
271}
272
273impl<I: Iterator<Item = char>> Iterator for DecomposeVietnamese<I> {
274 type Item = char;
275
276 #[inline]
277 fn next(&mut self) -> Option<char> {
278 if self.pending != '\u{0}' {
279 let c = self.pending;
280 self.pending = '\u{0}';
281 return Some(c);
282 }
283 if let Some(c) = self.delegate.next() {
284 let s = c as usize;
285 let minus_offset = s.wrapping_sub(0x1EA0);
286 if minus_offset < TONE_DATA.extensions_for_vietnamese.len() {
287 let val = TONE_DATA.extensions_for_vietnamese[minus_offset];
288 let base = expand(val & 0x3FF);
289 let tone = expand((val >> 10) + 0x0300);
290 self.pending = tone;
291 return Some(base);
292 }
293 if c >= '\u{C3}' && c <= '\u{0169}' {
294 let key = (s - 0xC3) as u8;
295 if let Ok(i) = TONE_DATA.middle_key.binary_search(&key) {
296 let val = TONE_DATA.middle_value[i];
297 let base = char::from(val & 0x7F);
298 let tone = if (val & 0x5F) == b'Y' {
299 // There has to be a more elegant way to handle this.
300 '\u{0301}'
301 } else if (val >> 7) == 0 {
302 '\u{0300}'
303 } else {
304 '\u{0303}'
305 };
306 self.pending = tone;
307 return Some(base);
308 }
309 }
310 if self.orthographic && c >= '\u{C0}' && c <= '\u{FA}' {
311 if let Ok(i) = TONE_DATA.windows_1258_key.binary_search(&(c as u8)) {
312 let val = TONE_DATA.windows_1258_value[i];
313 let base = char::from(val & 0x7F);
314 let tone = (val >> 7) as u16 + 0x0300;
315 self.pending = expand(tone);
316 return Some(base);
317 }
318 }
319 return Some(c);
320 }
321 None
322 }
323}
324
325/// Trait that adds a `decompose_vietnamese_tones` method to iterators
326/// over `char`.
327pub trait IterDecomposeVietnamese<I: Iterator<Item = char>> {
328 /// Assuming that `self` is an iterator yielding a sequence of
329 /// `char`s in Normalization Form C (this precondition is not
330 /// checked!), yields a sequence of `char`s with Vietnamese tone
331 /// marks less or more decomposed. Note that the output is _not_
332 /// in Unicode Normalization Form D or any Normalization Form.
333 /// Circumflex and breve are not detached from their base characters.
334 ///
335 /// If `orthographic` is `false`, tone marks are decomposed if
336 /// there is no precomposed form form the incoming character in
337 /// windows-1258. E.g. á is not decomposed, but ý is decomposed to
338 /// y followed by combining acute and ấ is decomposed to â followed
339 /// by combining acute.
340 ///
341 /// If `orthographic` is `true`, tone marks are always decomposed.
342 /// That is, even á is decomposed.
343 fn decompose_vietnamese_tones(self, orthographic: bool) -> DecomposeVietnamese<I>;
344}
345
346impl<I: Iterator<Item = char>> IterDecomposeVietnamese<I> for I {
347 /// Assuming that `self` is an iterator yielding a sequence of
348 /// `char`s in Normalization Form C (this precondition is not
349 /// checked!), yields a sequence of `char`s with Vietnamese tone
350 /// marks less or more decomposed. Note that the output is _not_
351 /// in Unicode Normalization Form D or any Normalization Form.
352 /// Circumflex and breve are not detached from their base characters.
353 ///
354 /// If `orthographic` is `false`, tone marks are decomposed if
355 /// there is no precomposed form form the incoming character in
356 /// windows-1258. E.g. á is not decomposed, but ý is decomposed to
357 /// y followed by combining acute and ấ is decomposed to â followed
358 /// by combining acute.
359 ///
360 /// If `orthographic` is `true`, tone marks are always decomposed.
361 /// That is, even á is decomposed.
362 #[inline]
363 fn decompose_vietnamese_tones(self, orthographic: bool) -> DecomposeVietnamese<I> {
364 DecomposeVietnamese {
365 delegate: self,
366 pending: '\u{0}',
367 orthographic: orthographic,
368 }
369 }
370}
371
372#[cfg(test)]
373mod tests {
374 use super::*;
375
376 fn check(nfc: char, base: char, tone: char) {
377 let mut decompose_vietnamese = std::iter::once(nfc).decompose_vietnamese_tones(true);
378 assert_eq!(decompose_vietnamese.next(), Some(base));
379 assert_eq!(decompose_vietnamese.next(), Some(tone));
380 assert_eq!(decompose_vietnamese.next(), None);
381 }
382
383 #[test]
384 fn test_tones() {
385 let normalizer = icu_normalizer::ComposingNormalizer::new_nfc();
386 let bases = [
387 'A', 'a', 'Ă', 'ă', 'Â', 'â', 'E', 'e', 'Ê', 'ê', 'I', 'i', 'O', 'o', 'Ô', 'ô', 'Ơ',
388 'ơ', 'U', 'u', 'Ư', 'ư', 'Y', 'y',
389 ];
390 let tones = ['\u{0300}', '\u{0309}', '\u{0303}', '\u{0301}', '\u{0323}'];
391 for &base in bases.iter() {
392 for &tone in tones.iter() {
393 let nfc = normalizer
394 .normalize_iter([base, tone].iter().copied())
395 .next()
396 .unwrap();
397 check(nfc, base, tone);
398 }
399 }
400 }
401}