detone/
lib.rs

1// Copyright 2019 Mozilla Foundation. See the COPYRIGHT
2// file at the top-level directory of this distribution.
3//
4// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
5// https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
6// <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
7// option. This file may not be copied, modified, or distributed
8// except according to those terms.
9
10//! An iterator adapter that takes an iterator over `char` yielding a sequence of
11//! `char`s in Normalization Form C (this precondition is not checked!) and
12//! yields `char`s either such that tone marks that wouldn't otherwise fit into
13//! windows-1258 are decomposed or such that text is decomposed into orthographic
14//! units.
15//!
16//! Use cases include preprocessing before encoding Vietnamese text into
17//! windows-1258 or converting precomposed Vietnamese text into a form that looks
18//! like it was written with the (non-IME) Vietnamese keyboard layout (e.g. for
19//! machine learning training or benchmarking purposes).
20
21#[repr(align(64))] // Align to cache lines
22struct ToneData {
23    windows_1258_key: [u8; 16],
24    windows_1258_value: [u8; 16],
25    middle_key: [u8; 14],
26    middle_value: [u8; 14],
27    extensions_for_vietnamese: [u16; 90],
28}
29
30// These arrays list the actual decomposed code point combinations that should
31// replace a single, composed code point. For example, 0x1EA0 ("Ạ") decomposes
32// into 0x0041 ("A") + 0x0323 ("Combining dot below"). Decompositions for
33// windows-1258 are always two code points.
34//
35// Entries here pack information about both decomposed points into a single
36// integer, where the lower bits indicate the first code point, and the upper
37// bits indicate the second code point (usually a tone mark). There are three
38// sets of decompositions, and each packs the the two code points together
39// differently to make efficient use of memory.
40static TONE_DATA: ToneData = ToneData {
41    // Index for orthographic-only decompositions. This array lists the Unicode
42    // code points that can be decomposed, while the `windows_1258_value` array
43    // lists the actual decompositions for them at the corresponding index.
44    windows_1258_key: [
45        0xC0, // À
46        0xC1, // Á
47        0xC8, // È
48        0xC9, // É
49        0xCD, // Í
50        0xD3, // Ó
51        0xD9, // Ù
52        0xDA, // Ú
53        0xE0, // à
54        0xE1, // á
55        0xE8, // è
56        0xE9, // é
57        0xED, // í
58        0xF3, // ó
59        0xF9, // ù
60        0xFA, // ú
61    ],
62    // Orthographic only decompositions. Given a composed code point, find it
63    // in the above array, then look for the decomposition at the corresponding
64    // index in this array.
65    //
66    // The lower 7 bits of a value is the first replacement code point. The
67    // upper bit is the second code point, offset by negative 0x0300. For
68    // example, the decomposition of 0xC0 ("À") is 0x41, which represents the
69    // code points 0x41 ("A") + 0x0300 ("Combining grave accent"):
70    //
71    //     0x41  =  0b_0100_0001
72    //     First:   0b_0100_0001 = 0x41
73    //                  ^^^^^^^^   (Lower 7 bits)
74    //     Second:  0b_0         = 0x00 + 0x300 = 0x0300
75    //                 ^           (Upper bit)
76    //
77    windows_1258_value: [
78        0x41, // À
79        0xC1, // Á
80        0x45, // È
81        0xC5, // É
82        0xC9, // Í
83        0xCF, // Ó
84        0x55, // Ù
85        0xD5, // Ú
86        0x61, // à
87        0xE1, // á
88        0x65, // è
89        0xE5, // é
90        0xE9, // í
91        0xEF, // ó
92        0x75, // ù
93        0xF5, // ú
94    ],
95    // Index for decompositions of assorted code points outside the range
96    // 0x1ea0 - 0x1efa. This array lists the Unicode code points that can be
97    // decomposed (offset by negative 0xC3 so they fit in one byte). The actual
98    // decomposition is at the corresponding index of the `middle_value` array.
99    middle_key: [
100        0x00, // Ã
101        0x09, // Ì
102        0x0F, // Ò
103        0x12, // Õ
104        0x1A, // Ý
105        0x20, // ã
106        0x29, // ì
107        0x2F, // ò
108        0x32, // õ
109        0x3A, // ý
110        0x65, // Ĩ
111        0x66, // ĩ
112        0xA5, // Ũ
113        0xA6, // ũ
114    ],
115    // Decompositions. Given a composed code point, find it in the above array,
116    // then find the decomposition at the corresponding index in this array.
117    //
118    // The lower 7 bits of a value is the first replacement code point. The
119    // second code point is more complicated:
120    //   - If the first point is 0x59 ("Y") or 0x79 ("y"), it is 0x0301
121    //     ("Combining Acute Accent"). For these, ignore the upper bit.
122    //   - If the upper bit is 0, it is 0x0300 ("Combining Grave Accent").
123    //   - If the upper bit is 1, it is 0x0303 ("Combining Tilde")
124    //
125    // For example, the decomposition of 0xC3 ("Ã") 0xC1, which is the code
126    // points 0x41 ("A") + 0x0303 ("Combining tilde"):
127    //
128    //     0xC1  =  0b_1100_0001
129    //     First:   0b_0100_0001 = 0x41
130    //                  ^^^^^^^^   (Lower 7 bits)
131    //     Second:  0b_1         = 0x01 -> 0x0303
132    //                 ^           (Upper bit)
133    //
134    middle_value: [
135        0xC1, // Ã
136        0x49, // Ì
137        0x4F, // Ò
138        0xCF, // Õ
139        0x59, // Ý
140        0xE1, // ã
141        0x69, // ì
142        0x6F, // ò
143        0xEF, // õ
144        0x79, // ý
145        0xC9, // Ĩ
146        0xE9, // ĩ
147        0xD5, // Ũ
148        0xF5, // ũ
149    ],
150    // Decompositions for code points in the range 0x1ea0 - 0x1efa (the main
151    // range of composed vowels + accents + tone marks used in Vietnamese).
152    //
153    // Decompositions are listed in order, so the decomposition for code point
154    // 0x1ea0 is at index 0, 0x1ea0 is at index 1, etc.
155    //
156    // The lower 10 bits of a value is the first replacement code point. The
157    // upper 6 bits are the second code point, offset by negative 0x0300. For
158    // example, the decomposition of 0x1EA0 ("Ạ") is 0x8C41, which represents
159    // the code points 0x41 ("A") + 0x0323 ("Combining dot below"):
160    //
161    //     0x8C41 =  0b_1000_1100_0100_0001
162    //     First:    0b_0000_0000_0100_0001 = 0x41
163    //                          ^^^^^^^^^^^   (Lower 10 bits)
164    //     Second:   0b_1000_11             = 0x23 + 0x300 = 0x0323
165    //                  ^^^^^^^               (Upper 6 bits)
166    //
167    extensions_for_vietnamese: [
168        0x8C41, // Ạ
169        0x8C61, // ạ
170        0x2441, // Ả
171        0x2461, // ả
172        0x04C2, // Ấ
173        0x04E2, // ấ
174        0x00C2, // Ầ
175        0x00E2, // ầ
176        0x24C2, // Ẩ
177        0x24E2, // ẩ
178        0x0CC2, // Ẫ
179        0x0CE2, // ẫ
180        0x8CC2, // Ậ
181        0x8CE2, // ậ
182        0x0502, // Ắ
183        0x0503, // ắ
184        0x0102, // Ằ
185        0x0103, // ằ
186        0x2502, // Ẳ
187        0x2503, // ẳ
188        0x0D02, // Ẵ
189        0x0D03, // ẵ
190        0x8D02, // Ặ
191        0x8D03, // ặ
192        0x8C45, // Ẹ
193        0x8C65, // ẹ
194        0x2445, // Ẻ
195        0x2465, // ẻ
196        0x0C45, // Ẽ
197        0x0C65, // ẽ
198        0x04CA, // Ế
199        0x04EA, // ế
200        0x00CA, // Ề
201        0x00EA, // ề
202        0x24CA, // Ể
203        0x24EA, // ể
204        0x0CCA, // Ễ
205        0x0CEA, // ễ
206        0x8CCA, // Ệ
207        0x8CEA, // ệ
208        0x2449, // Ỉ
209        0x2469, // ỉ
210        0x8C49, // Ị
211        0x8C69, // ị
212        0x8C4F, // Ọ
213        0x8C6F, // ọ
214        0x244F, // Ỏ
215        0x246F, // ỏ
216        0x04D4, // Ố
217        0x04F4, // ố
218        0x00D4, // Ồ
219        0x00F4, // ồ
220        0x24D4, // Ổ
221        0x24F4, // ổ
222        0x0CD4, // Ỗ
223        0x0CF4, // ỗ
224        0x8CD4, // Ộ
225        0x8CF4, // ộ
226        0x05A0, // Ớ
227        0x05A1, // ớ
228        0x01A0, // Ờ
229        0x01A1, // ờ
230        0x25A0, // Ở
231        0x25A1, // ở
232        0x0DA0, // Ỡ
233        0x0DA1, // ỡ
234        0x8DA0, // Ợ
235        0x8DA1, // ợ
236        0x8C55, // Ụ
237        0x8C75, // ụ
238        0x2455, // Ủ
239        0x2475, // ủ
240        0x05AF, // Ứ
241        0x05B0, // ứ
242        0x01AF, // Ừ
243        0x01B0, // ừ
244        0x25AF, // Ử
245        0x25B0, // ử
246        0x0DAF, // Ữ
247        0x0DB0, // ữ
248        0x8DAF, // Ự
249        0x8DB0, // ự
250        0x0059, // Ỳ
251        0x0079, // ỳ
252        0x8C59, // Ỵ
253        0x8C79, // ỵ
254        0x2459, // Ỷ
255        0x2479, // ỷ
256        0x0C59, // Ỹ
257        0x0C79, // ỹ
258    ],
259};
260
261fn expand(u: u16) -> char {
262    unsafe { std::char::from_u32_unchecked(u32::from(u)) }
263}
264
265/// An iterator adapter yielding `char` with tone marks detached.
266#[derive(Debug)]
267pub struct DecomposeVietnamese<I> {
268    delegate: I,
269    pending: char,
270    orthographic: bool,
271}
272
273impl<I: Iterator<Item = char>> Iterator for DecomposeVietnamese<I> {
274    type Item = char;
275
276    #[inline]
277    fn next(&mut self) -> Option<char> {
278        if self.pending != '\u{0}' {
279            let c = self.pending;
280            self.pending = '\u{0}';
281            return Some(c);
282        }
283        if let Some(c) = self.delegate.next() {
284            let s = c as usize;
285            let minus_offset = s.wrapping_sub(0x1EA0);
286            if minus_offset < TONE_DATA.extensions_for_vietnamese.len() {
287                let val = TONE_DATA.extensions_for_vietnamese[minus_offset];
288                let base = expand(val & 0x3FF);
289                let tone = expand((val >> 10) + 0x0300);
290                self.pending = tone;
291                return Some(base);
292            }
293            if c >= '\u{C3}' && c <= '\u{0169}' {
294                let key = (s - 0xC3) as u8;
295                if let Ok(i) = TONE_DATA.middle_key.binary_search(&key) {
296                    let val = TONE_DATA.middle_value[i];
297                    let base = char::from(val & 0x7F);
298                    let tone = if (val & 0x5F) == b'Y' {
299                        // There has to be a more elegant way to handle this.
300                        '\u{0301}'
301                    } else if (val >> 7) == 0 {
302                        '\u{0300}'
303                    } else {
304                        '\u{0303}'
305                    };
306                    self.pending = tone;
307                    return Some(base);
308                }
309            }
310            if self.orthographic && c >= '\u{C0}' && c <= '\u{FA}' {
311                if let Ok(i) = TONE_DATA.windows_1258_key.binary_search(&(c as u8)) {
312                    let val = TONE_DATA.windows_1258_value[i];
313                    let base = char::from(val & 0x7F);
314                    let tone = (val >> 7) as u16 + 0x0300;
315                    self.pending = expand(tone);
316                    return Some(base);
317                }
318            }
319            return Some(c);
320        }
321        None
322    }
323}
324
325/// Trait that adds a `decompose_vietnamese_tones` method to iterators
326/// over `char`.
327pub trait IterDecomposeVietnamese<I: Iterator<Item = char>> {
328    /// Assuming that `self` is an iterator yielding a sequence of
329    /// `char`s in Normalization Form C (this precondition is not
330    /// checked!), yields a sequence of `char`s with Vietnamese tone
331    /// marks less or more decomposed. Note that the output is _not_
332    /// in Unicode Normalization Form D or any Normalization Form.
333    /// Circumflex and breve are not detached from their base characters.
334    ///
335    /// If `orthographic` is `false`, tone marks are decomposed if
336    /// there is no precomposed form form the incoming character in
337    /// windows-1258. E.g. á is not decomposed, but ý is decomposed to
338    /// y followed by combining acute and ấ is decomposed to â followed
339    /// by combining acute.
340    ///
341    /// If `orthographic` is `true`, tone marks are always decomposed.
342    /// That is, even á is decomposed.
343    fn decompose_vietnamese_tones(self, orthographic: bool) -> DecomposeVietnamese<I>;
344}
345
346impl<I: Iterator<Item = char>> IterDecomposeVietnamese<I> for I {
347    /// Assuming that `self` is an iterator yielding a sequence of
348    /// `char`s in Normalization Form C (this precondition is not
349    /// checked!), yields a sequence of `char`s with Vietnamese tone
350    /// marks less or more decomposed. Note that the output is _not_
351    /// in Unicode Normalization Form D or any Normalization Form.
352    /// Circumflex and breve are not detached from their base characters.
353    ///
354    /// If `orthographic` is `false`, tone marks are decomposed if
355    /// there is no precomposed form form the incoming character in
356    /// windows-1258. E.g. á is not decomposed, but ý is decomposed to
357    /// y followed by combining acute and ấ is decomposed to â followed
358    /// by combining acute.
359    ///
360    /// If `orthographic` is `true`, tone marks are always decomposed.
361    /// That is, even á is decomposed.
362    #[inline]
363    fn decompose_vietnamese_tones(self, orthographic: bool) -> DecomposeVietnamese<I> {
364        DecomposeVietnamese {
365            delegate: self,
366            pending: '\u{0}',
367            orthographic: orthographic,
368        }
369    }
370}
371
372#[cfg(test)]
373mod tests {
374    use super::*;
375
376    fn check(nfc: char, base: char, tone: char) {
377        let mut decompose_vietnamese = std::iter::once(nfc).decompose_vietnamese_tones(true);
378        assert_eq!(decompose_vietnamese.next(), Some(base));
379        assert_eq!(decompose_vietnamese.next(), Some(tone));
380        assert_eq!(decompose_vietnamese.next(), None);
381    }
382
383    #[test]
384    fn test_tones() {
385        let normalizer = icu_normalizer::ComposingNormalizer::new_nfc();
386        let bases = [
387            'A', 'a', 'Ă', 'ă', 'Â', 'â', 'E', 'e', 'Ê', 'ê', 'I', 'i', 'O', 'o', 'Ô', 'ô', 'Ơ',
388            'ơ', 'U', 'u', 'Ư', 'ư', 'Y', 'y',
389        ];
390        let tones = ['\u{0300}', '\u{0309}', '\u{0303}', '\u{0301}', '\u{0323}'];
391        for &base in bases.iter() {
392            for &tone in tones.iter() {
393                let nfc = normalizer
394                    .normalize_iter([base, tone].iter().copied())
395                    .next()
396                    .unwrap();
397                check(nfc, base, tone);
398            }
399        }
400    }
401}