#[repr(align(64))] struct ToneData {
windows_1258_key: [u8; 16],
windows_1258_value: [u8; 16],
middle_key: [u8; 14],
middle_value: [u8; 14],
extensions_for_vietnamese: [u16; 90],
}
static TONE_DATA: ToneData = ToneData {
windows_1258_key: [
0xC0, 0xC1, 0xC8, 0xC9, 0xCD, 0xD3, 0xD9, 0xDA, 0xE0, 0xE1, 0xE8, 0xE9, 0xED, 0xF3, 0xF9, 0xFA, ],
windows_1258_value: [
0x41, 0xC1, 0x45, 0xC5, 0xC9, 0xCF, 0x55, 0xD5, 0x61, 0xE1, 0x65, 0xE5, 0xE9, 0xEF, 0x75, 0xF5, ],
middle_key: [
0x00, 0x09, 0x0F, 0x12, 0x1A, 0x20, 0x29, 0x2F, 0x32, 0x3A, 0x65, 0x66, 0xA5, 0xA6, ],
middle_value: [
0xC1, 0x49, 0x4F, 0xCF, 0x59, 0xE1, 0x69, 0x6F, 0xEF, 0x79, 0xC9, 0xE9, 0xD5, 0xF5, ],
extensions_for_vietnamese: [
0x8C41, 0x8C61, 0x2441, 0x2461, 0x04C2, 0x04E2, 0x00C2, 0x00E2, 0x24C2, 0x24E2, 0x0CC2, 0x0CE2, 0x8CC2, 0x8CE2, 0x0502, 0x0503, 0x0102, 0x0103, 0x2502, 0x2503, 0x0D02, 0x0D03, 0x8D02, 0x8D03, 0x8C45, 0x8C65, 0x2445, 0x2465, 0x0C45, 0x0C65, 0x04CA, 0x04EA, 0x00CA, 0x00EA, 0x24CA, 0x24EA, 0x0CCA, 0x0CEA, 0x8CCA, 0x8CEA, 0x2449, 0x2469, 0x8C49, 0x8C69, 0x8C4F, 0x8C6F, 0x244F, 0x246F, 0x04D4, 0x04F4, 0x00D4, 0x00F4, 0x24D4, 0x24F4, 0x0CD4, 0x0CF4, 0x8CD4, 0x8CF4, 0x05A0, 0x05A1, 0x01A0, 0x01A1, 0x25A0, 0x25A1, 0x0DA0, 0x0DA1, 0x8DA0, 0x8DA1, 0x8C55, 0x8C75, 0x2455, 0x2475, 0x05AF, 0x05B0, 0x01AF, 0x01B0, 0x25AF, 0x25B0, 0x0DAF, 0x0DB0, 0x8DAF, 0x8DB0, 0x0059, 0x0079, 0x8C59, 0x8C79, 0x2459, 0x2479, 0x0C59, 0x0C79, ],
};
fn expand(u: u16) -> char {
unsafe { std::char::from_u32_unchecked(u32::from(u)) }
}
#[derive(Debug)]
pub struct DecomposeVietnamese<I> {
delegate: I,
pending: char,
orthographic: bool,
}
impl<I: Iterator<Item = char>> Iterator for DecomposeVietnamese<I> {
type Item = char;
#[inline]
fn next(&mut self) -> Option<char> {
if self.pending != '\u{0}' {
let c = self.pending;
self.pending = '\u{0}';
return Some(c);
}
if let Some(c) = self.delegate.next() {
let s = c as usize;
let minus_offset = s.wrapping_sub(0x1EA0);
if minus_offset < TONE_DATA.extensions_for_vietnamese.len() {
let val = TONE_DATA.extensions_for_vietnamese[minus_offset];
let base = expand(val & 0x3FF);
let tone = expand((val >> 10) + 0x0300);
self.pending = tone;
return Some(base);
}
if c >= '\u{C3}' && c <= '\u{0169}' {
let key = (s - 0xC3) as u8;
if let Ok(i) = TONE_DATA.middle_key.binary_search(&key) {
let val = TONE_DATA.middle_value[i];
let base = char::from(val & 0x7F);
let tone = if (val & 0x5F) == b'Y' {
'\u{0301}'
} else if (val >> 7) == 0 {
'\u{0300}'
} else {
'\u{0303}'
};
self.pending = tone;
return Some(base);
}
}
if self.orthographic && c >= '\u{C0}' && c <= '\u{FA}' {
if let Ok(i) = TONE_DATA.windows_1258_key.binary_search(&(c as u8)) {
let val = TONE_DATA.windows_1258_value[i];
let base = char::from(val & 0x7F);
let tone = (val >> 7) as u16 + 0x0300;
self.pending = expand(tone);
return Some(base);
}
}
return Some(c);
}
None
}
}
pub trait IterDecomposeVietnamese<I: Iterator<Item = char>> {
fn decompose_vietnamese_tones(self, orthographic: bool) -> DecomposeVietnamese<I>;
}
impl<I: Iterator<Item = char>> IterDecomposeVietnamese<I> for I {
#[inline]
fn decompose_vietnamese_tones(self, orthographic: bool) -> DecomposeVietnamese<I> {
DecomposeVietnamese {
delegate: self,
pending: '\u{0}',
orthographic: orthographic,
}
}
}
#[cfg(test)]
mod tests {
use super::*;
fn check(nfc: char, base: char, tone: char) {
let mut decompose_vietnamese = std::iter::once(nfc).decompose_vietnamese_tones(true);
assert_eq!(decompose_vietnamese.next(), Some(base));
assert_eq!(decompose_vietnamese.next(), Some(tone));
assert_eq!(decompose_vietnamese.next(), None);
}
#[test]
fn test_tones() {
let normalizer = icu_normalizer::ComposingNormalizer::new_nfc();
let bases = [
'A', 'a', 'Ă', 'ă', 'Â', 'â', 'E', 'e', 'Ê', 'ê', 'I', 'i', 'O', 'o', 'Ô', 'ô', 'Ơ',
'ơ', 'U', 'u', 'Ư', 'ư', 'Y', 'y',
];
let tones = ['\u{0300}', '\u{0309}', '\u{0303}', '\u{0301}', '\u{0323}'];
for &base in bases.iter() {
for &tone in tones.iter() {
let nfc = normalizer
.normalize_iter([base, tone].iter().copied())
.next()
.unwrap();
check(nfc, base, tone);
}
}
}
}