unicode_shaper/shape/
buginese.rs

1// https://www.unicode.org/charts/PDF/U1A00.pdf
2// https://r12a.github.io/scripts/bugi/bug.html
3
4use crate::shape::*;
5use alloc::vec::Vec;
6
7/// Check if a character is Buginese
8pub fn is_buginese(c: &u16) -> bool {
9    // main 1A00–1A1F
10    *c >= 0x1A00 && *c <= 0x1A1F
11}
12
13#[derive(Debug, Clone, PartialEq)]
14enum MType {
15    C,    // Consonants (1A00-1A16)
16    GB,   // Generic base characters (00A0, 00D7, 2012–2015, 2022, 25CC, 25FB–25FE)
17    J,    // Joiners (200D ZWJ (Zero Width Joiner) & 034F CGJ (COMBINING GRAPHEME JOINER))
18    O,    // SCRIPT_COMMON characters in a Buginese run
19    R,    // Reserved characters from the Buginese block (1A1C, 1A1D)
20    S,    // Symbols (1A1E, 1A1F, A9CF)
21    VAbv, // Above base dependent vowel (1A17, 1A1B)
22    VBlw, // Below base dependent vowel (1A18)
23    VPre, // Pre base dependent vowel (1A19)
24    VPst, // Post base dependent vowel (1A1A)
25    VS,   // Variation selectors (FE00–FE0F)
26    WJ,   // Word joiner (2060)
27    NJ,   // Non-joiner (200C) [Zero Width Non-Joiner]
28    WS,   // Whitespace (0020, 0009, 000A, 000D, 000C, 0085, 3000, 200B)
29}
30impl MType {
31    // note: isSpecialSequence is for K, if true, '103A, 1039' come after c
32    fn from_u16(c: &u16) -> MType {
33        match c {
34            // Consonants (1A00-1A16)
35            0x1A00..=0x1A16 => MType::C,
36            // Generic base characters (00A0, 00D7, 2012–2015, 2022, 25CC, 25FB–25FE)
37            0x00A0 | 0x00D7 | 0x2012..=0x2015 | 0x2022 | 0x25CC | 0x25FB..=0x25FE => MType::GB,
38            // Joiners (200C, 200D)
39            0x200D | 0x034F => MType::J,
40            // Reserved characters from the Buginese block (1A1C, 1A1D)
41            0x1A1C | 0x1A1D => MType::R,
42            // Symbols (1A1E, 1A1F, A9CF)
43            0x1A1E | 0x1A1F | 0xA9CF => MType::S,
44            // Above base dependent vowel (1A17, 1A1B)
45            0x1A17 | 0x1A1B => MType::VAbv,
46            // Below base dependent vowel (1A18)
47            0x1A18 => MType::VBlw,
48            // Pre base dependent vowel (1A19)
49            0x1A19 => MType::VPre,
50            // Post base dependent vowel (1A1A)
51            0x1A1A => MType::VPst,
52            // Variation selectors (FE00–FE0F)
53            0xFE00..=0xFE0F => MType::VS,
54            // Word joiner (2060)
55            0x2060 => MType::WJ,
56            // Non-joiner (200C) [Zero Width Non-Joiner]
57            0x200C => MType::NJ,
58            // Whitespace (0020, 0009, 000A, 000D, 000C, 0085, 3000, 200B)
59            &WHITESPACE => MType::WS,
60            _ => MType::O,
61        }
62    }
63
64    fn is_same(&self, other: &u16) -> bool {
65        let other_type = MType::from_u16(other);
66        *self == other_type || other_type == MType::J
67    }
68}
69
70#[derive(Clone)]
71struct Definition<'a> {
72    // cluster definition
73    m_type: MType,
74    // reference to a slice of an original string
75    code: &'a [u16],
76}
77impl<'a> Definition<'a> {
78    fn new(m_type: MType, code: &'a [u16]) -> Self {
79        Self { m_type, code }
80    }
81
82    fn build_definition(input: &[u16]) -> Vec<Definition> {
83        let mut clusters = Vec::new();
84
85        let mut idx: usize = 0;
86        while idx < input.len() {
87            let start_idx = idx;
88            let code = &input[idx];
89            let m_type = MType::from_u16(code);
90            let mut end_idx = idx + 1;
91            // while we find joiners and the same type, continue
92            while end_idx < input.len() && MType::is_same(&m_type, &input[end_idx]) {
93                end_idx += 1;
94            }
95            clusters.push(Definition::new(m_type, &input[start_idx..end_idx]));
96            idx = end_idx;
97        }
98
99        clusters
100    }
101}
102
103struct Cluster<'a> {
104    pub defs: Vec<Definition<'a>>,
105    pub whitespace: Option<&'a u16>,
106}
107impl<'a> Cluster<'a> {
108    fn new(defs: Vec<Definition<'a>>, whitespace: Option<&'a u16>) -> Self {
109        Self { defs, whitespace }
110    }
111
112    fn build_clusters(defs: &'a [Definition<'a>]) -> Vec<Cluster<'a>> {
113        let mut clusters = Vec::new();
114
115        let mut def_idx = 0;
116        for idx in 0..defs.len() {
117            if defs[idx].m_type == MType::WS || defs[idx].m_type == MType::NJ {
118                clusters.push(Cluster::new(defs[def_idx..idx].to_vec(), Some(&defs[idx].code[0])));
119                def_idx = idx + 1;
120            }
121        }
122        // store last
123        if def_idx < defs.len() {
124            clusters.push(Cluster::new(defs[def_idx..].to_vec(), None));
125        }
126
127        clusters
128    }
129
130    /// Once the Buginese shaping engine has analyzed the run into
131    /// clusters as described above, it performs any required reordering.
132    /// Pre-base vowels (VPre) are reordered to the start of the syllable
133    /// cluster. A sequence of multiple pre-base vowels is permitted.
134    /// Such sequences are moved as a block to the beginning of the cluster.
135    /// In the following example, the run of code points represents a
136    /// single cluster.
137    fn get_sorted(&mut self) -> Vec<u16> {
138        // sort
139        let mut idx: usize = 0;
140        while idx < self.defs.len() {
141            match self.defs[idx].m_type {
142                MType::VPre => {
143                    // Pre-base vowels (VPre) are reordered to the start of the syllable cluster.
144                    let v_pre = self.defs.remove(idx);
145                    self.defs.insert(0, v_pre);
146                }
147                MType::VAbv | MType::VBlw | MType::VPst => {
148                    // // always put the head position consonant infront of the head letter
149                    // let mut head_idx = idx;
150                    // while head_idx > 0 && self.defs[head_idx].m_type != MType::C && self.defs[head_idx].m_type != MType::GB { head_idx -= 1; }
151                    // let vowel_sign = self.defs.remove(idx);
152                    // self.defs.insert(head_idx, vowel_sign);
153                }
154                _ => {}
155            }
156            idx += 1;
157        }
158
159        // store
160        let mut reordered = Vec::with_capacity(self.defs.len());
161        for def in &self.defs {
162            reordered.extend_from_slice(def.code)
163        }
164
165        reordered
166    }
167}
168
169/// Shape/Reordering characters
170/// The shaping engine inserts a placeholder glyph (U+25CC) wherever
171/// combining marks occur without a valid base. The character U+25CC
172/// belongs to the class of generic bases (GB). Well-formed Buginese
173/// character clusters are defined as follows:
174///
175/// Cases:
176/// 1) Simple non-compounding cluster: < S | Rsv | WS | O | J | WJ >
177/// 2) Clusters:                       < C | GB > [VS] (VPre)* (VAbv)* (VBlv)* (VPst)* [J]
178///
179/// Ex. ᨔᨗᨔᨗᨊᨗᨊ
180pub fn shape_buginese(input: &mut [u16]) {
181    let mut res: Vec<u16> = Vec::with_capacity(input.len());
182    // Step 1: Convert input to clusters
183    let defs = Definition::build_definition(input);
184    // Step 2: Split clusters by WS (white space)
185    let mut clusters_sets = Cluster::build_clusters(&defs);
186    // Step 3: Reorder the clusters and add them to result
187    clusters_sets.iter_mut().for_each(|c| {
188        res.append(&mut c.get_sorted());
189        // append whitespace of cluster if it exists
190        if let Some(ws) = c.whitespace {
191            res.push(*ws);
192        }
193    });
194
195    // now map the result to the original input
196    input.copy_from_slice(&res[..input.len()]);
197}
198
199#[cfg(test)]
200mod tests {
201    use super::*;
202
203    #[test]
204    fn buginese_test() {
205        let input: &[u16] = &[0x1A00, 0x1A19, 0x034F, 0x1A19, 0x034F, 0x1A17];
206        let expected: &[u16] = &[0x1A19, 0x034F, 0x1A19, 0x034F, 0x1A00, 0x1A17];
207        let mut result = input.to_vec();
208        shape_buginese(&mut result);
209        assert_eq!(result, expected);
210    }
211
212    #[test]
213    fn buginese_2_test() {
214        let input = "ᨔᨗᨔᨗᨊᨗᨊ";
215        let expected: &[u16] = &[6676, 6679, 6676, 6679, 6666, 6679, 6666];
216        // Encode the string as UTF-16 and obtain a slice of u16 values
217        let input_utf16_slice: Vec<u16> = input.encode_utf16().collect();
218        // Create a reference to the slice
219        let input_utf16_ref: &[u16] = &input_utf16_slice;
220        let mut result = input_utf16_ref.to_vec();
221        shape_buginese(&mut result);
222        assert_eq!(result, expected);
223    }
224
225    #[test]
226    fn buginese_3_test() {
227        let input = "ᨑᨗ ᨍᨍᨗᨕᨂᨗ";
228        let expected: &[u16] = &[6673, 6679, 32, 6669, 6669, 6679, 6677, 6658, 6679];
229        // Encode the string as UTF-16 and obtain a slice of u16 values
230        let input_utf16_slice: Vec<u16> = input.encode_utf16().collect();
231        // Create a reference to the slice
232        let input_utf16_ref: &[u16] = &input_utf16_slice;
233        let mut result = input_utf16_ref.to_vec();
234        shape_buginese(&mut result);
235        assert_eq!(result, expected);
236    }
237}