unicode_shaper/shape/buginese.rs
1// https://www.unicode.org/charts/PDF/U1A00.pdf
2// https://r12a.github.io/scripts/bugi/bug.html
3
4use crate::shape::*;
5use alloc::vec::Vec;
6
7/// Check if a character is Buginese
8pub fn is_buginese(c: &u16) -> bool {
9 // main 1A00–1A1F
10 *c >= 0x1A00 && *c <= 0x1A1F
11}
12
13#[derive(Debug, Clone, PartialEq)]
14enum MType {
15 C, // Consonants (1A00-1A16)
16 GB, // Generic base characters (00A0, 00D7, 2012–2015, 2022, 25CC, 25FB–25FE)
17 J, // Joiners (200D ZWJ (Zero Width Joiner) & 034F CGJ (COMBINING GRAPHEME JOINER))
18 O, // SCRIPT_COMMON characters in a Buginese run
19 R, // Reserved characters from the Buginese block (1A1C, 1A1D)
20 S, // Symbols (1A1E, 1A1F, A9CF)
21 VAbv, // Above base dependent vowel (1A17, 1A1B)
22 VBlw, // Below base dependent vowel (1A18)
23 VPre, // Pre base dependent vowel (1A19)
24 VPst, // Post base dependent vowel (1A1A)
25 VS, // Variation selectors (FE00–FE0F)
26 WJ, // Word joiner (2060)
27 NJ, // Non-joiner (200C) [Zero Width Non-Joiner]
28 WS, // Whitespace (0020, 0009, 000A, 000D, 000C, 0085, 3000, 200B)
29}
30impl MType {
31 // note: isSpecialSequence is for K, if true, '103A, 1039' come after c
32 fn from_u16(c: &u16) -> MType {
33 match c {
34 // Consonants (1A00-1A16)
35 0x1A00..=0x1A16 => MType::C,
36 // Generic base characters (00A0, 00D7, 2012–2015, 2022, 25CC, 25FB–25FE)
37 0x00A0 | 0x00D7 | 0x2012..=0x2015 | 0x2022 | 0x25CC | 0x25FB..=0x25FE => MType::GB,
38 // Joiners (200C, 200D)
39 0x200D | 0x034F => MType::J,
40 // Reserved characters from the Buginese block (1A1C, 1A1D)
41 0x1A1C | 0x1A1D => MType::R,
42 // Symbols (1A1E, 1A1F, A9CF)
43 0x1A1E | 0x1A1F | 0xA9CF => MType::S,
44 // Above base dependent vowel (1A17, 1A1B)
45 0x1A17 | 0x1A1B => MType::VAbv,
46 // Below base dependent vowel (1A18)
47 0x1A18 => MType::VBlw,
48 // Pre base dependent vowel (1A19)
49 0x1A19 => MType::VPre,
50 // Post base dependent vowel (1A1A)
51 0x1A1A => MType::VPst,
52 // Variation selectors (FE00–FE0F)
53 0xFE00..=0xFE0F => MType::VS,
54 // Word joiner (2060)
55 0x2060 => MType::WJ,
56 // Non-joiner (200C) [Zero Width Non-Joiner]
57 0x200C => MType::NJ,
58 // Whitespace (0020, 0009, 000A, 000D, 000C, 0085, 3000, 200B)
59 &WHITESPACE => MType::WS,
60 _ => MType::O,
61 }
62 }
63
64 fn is_same(&self, other: &u16) -> bool {
65 let other_type = MType::from_u16(other);
66 *self == other_type || other_type == MType::J
67 }
68}
69
70#[derive(Clone)]
71struct Definition<'a> {
72 // cluster definition
73 m_type: MType,
74 // reference to a slice of an original string
75 code: &'a [u16],
76}
77impl<'a> Definition<'a> {
78 fn new(m_type: MType, code: &'a [u16]) -> Self {
79 Self { m_type, code }
80 }
81
82 fn build_definition(input: &[u16]) -> Vec<Definition> {
83 let mut clusters = Vec::new();
84
85 let mut idx: usize = 0;
86 while idx < input.len() {
87 let start_idx = idx;
88 let code = &input[idx];
89 let m_type = MType::from_u16(code);
90 let mut end_idx = idx + 1;
91 // while we find joiners and the same type, continue
92 while end_idx < input.len() && MType::is_same(&m_type, &input[end_idx]) {
93 end_idx += 1;
94 }
95 clusters.push(Definition::new(m_type, &input[start_idx..end_idx]));
96 idx = end_idx;
97 }
98
99 clusters
100 }
101}
102
103struct Cluster<'a> {
104 pub defs: Vec<Definition<'a>>,
105 pub whitespace: Option<&'a u16>,
106}
107impl<'a> Cluster<'a> {
108 fn new(defs: Vec<Definition<'a>>, whitespace: Option<&'a u16>) -> Self {
109 Self { defs, whitespace }
110 }
111
112 fn build_clusters(defs: &'a [Definition<'a>]) -> Vec<Cluster<'a>> {
113 let mut clusters = Vec::new();
114
115 let mut def_idx = 0;
116 for idx in 0..defs.len() {
117 if defs[idx].m_type == MType::WS || defs[idx].m_type == MType::NJ {
118 clusters.push(Cluster::new(defs[def_idx..idx].to_vec(), Some(&defs[idx].code[0])));
119 def_idx = idx + 1;
120 }
121 }
122 // store last
123 if def_idx < defs.len() {
124 clusters.push(Cluster::new(defs[def_idx..].to_vec(), None));
125 }
126
127 clusters
128 }
129
130 /// Once the Buginese shaping engine has analyzed the run into
131 /// clusters as described above, it performs any required reordering.
132 /// Pre-base vowels (VPre) are reordered to the start of the syllable
133 /// cluster. A sequence of multiple pre-base vowels is permitted.
134 /// Such sequences are moved as a block to the beginning of the cluster.
135 /// In the following example, the run of code points represents a
136 /// single cluster.
137 fn get_sorted(&mut self) -> Vec<u16> {
138 // sort
139 let mut idx: usize = 0;
140 while idx < self.defs.len() {
141 match self.defs[idx].m_type {
142 MType::VPre => {
143 // Pre-base vowels (VPre) are reordered to the start of the syllable cluster.
144 let v_pre = self.defs.remove(idx);
145 self.defs.insert(0, v_pre);
146 }
147 MType::VAbv | MType::VBlw | MType::VPst => {
148 // // always put the head position consonant infront of the head letter
149 // let mut head_idx = idx;
150 // while head_idx > 0 && self.defs[head_idx].m_type != MType::C && self.defs[head_idx].m_type != MType::GB { head_idx -= 1; }
151 // let vowel_sign = self.defs.remove(idx);
152 // self.defs.insert(head_idx, vowel_sign);
153 }
154 _ => {}
155 }
156 idx += 1;
157 }
158
159 // store
160 let mut reordered = Vec::with_capacity(self.defs.len());
161 for def in &self.defs {
162 reordered.extend_from_slice(def.code)
163 }
164
165 reordered
166 }
167}
168
169/// Shape/Reordering characters
170/// The shaping engine inserts a placeholder glyph (U+25CC) wherever
171/// combining marks occur without a valid base. The character U+25CC
172/// belongs to the class of generic bases (GB). Well-formed Buginese
173/// character clusters are defined as follows:
174///
175/// Cases:
176/// 1) Simple non-compounding cluster: < S | Rsv | WS | O | J | WJ >
177/// 2) Clusters: < C | GB > [VS] (VPre)* (VAbv)* (VBlv)* (VPst)* [J]
178///
179/// Ex. ᨔᨗᨔᨗᨊᨗᨊ
180pub fn shape_buginese(input: &mut [u16]) {
181 let mut res: Vec<u16> = Vec::with_capacity(input.len());
182 // Step 1: Convert input to clusters
183 let defs = Definition::build_definition(input);
184 // Step 2: Split clusters by WS (white space)
185 let mut clusters_sets = Cluster::build_clusters(&defs);
186 // Step 3: Reorder the clusters and add them to result
187 clusters_sets.iter_mut().for_each(|c| {
188 res.append(&mut c.get_sorted());
189 // append whitespace of cluster if it exists
190 if let Some(ws) = c.whitespace {
191 res.push(*ws);
192 }
193 });
194
195 // now map the result to the original input
196 input.copy_from_slice(&res[..input.len()]);
197}
198
199#[cfg(test)]
200mod tests {
201 use super::*;
202
203 #[test]
204 fn buginese_test() {
205 let input: &[u16] = &[0x1A00, 0x1A19, 0x034F, 0x1A19, 0x034F, 0x1A17];
206 let expected: &[u16] = &[0x1A19, 0x034F, 0x1A19, 0x034F, 0x1A00, 0x1A17];
207 let mut result = input.to_vec();
208 shape_buginese(&mut result);
209 assert_eq!(result, expected);
210 }
211
212 #[test]
213 fn buginese_2_test() {
214 let input = "ᨔᨗᨔᨗᨊᨗᨊ";
215 let expected: &[u16] = &[6676, 6679, 6676, 6679, 6666, 6679, 6666];
216 // Encode the string as UTF-16 and obtain a slice of u16 values
217 let input_utf16_slice: Vec<u16> = input.encode_utf16().collect();
218 // Create a reference to the slice
219 let input_utf16_ref: &[u16] = &input_utf16_slice;
220 let mut result = input_utf16_ref.to_vec();
221 shape_buginese(&mut result);
222 assert_eq!(result, expected);
223 }
224
225 #[test]
226 fn buginese_3_test() {
227 let input = "ᨑᨗ ᨍᨍᨗᨕᨂᨗ";
228 let expected: &[u16] = &[6673, 6679, 32, 6669, 6669, 6679, 6677, 6658, 6679];
229 // Encode the string as UTF-16 and obtain a slice of u16 values
230 let input_utf16_slice: Vec<u16> = input.encode_utf16().collect();
231 // Create a reference to the slice
232 let input_utf16_ref: &[u16] = &input_utf16_slice;
233 let mut result = input_utf16_ref.to_vec();
234 shape_buginese(&mut result);
235 assert_eq!(result, expected);
236 }
237}