seshat/unicode/
normalization.rs1use crate::unicode::CodePoint;
2use crate::unicode::Ucd;
3use crate::unicode::props::Gc;
4use crate::unicode::props::Dt;
5
6pub(crate) fn starter(cp: u32) -> bool {
7 let cp = CodePoint::new(cp).unwrap();
8 let cp_ccc = cp.ccc();
9 let cp_gc = cp.gc();
10
11 if cp_gc == Gc::Mn {
12 if cp_ccc as u8 == 0 {
13 return true;
14 } else if cp_ccc as u8 > 0 {
15 return false;
16 }
17 }
18
19 if cp_gc == Gc::Mc {
20 if cp_ccc as u8 == 0 {
21 return true;
22 } else if cp_ccc as u8 > 0 {
23 return false;
24 }
25 }
26
27 if cp_gc == Gc::Me {
28 if cp_ccc as u8 == 0 {
29 return true;
30 }
31 }
32
33 if cp_ccc as u8 == 0 {
34 return true;
35 }
36
37 false
38}
39
40pub(super) fn reorderable_pair(pair: (u32, u32)) -> bool {
41 let a = CodePoint::new(pair.0).unwrap();
42 let b = CodePoint::new(pair.1).unwrap();
43 a.ccc() as u8 > b.ccc() as u8 && b.ccc() as u8 > 0
44}
45
46pub(super) fn canonical_ordering(sequence: &mut Vec<char>) {
47 if sequence.len() == 0 {
49 return ();
50 }
51
52 let mut last_idx = sequence.len() - 1;
53 while last_idx > 0 {
54 for i in 0..=last_idx {
55 if i + 1 == last_idx + 1 {
56 continue;
57 }
58 if reorderable_pair((sequence[i] as u32, sequence[i + 1] as u32)) {
59 sequence.swap(i, i + 1);
60 }
61 }
62 last_idx -= 1;
63 }
64}
65
66pub(super) fn compatibility_decomposition(s: &Vec<char>) -> Vec<char> {
67 let mut count = 0;
68 let mut decomposed = vec![];
69 for ch in s.iter() {
70 if ch.dm() == "" {
71 decomposed.push(*ch);
72 } else {
73 match ch.dt() {
74 Dt::None => {
75 decomposed.push(*ch);
76 }
77 _ => {
78 for decomposed_char in ch.dm().chars() {
79 decomposed.push(decomposed_char);
80 count += 1;
81 }
82 }
83 }
84 }
85 }
86 if count == 0 {
87 return decomposed;
88 }
89 compatibility_decomposition(&decomposed)
90}
91
92pub(super) fn canonical_decomposition(s: Vec<char>) -> Vec<char>{
93 let mut count = 0;
94 let mut decomposed = vec![];
95 for ch in s.iter() {
96 if ch.dm() == "" {
97 decomposed.push(*ch);
98 } else {
99 if ch.dt() == Dt::Can {
100 for decomposed_char in ch.dm().chars() {
101 decomposed.push(decomposed_char);
102 count += 1;
103 }
104 } else {
105 decomposed.push(*ch);
106 }
107 }
108 }
109 if count == 0 {
110 return decomposed;
111 }
112 canonical_decomposition(decomposed)
113}
114
115pub(crate) fn singleton_decomposition(cp: u32) -> bool {
116 let code_point = CodePoint::new(cp).unwrap();
117 let mut self_char = String::new();
119 self_char.push(std::char::from_u32(code_point.to_u32()).unwrap());
120 if code_point.dm() == self_char {
121 return false;
122 }
123 if code_point.dm().chars().collect::<Vec<char>>().len() == 1 {
125 if code_point.dt() != Dt::Can {
126 return false;
127 }
128 return true;
129 }
130
131 false
132}
133
134pub(crate) fn non_starter_decomposition(cp: u32) -> bool {
137 let code_point = CodePoint::new(cp).unwrap();
138 let decomposed = canonical_decomposition(
139 vec![std::char::from_u32(code_point.to_u32()).unwrap()]
140 );
141 if decomposed.len() > 1 && !starter(decomposed[0] as u32) {
142 return true;
143 }
144
145 false
146}
147
148fn primary_composite(cp: u32) -> bool {
150 let code_point = CodePoint::new(cp).unwrap();
151 if code_point.dt() == Dt::Can && !code_point.comp_ex() {
152 return true;
153 }
154
155 false
156}
157
158fn blocked(sequence: &[char]) -> bool {
160 if sequence[0].ccc() as u8 != 0 {
161 return false;
162 }
163 let first_i = 0;
164 let last_i = sequence.len() - 1;
165 if last_i - 1 == first_i {
166 return false;
167 }
168 if sequence[last_i - 1].ccc() as u8 == 0
169 || sequence[last_i - 1].ccc() as u8 >= sequence[last_i].ccc() as u8
170 {
171 return true;
172 }
173
174 false
175}
176
177pub(super) fn canonical_composition(s: &mut Vec<char>) {
179 if s.len() == 1 {
180 return ();
181 }
182
183 let mut offset = 1;
184 while offset < s.len() {
185 let i = offset;
186 let mut back_i = i - 1;
187 while back_i != 0 && !starter(s[back_i] as u32) {
191 back_i -= 1;
192 }
193 let mut lc = String::new();
198 lc.push(s[back_i]);
199 lc.push(s[i]);
200
201 let mapping = crate::unicode::ucd::dm::rdm(&lc);
202 let is_primary_composite = primary_composite(mapping);
203 if (starter(s[back_i] as u32)
204 && !blocked(&s[back_i..=i]))
205 && (mapping != 0x0 && is_primary_composite)
206 {
207 s[back_i] = std::char::from_u32(mapping).unwrap();
208 s.remove(i);
209 offset -= 1;
210 }
211 offset += 1;
212 }
213}
214
215pub(crate) fn nfd(s: &str) -> Vec<char> {
216 let seq = s.chars().collect::<Vec<char>>();
217 let mut seq = canonical_decomposition(seq);
218 canonical_ordering(&mut seq);
219
220 seq
221}
222
223pub(crate) fn nfkd(s: &str) -> Vec<char> {
224 let seq = s.chars().collect::<Vec<char>>();
225 let mut seq = compatibility_decomposition(&seq);
226 canonical_ordering(&mut seq);
227
228 seq
229}
230
231pub(crate) fn nfc(s: &str) -> Vec<char> {
232 let mut seq = nfd(s);
233 canonical_composition(&mut seq);
234
235 seq
236}
237
238pub(crate) fn nfkc(s: &str) -> Vec<char> {
239 let mut seq = nfkd(s);
240 canonical_composition(&mut seq);
241
242 seq
243}
244
245mod tests {
246 #[test]
247 fn test_canonical_ordering() {
248 let mut s1 = vec!['a', '\u{0305}', '\u{0315}', '\u{0300}', '\u{05AE}', 'b'];
249 super::canonical_ordering(&mut s1);
250 assert_eq!(
251 s1,
252 vec!['a', '\u{05AE}', '\u{0305}', '\u{0300}', '\u{0315}', 'b']
253 )
254 }
255
256 #[test]
257 fn test_blocked() {
258 let s1 = &['A', 'B'];
259 assert_eq!(super::blocked(s1), false);
260 assert_eq!(super::blocked(&['a', '\u{05AE}', '\u{0305}', '\u{0300}']), true);
261 }
262}