seshat/unicode/
normalization.rs

1use crate::unicode::CodePoint;
2use crate::unicode::Ucd;
3use crate::unicode::props::Gc;
4use crate::unicode::props::Dt;
5
6pub(crate) fn starter(cp: u32) -> bool {
7    let cp = CodePoint::new(cp).unwrap();
8    let cp_ccc = cp.ccc();
9    let cp_gc = cp.gc();
10
11    if cp_gc == Gc::Mn {
12        if cp_ccc as u8 == 0 {
13            return true;
14        } else if cp_ccc as u8 > 0 {
15            return false;
16        }
17    }
18
19    if cp_gc == Gc::Mc {
20        if cp_ccc as u8 == 0 {
21            return true;
22        } else if cp_ccc as u8 > 0 {
23            return false;
24        }
25    }
26
27    if cp_gc == Gc::Me {
28        if cp_ccc as u8 == 0 {
29            return true;
30        }
31    }
32
33    if cp_ccc as u8 == 0 {
34        return true;
35    }
36
37    false
38}
39
40pub(super) fn reorderable_pair(pair: (u32, u32)) -> bool {
41    let a = CodePoint::new(pair.0).unwrap();
42    let b = CodePoint::new(pair.1).unwrap();
43    a.ccc() as u8 > b.ccc() as u8 && b.ccc() as u8 > 0
44}
45
46pub(super) fn canonical_ordering(sequence: &mut Vec<char>) {
47    // This ordering algorithm acts like a bubble sort.
48    if sequence.len() == 0 {
49        return ();
50    }
51
52    let mut last_idx = sequence.len() - 1;
53    while last_idx > 0 {
54        for i in 0..=last_idx {
55            if i + 1 == last_idx + 1 {
56                continue;
57            }
58            if reorderable_pair((sequence[i] as u32, sequence[i + 1] as u32)) {
59                sequence.swap(i, i + 1);
60            }
61        }
62        last_idx -= 1;
63    }
64}
65
66pub(super) fn compatibility_decomposition(s: &Vec<char>) -> Vec<char> {
67    let mut count = 0;
68    let mut decomposed = vec![];
69    for ch in s.iter() {
70        if ch.dm() == "" {
71            decomposed.push(*ch);
72        } else {
73            match ch.dt() {
74                Dt::None => {
75                    decomposed.push(*ch);
76                }
77                _ => {
78                    for decomposed_char in ch.dm().chars() {
79                        decomposed.push(decomposed_char);
80                        count += 1;
81                    }
82                }
83            }
84        }
85    }
86    if count == 0 {
87        return decomposed;
88    }
89    compatibility_decomposition(&decomposed)
90}
91
92pub(super) fn canonical_decomposition(s: Vec<char>) -> Vec<char>{
93    let mut count = 0;
94    let mut decomposed = vec![];
95    for ch in s.iter() {
96        if ch.dm() == "" {
97            decomposed.push(*ch);
98        } else {
99            if ch.dt() == Dt::Can {
100                for decomposed_char in ch.dm().chars() {
101                    decomposed.push(decomposed_char);
102                    count += 1;
103                }
104            } else {
105                decomposed.push(*ch);
106            }
107        }
108    }
109    if count == 0 {
110        return decomposed;
111    }
112    canonical_decomposition(decomposed)
113}
114
115pub(crate) fn singleton_decomposition(cp: u32) -> bool {
116    let code_point = CodePoint::new(cp).unwrap();
117    // Default value (the code point itself) is not singletons.
118    let mut self_char = String::new();
119    self_char.push(std::char::from_u32(code_point.to_u32()).unwrap());
120    if code_point.dm() == self_char {
121        return false;
122    }
123    // Single character with canonical decomposition is singleton.
124    if code_point.dm().chars().collect::<Vec<char>>().len() == 1 {
125        if code_point.dt() != Dt::Can {
126            return false;
127        }
128        return true;
129    }
130
131    false
132}
133
134// D111     Non-starter decomposition: An expanding canonical decomposition which is not
135//          a starter decomposition.
136pub(crate) fn non_starter_decomposition(cp: u32) -> bool {
137    let code_point = CodePoint::new(cp).unwrap();
138    let decomposed = canonical_decomposition(
139        vec![std::char::from_u32(code_point.to_u32()).unwrap()]
140    );
141    if decomposed.len() > 1 && !starter(decomposed[0] as u32) {
142        return true;
143    }
144
145    false
146}
147
148// D114
149fn primary_composite(cp: u32) -> bool {
150    let code_point = CodePoint::new(cp).unwrap();
151    if code_point.dt() == Dt::Can && !code_point.comp_ex() {
152        return true;
153    }
154
155    false
156}
157
158// D115
159fn blocked(sequence: &[char]) -> bool {
160    if sequence[0].ccc() as u8 != 0 {
161        return false;
162    }
163    let first_i = 0;
164    let last_i = sequence.len() - 1;
165    if last_i - 1 == first_i {
166        return false;
167    }
168    if sequence[last_i - 1].ccc() as u8 == 0
169        || sequence[last_i - 1].ccc() as u8 >= sequence[last_i].ccc() as u8
170    {
171        return true;
172    }
173
174    false
175}
176
177// D117
178pub(super) fn canonical_composition(s: &mut Vec<char>) {
179    if s.len() == 1 {
180        return ();
181    }
182
183    let mut offset = 1;
184    while offset < s.len() {
185        let i = offset;
186        let mut back_i = i - 1;
187        // R1 - Seek back (left) in the coded character sequence from the
188        // character C to find the last Starter L preceding C in the
189        // character sequence.
190        while back_i != 0 && !starter(s[back_i] as u32) {
191            back_i -= 1;
192        }
193        // R2 - If there is such an L, and C is not blocked from L, and there
194        // exists a Primary Composite P which is canonically equivalent to the
195        // sequence <L, C>, then replace L by P in the sequence and delete C
196        // from the sequence.
197        let mut lc = String::new();
198        lc.push(s[back_i]);
199        lc.push(s[i]);
200
201        let mapping = crate::unicode::ucd::dm::rdm(&lc);
202        let is_primary_composite = primary_composite(mapping);
203        if (starter(s[back_i] as u32)
204            && !blocked(&s[back_i..=i]))
205            && (mapping != 0x0 && is_primary_composite)
206        {
207            s[back_i] = std::char::from_u32(mapping).unwrap();
208            s.remove(i);
209            offset -= 1;
210        }
211        offset += 1;
212    }
213}
214
215pub(crate) fn nfd(s: &str) -> Vec<char> {
216    let seq = s.chars().collect::<Vec<char>>();
217    let mut seq = canonical_decomposition(seq);
218    canonical_ordering(&mut seq);
219
220    seq
221}
222
223pub(crate) fn nfkd(s: &str) -> Vec<char> {
224    let seq = s.chars().collect::<Vec<char>>();
225    let mut seq = compatibility_decomposition(&seq);
226    canonical_ordering(&mut seq);
227
228    seq
229}
230
231pub(crate) fn nfc(s: &str) -> Vec<char> {
232    let mut seq = nfd(s);
233    canonical_composition(&mut seq);
234
235    seq
236}
237
238pub(crate) fn nfkc(s: &str) -> Vec<char> {
239    let mut seq = nfkd(s);
240    canonical_composition(&mut seq);
241
242    seq
243}
244
245mod tests {
246    #[test]
247    fn test_canonical_ordering() {
248        let mut s1 = vec!['a', '\u{0305}', '\u{0315}', '\u{0300}', '\u{05AE}', 'b'];
249        super::canonical_ordering(&mut s1);
250        assert_eq!(
251            s1,
252            vec!['a', '\u{05AE}', '\u{0305}', '\u{0300}', '\u{0315}', 'b']
253        )
254    }
255
256    #[test]
257    fn test_blocked() {
258        let s1 = &['A', 'B'];
259        assert_eq!(super::blocked(s1), false);
260        assert_eq!(super::blocked(&['a', '\u{05AE}', '\u{0305}', '\u{0300}']), true);
261    }
262}