unicode_shaper/shape/
mod.rs

1/// Arabic unicode shaping
2pub mod arabic;
3/// Buginese unicode shaping
4pub mod buginese;
5/// CJK (Chinese, Japanese, or Korean) unicode shaping
6pub mod cjk;
7mod internal;
8/// Javanese unicode shaping
9pub mod javanese;
10/// Khmer unicode shaping
11pub mod khmer;
12/// Myanmar unicode shaping
13pub mod myanmar;
14mod shared;
15/// Tamil unicode shaping
16pub mod tamil;
17/// Thai unicode shaping
18pub mod thai;
19/// Tibetan unicode shaping
20pub mod tibetan;
21
22use crate::*;
23use alloc::vec::Vec;
24pub use arabic::*;
25pub use buginese::*;
26pub use cjk::*;
27use internal::*;
28pub use javanese::*;
29pub use khmer::*;
30pub use myanmar::*;
31pub use tamil::*;
32pub use tibetan::*;
33
34/// Converts an Arabic Unicode buffer in 06xx Range into a shaped
35/// arabic Unicode buffer in FExx Range
36pub fn shape_unicode(source: &[u16], options: &u32) -> Vec<u16> {
37    let mut output = source.to_vec();
38
39    // all other shaping
40    if options & U_SHAPE_LETTERS_MASK != 0 {
41        // arabic shaping
42        output = shape_arabic(&output, options);
43        // Buginese shaping
44        shape_buginese(&mut output);
45        // Javanese shaping
46        shape_javanese(&mut output);
47        // Myanmar shaping
48        shape_myanmar(&mut output);
49        // Tamil shaping
50        shape_tamil(&mut output);
51        // Tibetan shaping
52        shape_tibetan(&mut output);
53        // khmer
54        shape_khmer(&mut output);
55    }
56
57    // if option to process bidirectional text is set, then reorder the output
58    if (options & U_SHAPE_DIRECTION_OUTPUT_BIDI) != 0 {
59        return process_bidi_text(&output);
60    }
61
62    output
63}
64
65#[cfg(test)]
66mod tests {
67    use super::*;
68    use alloc::{string::String, vec::Vec};
69
70    const DEFAULT_OPTIONS: u32 = (U_SHAPE_LETTERS_SHAPE & U_SHAPE_LETTERS_MASK)
71        | (U_SHAPE_TEXT_DIRECTION_LOGICAL & U_SHAPE_TEXT_DIRECTION_MASK)
72        | U_SHAPE_DIRECTION_OUTPUT_BIDI;
73
74    fn reverse_string(input: &str) -> String {
75        let mut chars: Vec<char> = input.chars().collect();
76        chars.reverse();
77        chars.into_iter().collect()
78    }
79
80    #[test]
81    fn basic_string() {
82        // Create a Rust string
83        let my_string = "normal latin text";
84        // Encode the string as UTF-16 and obtain a slice of u16 values
85        let utf16_slice: Vec<u16> = my_string.encode_utf16().collect();
86        // Create a reference to the slice
87        let utf16_ref: &[u16] = &utf16_slice;
88        // Print the original UTF-16 representation
89        let result: &[u16] = &shape_unicode(utf16_ref, &DEFAULT_OPTIONS);
90        // Print the result of shape_unicode
91        assert_eq!(result, utf16_ref);
92    }
93
94    #[test]
95    fn arabic_string() {
96        // Create a Rust string
97        let input = "سلام۳۹";
98        let expected = "۳۹ﻡﻼﺳ";
99        // Encode the string as UTF-16 and obtain a slice of u16 values
100        let input_utf16_slice: Vec<u16> = input.encode_utf16().collect();
101        let expected_utf16_slice: Vec<u16> = expected.encode_utf16().collect();
102        // Create a reference to the slice
103        let input_utf16_ref: &[u16] = &input_utf16_slice;
104        let expected_utf16_ref: &[u16] = &expected_utf16_slice;
105        let result: &[u16] = &shape_unicode(input_utf16_ref, &DEFAULT_OPTIONS);
106        assert_ne!(result, input_utf16_ref);
107        assert_eq!(result, expected_utf16_ref);
108    }
109
110    #[test]
111    fn hebrew_string() {
112        // Create a Rust string
113        let input = "ישראל"; // 1500, 1488, 1512, 1513, 1497
114        let expected = reverse_string(input); // 1497, 1513, 1512, 1488, 1500
115                                              // Encode the string as UTF-16 and obtain a slice of u16 values
116        let input_utf16_slice: Vec<u16> = input.encode_utf16().collect();
117        let expected_utf16_slice: Vec<u16> = expected.encode_utf16().collect();
118        // Create a reference to the slice
119        let input_utf16_ref: &[u16] = &input_utf16_slice;
120        let expected_utf16_ref: &[u16] = &expected_utf16_slice;
121        let result: &[u16] = &shape_unicode(input_utf16_ref, &DEFAULT_OPTIONS);
122        assert_eq!(result, expected_utf16_ref);
123    }
124
125    // #[test]
126    // fn khmer_test() {
127    //     // Create a Rust string
128    //     let input = "ព្រ"; // 6038, 6098, 6042
129    //     let expected: &[u16] = &[6098, 6042, 6038];
130    //     // Encode the string as UTF-16 and obtain a slice of u16 values
131    //     let input_utf16_slice: Vec<u16> = input.encode_utf16().collect();
132    //     // Create a reference to the slice
133    //     let input_utf16_ref: &[u16] = &input_utf16_slice;
134    //     let result: &[u16] = &shape_unicode(input_utf16_ref, &DEFAULT_OPTIONS);
135    //     assert_eq!(result, expected);
136    // }
137
138    #[test]
139    fn hebrew_degesh_test() {
140        // Create a Rust string
141        let input = "בּ"; // 1468, 1489
142        let expected: &[u16] = &[1468, 1489];
143        // Encode the string as UTF-16 and obtain a slice of u16 values
144        let input_utf16_slice: Vec<u16> = input.encode_utf16().collect();
145        // Create a reference to the slice
146        let input_utf16_ref: &[u16] = &input_utf16_slice;
147        let result: &[u16] = &shape_unicode(input_utf16_ref, &DEFAULT_OPTIONS);
148        assert_eq!(result, expected);
149    }
150
151    #[test]
152    fn myanmar_test() {
153        // Create a Rust string
154        // DEC: 4100, 4154, 4153, 4096, 4153, 4096, 4155, 4156, 4157, 4158, 4145, 4141, 4143, 4151, 4154, 4140, 4158, 4142, 4151, 4196, 4146, 4150, 4151, 4152, 4237
155        // HEX: 1004, 103A, 1039, 1000, 1039, 1000, 103B, 103C, 103D, 1031, 1031, 102D, 102F, 1036, 102C, 1036
156        let input = "င်္က္ကျြွှေို့်ာှီ့ၤဲံ့းႍ";
157        let expected: &[u16] = &[
158            4145, 4156, 4096, 4100, 4154, 4153, 4153, 4096, 4155, 4157, 4158, 4141, 4143, 4151,
159            4154, 4140, 4158, 4142, 4151, 4196, 4146, 4150, 4151, 4152, 4237,
160        ];
161        // Encode the string as UTF-16 and obtain a slice of u16 values
162        let input_utf16_slice: Vec<u16> = input.encode_utf16().collect();
163        // Create a reference to the slice
164        let input_utf16_ref: &[u16] = &input_utf16_slice;
165        let result: &[u16] = &shape_unicode(input_utf16_ref, &DEFAULT_OPTIONS);
166        assert_eq!(result, expected);
167    }
168
169    #[test]
170    fn myanmar_complex_2_test() {
171        let input: &[u16] = &[
172            0x1004, 0x103A, 0x1039, 0x1000, 0x1039, 0x1000, 0x103B, 0x103C, 0x103D, 0x1031, 0x1031,
173            0x102D, 0x102F, 0x1036, 0x102C, 0x1036,
174        ];
175        let expected: &[u16] = &[
176            0x1031, 0x1031, 0x103C, 0x1000, 0x1004, 0x103A, 0x1039, 0x1039, 0x1000, 0x103B, 0x103D,
177            0x102D, 0x1036, 0x102F, 0x102C, 0x1036,
178        ];
179        let result: &[u16] = &shape_unicode(input, &DEFAULT_OPTIONS);
180        assert_eq!(result, expected);
181    }
182
183    #[test]
184    fn tibetan_test() {
185        let input = "བོད་རང་སྐྱོང་ལྗོངས།";
186        let expected: &[u16] = &[
187            3964, 3926, 3921, 3851, 3938, 3908, 3851, 3964, 3942, 3984, 4017, 3908, 3851, 3964,
188            3939, 3991, 3908, 3942, 3853,
189        ];
190        // Encode the string as UTF-16 and obtain a slice of u16 values
191        let input_utf16_slice: Vec<u16> = input.encode_utf16().collect();
192        let result: &[u16] = &shape_unicode(&input_utf16_slice, &DEFAULT_OPTIONS);
193        assert_eq!(result, expected);
194    }
195
196    #[test]
197    fn buginese_test() {
198        let input = "ᨑᨗ ᨍᨍᨗᨕᨂᨗ";
199        let expected: &[u16] = &[6673, 6679, 32, 6669, 6669, 6679, 6677, 6658, 6679];
200        // Encode the string as UTF-16 and obtain a slice of u16 values
201        let input_utf16_slice: Vec<u16> = input.encode_utf16().collect();
202        // Create a reference to the slice
203        let input_utf16_ref: &[u16] = &input_utf16_slice;
204
205        let result: &[u16] = &shape_unicode(input_utf16_ref, &DEFAULT_OPTIONS);
206        assert_eq!(result, expected);
207    }
208
209    #[test]
210    fn javanese_test() {
211        let input = "ꦧꦺꦲꦏ꧀ꦠꦸꦩꦿꦥ꧀ꦲ";
212        let expected: &[u16] =
213            &[43450, 43431, 43442, 43407, 43456, 43424, 43448, 43433, 43455, 43429, 43456, 43442];
214        // Encode the string as UTF-16 and obtain a slice of u16 values
215        let input_utf16_slice: Vec<u16> = input.encode_utf16().collect();
216        // Create a reference to the slice
217        let input_utf16_ref: &[u16] = &input_utf16_slice;
218
219        let result: &[u16] = &shape_unicode(input_utf16_ref, &DEFAULT_OPTIONS);
220        assert_eq!(result, expected);
221    }
222}