allsorts_subset_browser/scripts/
arabic.rs

1//! Implementation of font shaping for Arabic scripts
2//!
3//! Code herein follows the specification at:
4//! <https://github.com/n8willis/opentype-shaping-documents/blob/master/opentype-shaping-arabic-general.md>
5
6use unicode_joining_type::{get_joining_type, JoiningType};
7
8use crate::error::{ParseError, ShapingError};
9use crate::gsub::{self, FeatureMask, GlyphData, GlyphOrigin, RawGlyph};
10use crate::layout::{FeatureTableSubstitution, GDEFTable, LayoutCache, LayoutTable, GSUB};
11use crate::tag;
12use crate::unicode::mcc::{
13    modified_combining_class, sort_by_modified_combining_class, ModifiedCombiningClass,
14};
15
16#[derive(Clone)]
17struct ArabicData {
18    joining_type: JoiningType,
19    feature_tag: u32,
20}
21
22impl GlyphData for ArabicData {
23    fn merge(data1: ArabicData, _data2: ArabicData) -> ArabicData {
24        // TODO hold off for future Unicode normalisation changes
25        data1
26    }
27}
28
29// Arabic glyphs are represented as `RawGlyph` structs with `ArabicData` for its `extra_data`.
30type ArabicGlyph = RawGlyph<ArabicData>;
31
32impl ArabicGlyph {
33    fn is_transparent(&self) -> bool {
34        self.extra_data.joining_type == JoiningType::Transparent || self.multi_subst_dup()
35    }
36
37    fn is_left_joining(&self) -> bool {
38        self.extra_data.joining_type == JoiningType::LeftJoining
39            || self.extra_data.joining_type == JoiningType::DualJoining
40            || self.extra_data.joining_type == JoiningType::JoinCausing
41    }
42
43    fn is_right_joining(&self) -> bool {
44        self.extra_data.joining_type == JoiningType::RightJoining
45            || self.extra_data.joining_type == JoiningType::DualJoining
46            || self.extra_data.joining_type == JoiningType::JoinCausing
47    }
48
49    fn feature_tag(&self) -> u32 {
50        self.extra_data.feature_tag
51    }
52
53    fn set_feature_tag(&mut self, feature_tag: u32) {
54        self.extra_data.feature_tag = feature_tag
55    }
56}
57
58impl From<&RawGlyph<()>> for ArabicGlyph {
59    fn from(raw_glyph: &RawGlyph<()>) -> ArabicGlyph {
60        // Since there's no `Char` to work out the `ArabicGlyph`s joining type when the glyph's
61        // `glyph_origin` is `GlyphOrigin::Direct`, we fallback to `JoiningType::NonJoining` as
62        // the safest approach
63        let joining_type = match raw_glyph.glyph_origin {
64            GlyphOrigin::Char(c) => get_joining_type(c),
65            GlyphOrigin::Direct => JoiningType::NonJoining,
66        };
67
68        ArabicGlyph {
69            unicodes: raw_glyph.unicodes.clone(),
70            glyph_index: raw_glyph.glyph_index,
71            liga_component_pos: raw_glyph.liga_component_pos,
72            glyph_origin: raw_glyph.glyph_origin,
73            flags: raw_glyph.flags,
74            variation: raw_glyph.variation,
75            extra_data: ArabicData {
76                joining_type,
77                // For convenience, we loosely follow the spec (`2. Computing letter joining
78                // states`) here by initialising all `ArabicGlyph`s to `tag::ISOL`
79                feature_tag: tag::ISOL,
80            },
81        }
82    }
83}
84
85impl From<&ArabicGlyph> for RawGlyph<()> {
86    fn from(arabic_glyph: &ArabicGlyph) -> RawGlyph<()> {
87        RawGlyph {
88            unicodes: arabic_glyph.unicodes.clone(),
89            glyph_index: arabic_glyph.glyph_index,
90            liga_component_pos: arabic_glyph.liga_component_pos,
91            glyph_origin: arabic_glyph.glyph_origin,
92            flags: arabic_glyph.flags,
93            variation: arabic_glyph.variation,
94            extra_data: (),
95        }
96    }
97}
98
99pub fn gsub_apply_arabic(
100    gsub_cache: &LayoutCache<GSUB>,
101    gsub_table: &LayoutTable<GSUB>,
102    gdef_table: Option<&GDEFTable>,
103    script_tag: u32,
104    lang_tag: Option<u32>,
105    feature_variations: Option<&FeatureTableSubstitution<'_>>,
106    raw_glyphs: &mut Vec<RawGlyph<()>>,
107) -> Result<(), ShapingError> {
108    match gsub_table.find_script(script_tag)? {
109        Some(s) => {
110            if s.find_langsys_or_default(lang_tag)?.is_none() {
111                return Ok(());
112            }
113        }
114        None => return Ok(()),
115    }
116
117    let arabic_glyphs = &mut raw_glyphs.iter().map(ArabicGlyph::from).collect();
118
119    // 1. Compound character composition and decomposition
120
121    apply_lookups(
122        FeatureMask::CCMP,
123        gsub_cache,
124        gsub_table,
125        gdef_table,
126        script_tag,
127        lang_tag,
128        feature_variations,
129        arabic_glyphs,
130        |_, _| true,
131    )?;
132
133    // 2. Computing letter joining states
134
135    {
136        let mut previous_i = arabic_glyphs
137            .iter()
138            .position(|g| !g.is_transparent())
139            .unwrap_or(0);
140
141        for i in (previous_i + 1)..arabic_glyphs.len() {
142            if arabic_glyphs[i].is_transparent() {
143                continue;
144            }
145
146            if arabic_glyphs[previous_i].is_left_joining() && arabic_glyphs[i].is_right_joining() {
147                arabic_glyphs[i].set_feature_tag(tag::FINA);
148
149                match arabic_glyphs[previous_i].feature_tag() {
150                    tag::ISOL => arabic_glyphs[previous_i].set_feature_tag(tag::INIT),
151                    tag::FINA => arabic_glyphs[previous_i].set_feature_tag(tag::MEDI),
152                    _ => {}
153                }
154            }
155
156            previous_i = i;
157        }
158    }
159
160    // 3. Applying the stch feature
161    //
162    // TODO hold off for future generalised solution (including the Syriac Abbreviation Mark)
163
164    // 4. Applying the language-form substitution features from GSUB
165
166    const LANGUAGE_FEATURES: &[(FeatureMask, bool)] = &[
167        (FeatureMask::LOCL, true),
168        (FeatureMask::ISOL, false),
169        (FeatureMask::FINA, false),
170        (FeatureMask::MEDI, false),
171        (FeatureMask::INIT, false),
172        (FeatureMask::RLIG, true),
173        (FeatureMask::RCLT, true),
174        (FeatureMask::CALT, true),
175    ];
176
177    for &(feature_mask, is_global) in LANGUAGE_FEATURES {
178        apply_lookups(
179            feature_mask,
180            gsub_cache,
181            gsub_table,
182            gdef_table,
183            script_tag,
184            lang_tag,
185            feature_variations,
186            arabic_glyphs,
187            |g, feature_tag| is_global || g.feature_tag() == feature_tag,
188        )?;
189    }
190
191    // 5. Applying the typographic-form substitution features from GSUB
192    //
193    // Note that we skip `GSUB`'s `DLIG` and `CSWH` features as results would differ from other
194    // Arabic shapers
195
196    const TYPOGRAPHIC_FEATURES: &[FeatureMask] = &[FeatureMask::LIGA, FeatureMask::MSET];
197
198    for &feature_mask in TYPOGRAPHIC_FEATURES {
199        apply_lookups(
200            feature_mask,
201            gsub_cache,
202            gsub_table,
203            gdef_table,
204            script_tag,
205            lang_tag,
206            feature_variations,
207            arabic_glyphs,
208            |_, _| true,
209        )?;
210    }
211
212    // 6. Mark reordering
213    //
214    // Handled in the text preprocessing stage.
215
216    *raw_glyphs = arabic_glyphs.iter().map(RawGlyph::from).collect();
217
218    Ok(())
219}
220
221fn apply_lookups(
222    feature_mask: FeatureMask,
223    gsub_cache: &LayoutCache<GSUB>,
224    gsub_table: &LayoutTable<GSUB>,
225    gdef_table: Option<&GDEFTable>,
226    script_tag: u32,
227    lang_tag: Option<u32>,
228    feature_variations: Option<&FeatureTableSubstitution<'_>>,
229    arabic_glyphs: &mut Vec<ArabicGlyph>,
230    pred: impl Fn(&ArabicGlyph, u32) -> bool + Copy,
231) -> Result<(), ParseError> {
232    let index = gsub::get_lookups_cache_index(
233        gsub_cache,
234        script_tag,
235        lang_tag,
236        feature_variations,
237        feature_mask,
238    )?;
239    let lookups = &gsub_cache.cached_lookups.borrow()[index];
240
241    for &(lookup_index, feature_tag) in lookups {
242        gsub::gsub_apply_lookup(
243            gsub_cache,
244            gsub_table,
245            gdef_table,
246            lookup_index,
247            feature_tag,
248            None,
249            arabic_glyphs,
250            0,
251            arabic_glyphs.len(),
252            |g| pred(g, feature_tag),
253        )?;
254    }
255
256    Ok(())
257}
258
259/// Reorder Arabic marks per AMTRA. See: https://www.unicode.org/reports/tr53/.
260pub(super) fn reorder_marks(cs: &mut [char]) {
261    sort_by_modified_combining_class(cs);
262
263    for css in
264        cs.split_mut(|&c| modified_combining_class(c) == ModifiedCombiningClass::NotReordered)
265    {
266        reorder_marks_shadda(css);
267        reorder_marks_other_combining(css, ModifiedCombiningClass::Above);
268        reorder_marks_other_combining(css, ModifiedCombiningClass::Below);
269    }
270}
271
272fn reorder_marks_shadda(cs: &mut [char]) {
273    use std::cmp::Ordering;
274
275    // 2a. Move any Shadda characters to the beginning of S, where S is a max
276    // length substring of non-starter characters.
277    fn comparator(c1: &char, _c2: &char) -> Ordering {
278        if modified_combining_class(*c1) == ModifiedCombiningClass::CCC33 {
279            Ordering::Less
280        } else {
281            Ordering::Equal
282        }
283    }
284    cs.sort_by(comparator)
285}
286
287fn reorder_marks_other_combining(cs: &mut [char], mcc: ModifiedCombiningClass) {
288    debug_assert!(mcc == ModifiedCombiningClass::Below || mcc == ModifiedCombiningClass::Above);
289
290    // Get the start index of a possible sequence of characters with canonical
291    // combining class equal to `mcc`. (Assumes that `glyphs` is normalised to
292    // NFD.)
293    let first = cs.iter().position(|&c| modified_combining_class(c) == mcc);
294
295    if let Some(first) = first {
296        // 2b/2c. If the sequence of characters _begins_ with any MCM characters,
297        // move the sequence of such characters to the beginning of S.
298        let count = cs[first..]
299            .iter()
300            .take_while(|&&c| is_modifier_combining_mark(c))
301            .count();
302        cs[..(first + count)].rotate_right(count);
303    }
304}
305
306fn is_modifier_combining_mark(ch: char) -> bool {
307    // https://www.unicode.org/reports/tr53/tr53-6.html#MCM
308    match ch {
309        | '\u{0654}' // ARABIC HAMZA ABOVE
310        | '\u{0655}' // ARABIC HAMZA BELOW
311        | '\u{0658}' // ARABIC MARK NOON GHUNNA
312        | '\u{06DC}' // ARABIC SMALL HIGH SEEN
313        | '\u{06E3}' // ARABIC SMALL LOW SEEN
314        | '\u{06E7}' // ARABIC SMALL HIGH YEH
315        | '\u{06E8}' // ARABIC SMALL HIGH NOON
316        | '\u{08CA}' // ARABIC SMALL HIGH FARSI YEH
317        | '\u{08CB}' // ARABIC SMALL HIGH YEH BARREE WITH TWO DOTS BELOW
318        | '\u{08CD}' // ARABIC SMALL HIGH ZAH
319        | '\u{08CE}' // ARABIC LARGE ROUND DOT ABOVE
320        | '\u{08CF}' // ARABIC LARGE ROUND DOT BELOW
321        | '\u{08D3}' // ARABIC SMALL LOW WAW
322        | '\u{08F3}' => true, // ARABIC SMALL HIGH WAW
323        _ => false,
324    }
325}
326
327#[cfg(test)]
328mod tests {
329    use super::*;
330
331    // https://www.unicode.org/reports/tr53/#Demonstrating_AMTRA.
332    mod reorder_marks {
333        use super::*;
334
335        #[test]
336        fn test_artificial() {
337            let cs = vec![
338                '\u{0618}', '\u{0619}', '\u{064E}', '\u{064F}', '\u{0654}', '\u{0658}', '\u{0653}',
339                '\u{0654}', '\u{0651}', '\u{0656}', '\u{0651}', '\u{065C}', '\u{0655}', '\u{0650}',
340            ];
341            let cs_exp = vec![
342                '\u{0654}', '\u{0658}', '\u{0651}', '\u{0651}', '\u{0618}', '\u{064E}', '\u{0619}',
343                '\u{064F}', '\u{0650}', '\u{0656}', '\u{065C}', '\u{0655}', '\u{0653}', '\u{0654}',
344            ];
345            test_reorder_marks(&cs, &cs_exp);
346        }
347
348        // Variant of `test_artificial` where U+0656 is replaced with U+0655
349        // to test the reordering of MCM characters for the ccc = 220 group.
350        #[test]
351        fn test_artificial_custom() {
352            let cs = vec![
353                '\u{0618}', '\u{0619}', '\u{064E}', '\u{064F}', '\u{0654}', '\u{0658}', '\u{0653}',
354                '\u{0654}', '\u{0651}', '\u{0655}', '\u{0651}', '\u{065C}', '\u{0655}', '\u{0650}',
355            ];
356            let cs_exp = vec![
357                '\u{0655}', '\u{0654}', '\u{0658}', '\u{0651}', '\u{0651}', '\u{0618}', '\u{064E}',
358                '\u{0619}', '\u{064F}', '\u{0650}', '\u{065C}', '\u{0655}', '\u{0653}', '\u{0654}',
359            ];
360            test_reorder_marks(&cs, &cs_exp);
361        }
362
363        #[test]
364        fn test_example1() {
365            let cs1 = vec!['\u{0627}', '\u{064F}', '\u{0654}'];
366            let cs1_exp = vec!['\u{0627}', '\u{0654}', '\u{064F}'];
367            test_reorder_marks(&cs1, &cs1_exp);
368
369            let cs2 = vec!['\u{0627}', '\u{064F}', '\u{034F}', '\u{0654}'];
370            test_reorder_marks(&cs2, &cs2);
371
372            let cs3 = vec!['\u{0649}', '\u{0650}', '\u{0655}'];
373            let cs3_exp = vec!['\u{0649}', '\u{0655}', '\u{0650}'];
374            test_reorder_marks(&cs3, &cs3_exp);
375
376            let cs4 = vec!['\u{0649}', '\u{0650}', '\u{034F}', '\u{0655}'];
377            test_reorder_marks(&cs4, &cs4);
378        }
379
380        #[test]
381        fn test_example2a() {
382            let cs = vec!['\u{0635}', '\u{06DC}', '\u{0652}'];
383            test_reorder_marks(&cs, &cs);
384        }
385
386        #[test]
387        fn test_example2b() {
388            let cs1 = vec!['\u{0647}', '\u{0652}', '\u{06DC}'];
389            let cs1_exp = vec!['\u{0647}', '\u{06DC}', '\u{0652}'];
390            test_reorder_marks(&cs1, &cs1_exp);
391
392            let cs2 = vec!['\u{0647}', '\u{0652}', '\u{034F}', '\u{06DC}'];
393            test_reorder_marks(&cs2, &cs2);
394        }
395
396        #[test]
397        fn test_example3() {
398            let cs1 = vec!['\u{0640}', '\u{0650}', '\u{0651}', '\u{06E7}'];
399            // The expected output in https://www.unicode.org/reports/tr53/#Example3
400            //
401            // [U+0640, U+0650, U+06E7, U+0651]
402            //
403            // is incorrect, in that it fails to account for U+0651 Shadda moving to
404            // the front of U+0650 Kasra, per step 2a of AMTRA.
405            //
406            // U+06E7 Small High Yeh should then move to the front of Shadda per step
407            // 2b, resulting in:
408            let cs1_exp = vec!['\u{0640}', '\u{06E7}', '\u{0651}', '\u{0650}'];
409            test_reorder_marks(&cs1, &cs1_exp);
410
411            let cs2 = vec!['\u{0640}', '\u{0650}', '\u{0651}', '\u{034F}', '\u{06E7}'];
412            // As above, Shadda should move to the front of Kasra, so the expected
413            // output in https://www.unicode.org/reports/tr53/#Example3
414            //
415            // [U+0640, U+0650, U+0651, U+034F, U+06E7]
416            //
417            // (i.e. no changes) is also incorrect.
418            let cs2_exp = vec!['\u{0640}', '\u{0651}', '\u{0650}', '\u{034F}', '\u{06E7}'];
419            test_reorder_marks(&cs2, &cs2_exp);
420        }
421
422        #[test]
423        fn test_example4a() {
424            let cs = vec!['\u{0640}', '\u{0652}', '\u{034F}', '\u{06E8}'];
425            test_reorder_marks(&cs, &cs);
426        }
427
428        #[test]
429        fn test_example4b() {
430            let cs1 = vec!['\u{06C6}', '\u{064F}', '\u{06E8}'];
431            let cs1_exp = vec!['\u{06C6}', '\u{06E8}', '\u{064F}'];
432            test_reorder_marks(&cs1, &cs1_exp);
433
434            let cs2 = vec!['\u{06C6}', '\u{064F}', '\u{034F}', '\u{06E8}'];
435            test_reorder_marks(&cs2, &cs2);
436        }
437
438        fn test_reorder_marks(cs: &Vec<char>, cs_exp: &Vec<char>) {
439            let mut cs_act = cs.clone();
440            reorder_marks(&mut cs_act);
441            assert_eq!(cs_exp, &cs_act);
442        }
443    }
444}