oxideav_scribe/shaping/arabic.rs
1//! Arabic / Hebrew RTL contextual joining (round 7).
2//!
3//! Implements the Unicode joining-type lookup + adjacency state machine
4//! that picks one of `{Isol, Init, Medi, Fina}` for every character in
5//! a run. Downstream the chosen form selects which OpenType GSUB feature
6//! tag (`isol` / `init` / `medi` / `fina`) the shaper applies — letting
7//! a font swap "isolated alif" for "final alif", etc.
8//!
9//! ## References
10//!
11//! - Unicode Standard Annex #44 (UCD) — `ArabicShaping.txt` (joining
12//! type per codepoint).
13//! - Unicode core specification §9.2 — Arabic joining algorithm.
14//! - Microsoft OpenType Layout — Arabic shaping (the `isol` / `init` /
15//! `medi` / `fina` feature contract).
16//!
17//! No HarfBuzz / FreeType / pango / ICU layout source consulted — this
18//! is a clean-room implementation of the algorithm described in the
19//! Unicode + OpenType specs.
20//!
21//! ## Algorithm
22//!
23//! 1. For each char compute its [`JoiningClass`] via [`joining_class`].
24//! 2. Walk the run left-to-right, **skipping `T` (transparent)** chars
25//! when computing neighbours, to determine whether each non-T char
26//! can join with its left/right neighbour.
27//! 3. The form is then:
28//! - `Isol` — neither side joins
29//! - `Init` — only the right side joins (left edge of a chain)
30//! - `Medi` — both sides join (interior of a chain)
31//! - `Fina` — only the left side joins (right edge of a chain)
32//!
33//! "Joins" here means: the neighbour's joining class is in
34//! `{D, R, C}` for the *left* neighbour (it can join to its right) and
35//! `{D, L, C}` for the *right* neighbour (it can join to its left).
36//! `T` chars (combining marks, etc.) are pass-through — they inherit
37//! the form of the char they decorate.
38//!
39//! Logical-order input is assumed (post-bidi). The state machine itself
40//! is direction-agnostic — RTL display order is the rasterizer's
41//! concern, not this module's.
42
43#![allow(clippy::manual_range_contains)]
44
45/// Unicode joining type. Names match the UCD `ArabicShaping.txt`
46/// single-letter codes.
47#[derive(Debug, Clone, Copy, PartialEq, Eq)]
48pub enum JoiningClass {
49 /// Non-joining. Breaks the joining chain on both sides. Default for
50 /// non-Arabic / non-Syriac codepoints, plus Arabic punctuation and
51 /// most digits.
52 U,
53 /// Left-joining. Joins to its left neighbour only. Rare in Arabic
54 /// (used in some Manichaean / Phags-pa style scripts; included for
55 /// completeness).
56 L,
57 /// Right-joining. Joins to its right neighbour only. The "alif"
58 /// family: alif, dal, dhal, reh, zain, waw and a few others.
59 R,
60 /// Dual-joining. Joins on both sides. The bulk of the Arabic
61 /// alphabet — ba, ta, tha, jeem, hah, etc.
62 D,
63 /// Joining-causing. Forces a joining context regardless of intrinsic
64 /// joinability. ZWJ (U+200D), tatweel (U+0640).
65 C,
66 /// Transparent. Combining mark / harakat — does not participate in
67 /// joining; the chain skips over T chars when computing adjacency.
68 T,
69}
70
71/// The contextual form chosen for a given character within a run. Maps
72/// 1:1 onto the four standard Arabic OpenType feature tags.
73#[derive(Debug, Clone, Copy, PartialEq, Eq)]
74pub enum JoiningForm {
75 /// Standalone — neither side joins. Apply the `isol` feature.
76 Isol,
77 /// Initial — joins on the right only (left edge of a chain). Apply
78 /// the `init` feature.
79 Init,
80 /// Medial — both sides join (interior of a chain). Apply the `medi`
81 /// feature.
82 Medi,
83 /// Final — joins on the left only (right edge of a chain). Apply
84 /// the `fina` feature.
85 Fina,
86}
87
88impl JoiningForm {
89 /// The OpenType feature tag (4-byte little-endian-as-bytes ASCII)
90 /// that selects this form's substitution.
91 pub fn feature_tag(self) -> [u8; 4] {
92 match self {
93 Self::Isol => *b"isol",
94 Self::Init => *b"init",
95 Self::Medi => *b"medi",
96 Self::Fina => *b"fina",
97 }
98 }
99}
100
101/// Coarse script classification used to decide which feature tag list
102/// the shaper applies to a run. Only the scripts that need contextual
103/// shaping are enumerated; everything else collapses to `Other`.
104#[derive(Debug, Clone, Copy, PartialEq, Eq)]
105pub enum Script {
106 /// Arabic block + supplements + presentation forms (U+0600..U+06FF,
107 /// U+0750..U+077F, U+08A0..U+08FF, U+FB50..U+FDFF, U+FE70..U+FEFF).
108 Arabic,
109 /// Hebrew block + Alphabetic Presentation Forms-A Hebrew range
110 /// (U+0590..U+05FF, U+FB1D..U+FB4F).
111 Hebrew,
112 /// Devanagari block (U+0900..U+097F). Hindi / Marathi / Sanskrit /
113 /// Nepali. Round 8 added cluster-based shaping — see
114 /// [`super::indic`] for the cluster machine and
115 /// [`super::indic::devanagari_feature_tags`] for the
116 /// substitution-feature application order.
117 Devanagari,
118 /// Bengali block (U+0980..U+09FF). Bengali / Assamese / Manipuri.
119 /// Round 10 added cluster-based shaping — same broad shape as
120 /// Devanagari (halant-driven conjuncts, reph rule for RA U+09B0,
121 /// pre-base matra reorder) but Bengali has THREE pre-base matras
122 /// (U+09BF / U+09C7 / U+09C8) instead of Devanagari's one.
123 Bengali,
124 /// Tamil block (U+0B80..U+0BFF). Tamil. Round 10 added
125 /// minimal cluster-based shaping: pre-base matra reorder (U+0BC6 /
126 /// U+0BC7 / U+0BC8) only — no reph (Tamil RA renders in-line),
127 /// no nukta, no conjunct formation in the modern orthography.
128 Tamil,
129 /// Gurmukhi block (U+0A00..U+0A7F). Punjabi. Round 11 added
130 /// halant-driven cluster machine: pre-base matra reorder
131 /// (U+0A3F sign "i"); reph rare in modern usage (RA U+0A30 sets
132 /// the flag for fonts that ship a `rphf` lookup, callers without
133 /// one fall back to in-line RA rendering).
134 Gurmukhi,
135 /// Gujarati block (U+0A80..U+0AFF). Gujarati. Round 11 added —
136 /// closest in shape to Devanagari (halant-driven conjuncts;
137 /// pre-base matra U+0ABF; reph rule on RA U+0AB0).
138 Gujarati,
139 /// Telugu block (U+0C00..U+0C7F). Telugu. Round 11 added —
140 /// reph identification on RA U+0C30 plus pre-base matra reorder
141 /// for U+0C46 / U+0C47 / U+0C48 (e / ee / ai). The Telugu split
142 /// vowels (U+0C46 + U+0C56) decompose to a pre-base + post-base
143 /// pair under NFD; the cluster machine flags the pre-base
144 /// component for reorder.
145 Telugu,
146 /// Kannada block (U+0C80..U+0CFF). Kannada. Round 11 added —
147 /// similar shape to Telugu (reph on RA U+0CB0; pre-base matras
148 /// U+0CC6 / U+0CC7 / U+0CC8) with its own codepoints + halant
149 /// (U+0CCD).
150 Kannada,
151 /// Malayalam block (U+0D00..U+0D7F). Malayalam. Round 11 added —
152 /// pre-base matras U+0D46 / U+0D47 / U+0D48 plus the chillu
153 /// (half-form) characters U+0D7A..U+0D7F treated as
154 /// consonants (they are NFC-stable independent codepoints in modern
155 /// Malayalam orthography). No reph in modern Malayalam — chillu
156 /// replaces the historic reph rendering.
157 Malayalam,
158 /// Oriya block (U+0B00..U+0B7F). Oriya / Odia. Round 11 added —
159 /// reph identification on RA U+0B30 plus pre-base matra reorder
160 /// for U+0B47 / U+0B48 / U+0B4B / U+0B4C (Oriya is unusual in that
161 /// the precomposed o / au matras are themselves pre-base after
162 /// canonical decomposition). Halant U+0B4D drives conjuncts.
163 Oriya,
164 /// Sinhala block (U+0D80..U+0DFF). Sinhala. Round 12 (Brahmic
165 /// non-Indic) — closest to Indic in shape. Halant / al-lakuna
166 /// U+0DCA drives conjuncts; pre-base matras U+0DD9..U+0DDB
167 /// (e / ee / ai) plus the precomposed two-part vowels
168 /// U+0DDC..U+0DDE (o / oo / au) reorder to the front of the
169 /// cluster. No reph (Sinhala has no superscript reph rendering).
170 Sinhala,
171 /// Khmer block (U+1780..U+17FF). Khmer / Cambodian. Round 12 added
172 /// — coeng (U+17D2) plays the role of halant and stacks subjoined
173 /// consonants underneath the base; subjoined chains are commonly
174 /// 2-3 deep in Pali borrowings. Pre-base matras U+17BE / U+17BF /
175 /// U+17C0..U+17C5 reorder to the front of the cluster. No reph.
176 Khmer,
177 /// Thai block (U+0E00..U+0E7F). Thai. Round 12 added — no halant
178 /// and no conjunct formation; pre-base vowels U+0E40..U+0E44
179 /// already appear in storage / keyboard order BEFORE their
180 /// consonant (the one Indic-family script where this is the case),
181 /// so no reorder is needed — the cluster machine simply starts a
182 /// new cluster at each pre-base vowel. Tone marks U+0E48..U+0E4B
183 /// + signs U+0E4C..U+0E4E attach to the cluster end.
184 Thai,
185 /// Anything else — Latin, CJK, Cyrillic, Greek, etc.
186 Other,
187}
188
189/// Detect the script of `ch`. Returns [`Script::Other`] for any
190/// codepoint not in one of the explicitly-handled blocks.
191pub fn script_of(ch: char) -> Script {
192 let cp = ch as u32;
193 if (0x0600..=0x06FF).contains(&cp)
194 || (0x0750..=0x077F).contains(&cp)
195 || (0x08A0..=0x08FF).contains(&cp)
196 || (0xFB50..=0xFDFF).contains(&cp)
197 || (0xFE70..=0xFEFF).contains(&cp)
198 {
199 return Script::Arabic;
200 }
201 if (0x0590..=0x05FF).contains(&cp) || (0xFB1D..=0xFB4F).contains(&cp) {
202 return Script::Hebrew;
203 }
204 if (0x0900..=0x097F).contains(&cp) {
205 return Script::Devanagari;
206 }
207 if (0x0980..=0x09FF).contains(&cp) {
208 return Script::Bengali;
209 }
210 if (0x0B80..=0x0BFF).contains(&cp) {
211 return Script::Tamil;
212 }
213 if (0x0A00..=0x0A7F).contains(&cp) {
214 return Script::Gurmukhi;
215 }
216 if (0x0A80..=0x0AFF).contains(&cp) {
217 return Script::Gujarati;
218 }
219 if (0x0C00..=0x0C7F).contains(&cp) {
220 return Script::Telugu;
221 }
222 if (0x0C80..=0x0CFF).contains(&cp) {
223 return Script::Kannada;
224 }
225 if (0x0D00..=0x0D7F).contains(&cp) {
226 return Script::Malayalam;
227 }
228 if (0x0B00..=0x0B7F).contains(&cp) {
229 return Script::Oriya;
230 }
231 if (0x0D80..=0x0DFF).contains(&cp) {
232 return Script::Sinhala;
233 }
234 if (0x1780..=0x17FF).contains(&cp) {
235 return Script::Khmer;
236 }
237 if (0x0E00..=0x0E7F).contains(&cp) {
238 return Script::Thai;
239 }
240 Script::Other
241}
242
243/// Feature tags the shaper should attempt to apply for a run of the
244/// given script. Arabic returns the four joining features; Devanagari
245/// returns the spec-mandated Indic substitution + presentation feature
246/// chain (round 8); Hebrew exposes `ccmp` so future mark-composition
247/// lookups can hook in. The shape pipeline ignores tags it doesn't
248/// know how to apply.
249pub fn feature_tags_for_run(script: Script) -> Vec<[u8; 4]> {
250 match script {
251 Script::Arabic => vec![*b"isol", *b"init", *b"medi", *b"fina"],
252 Script::Hebrew => vec![*b"ccmp"],
253 Script::Devanagari => super::indic::devanagari_feature_tags(),
254 Script::Bengali => super::indic::bengali_feature_tags(),
255 Script::Tamil => super::indic::tamil_feature_tags(),
256 Script::Gurmukhi => super::indic::gurmukhi_feature_tags(),
257 Script::Gujarati => super::indic::gujarati_feature_tags(),
258 Script::Telugu => super::indic::telugu_feature_tags(),
259 Script::Kannada => super::indic::kannada_feature_tags(),
260 Script::Malayalam => super::indic::malayalam_feature_tags(),
261 Script::Oriya => super::indic::oriya_feature_tags(),
262 Script::Sinhala => super::indic::sinhala_feature_tags(),
263 Script::Khmer => super::indic::khmer_feature_tags(),
264 Script::Thai => super::indic::thai_feature_tags(),
265 Script::Other => Vec::new(),
266 }
267}
268
269/// Look up the Unicode joining class for `ch`. The table covers the
270/// Arabic + Syriac + Arabic Supplement + Arabic Extended-A blocks plus
271/// the general-category-Mn combining marks that overlap them.
272///
273/// Codepoints outside the joining-aware blocks return [`JoiningClass::U`]
274/// — the safe "non-joining" default that breaks any chain on both
275/// sides. This matches the UCD's "no entry → U" rule.
276pub fn joining_class(ch: char) -> JoiningClass {
277 let cp = ch as u32;
278 // Fast-path: outside any joining-aware block → U.
279 let in_arabic_block = (0x0600..=0x06FF).contains(&cp)
280 || (0x0750..=0x077F).contains(&cp)
281 || (0x0870..=0x089F).contains(&cp)
282 || (0x08A0..=0x08FF).contains(&cp)
283 || (0xFB50..=0xFDFF).contains(&cp)
284 || (0xFE70..=0xFEFF).contains(&cp);
285 let in_syriac_block = (0x0700..=0x074F).contains(&cp);
286 let in_zwj_zwnj = cp == 0x200C || cp == 0x200D;
287 if !in_arabic_block && !in_syriac_block && !in_zwj_zwnj {
288 return JoiningClass::U;
289 }
290
291 // Joining-causing: ZWJ, tatweel, kashida-like.
292 if cp == 0x200D || cp == 0x0640 || cp == 0x07FA {
293 return JoiningClass::C;
294 }
295 // Non-joiner: ZWNJ explicitly *blocks* joining.
296 if cp == 0x200C {
297 return JoiningClass::U;
298 }
299
300 // Transparent: combining marks (general category Mn) within the
301 // Arabic / Syriac blocks. Round-7 enumerates the dense ranges
302 // explicitly rather than carrying the full UCD; the omitted
303 // codepoints fall through to U which is also acceptable for marks
304 // in this round (a marked char that's mistakenly U-classified
305 // becomes a chain break, which is visually conservative).
306 if is_transparent_mark(cp) {
307 return JoiningClass::T;
308 }
309
310 // Hand-curated joining-class table for the Arabic letters we
311 // actually shape. Sourced from `ArabicShaping.txt` (UCD).
312 match cp {
313 // -- Arabic letters in U+0620..U+064A ----------------------
314 // Hamza variants (R = right-joining for hamza-on-base where
315 // applicable; bare hamza U+0621 is U).
316 0x0621 => JoiningClass::U, // ARABIC LETTER HAMZA
317 0x0622 => JoiningClass::R, // ALEF WITH MADDA ABOVE
318 0x0623 => JoiningClass::R, // ALEF WITH HAMZA ABOVE
319 0x0624 => JoiningClass::R, // WAW WITH HAMZA ABOVE
320 0x0625 => JoiningClass::R, // ALEF WITH HAMZA BELOW
321 0x0626 => JoiningClass::D, // YEH WITH HAMZA ABOVE
322 0x0627 => JoiningClass::R, // ALEF
323 0x0628 => JoiningClass::D, // BEH
324 0x0629 => JoiningClass::R, // TEH MARBUTA
325 0x062A => JoiningClass::D, // TEH
326 0x062B => JoiningClass::D, // THEH
327 0x062C => JoiningClass::D, // JEEM
328 0x062D => JoiningClass::D, // HAH
329 0x062E => JoiningClass::D, // KHAH
330 0x062F => JoiningClass::R, // DAL
331 0x0630 => JoiningClass::R, // THAL
332 0x0631 => JoiningClass::R, // REH
333 0x0632 => JoiningClass::R, // ZAIN
334 0x0633 => JoiningClass::D, // SEEN
335 0x0634 => JoiningClass::D, // SHEEN
336 0x0635 => JoiningClass::D, // SAD
337 0x0636 => JoiningClass::D, // DAD
338 0x0637 => JoiningClass::D, // TAH
339 0x0638 => JoiningClass::D, // ZAH
340 0x0639 => JoiningClass::D, // AIN
341 0x063A => JoiningClass::D, // GHAIN
342 // 0x063B..0x063F are extra letter forms — mostly D.
343 0x063B..=0x063F => JoiningClass::D,
344 // 0x0640 already handled above (tatweel = C).
345 0x0641 => JoiningClass::D, // FEH
346 0x0642 => JoiningClass::D, // QAF
347 0x0643 => JoiningClass::D, // KAF
348 0x0644 => JoiningClass::D, // LAM
349 0x0645 => JoiningClass::D, // MEEM
350 0x0646 => JoiningClass::D, // NOON
351 0x0647 => JoiningClass::D, // HEH
352 0x0648 => JoiningClass::R, // WAW
353 0x0649 => JoiningClass::D, // ALEF MAKSURA (D in modern usage)
354 0x064A => JoiningClass::D, // YEH
355 // 0x064B..0x065F harakat (already classified T above).
356 // -- Extended Arabic letters U+066E..U+06D3 ---------------
357 0x066E..=0x066F => JoiningClass::D,
358 0x0671..=0x0673 => JoiningClass::R,
359 0x0674 => JoiningClass::U,
360 0x0675..=0x0677 => JoiningClass::R,
361 0x0678..=0x0687 => JoiningClass::D,
362 0x0688..=0x0699 => JoiningClass::R,
363 0x069A..=0x06A9 => JoiningClass::D,
364 0x06AA => JoiningClass::R,
365 0x06AB..=0x06BF => JoiningClass::D,
366 0x06C0 => JoiningClass::R,
367 0x06C1..=0x06C2 => JoiningClass::D,
368 0x06C3..=0x06CB => JoiningClass::R,
369 0x06CC => JoiningClass::D,
370 0x06CD => JoiningClass::R,
371 0x06CE => JoiningClass::D,
372 0x06CF => JoiningClass::R,
373 0x06D0..=0x06D1 => JoiningClass::D,
374 0x06D2..=0x06D3 => JoiningClass::R,
375 0x06D5 => JoiningClass::R,
376 // -- Arabic Supplement (U+0750..U+077F) — all D ----------
377 0x0750..=0x077F => JoiningClass::D,
378 // -- Arabic Extended-A (U+08A0..U+08B4 etc.) — mostly D --
379 0x08A0..=0x08B4 => JoiningClass::D,
380 0x08B6..=0x08BD => JoiningClass::D,
381 // Presentation forms — typically isolated by construction;
382 // returning U keeps them out of joining chains.
383 0xFB50..=0xFDFF => JoiningClass::U,
384 0xFE70..=0xFEFF => JoiningClass::U,
385 // Anything else in the joining-aware blocks → U (safe default).
386 _ => JoiningClass::U,
387 }
388}
389
390/// True when `cp` is a transparent (combining) mark within the
391/// joining-aware blocks. Covers Arabic harakat, shadda, sukun, dagger
392/// alef, and the Mn marks in the Syriac and Arabic Supplement blocks.
393fn is_transparent_mark(cp: u32) -> bool {
394 // Arabic harakat + tanwin + shadda + sukun + maddah etc.
395 if (0x0610..=0x061A).contains(&cp) {
396 return true;
397 }
398 if (0x064B..=0x065F).contains(&cp) {
399 return true;
400 }
401 if cp == 0x0670 {
402 return true;
403 } // ARABIC LETTER SUPERSCRIPT ALEF
404 if (0x06D6..=0x06DC).contains(&cp) {
405 return true;
406 }
407 if (0x06DF..=0x06E4).contains(&cp) {
408 return true;
409 }
410 if (0x06E7..=0x06E8).contains(&cp) {
411 return true;
412 }
413 if (0x06EA..=0x06ED).contains(&cp) {
414 return true;
415 }
416 if (0x08D3..=0x08E1).contains(&cp) {
417 return true;
418 }
419 if (0x08E3..=0x08FF).contains(&cp) {
420 return true;
421 }
422 // Syriac marks.
423 if (0x0711..=0x0711).contains(&cp) {
424 return true;
425 }
426 if (0x0730..=0x074A).contains(&cp) {
427 return true;
428 }
429 false
430}
431
432/// Compute the chosen [`JoiningForm`] for every character in `chars`,
433/// applying the joining-adjacency state machine described in the module
434/// docs.
435///
436/// Inputs are assumed to be in **logical order** (post-bidi). The
437/// returned `Vec` has the same length as `chars`. T-class chars receive
438/// the same form as the most recent non-T base character so the caller
439/// can blindly index by char position.
440pub fn compute_forms(chars: &[char]) -> Vec<JoiningForm> {
441 let n = chars.len();
442 let mut forms = vec![JoiningForm::Isol; n];
443 if n == 0 {
444 return forms;
445 }
446 let classes: Vec<JoiningClass> = chars.iter().map(|&c| joining_class(c)).collect();
447
448 // Helper: index of the previous non-T char, or None.
449 let prev_non_t = |i: usize| -> Option<usize> {
450 let mut j = i;
451 while j > 0 {
452 j -= 1;
453 if classes[j] != JoiningClass::T {
454 return Some(j);
455 }
456 }
457 None
458 };
459 // Helper: index of the next non-T char, or None.
460 let next_non_t = |i: usize| -> Option<usize> {
461 let mut j = i + 1;
462 while j < n {
463 if classes[j] != JoiningClass::T {
464 return Some(j);
465 }
466 j += 1;
467 }
468 None
469 };
470
471 for i in 0..n {
472 let cls = classes[i];
473 if cls == JoiningClass::T {
474 // Resolved later — inherit from the preceding non-T base.
475 continue;
476 }
477 // "left_joins" = the previous non-T char can extend its joining
478 // chain to this char. A previous {D, L, C} can do so. Note
479 // that a previous U or R cannot — R only joins to its left
480 // neighbour (i.e. the *char before it*), not to its right.
481 let left_can_join = matches!(
482 prev_non_t(i).map(|j| classes[j]),
483 Some(JoiningClass::D) | Some(JoiningClass::L) | Some(JoiningClass::C)
484 );
485 // "right_joins" = the next non-T char can extend its chain back
486 // to this one. Next {D, R, C} can do so.
487 let right_can_join = matches!(
488 next_non_t(i).map(|j| classes[j]),
489 Some(JoiningClass::D) | Some(JoiningClass::R) | Some(JoiningClass::C)
490 );
491 // Now intersect with what *this* char allows on each side:
492 // - U: never joins → always Isol.
493 // - R: joins on the left only.
494 // - L: joins on the right only.
495 // - D: joins on both sides.
496 // - C: joins on both sides (joining-causing acts as D for
497 // the purpose of form selection).
498 let (this_left, this_right) = match cls {
499 JoiningClass::U => (false, false),
500 JoiningClass::R => (true, false),
501 JoiningClass::L => (false, true),
502 JoiningClass::D | JoiningClass::C => (true, true),
503 JoiningClass::T => unreachable!(),
504 };
505 let joins_left = left_can_join && this_left;
506 let joins_right = right_can_join && this_right;
507 forms[i] = match (joins_left, joins_right) {
508 (false, false) => JoiningForm::Isol,
509 (false, true) => JoiningForm::Init,
510 (true, true) => JoiningForm::Medi,
511 (true, false) => JoiningForm::Fina,
512 };
513 }
514
515 // Second pass: T chars inherit the form of the previous non-T base.
516 let mut last_form = JoiningForm::Isol;
517 for i in 0..n {
518 if classes[i] == JoiningClass::T {
519 forms[i] = last_form;
520 } else {
521 last_form = forms[i];
522 }
523 }
524
525 forms
526}
527
528#[cfg(test)]
529#[allow(non_snake_case)] // Tests reference Unicode codepoints / UCD class
530 // letters (R / D / U+062x) by their canonical
531 // capitalisation; renaming hurts readability.
532mod tests {
533 use super::*;
534
535 #[test]
536 fn joining_class_lookup_returns_R_for_alif_U_062() {
537 // U+0627 ARABIC LETTER ALEF — the canonical right-joining
538 // letter.
539 assert_eq!(joining_class('\u{0627}'), JoiningClass::R);
540 }
541
542 #[test]
543 fn joining_class_lookup_returns_D_for_ba_U_0628() {
544 // U+0628 ARABIC LETTER BEH — dual-joining.
545 assert_eq!(joining_class('\u{0628}'), JoiningClass::D);
546 }
547
548 #[test]
549 fn dual_joining_letter_between_two_dual_joiners_picks_medi() {
550 // BEH BEH BEH — interior BEH must be Medi.
551 let chars = ['\u{0628}', '\u{0628}', '\u{0628}'];
552 let forms = compute_forms(&chars);
553 assert_eq!(forms[0], JoiningForm::Init);
554 assert_eq!(forms[1], JoiningForm::Medi);
555 assert_eq!(forms[2], JoiningForm::Fina);
556 }
557
558 #[test]
559 fn dual_joining_letter_at_start_picks_init() {
560 // BEH at start of a 2-char chain BEH+TEH.
561 let chars = ['\u{0628}', '\u{062A}'];
562 let forms = compute_forms(&chars);
563 assert_eq!(forms[0], JoiningForm::Init);
564 assert_eq!(forms[1], JoiningForm::Fina);
565 }
566
567 #[test]
568 fn right_joining_letter_at_end_picks_fina() {
569 // BEH then ALEF — BEH (D) is Init, ALEF (R) joins-left so
570 // it becomes Fina.
571 let chars = ['\u{0628}', '\u{0627}'];
572 let forms = compute_forms(&chars);
573 assert_eq!(forms[0], JoiningForm::Init);
574 assert_eq!(forms[1], JoiningForm::Fina);
575 }
576
577 #[test]
578 fn transparent_combining_mark_does_not_break_chain() {
579 // BEH FATHA BEH — the FATHA (U+064E, T) sits between two
580 // dual-joiners; the chain must skip it, so the second BEH
581 // remains a continuation (Fina here, since the chain ends).
582 // The mark inherits Init from the preceding BEH.
583 let chars = ['\u{0628}', '\u{064E}', '\u{0628}'];
584 let forms = compute_forms(&chars);
585 assert_eq!(forms[0], JoiningForm::Init);
586 assert_eq!(forms[1], JoiningForm::Init); // mark inherits
587 assert_eq!(forms[2], JoiningForm::Fina);
588 }
589
590 #[test]
591 fn alef_after_lam_in_la_word_picks_fina() {
592 // LAM + ALEF — the canonical "la" sequence. LAM (D) is Init,
593 // ALEF (R) joins-left → Fina.
594 let chars = ['\u{0644}', '\u{0627}'];
595 let forms = compute_forms(&chars);
596 assert_eq!(forms[0], JoiningForm::Init);
597 assert_eq!(forms[1], JoiningForm::Fina);
598 }
599
600 #[test]
601 fn isolated_letter_with_no_neighbours_picks_isol() {
602 let forms = compute_forms(&['\u{0628}']);
603 assert_eq!(forms[0], JoiningForm::Isol);
604 }
605
606 #[test]
607 fn right_joiner_followed_by_dual_joiner_breaks_chain() {
608 // ALEF (R) cannot join its right neighbour, so the next BEH
609 // sees no left-joiner and starts a new chain.
610 // ALEF + BEH + BEH:
611 // ALEF(R) — Isol (no left, can't extend right)
612 // BEH(D) — Init
613 // BEH(D) — Fina
614 let forms = compute_forms(&['\u{0627}', '\u{0628}', '\u{0628}']);
615 assert_eq!(forms[0], JoiningForm::Isol);
616 assert_eq!(forms[1], JoiningForm::Init);
617 assert_eq!(forms[2], JoiningForm::Fina);
618 }
619
620 #[test]
621 fn space_between_letters_breaks_chain() {
622 // BEH SPACE BEH — space is U, breaks chain → both Isol.
623 let chars = ['\u{0628}', ' ', '\u{0628}'];
624 let forms = compute_forms(&chars);
625 assert_eq!(forms[0], JoiningForm::Isol);
626 assert_eq!(forms[1], JoiningForm::Isol);
627 assert_eq!(forms[2], JoiningForm::Isol);
628 }
629
630 #[test]
631 fn zwj_extends_chain_across_non_joiner() {
632 // BEH + ZWJ + ZWJ + BEH should all participate via the C
633 // class. The two ZWJs are joining-causing → Medi each, and
634 // the BEHs become Init / Fina.
635 let chars = ['\u{0628}', '\u{200D}', '\u{200D}', '\u{0628}'];
636 let forms = compute_forms(&chars);
637 assert_eq!(forms[0], JoiningForm::Init);
638 assert_eq!(forms[1], JoiningForm::Medi);
639 assert_eq!(forms[2], JoiningForm::Medi);
640 assert_eq!(forms[3], JoiningForm::Fina);
641 }
642
643 #[test]
644 fn zwnj_breaks_chain() {
645 // BEH + ZWNJ + BEH — ZWNJ (U) explicitly breaks the chain.
646 let chars = ['\u{0628}', '\u{200C}', '\u{0628}'];
647 let forms = compute_forms(&chars);
648 assert_eq!(forms[0], JoiningForm::Isol);
649 assert_eq!(forms[1], JoiningForm::Isol);
650 assert_eq!(forms[2], JoiningForm::Isol);
651 }
652
653 #[test]
654 fn script_of_arabic_alef_is_arabic() {
655 assert_eq!(script_of('\u{0627}'), Script::Arabic);
656 }
657
658 #[test]
659 fn script_of_hebrew_alef_is_hebrew() {
660 assert_eq!(script_of('\u{05D0}'), Script::Hebrew);
661 }
662
663 #[test]
664 fn script_of_latin_a_is_other() {
665 assert_eq!(script_of('A'), Script::Other);
666 }
667
668 #[test]
669 fn feature_tags_for_arabic_includes_four_joining_features() {
670 let tags = feature_tags_for_run(Script::Arabic);
671 assert!(tags.contains(b"isol"));
672 assert!(tags.contains(b"init"));
673 assert!(tags.contains(b"medi"));
674 assert!(tags.contains(b"fina"));
675 }
676
677 #[test]
678 fn feature_tags_for_other_is_empty() {
679 assert!(feature_tags_for_run(Script::Other).is_empty());
680 }
681
682 #[test]
683 fn feature_tag_round_trips_per_form() {
684 assert_eq!(JoiningForm::Isol.feature_tag(), *b"isol");
685 assert_eq!(JoiningForm::Init.feature_tag(), *b"init");
686 assert_eq!(JoiningForm::Medi.feature_tag(), *b"medi");
687 assert_eq!(JoiningForm::Fina.feature_tag(), *b"fina");
688 }
689
690 #[test]
691 fn empty_run_returns_empty() {
692 assert!(compute_forms(&[]).is_empty());
693 }
694
695 #[test]
696 fn arabic_word_alsalam_picks_expected_forms() {
697 // "السلام" = ALEF LAM SEEN LAM ALEF MEEM
698 // Joining classes: R D D D R D
699 // Expected forms (logical order):
700 // ALEF(R) Isol — no left, R can't join right
701 // LAM(D) Init — right joins (SEEN D), left ALEF can't extend right
702 // SEEN(D) Medi — both LAMs are D
703 // LAM(D) Medi — between SEEN and ALEF (R can join left)
704 // ALEF(R) Fina — left LAM extends, ALEF can't extend right
705 // MEEM(D) Isol — no left (ALEF R can't extend right), no right
706 let chars: Vec<char> = "السلام".chars().collect();
707 let forms = compute_forms(&chars);
708 assert_eq!(forms.len(), 6);
709 assert_eq!(forms[0], JoiningForm::Isol);
710 assert_eq!(forms[1], JoiningForm::Init);
711 assert_eq!(forms[2], JoiningForm::Medi);
712 assert_eq!(forms[3], JoiningForm::Medi);
713 assert_eq!(forms[4], JoiningForm::Fina);
714 assert_eq!(forms[5], JoiningForm::Isol);
715 }
716}