oxideav-scribe 0.1.5

Pure-Rust vector font shaper + layout for the oxideav framework — TrueType / OTF outline access, GSUB ligatures, GPOS kerning, mark attachment, CBDT colour bitmaps. Pixel pipeline lives in oxideav-raster.
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
//! Indic complex-script shaping (round 8 / round 10).
//!
//! Devanagari (Hindi / Marathi / Sanskrit / Nepali) was the first Indic
//! script we supported (round 8, commit 67a50bc). Round 10 generalises
//! the same cluster-machine pattern to two more scripts:
//!
//! - **Bengali** (U+0980..U+09FF) — Bengali / Assamese / Manipuri.
//!   Closest to Devanagari: same halant-driven conjunct formation, same
//!   reph rule (RA U+09B0 + halant U+09CD + consonant), same family of
//!   pre-base reordering matras (U+09BF "i", U+09C7 "e", U+09C8 "ai"
//!   — Bengali is unusual in that THREE matras reorder, not just one).
//! - **Tamil** (U+0B80..U+0BFF) — minimal cluster reordering. No
//!   conjunct formation in the modern orthography (each consonant is
//!   independently spelled with its own pulli / virama U+0BCD when
//!   needed). The split-vowel matras (U+0BCA = U+0BC6 + U+0BBE,
//!   U+0BCB = U+0BC7 + U+0BBE, U+0BCC = U+0BC6 + U+0BD7) carry a
//!   pre-base component that requires reordering when present.
//!
//! Unlike Arabic — which is purely contextual joining over a
//! left-to-right glyph stream — Indic shaping is **cluster-based**:
//! input characters are grouped into orthographic syllables, then
//! re-ordered + re-shaped within each cluster according to script-
//! specific rules.
//!
//! ## Scope per round
//!
//! Round 8 covered Devanagari pre-base matra reorder + reph
//! identification (without `rphf` GSUB substitution).
//!
//! Round 10 (this round):
//! - Adds Bengali + Tamil as two more scripts under the same shape.
//! - Wires the `rphf` GSUB feature to the reph identification: when a
//!   cluster has [`ClusterFlags::has_reph`] AND the active face publishes
//!   a `rphf` lookup for the script, the leading RA glyph is rewritten
//!   to its reph form via [`oxideav_ttf::Font::gsub_apply_lookup_type_1`]
//!   and the halant is dropped. See [`crate::face_chain`] for the
//!   wiring.
//!
//! ## References
//!
//! - Unicode 15.1 Standard Annex #15 (Indic syllabic categories).
//! - Unicode 15.1 Standard Annex #29 (text segmentation; grapheme
//!   cluster baseline).
//! - Microsoft OpenType Layout — *Creating and supporting OpenType
//!   fonts for the Devanagari script* (the canonical description of
//!   the cluster reorder rules + GSUB feature application order).
//! - Microsoft OpenType Layout — *Creating and supporting OpenType
//!   fonts for Indic scripts* (Bengali / Tamil / Telugu / Gujarati /
//!   Gurmukhi / Kannada / Malayalam / Oriya).
//!
//! No HarfBuzz / FreeType / pango / ICU layout source consulted. The
//! algorithms are clean-room implementations derived from the Unicode +
//! OpenType specs above plus the per-script `Shaping` informative
//! examples in the OpenType layout doc.

#![allow(clippy::manual_range_contains)]

/// Indic syllabic category. Names are short for readability;
/// see the per-variant docs for the full Unicode classification.
///
/// The same enum is used across all supported Indic scripts — what
/// differs between scripts is the per-codepoint classifier (e.g.
/// [`devanagari_category`] vs [`bengali_category`]) and the cluster /
/// feature application rules that consume it.
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum IndicCategory {
    /// Independent consonant (base). Drives cluster start / base
    /// selection. Per-script ranges:
    /// - Devanagari U+0915..U+0939, U+0958..U+095F, U+0978..U+097F.
    /// - Bengali U+0995..U+09B9 (with gaps), U+09DC..U+09DF,
    ///   U+09F0..U+09F1.
    /// - Tamil U+0B95..U+0BB9 (with gaps).
    Consonant,
    /// Independent vowel — acts as a base for the cluster but does not
    /// chain via halant. Per-script ranges:
    /// - Devanagari U+0904..U+0914, U+0960..U+0961.
    /// - Bengali U+0985..U+0994.
    /// - Tamil U+0B85..U+0B94.
    Vowel,
    /// Halant / virama — suppresses the inherent vowel of the
    /// preceding consonant; when followed by another consonant it
    /// forms a conjunct (both stay in the same cluster).
    /// Per-script codepoints:
    /// - Devanagari U+094D.
    /// - Bengali U+09CD.
    /// - Tamil U+0BCD (often called "pulli" in Tamil contexts).
    Halant,
    /// Pre-base reordering matra — vowel sign that appears AFTER its
    /// base consonant in logical order but renders VISUALLY BEFORE it.
    /// The reorderer in [`reorder_cluster`] swaps it to the front of
    /// the cluster. Per-script codepoints:
    /// - Devanagari U+093F (sign "i").
    /// - Bengali U+09BF (sign "i"), U+09C7 (sign "e"), U+09C8
    ///   (sign "ai") — Bengali is unusual in having THREE.
    /// - Tamil U+0BC6 (sign "e"), U+0BC7 (sign "ee"), U+0BC8 (sign "ai")
    ///   — Tamil's e/ee/ai matras are pre-base rather than post-base.
    PreBaseMatra,
    /// Vowel sign / matra (other than pre-base). Stays in its logical
    /// position within the cluster.
    Matra,
    /// Nukta — combining dot-below; binds tightly to the
    /// preceding consonant (forms a "nukta'd" consonant). Per-script:
    /// - Devanagari U+093C.
    /// - Bengali U+09BC.
    /// - Tamil: no nukta in the modern orthography (no codepoint
    ///   classified `Nukta` for Tamil).
    Nukta,
    /// Anusvara / candrabindu / visarga — bindu marks that attach to
    /// the cluster end. Per-script:
    /// - Devanagari U+0900..U+0903.
    /// - Bengali U+0981..U+0983.
    /// - Tamil U+0B82, U+0B83.
    Bindu,
    /// Avagraha + danda + double-danda + the digit block + various
    /// miscellaneous symbols. Treated as cluster-breaking
    /// (each is its own cluster).
    Symbol,
    /// Anything outside the script's main block. Treated as a cluster
    /// boundary — an Indic cluster never crosses the script boundary.
    Other,
}

/// Look up the Indic category for `ch` within the Devanagari block.
/// Codepoints outside U+0900..U+097F return [`IndicCategory::Other`].
///
/// The classification follows the Unicode `IndicSyllabicCategory.txt`
/// and `IndicPositionalCategory.txt` properties — but condensed to the
/// nine categories the cluster machine actually distinguishes.
pub fn devanagari_category(ch: char) -> IndicCategory {
    let cp = ch as u32;
    // Fast-path: outside the Devanagari block entirely.
    if cp < 0x0900 || cp > 0x097F {
        return IndicCategory::Other;
    }
    match cp {
        // Bindu marks (anusvara / candrabindu / visarga / inverted
        // candrabindu).
        0x0900..=0x0903 => IndicCategory::Bindu,
        // Independent vowels (A through AU).
        0x0904..=0x0914 => IndicCategory::Vowel,
        // Consonants KA..HA + extended consonants (NNNA, RRA, etc.).
        0x0915..=0x0939 => IndicCategory::Consonant,
        // Nukta — combining dot below.
        0x093C => IndicCategory::Nukta,
        // Avagraha (sign) — symbol; treated as cluster break.
        0x093D => IndicCategory::Symbol,
        // Vowel signs (matras) other than pre-base "i".
        0x093A | 0x093B => IndicCategory::Matra,
        // Post-base matra AA (U+093E) — the most common matra in
        // running Hindi text.
        0x093E => IndicCategory::Matra,
        // Pre-base matra "i" — the only matra that needs reordering
        // in modern Devanagari.
        0x093F => IndicCategory::PreBaseMatra,
        // The post-base, above-base, and below-base matras.
        0x0940..=0x094C => IndicCategory::Matra,
        // Halant / virama.
        0x094D => IndicCategory::Halant,
        0x094E..=0x094F => IndicCategory::Matra,
        // Stress signs / udatta + anudatta + grave + acute + Vedic
        // marks.
        0x0951..=0x0954 => IndicCategory::Bindu,
        // Vowel signs UE / UUE / SHORT_E (Marathi extensions).
        0x0955..=0x0957 => IndicCategory::Matra,
        // Additional nukta'd consonants (QA..YYA).
        0x0958..=0x095F => IndicCategory::Consonant,
        // Vocalic L + LL (independent vowels).
        0x0960..=0x0961 => IndicCategory::Vowel,
        // Vowel signs vocalic L + LL.
        0x0962..=0x0963 => IndicCategory::Matra,
        // Danda + double-danda + abbreviation sign + Devanagari ".".
        // All cluster-breaking symbols.
        0x0964..=0x096F => IndicCategory::Symbol, // includes digits 0966..096F
        0x0970..=0x0977 => IndicCategory::Symbol,
        // Extended consonants (Sindhi / Marathi etc.).
        0x0978..=0x097F => IndicCategory::Consonant,
        // Already enumerated everything in the block — exhaustive
        // match for clarity. Anything we missed defaults to Symbol
        // (cluster-breaking) which is the conservative choice.
        _ => IndicCategory::Symbol,
    }
}

/// Look up the Indic category for `ch` within the Bengali block.
/// Codepoints outside U+0980..U+09FF return [`IndicCategory::Other`].
///
/// Bengali shares Devanagari's structural shape (halant U+09CD glues
/// consonants into conjuncts; bindus attach to the cluster end) but
/// has THREE pre-base reordering matras (U+09BF "i", U+09C7 "e",
/// U+09C8 "ai") instead of Devanagari's one. The reph rule is the
/// same shape — RA U+09B0 + halant + consonant.
pub fn bengali_category(ch: char) -> IndicCategory {
    let cp = ch as u32;
    if cp < 0x0980 || cp > 0x09FF {
        return IndicCategory::Other;
    }
    match cp {
        // U+0980 BENGALI ANJI — sign; cluster-breaking.
        0x0980 => IndicCategory::Symbol,
        // Bindu marks: candrabindu / anusvara / visarga.
        0x0981..=0x0983 => IndicCategory::Bindu,
        // U+0984 unassigned.
        // Independent vowels A..AU (with gaps at U+098D, U+098E,
        // U+0991, U+0992 historically). We classify the entire span
        // as Vowel — assigned points are vowels; unassigned points
        // (which the font's cmap won't have anyway) fall through.
        0x0985..=0x098C => IndicCategory::Vowel,
        0x098F..=0x0990 => IndicCategory::Vowel,
        0x0993..=0x0994 => IndicCategory::Vowel,
        // Consonants KA..HA — Bengali block consonants run U+0995..
        // U+09B9 with gaps at the same positions Devanagari has gaps.
        0x0995..=0x09A8 => IndicCategory::Consonant,
        // U+09A9 unassigned.
        0x09AA..=0x09B0 => IndicCategory::Consonant,
        // U+09B1 unassigned.
        0x09B2 => IndicCategory::Consonant,
        // U+09B3..U+09B5 unassigned.
        0x09B6..=0x09B9 => IndicCategory::Consonant,
        // U+09BA, U+09BB unassigned.
        // Nukta — combining dot below.
        0x09BC => IndicCategory::Nukta,
        // Avagraha — symbol.
        0x09BD => IndicCategory::Symbol,
        // Vowel sign AA (post-base).
        0x09BE => IndicCategory::Matra,
        // Pre-base matra "i".
        0x09BF => IndicCategory::PreBaseMatra,
        // Vowel signs II / U / UU / R / RR (post-base + below-base).
        0x09C0..=0x09C4 => IndicCategory::Matra,
        // U+09C5, U+09C6 unassigned.
        // Pre-base matras "e" and "ai".
        0x09C7..=0x09C8 => IndicCategory::PreBaseMatra,
        // U+09C9, U+09CA unassigned (the slot for "o" / "au" — these
        // are encoded as 2-character sequences combining U+09C7 +
        // U+09BE / U+09D7 in modern Bengali).
        // Vowel signs "o" and "au" — these ARE encoded as U+09CB and
        // U+09CC (canonical decomposition: U+09C7 + U+09BE / U+09D7).
        // We treat them as post-base matras (the cluster machine sees
        // the canonical-equivalent form when text has been NFC-normalised;
        // the precomposed code points themselves are post-base).
        0x09CB..=0x09CC => IndicCategory::Matra,
        // Halant / virama (Bengali "hashanta").
        0x09CD => IndicCategory::Halant,
        // U+09CE BENGALI LETTER KHANDA TA — special consonant form.
        0x09CE => IndicCategory::Consonant,
        // U+09CF..U+09D6 unassigned.
        // U+09D7 BENGALI AU LENGTH MARK — combining mark used in the
        // canonical decomposition of U+09CC. Post-base position.
        0x09D7 => IndicCategory::Matra,
        // U+09D8..U+09DB unassigned.
        // RRA, RHA — additional consonants.
        0x09DC..=0x09DD => IndicCategory::Consonant,
        // U+09DE unassigned.
        // YYA — additional consonant.
        0x09DF => IndicCategory::Consonant,
        // Vocalic R / L (independent vowels).
        0x09E0..=0x09E1 => IndicCategory::Vowel,
        // Vowel signs vocalic L (matras).
        0x09E2..=0x09E3 => IndicCategory::Matra,
        // U+09E4, U+09E5 unassigned.
        // Digits + miscellaneous symbols.
        0x09E6..=0x09EF => IndicCategory::Symbol, // digits 0..9
        // RUPEE MARK / RUPEE SIGN / NUMERATOR / etc.
        0x09F0 => IndicCategory::Consonant, // BENGALI LETTER RA WITH MIDDLE DIAGONAL (Assamese)
        0x09F1 => IndicCategory::Consonant, // BENGALI LETTER RA WITH LOWER DIAGONAL (Assamese)
        0x09F2..=0x09FF => IndicCategory::Symbol,
        _ => IndicCategory::Symbol,
    }
}

/// Look up the Indic category for `ch` within the Tamil block.
/// Codepoints outside U+0B80..U+0BFF return [`IndicCategory::Other`].
///
/// Tamil's cluster machine is the simplest of the supported scripts:
/// - No nukta (no U+0BBC slot).
/// - No reph rule — Tamil RA (U+0BB0) does NOT form a superscript even
///   in RA+halant+consonant sequence; the halant + RA is rendered
///   in-line.
/// - The pulli / virama (U+0BCD) DOES suppress the inherent vowel
///   like other Indic halants, but Tamil orthography prefers
///   independent consonants to conjuncts in most cases.
/// - THREE pre-base matras: U+0BC6 "e", U+0BC7 "ee", U+0BC8 "ai".
/// - Two-character vowel signs U+0BCA/U+0BCB/U+0BCC are precomposed
///   forms of pre-base + post-base components; we treat the
///   precomposed codepoints as post-base matras (the canonical
///   decomposition is the responsibility of the upstream NFC
///   normaliser).
pub fn tamil_category(ch: char) -> IndicCategory {
    let cp = ch as u32;
    if cp < 0x0B80 || cp > 0x0BFF {
        return IndicCategory::Other;
    }
    match cp {
        // U+0B80, U+0B81 unassigned.
        // U+0B82 anusvara, U+0B83 visarga.
        0x0B82..=0x0B83 => IndicCategory::Bindu,
        // Independent vowels A..AU (with standard Tamil gaps at
        // U+0B8B..U+0B8D, U+0B91 — those slots are unassigned).
        0x0B85..=0x0B8A => IndicCategory::Vowel,
        0x0B8E..=0x0B90 => IndicCategory::Vowel,
        0x0B92..=0x0B94 => IndicCategory::Vowel,
        // Consonants KA..HA (with the standard Tamil gaps).
        0x0B95 => IndicCategory::Consonant,
        0x0B99..=0x0B9A => IndicCategory::Consonant,
        0x0B9C => IndicCategory::Consonant,
        0x0B9E..=0x0B9F => IndicCategory::Consonant,
        0x0BA3..=0x0BA4 => IndicCategory::Consonant,
        0x0BA8..=0x0BAA => IndicCategory::Consonant,
        0x0BAE..=0x0BB9 => IndicCategory::Consonant,
        // U+0BBA..U+0BBD unassigned (no nukta / avagraha in Tamil).
        // Vowel sign AA — post-base.
        0x0BBE => IndicCategory::Matra,
        // Vowel signs I / II / U / UU — post-base.
        0x0BBF..=0x0BC2 => IndicCategory::Matra,
        // U+0BC3..U+0BC5 unassigned.
        // Pre-base matras E / EE / AI.
        0x0BC6..=0x0BC8 => IndicCategory::PreBaseMatra,
        // U+0BC9 unassigned.
        // Two-character vowel signs O / OO / AU — precomposed forms.
        // We classify them as post-base matras: the canonical
        // decomposition (U+0BC6 + U+0BBE / U+0BD7) carries the
        // pre-base component explicitly. Callers feeding NFC-normalised
        // text get the canonical decomposition for free; raw
        // precomposed input gets a post-base matra (visually the wrong
        // position for the pre-base component, but no orthographic
        // damage — the cluster still renders).
        0x0BCA..=0x0BCC => IndicCategory::Matra,
        // Pulli / virama — Tamil's halant.
        0x0BCD => IndicCategory::Halant,
        // U+0BCE..U+0BD6 unassigned.
        // U+0BD7 AU LENGTH MARK — combining; post-base.
        0x0BD7 => IndicCategory::Matra,
        // U+0BD8..U+0BE5 unassigned.
        // Tamil digits.
        0x0BE6..=0x0BEF => IndicCategory::Symbol,
        // Tamil numbers / signs (year / month / day / etc.).
        0x0BF0..=0x0BFF => IndicCategory::Symbol,
        _ => IndicCategory::Other,
    }
}

/// Per-cluster shaping flags computed by [`reorder_cluster`].
#[derive(Debug, Clone, Copy, Default, PartialEq, Eq)]
pub struct ClusterFlags {
    /// True when the cluster started with `RA + halant + consonant`
    /// — the RA at position 0 should ultimately render as a "reph"
    /// superscript mark over the cluster end. Tamil never sets this
    /// (Tamil's RA does not form a reph). Devanagari + Bengali do.
    pub has_reph: bool,
    /// True when the cluster contained a pre-base matra that was
    /// moved to the front of the cluster.
    pub pre_base_reordered: bool,
}

/// Walk `chars` and emit `(cluster_start, cluster_end_exclusive)` byte
/// indices into `chars` for every Indic cluster of the script picked
/// by `category`. Non-Indic characters become single-character clusters
/// whose category is [`IndicCategory::Other`].
///
/// A cluster boundary starts a new cluster when:
/// - the current character is `Other` (non-Indic);
/// - the current character is `Consonant` or `Vowel` AND the previous
///   character is NOT `Halant` (a halant glues the next consonant
///   into the same cluster, forming a conjunct);
/// - the previous character was `Symbol` (danda etc. always end a
///   cluster).
///
/// Otherwise the current character extends the cluster.
///
/// Pass [`devanagari_category`], [`bengali_category`], or
/// [`tamil_category`] as `category` to drive the segmentation per
/// script. The legacy round-8 entry point [`cluster_boundaries`]
/// hard-codes [`devanagari_category`].
pub fn cluster_boundaries_with(
    chars: &[char],
    category: fn(char) -> IndicCategory,
) -> Vec<(usize, usize)> {
    let mut out: Vec<(usize, usize)> = Vec::new();
    if chars.is_empty() {
        return out;
    }
    let n = chars.len();
    let mut start = 0usize;
    for i in 1..n {
        let prev = category(chars[i - 1]);
        let cur = category(chars[i]);
        let boundary = matches!(cur, IndicCategory::Other | IndicCategory::Symbol)
            || matches!(prev, IndicCategory::Other | IndicCategory::Symbol)
            || (matches!(cur, IndicCategory::Consonant | IndicCategory::Vowel)
                && !matches!(prev, IndicCategory::Halant));
        if boundary {
            out.push((start, i));
            start = i;
        }
    }
    out.push((start, n));
    out
}

/// Devanagari cluster segmenter. Convenience wrapper for
/// [`cluster_boundaries_with`] that hard-codes [`devanagari_category`]
/// — preserved for callers built against the round-8 API.
pub fn cluster_boundaries(chars: &[char]) -> Vec<(usize, usize)> {
    cluster_boundaries_with(chars, devanagari_category)
}

/// Reordering rules that describe how a single cluster is rewritten
/// from logical to visual order.
#[derive(Debug, Clone, Copy)]
pub struct ReorderRules {
    /// Lookup function that classifies a single character.
    pub category: fn(char) -> IndicCategory,
    /// Codepoint of the script's RA letter — the only consonant that
    /// can form a reph. Devanagari U+0930, Bengali U+09B0, Tamil
    /// U+0BB0 (but Tamil sets `reph_enabled = false`).
    pub ra_codepoint: char,
    /// True when this script forms a reph (Devanagari, Bengali). False
    /// for scripts where RA + halant + consonant renders in-line
    /// (Tamil, Malayalam in modern orthography).
    pub reph_enabled: bool,
}

/// Devanagari reorder rules.
pub const DEVANAGARI_RULES: ReorderRules = ReorderRules {
    category: devanagari_category,
    ra_codepoint: '\u{0930}',
    reph_enabled: true,
};

/// Bengali reorder rules.
pub const BENGALI_RULES: ReorderRules = ReorderRules {
    category: bengali_category,
    ra_codepoint: '\u{09B0}',
    reph_enabled: true,
};

/// Tamil reorder rules. `reph_enabled = false` because Tamil RA
/// does not form a superscript reph.
pub const TAMIL_RULES: ReorderRules = ReorderRules {
    category: tamil_category,
    ra_codepoint: '\u{0BB0}',
    reph_enabled: false,
};

/// Apply Indic cluster reordering to a single cluster using `rules`.
///
/// Returns the reordered character slice plus [`ClusterFlags`]
/// describing what was done.
///
/// Scope:
/// - **Pre-base matra** — if the cluster contains a pre-base matra
///   (any codepoint classified [`IndicCategory::PreBaseMatra`]
///   anywhere after the first consonant), move it to position 0.
///   Bengali clusters can have multiple pre-base matras in pathological
///   input; only the FIRST is moved (the others stay in place — the
///   cluster machine is tolerant rather than authoritative).
/// - **Reph detection** — if `rules.reph_enabled` is true AND the
///   cluster begins with `RA + halant + consonant`, set
///   [`ClusterFlags::has_reph`]. The actual glyph substitution is
///   wired in [`crate::face_chain`] via `Font::gsub_apply_lookup_type_1`
///   using the `rphf` feature.
pub fn reorder_cluster_with(cluster: &[char], rules: &ReorderRules) -> (Vec<char>, ClusterFlags) {
    let mut flags = ClusterFlags::default();
    if cluster.is_empty() {
        return (Vec::new(), flags);
    }
    let mut out: Vec<char> = cluster.to_vec();

    // Pre-base matra reorder. Find the FIRST pre-base matra and move
    // it to position 0.
    if let Some(matra_idx) = out
        .iter()
        .position(|&c| (rules.category)(c) == IndicCategory::PreBaseMatra)
    {
        if matra_idx > 0 {
            let matra = out.remove(matra_idx);
            out.insert(0, matra);
            flags.pre_base_reordered = true;
        }
    }

    // Reph detection. Use the original `cluster` (not `out`) so a
    // pre-base matra moved to the front doesn't mask the leading RA.
    if rules.reph_enabled
        && cluster.len() >= 3
        && cluster[0] == rules.ra_codepoint
        && (rules.category)(cluster[1]) == IndicCategory::Halant
        && (rules.category)(cluster[2]) == IndicCategory::Consonant
    {
        flags.has_reph = true;
    }

    (out, flags)
}

/// Devanagari cluster reorder. Convenience wrapper for
/// [`reorder_cluster_with`] using [`DEVANAGARI_RULES`] — preserved for
/// callers built against the round-8 API.
pub fn reorder_cluster(cluster: &[char]) -> (Vec<char>, ClusterFlags) {
    reorder_cluster_with(cluster, &DEVANAGARI_RULES)
}

/// Devanagari OpenType GSUB feature tags, in the spec-mandated
/// application order. The first 9 tags (`locl`..`cjct`) are
/// "substitution" features that reshape clusters into conjuncts and
/// half-forms; the last 6 (`init`..`haln`) are "presentation"
/// features that pick contextual variants.
pub fn devanagari_feature_tags() -> Vec<[u8; 4]> {
    vec![
        *b"locl", // language-form substitutions
        *b"ccmp", // glyph composition / decomposition
        *b"nukt", // nukta forms
        *b"akhn", // akhand ligatures (e.g. ksha, jnya)
        *b"rphf", // reph form (RA + halant → superscript)
        *b"blwf", // below-base forms
        *b"half", // half forms (consonant + halant in non-final position)
        *b"vatu", // vattu variants
        *b"cjct", // conjunct forms
        *b"init", // initial contextual variants
        *b"pres", // pre-base substitutions
        *b"abvs", // above-base substitutions
        *b"blws", // below-base substitutions
        *b"psts", // post-base substitutions
        *b"haln", // halant forms
    ]
}

/// Bengali OpenType GSUB feature tags, in the spec-mandated
/// application order. Identical shape to Devanagari — the same
/// substitution/presentation feature pipeline applies.
pub fn bengali_feature_tags() -> Vec<[u8; 4]> {
    // Bengali shares Devanagari's feature ordering one-to-one
    // (same Indic family rules in the OpenType spec).
    devanagari_feature_tags()
}

/// Tamil OpenType GSUB feature tags, in the spec-mandated application
/// order. Tamil's substitution chain is simpler than Devanagari /
/// Bengali — there's no `rphf` (no reph), no `vatu` (no vattu), no
/// `cjct` (no conjuncts in modern orthography). The remaining
/// substitution + presentation features carry over.
pub fn tamil_feature_tags() -> Vec<[u8; 4]> {
    vec![
        *b"locl", // language-form substitutions
        *b"ccmp", // glyph composition / decomposition
        *b"akhn", // akhand ligatures (rare in Tamil but present in some fonts)
        *b"half", // half forms
        *b"pref", // pre-base form (Tamil-specific: reorders the
        // pre-base component of a precomposed two-part vowel sign).
        *b"blwf", // below-base forms
        *b"pstf", // post-base forms
        *b"init", // initial contextual variants
        *b"pres", // pre-base substitutions
        *b"abvs", // above-base substitutions
        *b"blws", // below-base substitutions
        *b"psts", // post-base substitutions
        *b"haln", // halant forms
    ]
}

/// OpenType script tags for the Indic scripts we shape. Each tuple
/// returns `(modern_tag, legacy_tag)` — modern Indic2 tags
/// (`dev2` / `bng2` / `tml2`) carry the up-to-date feature lookups in
/// most fonts; legacy v1 tags (`deva` / `beng` / `taml`) ship the
/// pre-2005 lookups for compatibility with older shapers.
///
/// Use [`script_indic_tags`] to fetch the pair for a given script.
pub fn script_indic_tags(script: super::arabic::Script) -> Option<([u8; 4], [u8; 4])> {
    match script {
        super::arabic::Script::Devanagari => Some((*b"dev2", *b"deva")),
        super::arabic::Script::Bengali => Some((*b"bng2", *b"beng")),
        super::arabic::Script::Tamil => Some((*b"tml2", *b"taml")),
        _ => None,
    }
}

#[cfg(test)]
#[allow(non_snake_case)] // tests reference Unicode codepoint literals
mod tests {
    use super::*;

    // ---------- Devanagari (round 8 baseline tests) ----------

    #[test]
    fn devanagari_category_lookup_returns_consonant_for_ka_U_0915() {
        assert_eq!(devanagari_category('\u{0915}'), IndicCategory::Consonant);
    }

    #[test]
    fn devanagari_category_lookup_returns_halant_for_U_094D() {
        assert_eq!(devanagari_category('\u{094D}'), IndicCategory::Halant);
    }

    #[test]
    fn devanagari_category_lookup_returns_pre_base_matra_for_U_093F() {
        assert_eq!(devanagari_category('\u{093F}'), IndicCategory::PreBaseMatra);
    }

    #[test]
    fn devanagari_category_classifies_vowel_a_as_vowel() {
        assert_eq!(devanagari_category('\u{0905}'), IndicCategory::Vowel);
    }

    #[test]
    fn devanagari_category_classifies_anusvara_as_bindu() {
        assert_eq!(devanagari_category('\u{0902}'), IndicCategory::Bindu);
    }

    #[test]
    fn devanagari_category_classifies_nukta_as_nukta() {
        assert_eq!(devanagari_category('\u{093C}'), IndicCategory::Nukta);
    }

    #[test]
    fn devanagari_category_classifies_post_base_matra_aa_as_matra() {
        assert_eq!(devanagari_category('\u{093E}'), IndicCategory::Matra);
    }

    #[test]
    fn devanagari_category_classifies_danda_as_symbol() {
        assert_eq!(devanagari_category('\u{0964}'), IndicCategory::Symbol);
    }

    #[test]
    fn devanagari_category_returns_other_for_latin_a() {
        assert_eq!(devanagari_category('A'), IndicCategory::Other);
    }

    #[test]
    fn script_of_recognises_devanagari_block() {
        use super::super::arabic::{script_of, Script};
        assert_eq!(script_of('\u{0915}'), Script::Devanagari);
        assert_eq!(script_of('\u{094D}'), Script::Devanagari);
        assert_eq!(script_of('\u{097F}'), Script::Devanagari);
    }

    #[test]
    fn script_of_still_classifies_arabic_and_latin_correctly() {
        use super::super::arabic::{script_of, Script};
        assert_eq!(script_of('\u{0627}'), Script::Arabic);
        assert_eq!(script_of('A'), Script::Other);
    }

    #[test]
    fn pre_base_matra_reorders_before_base_consonant() {
        let cluster = ['\u{0915}', '\u{093F}'];
        let (out, flags) = reorder_cluster(&cluster);
        assert_eq!(out, vec!['\u{093F}', '\u{0915}']);
        assert!(flags.pre_base_reordered);
        assert!(!flags.has_reph);
    }

    #[test]
    fn pre_base_matra_reorders_in_conjunct_cluster() {
        let cluster = ['\u{0915}', '\u{094D}', '\u{0937}', '\u{093F}'];
        let (out, flags) = reorder_cluster(&cluster);
        assert_eq!(out, vec!['\u{093F}', '\u{0915}', '\u{094D}', '\u{0937}']);
        assert!(flags.pre_base_reordered);
    }

    #[test]
    fn reph_formation_at_cluster_start_marks_RA_for_superscript() {
        let cluster = ['\u{0930}', '\u{094D}', '\u{0915}'];
        let (out, flags) = reorder_cluster(&cluster);
        assert_eq!(out, vec!['\u{0930}', '\u{094D}', '\u{0915}']);
        assert!(flags.has_reph);
        assert!(!flags.pre_base_reordered);
    }

    #[test]
    fn reph_with_pre_base_matra_combines_both_flags() {
        let cluster = ['\u{0930}', '\u{094D}', '\u{0915}', '\u{093F}'];
        let (out, flags) = reorder_cluster(&cluster);
        assert_eq!(out, vec!['\u{093F}', '\u{0930}', '\u{094D}', '\u{0915}']);
        assert!(flags.has_reph);
        assert!(flags.pre_base_reordered);
    }

    #[test]
    fn cluster_without_reph_consonant_does_not_set_flag() {
        let cluster = ['\u{0915}', '\u{094D}', '\u{0937}'];
        let (_out, flags) = reorder_cluster(&cluster);
        assert!(!flags.has_reph);
    }

    #[test]
    fn cluster_boundary_starts_new_cluster_at_consonant_after_vowel() {
        let chars = ['\u{0915}', '\u{093E}', '\u{0915}'];
        let bounds = cluster_boundaries(&chars);
        assert_eq!(bounds, vec![(0, 2), (2, 3)]);
    }

    #[test]
    fn cluster_boundary_keeps_conjunct_in_one_cluster() {
        let chars = ['\u{0915}', '\u{094D}', '\u{0937}'];
        let bounds = cluster_boundaries(&chars);
        assert_eq!(bounds, vec![(0, 3)]);
    }

    #[test]
    fn cluster_boundary_breaks_at_danda_symbol() {
        let chars = ['\u{0915}', '\u{0964}', '\u{0915}'];
        let bounds = cluster_boundaries(&chars);
        assert_eq!(bounds, vec![(0, 1), (1, 2), (2, 3)]);
    }

    #[test]
    fn cluster_boundary_breaks_at_non_indic_codepoint() {
        let chars = ['\u{0915}', ' ', '\u{0915}'];
        let bounds = cluster_boundaries(&chars);
        assert_eq!(bounds, vec![(0, 1), (1, 2), (2, 3)]);
    }

    #[test]
    fn cluster_boundary_handles_empty_input() {
        let bounds = cluster_boundaries(&[]);
        assert!(bounds.is_empty());
    }

    #[test]
    fn cluster_boundary_single_consonant_is_one_cluster() {
        let chars = ['\u{0915}'];
        let bounds = cluster_boundaries(&chars);
        assert_eq!(bounds, vec![(0, 1)]);
    }

    #[test]
    fn devanagari_feature_tags_are_in_canonical_order() {
        let tags = devanagari_feature_tags();
        assert_eq!(&tags[0], b"locl");
        assert_eq!(&tags[1], b"ccmp");
        assert_eq!(&tags[2], b"nukt");
        assert_eq!(&tags[3], b"akhn");
        assert_eq!(&tags[4], b"rphf");
        assert_eq!(tags.last(), Some(b"haln"));
    }

    #[test]
    fn empty_cluster_reorder_returns_empty() {
        let (out, flags) = reorder_cluster(&[]);
        assert!(out.is_empty());
        assert_eq!(flags, ClusterFlags::default());
    }

    #[test]
    fn single_consonant_cluster_does_not_reorder() {
        let cluster = ['\u{0915}'];
        let (out, flags) = reorder_cluster(&cluster);
        assert_eq!(out, vec!['\u{0915}']);
        assert!(!flags.pre_base_reordered);
        assert!(!flags.has_reph);
    }

    #[test]
    fn two_clusters_with_pre_base_matras_each_reorder_independently() {
        let chars = ['\u{0915}', '\u{093F}', '\u{0915}', '\u{093F}'];
        let bounds = cluster_boundaries(&chars);
        assert_eq!(bounds, vec![(0, 2), (2, 4)]);
        for (s, e) in bounds {
            let (out, flags) = reorder_cluster(&chars[s..e]);
            assert_eq!(out, vec!['\u{093F}', '\u{0915}']);
            assert!(flags.pre_base_reordered);
        }
    }

    // ---------- Bengali (round 10) ----------

    #[test]
    fn bengali_category_classifies_ka_as_consonant() {
        // U+0995 BENGALI LETTER KA.
        assert_eq!(bengali_category('\u{0995}'), IndicCategory::Consonant);
    }

    #[test]
    fn bengali_category_classifies_ra_as_consonant() {
        // U+09B0 BENGALI LETTER RA.
        assert_eq!(bengali_category('\u{09B0}'), IndicCategory::Consonant);
    }

    #[test]
    fn bengali_category_classifies_halant_as_halant() {
        // U+09CD BENGALI SIGN VIRAMA (hashanta).
        assert_eq!(bengali_category('\u{09CD}'), IndicCategory::Halant);
    }

    #[test]
    fn bengali_category_classifies_nukta_as_nukta() {
        // U+09BC BENGALI SIGN NUKTA.
        assert_eq!(bengali_category('\u{09BC}'), IndicCategory::Nukta);
    }

    #[test]
    fn bengali_category_pre_base_matras_i_e_ai() {
        // U+09BF, U+09C7, U+09C8 — ALL pre-base in Bengali.
        assert_eq!(bengali_category('\u{09BF}'), IndicCategory::PreBaseMatra);
        assert_eq!(bengali_category('\u{09C7}'), IndicCategory::PreBaseMatra);
        assert_eq!(bengali_category('\u{09C8}'), IndicCategory::PreBaseMatra);
    }

    #[test]
    fn bengali_category_classifies_aa_matra_as_matra() {
        // U+09BE BENGALI VOWEL SIGN AA — post-base matra.
        assert_eq!(bengali_category('\u{09BE}'), IndicCategory::Matra);
    }

    #[test]
    fn bengali_category_classifies_anusvara_as_bindu() {
        // U+0982 BENGALI SIGN ANUSVARA.
        assert_eq!(bengali_category('\u{0982}'), IndicCategory::Bindu);
    }

    #[test]
    fn bengali_category_classifies_independent_vowel_a_as_vowel() {
        // U+0985 BENGALI LETTER A.
        assert_eq!(bengali_category('\u{0985}'), IndicCategory::Vowel);
    }

    #[test]
    fn bengali_category_returns_other_for_devanagari_codepoint() {
        // Devanagari is OUT of the Bengali block.
        assert_eq!(bengali_category('\u{0915}'), IndicCategory::Other);
    }

    #[test]
    fn bengali_pre_base_matra_i_reorders_before_base() {
        // BENGALI KA + sign-i → sign-i + KA.
        let cluster = ['\u{0995}', '\u{09BF}'];
        let (out, flags) = reorder_cluster_with(&cluster, &BENGALI_RULES);
        assert_eq!(out, vec!['\u{09BF}', '\u{0995}']);
        assert!(flags.pre_base_reordered);
        assert!(!flags.has_reph);
    }

    #[test]
    fn bengali_pre_base_matra_e_reorders_before_base() {
        // BENGALI KA + sign-e → sign-e + KA.
        let cluster = ['\u{0995}', '\u{09C7}'];
        let (out, flags) = reorder_cluster_with(&cluster, &BENGALI_RULES);
        assert_eq!(out, vec!['\u{09C7}', '\u{0995}']);
        assert!(flags.pre_base_reordered);
    }

    #[test]
    fn bengali_pre_base_matra_ai_reorders_before_base() {
        // BENGALI KA + sign-ai → sign-ai + KA.
        let cluster = ['\u{0995}', '\u{09C8}'];
        let (out, flags) = reorder_cluster_with(&cluster, &BENGALI_RULES);
        assert_eq!(out, vec!['\u{09C8}', '\u{0995}']);
        assert!(flags.pre_base_reordered);
    }

    #[test]
    fn bengali_reph_formation_marks_RA_for_superscript() {
        // BENGALI RA + halant + KA → reph + KA.
        let cluster = ['\u{09B0}', '\u{09CD}', '\u{0995}'];
        let (out, flags) = reorder_cluster_with(&cluster, &BENGALI_RULES);
        assert_eq!(out, vec!['\u{09B0}', '\u{09CD}', '\u{0995}']);
        assert!(flags.has_reph);
    }

    #[test]
    fn bengali_conjunct_keeps_in_one_cluster() {
        // BENGALI KA + halant + SHA → conjunct (single cluster).
        let chars = ['\u{0995}', '\u{09CD}', '\u{09B7}'];
        let bounds = cluster_boundaries_with(&chars, bengali_category);
        assert_eq!(bounds, vec![(0, 3)]);
    }

    // ---------- Tamil (round 10) ----------

    #[test]
    fn tamil_category_classifies_ka_as_consonant() {
        // U+0B95 TAMIL LETTER KA.
        assert_eq!(tamil_category('\u{0B95}'), IndicCategory::Consonant);
    }

    #[test]
    fn tamil_category_classifies_ra_as_consonant() {
        // U+0BB0 TAMIL LETTER RA.
        assert_eq!(tamil_category('\u{0BB0}'), IndicCategory::Consonant);
    }

    #[test]
    fn tamil_category_classifies_pulli_as_halant() {
        // U+0BCD TAMIL SIGN VIRAMA (pulli).
        assert_eq!(tamil_category('\u{0BCD}'), IndicCategory::Halant);
    }

    #[test]
    fn tamil_category_pre_base_matras_e_ee_ai() {
        // U+0BC6 (e), U+0BC7 (ee), U+0BC8 (ai) — pre-base.
        assert_eq!(tamil_category('\u{0BC6}'), IndicCategory::PreBaseMatra);
        assert_eq!(tamil_category('\u{0BC7}'), IndicCategory::PreBaseMatra);
        assert_eq!(tamil_category('\u{0BC8}'), IndicCategory::PreBaseMatra);
    }

    #[test]
    fn tamil_category_classifies_aa_matra_as_matra() {
        // U+0BBE TAMIL VOWEL SIGN AA — post-base.
        assert_eq!(tamil_category('\u{0BBE}'), IndicCategory::Matra);
    }

    #[test]
    fn tamil_category_classifies_anusvara_as_bindu() {
        // U+0B82 TAMIL SIGN ANUSVARA.
        assert_eq!(tamil_category('\u{0B82}'), IndicCategory::Bindu);
    }

    #[test]
    fn tamil_category_classifies_independent_vowel_a_as_vowel() {
        // U+0B85 TAMIL LETTER A.
        assert_eq!(tamil_category('\u{0B85}'), IndicCategory::Vowel);
    }

    #[test]
    fn tamil_category_returns_other_for_devanagari_codepoint() {
        assert_eq!(tamil_category('\u{0915}'), IndicCategory::Other);
    }

    #[test]
    fn tamil_pre_base_matra_e_reorders_before_base() {
        // TAMIL KA + sign-e → sign-e + KA.
        let cluster = ['\u{0B95}', '\u{0BC6}'];
        let (out, flags) = reorder_cluster_with(&cluster, &TAMIL_RULES);
        assert_eq!(out, vec!['\u{0BC6}', '\u{0B95}']);
        assert!(flags.pre_base_reordered);
    }

    #[test]
    fn tamil_RA_plus_halant_does_NOT_set_reph_flag() {
        // Tamil RA + pulli + KA — Tamil never forms a reph.
        let cluster = ['\u{0BB0}', '\u{0BCD}', '\u{0B95}'];
        let (_out, flags) = reorder_cluster_with(&cluster, &TAMIL_RULES);
        assert!(!flags.has_reph, "Tamil reph_enabled is false");
    }

    #[test]
    fn tamil_cluster_boundary_keeps_pulli_chain_in_one_cluster() {
        // KA + pulli + KA → conjunct-like cluster.
        let chars = ['\u{0B95}', '\u{0BCD}', '\u{0B95}'];
        let bounds = cluster_boundaries_with(&chars, tamil_category);
        assert_eq!(bounds, vec![(0, 3)]);
    }

    #[test]
    fn tamil_feature_tags_omit_rphf_and_cjct() {
        let tags = tamil_feature_tags();
        assert!(!tags.contains(b"rphf"), "Tamil has no reph feature");
        assert!(!tags.contains(b"cjct"), "Tamil has no conjunct feature");
        assert!(!tags.contains(b"vatu"), "Tamil has no vattu feature");
        // Tamil-specific tag.
        assert!(tags.contains(b"pref"), "Tamil emits the pref feature");
    }

    #[test]
    fn bengali_feature_tags_match_devanagari_shape() {
        assert_eq!(bengali_feature_tags(), devanagari_feature_tags());
    }

    #[test]
    fn script_indic_tags_returns_modern_and_legacy_pair_for_devanagari() {
        use super::super::arabic::Script;
        let pair = script_indic_tags(Script::Devanagari);
        assert_eq!(pair, Some((*b"dev2", *b"deva")));
    }

    #[test]
    fn script_indic_tags_returns_pair_for_bengali_and_tamil() {
        use super::super::arabic::Script;
        assert_eq!(
            script_indic_tags(Script::Bengali),
            Some((*b"bng2", *b"beng"))
        );
        assert_eq!(script_indic_tags(Script::Tamil), Some((*b"tml2", *b"taml")));
    }

    #[test]
    fn script_indic_tags_returns_none_for_arabic_or_other() {
        use super::super::arabic::Script;
        assert_eq!(script_indic_tags(Script::Arabic), None);
        assert_eq!(script_indic_tags(Script::Other), None);
    }
}