#![allow(clippy::manual_range_contains)]
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum IndicCategory {
Consonant,
Vowel,
Halant,
PreBaseMatra,
Matra,
Nukta,
Bindu,
Symbol,
Other,
}
pub fn devanagari_category(ch: char) -> IndicCategory {
let cp = ch as u32;
if cp < 0x0900 || cp > 0x097F {
return IndicCategory::Other;
}
match cp {
0x0900..=0x0903 => IndicCategory::Bindu,
0x0904..=0x0914 => IndicCategory::Vowel,
0x0915..=0x0939 => IndicCategory::Consonant,
0x093C => IndicCategory::Nukta,
0x093D => IndicCategory::Symbol,
0x093A | 0x093B => IndicCategory::Matra,
0x093E => IndicCategory::Matra,
0x093F => IndicCategory::PreBaseMatra,
0x0940..=0x094C => IndicCategory::Matra,
0x094D => IndicCategory::Halant,
0x094E..=0x094F => IndicCategory::Matra,
0x0951..=0x0954 => IndicCategory::Bindu,
0x0955..=0x0957 => IndicCategory::Matra,
0x0958..=0x095F => IndicCategory::Consonant,
0x0960..=0x0961 => IndicCategory::Vowel,
0x0962..=0x0963 => IndicCategory::Matra,
0x0964..=0x096F => IndicCategory::Symbol, 0x0970..=0x0977 => IndicCategory::Symbol,
0x0978..=0x097F => IndicCategory::Consonant,
_ => IndicCategory::Symbol,
}
}
pub fn bengali_category(ch: char) -> IndicCategory {
let cp = ch as u32;
if cp < 0x0980 || cp > 0x09FF {
return IndicCategory::Other;
}
match cp {
0x0980 => IndicCategory::Symbol,
0x0981..=0x0983 => IndicCategory::Bindu,
0x0985..=0x098C => IndicCategory::Vowel,
0x098F..=0x0990 => IndicCategory::Vowel,
0x0993..=0x0994 => IndicCategory::Vowel,
0x0995..=0x09A8 => IndicCategory::Consonant,
0x09AA..=0x09B0 => IndicCategory::Consonant,
0x09B2 => IndicCategory::Consonant,
0x09B6..=0x09B9 => IndicCategory::Consonant,
0x09BC => IndicCategory::Nukta,
0x09BD => IndicCategory::Symbol,
0x09BE => IndicCategory::Matra,
0x09BF => IndicCategory::PreBaseMatra,
0x09C0..=0x09C4 => IndicCategory::Matra,
0x09C7..=0x09C8 => IndicCategory::PreBaseMatra,
0x09CB..=0x09CC => IndicCategory::Matra,
0x09CD => IndicCategory::Halant,
0x09CE => IndicCategory::Consonant,
0x09D7 => IndicCategory::Matra,
0x09DC..=0x09DD => IndicCategory::Consonant,
0x09DF => IndicCategory::Consonant,
0x09E0..=0x09E1 => IndicCategory::Vowel,
0x09E2..=0x09E3 => IndicCategory::Matra,
0x09E6..=0x09EF => IndicCategory::Symbol, 0x09F0 => IndicCategory::Consonant, 0x09F1 => IndicCategory::Consonant, 0x09F2..=0x09FF => IndicCategory::Symbol,
_ => IndicCategory::Symbol,
}
}
pub fn tamil_category(ch: char) -> IndicCategory {
let cp = ch as u32;
if cp < 0x0B80 || cp > 0x0BFF {
return IndicCategory::Other;
}
match cp {
0x0B82..=0x0B83 => IndicCategory::Bindu,
0x0B85..=0x0B8A => IndicCategory::Vowel,
0x0B8E..=0x0B90 => IndicCategory::Vowel,
0x0B92..=0x0B94 => IndicCategory::Vowel,
0x0B95 => IndicCategory::Consonant,
0x0B99..=0x0B9A => IndicCategory::Consonant,
0x0B9C => IndicCategory::Consonant,
0x0B9E..=0x0B9F => IndicCategory::Consonant,
0x0BA3..=0x0BA4 => IndicCategory::Consonant,
0x0BA8..=0x0BAA => IndicCategory::Consonant,
0x0BAE..=0x0BB9 => IndicCategory::Consonant,
0x0BBE => IndicCategory::Matra,
0x0BBF..=0x0BC2 => IndicCategory::Matra,
0x0BC6..=0x0BC8 => IndicCategory::PreBaseMatra,
0x0BCA..=0x0BCC => IndicCategory::Matra,
0x0BCD => IndicCategory::Halant,
0x0BD7 => IndicCategory::Matra,
0x0BE6..=0x0BEF => IndicCategory::Symbol,
0x0BF0..=0x0BFF => IndicCategory::Symbol,
_ => IndicCategory::Other,
}
}
#[derive(Debug, Clone, Copy, Default, PartialEq, Eq)]
pub struct ClusterFlags {
pub has_reph: bool,
pub pre_base_reordered: bool,
}
pub fn cluster_boundaries_with(
chars: &[char],
category: fn(char) -> IndicCategory,
) -> Vec<(usize, usize)> {
let mut out: Vec<(usize, usize)> = Vec::new();
if chars.is_empty() {
return out;
}
let n = chars.len();
let mut start = 0usize;
for i in 1..n {
let prev = category(chars[i - 1]);
let cur = category(chars[i]);
let boundary = matches!(cur, IndicCategory::Other | IndicCategory::Symbol)
|| matches!(prev, IndicCategory::Other | IndicCategory::Symbol)
|| (matches!(cur, IndicCategory::Consonant | IndicCategory::Vowel)
&& !matches!(prev, IndicCategory::Halant));
if boundary {
out.push((start, i));
start = i;
}
}
out.push((start, n));
out
}
pub fn cluster_boundaries(chars: &[char]) -> Vec<(usize, usize)> {
cluster_boundaries_with(chars, devanagari_category)
}
#[derive(Debug, Clone, Copy)]
pub struct ReorderRules {
pub category: fn(char) -> IndicCategory,
pub ra_codepoint: char,
pub reph_enabled: bool,
}
pub const DEVANAGARI_RULES: ReorderRules = ReorderRules {
category: devanagari_category,
ra_codepoint: '\u{0930}',
reph_enabled: true,
};
pub const BENGALI_RULES: ReorderRules = ReorderRules {
category: bengali_category,
ra_codepoint: '\u{09B0}',
reph_enabled: true,
};
pub const TAMIL_RULES: ReorderRules = ReorderRules {
category: tamil_category,
ra_codepoint: '\u{0BB0}',
reph_enabled: false,
};
pub fn reorder_cluster_with(cluster: &[char], rules: &ReorderRules) -> (Vec<char>, ClusterFlags) {
let mut flags = ClusterFlags::default();
if cluster.is_empty() {
return (Vec::new(), flags);
}
let mut out: Vec<char> = cluster.to_vec();
if let Some(matra_idx) = out
.iter()
.position(|&c| (rules.category)(c) == IndicCategory::PreBaseMatra)
{
if matra_idx > 0 {
let matra = out.remove(matra_idx);
out.insert(0, matra);
flags.pre_base_reordered = true;
}
}
if rules.reph_enabled
&& cluster.len() >= 3
&& cluster[0] == rules.ra_codepoint
&& (rules.category)(cluster[1]) == IndicCategory::Halant
&& (rules.category)(cluster[2]) == IndicCategory::Consonant
{
flags.has_reph = true;
}
(out, flags)
}
pub fn reorder_cluster(cluster: &[char]) -> (Vec<char>, ClusterFlags) {
reorder_cluster_with(cluster, &DEVANAGARI_RULES)
}
pub fn devanagari_feature_tags() -> Vec<[u8; 4]> {
vec![
*b"locl", *b"ccmp", *b"nukt", *b"akhn", *b"rphf", *b"blwf", *b"half", *b"vatu", *b"cjct", *b"init", *b"pres", *b"abvs", *b"blws", *b"psts", *b"haln", ]
}
pub fn bengali_feature_tags() -> Vec<[u8; 4]> {
devanagari_feature_tags()
}
pub fn tamil_feature_tags() -> Vec<[u8; 4]> {
vec![
*b"locl", *b"ccmp", *b"akhn", *b"half", *b"pref", *b"blwf", *b"pstf", *b"init", *b"pres", *b"abvs", *b"blws", *b"psts", *b"haln", ]
}
pub fn script_indic_tags(script: super::arabic::Script) -> Option<([u8; 4], [u8; 4])> {
match script {
super::arabic::Script::Devanagari => Some((*b"dev2", *b"deva")),
super::arabic::Script::Bengali => Some((*b"bng2", *b"beng")),
super::arabic::Script::Tamil => Some((*b"tml2", *b"taml")),
_ => None,
}
}
#[cfg(test)]
#[allow(non_snake_case)] mod tests {
use super::*;
#[test]
fn devanagari_category_lookup_returns_consonant_for_ka_U_0915() {
assert_eq!(devanagari_category('\u{0915}'), IndicCategory::Consonant);
}
#[test]
fn devanagari_category_lookup_returns_halant_for_U_094D() {
assert_eq!(devanagari_category('\u{094D}'), IndicCategory::Halant);
}
#[test]
fn devanagari_category_lookup_returns_pre_base_matra_for_U_093F() {
assert_eq!(devanagari_category('\u{093F}'), IndicCategory::PreBaseMatra);
}
#[test]
fn devanagari_category_classifies_vowel_a_as_vowel() {
assert_eq!(devanagari_category('\u{0905}'), IndicCategory::Vowel);
}
#[test]
fn devanagari_category_classifies_anusvara_as_bindu() {
assert_eq!(devanagari_category('\u{0902}'), IndicCategory::Bindu);
}
#[test]
fn devanagari_category_classifies_nukta_as_nukta() {
assert_eq!(devanagari_category('\u{093C}'), IndicCategory::Nukta);
}
#[test]
fn devanagari_category_classifies_post_base_matra_aa_as_matra() {
assert_eq!(devanagari_category('\u{093E}'), IndicCategory::Matra);
}
#[test]
fn devanagari_category_classifies_danda_as_symbol() {
assert_eq!(devanagari_category('\u{0964}'), IndicCategory::Symbol);
}
#[test]
fn devanagari_category_returns_other_for_latin_a() {
assert_eq!(devanagari_category('A'), IndicCategory::Other);
}
#[test]
fn script_of_recognises_devanagari_block() {
use super::super::arabic::{script_of, Script};
assert_eq!(script_of('\u{0915}'), Script::Devanagari);
assert_eq!(script_of('\u{094D}'), Script::Devanagari);
assert_eq!(script_of('\u{097F}'), Script::Devanagari);
}
#[test]
fn script_of_still_classifies_arabic_and_latin_correctly() {
use super::super::arabic::{script_of, Script};
assert_eq!(script_of('\u{0627}'), Script::Arabic);
assert_eq!(script_of('A'), Script::Other);
}
#[test]
fn pre_base_matra_reorders_before_base_consonant() {
let cluster = ['\u{0915}', '\u{093F}'];
let (out, flags) = reorder_cluster(&cluster);
assert_eq!(out, vec!['\u{093F}', '\u{0915}']);
assert!(flags.pre_base_reordered);
assert!(!flags.has_reph);
}
#[test]
fn pre_base_matra_reorders_in_conjunct_cluster() {
let cluster = ['\u{0915}', '\u{094D}', '\u{0937}', '\u{093F}'];
let (out, flags) = reorder_cluster(&cluster);
assert_eq!(out, vec!['\u{093F}', '\u{0915}', '\u{094D}', '\u{0937}']);
assert!(flags.pre_base_reordered);
}
#[test]
fn reph_formation_at_cluster_start_marks_RA_for_superscript() {
let cluster = ['\u{0930}', '\u{094D}', '\u{0915}'];
let (out, flags) = reorder_cluster(&cluster);
assert_eq!(out, vec!['\u{0930}', '\u{094D}', '\u{0915}']);
assert!(flags.has_reph);
assert!(!flags.pre_base_reordered);
}
#[test]
fn reph_with_pre_base_matra_combines_both_flags() {
let cluster = ['\u{0930}', '\u{094D}', '\u{0915}', '\u{093F}'];
let (out, flags) = reorder_cluster(&cluster);
assert_eq!(out, vec!['\u{093F}', '\u{0930}', '\u{094D}', '\u{0915}']);
assert!(flags.has_reph);
assert!(flags.pre_base_reordered);
}
#[test]
fn cluster_without_reph_consonant_does_not_set_flag() {
let cluster = ['\u{0915}', '\u{094D}', '\u{0937}'];
let (_out, flags) = reorder_cluster(&cluster);
assert!(!flags.has_reph);
}
#[test]
fn cluster_boundary_starts_new_cluster_at_consonant_after_vowel() {
let chars = ['\u{0915}', '\u{093E}', '\u{0915}'];
let bounds = cluster_boundaries(&chars);
assert_eq!(bounds, vec![(0, 2), (2, 3)]);
}
#[test]
fn cluster_boundary_keeps_conjunct_in_one_cluster() {
let chars = ['\u{0915}', '\u{094D}', '\u{0937}'];
let bounds = cluster_boundaries(&chars);
assert_eq!(bounds, vec![(0, 3)]);
}
#[test]
fn cluster_boundary_breaks_at_danda_symbol() {
let chars = ['\u{0915}', '\u{0964}', '\u{0915}'];
let bounds = cluster_boundaries(&chars);
assert_eq!(bounds, vec![(0, 1), (1, 2), (2, 3)]);
}
#[test]
fn cluster_boundary_breaks_at_non_indic_codepoint() {
let chars = ['\u{0915}', ' ', '\u{0915}'];
let bounds = cluster_boundaries(&chars);
assert_eq!(bounds, vec![(0, 1), (1, 2), (2, 3)]);
}
#[test]
fn cluster_boundary_handles_empty_input() {
let bounds = cluster_boundaries(&[]);
assert!(bounds.is_empty());
}
#[test]
fn cluster_boundary_single_consonant_is_one_cluster() {
let chars = ['\u{0915}'];
let bounds = cluster_boundaries(&chars);
assert_eq!(bounds, vec![(0, 1)]);
}
#[test]
fn devanagari_feature_tags_are_in_canonical_order() {
let tags = devanagari_feature_tags();
assert_eq!(&tags[0], b"locl");
assert_eq!(&tags[1], b"ccmp");
assert_eq!(&tags[2], b"nukt");
assert_eq!(&tags[3], b"akhn");
assert_eq!(&tags[4], b"rphf");
assert_eq!(tags.last(), Some(b"haln"));
}
#[test]
fn empty_cluster_reorder_returns_empty() {
let (out, flags) = reorder_cluster(&[]);
assert!(out.is_empty());
assert_eq!(flags, ClusterFlags::default());
}
#[test]
fn single_consonant_cluster_does_not_reorder() {
let cluster = ['\u{0915}'];
let (out, flags) = reorder_cluster(&cluster);
assert_eq!(out, vec!['\u{0915}']);
assert!(!flags.pre_base_reordered);
assert!(!flags.has_reph);
}
#[test]
fn two_clusters_with_pre_base_matras_each_reorder_independently() {
let chars = ['\u{0915}', '\u{093F}', '\u{0915}', '\u{093F}'];
let bounds = cluster_boundaries(&chars);
assert_eq!(bounds, vec![(0, 2), (2, 4)]);
for (s, e) in bounds {
let (out, flags) = reorder_cluster(&chars[s..e]);
assert_eq!(out, vec!['\u{093F}', '\u{0915}']);
assert!(flags.pre_base_reordered);
}
}
#[test]
fn bengali_category_classifies_ka_as_consonant() {
assert_eq!(bengali_category('\u{0995}'), IndicCategory::Consonant);
}
#[test]
fn bengali_category_classifies_ra_as_consonant() {
assert_eq!(bengali_category('\u{09B0}'), IndicCategory::Consonant);
}
#[test]
fn bengali_category_classifies_halant_as_halant() {
assert_eq!(bengali_category('\u{09CD}'), IndicCategory::Halant);
}
#[test]
fn bengali_category_classifies_nukta_as_nukta() {
assert_eq!(bengali_category('\u{09BC}'), IndicCategory::Nukta);
}
#[test]
fn bengali_category_pre_base_matras_i_e_ai() {
assert_eq!(bengali_category('\u{09BF}'), IndicCategory::PreBaseMatra);
assert_eq!(bengali_category('\u{09C7}'), IndicCategory::PreBaseMatra);
assert_eq!(bengali_category('\u{09C8}'), IndicCategory::PreBaseMatra);
}
#[test]
fn bengali_category_classifies_aa_matra_as_matra() {
assert_eq!(bengali_category('\u{09BE}'), IndicCategory::Matra);
}
#[test]
fn bengali_category_classifies_anusvara_as_bindu() {
assert_eq!(bengali_category('\u{0982}'), IndicCategory::Bindu);
}
#[test]
fn bengali_category_classifies_independent_vowel_a_as_vowel() {
assert_eq!(bengali_category('\u{0985}'), IndicCategory::Vowel);
}
#[test]
fn bengali_category_returns_other_for_devanagari_codepoint() {
assert_eq!(bengali_category('\u{0915}'), IndicCategory::Other);
}
#[test]
fn bengali_pre_base_matra_i_reorders_before_base() {
let cluster = ['\u{0995}', '\u{09BF}'];
let (out, flags) = reorder_cluster_with(&cluster, &BENGALI_RULES);
assert_eq!(out, vec!['\u{09BF}', '\u{0995}']);
assert!(flags.pre_base_reordered);
assert!(!flags.has_reph);
}
#[test]
fn bengali_pre_base_matra_e_reorders_before_base() {
let cluster = ['\u{0995}', '\u{09C7}'];
let (out, flags) = reorder_cluster_with(&cluster, &BENGALI_RULES);
assert_eq!(out, vec!['\u{09C7}', '\u{0995}']);
assert!(flags.pre_base_reordered);
}
#[test]
fn bengali_pre_base_matra_ai_reorders_before_base() {
let cluster = ['\u{0995}', '\u{09C8}'];
let (out, flags) = reorder_cluster_with(&cluster, &BENGALI_RULES);
assert_eq!(out, vec!['\u{09C8}', '\u{0995}']);
assert!(flags.pre_base_reordered);
}
#[test]
fn bengali_reph_formation_marks_RA_for_superscript() {
let cluster = ['\u{09B0}', '\u{09CD}', '\u{0995}'];
let (out, flags) = reorder_cluster_with(&cluster, &BENGALI_RULES);
assert_eq!(out, vec!['\u{09B0}', '\u{09CD}', '\u{0995}']);
assert!(flags.has_reph);
}
#[test]
fn bengali_conjunct_keeps_in_one_cluster() {
let chars = ['\u{0995}', '\u{09CD}', '\u{09B7}'];
let bounds = cluster_boundaries_with(&chars, bengali_category);
assert_eq!(bounds, vec![(0, 3)]);
}
#[test]
fn tamil_category_classifies_ka_as_consonant() {
assert_eq!(tamil_category('\u{0B95}'), IndicCategory::Consonant);
}
#[test]
fn tamil_category_classifies_ra_as_consonant() {
assert_eq!(tamil_category('\u{0BB0}'), IndicCategory::Consonant);
}
#[test]
fn tamil_category_classifies_pulli_as_halant() {
assert_eq!(tamil_category('\u{0BCD}'), IndicCategory::Halant);
}
#[test]
fn tamil_category_pre_base_matras_e_ee_ai() {
assert_eq!(tamil_category('\u{0BC6}'), IndicCategory::PreBaseMatra);
assert_eq!(tamil_category('\u{0BC7}'), IndicCategory::PreBaseMatra);
assert_eq!(tamil_category('\u{0BC8}'), IndicCategory::PreBaseMatra);
}
#[test]
fn tamil_category_classifies_aa_matra_as_matra() {
assert_eq!(tamil_category('\u{0BBE}'), IndicCategory::Matra);
}
#[test]
fn tamil_category_classifies_anusvara_as_bindu() {
assert_eq!(tamil_category('\u{0B82}'), IndicCategory::Bindu);
}
#[test]
fn tamil_category_classifies_independent_vowel_a_as_vowel() {
assert_eq!(tamil_category('\u{0B85}'), IndicCategory::Vowel);
}
#[test]
fn tamil_category_returns_other_for_devanagari_codepoint() {
assert_eq!(tamil_category('\u{0915}'), IndicCategory::Other);
}
#[test]
fn tamil_pre_base_matra_e_reorders_before_base() {
let cluster = ['\u{0B95}', '\u{0BC6}'];
let (out, flags) = reorder_cluster_with(&cluster, &TAMIL_RULES);
assert_eq!(out, vec!['\u{0BC6}', '\u{0B95}']);
assert!(flags.pre_base_reordered);
}
#[test]
fn tamil_RA_plus_halant_does_NOT_set_reph_flag() {
let cluster = ['\u{0BB0}', '\u{0BCD}', '\u{0B95}'];
let (_out, flags) = reorder_cluster_with(&cluster, &TAMIL_RULES);
assert!(!flags.has_reph, "Tamil reph_enabled is false");
}
#[test]
fn tamil_cluster_boundary_keeps_pulli_chain_in_one_cluster() {
let chars = ['\u{0B95}', '\u{0BCD}', '\u{0B95}'];
let bounds = cluster_boundaries_with(&chars, tamil_category);
assert_eq!(bounds, vec![(0, 3)]);
}
#[test]
fn tamil_feature_tags_omit_rphf_and_cjct() {
let tags = tamil_feature_tags();
assert!(!tags.contains(b"rphf"), "Tamil has no reph feature");
assert!(!tags.contains(b"cjct"), "Tamil has no conjunct feature");
assert!(!tags.contains(b"vatu"), "Tamil has no vattu feature");
assert!(tags.contains(b"pref"), "Tamil emits the pref feature");
}
#[test]
fn bengali_feature_tags_match_devanagari_shape() {
assert_eq!(bengali_feature_tags(), devanagari_feature_tags());
}
#[test]
fn script_indic_tags_returns_modern_and_legacy_pair_for_devanagari() {
use super::super::arabic::Script;
let pair = script_indic_tags(Script::Devanagari);
assert_eq!(pair, Some((*b"dev2", *b"deva")));
}
#[test]
fn script_indic_tags_returns_pair_for_bengali_and_tamil() {
use super::super::arabic::Script;
assert_eq!(
script_indic_tags(Script::Bengali),
Some((*b"bng2", *b"beng"))
);
assert_eq!(script_indic_tags(Script::Tamil), Some((*b"tml2", *b"taml")));
}
#[test]
fn script_indic_tags_returns_none_for_arabic_or_other() {
use super::super::arabic::Script;
assert_eq!(script_indic_tags(Script::Arabic), None);
assert_eq!(script_indic_tags(Script::Other), None);
}
}