#![allow(clippy::manual_range_contains)]
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum IndicCategory {
Consonant,
Vowel,
Halant,
PreBaseMatra,
Matra,
Nukta,
Bindu,
Symbol,
Other,
}
pub fn devanagari_category(ch: char) -> IndicCategory {
let cp = ch as u32;
if cp < 0x0900 || cp > 0x097F {
return IndicCategory::Other;
}
match cp {
0x0900..=0x0903 => IndicCategory::Bindu,
0x0904..=0x0914 => IndicCategory::Vowel,
0x0915..=0x0939 => IndicCategory::Consonant,
0x093C => IndicCategory::Nukta,
0x093D => IndicCategory::Symbol,
0x093A | 0x093B => IndicCategory::Matra,
0x093E => IndicCategory::Matra,
0x093F => IndicCategory::PreBaseMatra,
0x0940..=0x094C => IndicCategory::Matra,
0x094D => IndicCategory::Halant,
0x094E..=0x094F => IndicCategory::Matra,
0x0951..=0x0954 => IndicCategory::Bindu,
0x0955..=0x0957 => IndicCategory::Matra,
0x0958..=0x095F => IndicCategory::Consonant,
0x0960..=0x0961 => IndicCategory::Vowel,
0x0962..=0x0963 => IndicCategory::Matra,
0x0964..=0x096F => IndicCategory::Symbol, 0x0970..=0x0977 => IndicCategory::Symbol,
0x0978..=0x097F => IndicCategory::Consonant,
_ => IndicCategory::Symbol,
}
}
pub fn bengali_category(ch: char) -> IndicCategory {
let cp = ch as u32;
if cp < 0x0980 || cp > 0x09FF {
return IndicCategory::Other;
}
match cp {
0x0980 => IndicCategory::Symbol,
0x0981..=0x0983 => IndicCategory::Bindu,
0x0985..=0x098C => IndicCategory::Vowel,
0x098F..=0x0990 => IndicCategory::Vowel,
0x0993..=0x0994 => IndicCategory::Vowel,
0x0995..=0x09A8 => IndicCategory::Consonant,
0x09AA..=0x09B0 => IndicCategory::Consonant,
0x09B2 => IndicCategory::Consonant,
0x09B6..=0x09B9 => IndicCategory::Consonant,
0x09BC => IndicCategory::Nukta,
0x09BD => IndicCategory::Symbol,
0x09BE => IndicCategory::Matra,
0x09BF => IndicCategory::PreBaseMatra,
0x09C0..=0x09C4 => IndicCategory::Matra,
0x09C7..=0x09C8 => IndicCategory::PreBaseMatra,
0x09CB..=0x09CC => IndicCategory::Matra,
0x09CD => IndicCategory::Halant,
0x09CE => IndicCategory::Consonant,
0x09D7 => IndicCategory::Matra,
0x09DC..=0x09DD => IndicCategory::Consonant,
0x09DF => IndicCategory::Consonant,
0x09E0..=0x09E1 => IndicCategory::Vowel,
0x09E2..=0x09E3 => IndicCategory::Matra,
0x09E6..=0x09EF => IndicCategory::Symbol, 0x09F0 => IndicCategory::Consonant, 0x09F1 => IndicCategory::Consonant, 0x09F2..=0x09FF => IndicCategory::Symbol,
_ => IndicCategory::Symbol,
}
}
pub fn tamil_category(ch: char) -> IndicCategory {
let cp = ch as u32;
if cp < 0x0B80 || cp > 0x0BFF {
return IndicCategory::Other;
}
match cp {
0x0B82..=0x0B83 => IndicCategory::Bindu,
0x0B85..=0x0B8A => IndicCategory::Vowel,
0x0B8E..=0x0B90 => IndicCategory::Vowel,
0x0B92..=0x0B94 => IndicCategory::Vowel,
0x0B95 => IndicCategory::Consonant,
0x0B99..=0x0B9A => IndicCategory::Consonant,
0x0B9C => IndicCategory::Consonant,
0x0B9E..=0x0B9F => IndicCategory::Consonant,
0x0BA3..=0x0BA4 => IndicCategory::Consonant,
0x0BA8..=0x0BAA => IndicCategory::Consonant,
0x0BAE..=0x0BB9 => IndicCategory::Consonant,
0x0BBE => IndicCategory::Matra,
0x0BBF..=0x0BC2 => IndicCategory::Matra,
0x0BC6..=0x0BC8 => IndicCategory::PreBaseMatra,
0x0BCA..=0x0BCC => IndicCategory::Matra,
0x0BCD => IndicCategory::Halant,
0x0BD7 => IndicCategory::Matra,
0x0BE6..=0x0BEF => IndicCategory::Symbol,
0x0BF0..=0x0BFF => IndicCategory::Symbol,
_ => IndicCategory::Other,
}
}
pub fn gurmukhi_category(ch: char) -> IndicCategory {
let cp = ch as u32;
if cp < 0x0A00 || cp > 0x0A7F {
return IndicCategory::Other;
}
match cp {
0x0A01..=0x0A03 => IndicCategory::Bindu,
0x0A05..=0x0A0A => IndicCategory::Vowel,
0x0A0F..=0x0A10 => IndicCategory::Vowel,
0x0A13..=0x0A14 => IndicCategory::Vowel,
0x0A15..=0x0A28 => IndicCategory::Consonant,
0x0A2A..=0x0A30 => IndicCategory::Consonant,
0x0A32..=0x0A33 => IndicCategory::Consonant,
0x0A35..=0x0A36 => IndicCategory::Consonant,
0x0A38..=0x0A39 => IndicCategory::Consonant,
0x0A3C => IndicCategory::Nukta,
0x0A3E => IndicCategory::Matra,
0x0A3F => IndicCategory::PreBaseMatra,
0x0A40..=0x0A42 => IndicCategory::Matra,
0x0A47..=0x0A48 => IndicCategory::Matra,
0x0A4B..=0x0A4C => IndicCategory::Matra,
0x0A4D => IndicCategory::Halant,
0x0A51 => IndicCategory::Bindu,
0x0A59..=0x0A5C => IndicCategory::Consonant,
0x0A5E => IndicCategory::Consonant,
0x0A66..=0x0A6F => IndicCategory::Symbol,
0x0A70..=0x0A71 => IndicCategory::Bindu,
0x0A72..=0x0A74 => IndicCategory::Consonant,
0x0A75 => IndicCategory::Bindu,
_ => IndicCategory::Symbol,
}
}
pub fn gujarati_category(ch: char) -> IndicCategory {
let cp = ch as u32;
if cp < 0x0A80 || cp > 0x0AFF {
return IndicCategory::Other;
}
match cp {
0x0A81..=0x0A83 => IndicCategory::Bindu,
0x0A85..=0x0A8D => IndicCategory::Vowel,
0x0A8F..=0x0A91 => IndicCategory::Vowel,
0x0A93..=0x0A94 => IndicCategory::Vowel,
0x0A95..=0x0AA8 => IndicCategory::Consonant,
0x0AAA..=0x0AB0 => IndicCategory::Consonant,
0x0AB2..=0x0AB3 => IndicCategory::Consonant,
0x0AB5..=0x0AB9 => IndicCategory::Consonant,
0x0ABC => IndicCategory::Nukta,
0x0ABD => IndicCategory::Symbol,
0x0ABE => IndicCategory::Matra,
0x0ABF => IndicCategory::PreBaseMatra,
0x0AC0..=0x0AC5 => IndicCategory::Matra,
0x0AC7..=0x0AC9 => IndicCategory::Matra,
0x0ACB..=0x0ACC => IndicCategory::Matra,
0x0ACD => IndicCategory::Halant,
0x0AD0 => IndicCategory::Consonant,
0x0AE0..=0x0AE1 => IndicCategory::Vowel,
0x0AE2..=0x0AE3 => IndicCategory::Matra,
0x0AE6..=0x0AEF => IndicCategory::Symbol,
0x0AF0..=0x0AFF => IndicCategory::Symbol,
_ => IndicCategory::Symbol,
}
}
pub fn telugu_category(ch: char) -> IndicCategory {
let cp = ch as u32;
if cp < 0x0C00 || cp > 0x0C7F {
return IndicCategory::Other;
}
match cp {
0x0C00..=0x0C04 => IndicCategory::Bindu,
0x0C05..=0x0C0C => IndicCategory::Vowel,
0x0C0E..=0x0C10 => IndicCategory::Vowel,
0x0C12..=0x0C14 => IndicCategory::Vowel,
0x0C15..=0x0C28 => IndicCategory::Consonant,
0x0C2A..=0x0C39 => IndicCategory::Consonant,
0x0C3C => IndicCategory::Nukta,
0x0C3D => IndicCategory::Symbol,
0x0C3E..=0x0C44 => IndicCategory::Matra,
0x0C46..=0x0C48 => IndicCategory::PreBaseMatra,
0x0C4A..=0x0C4C => IndicCategory::Matra,
0x0C4D => IndicCategory::Halant,
0x0C55..=0x0C56 => IndicCategory::Matra,
0x0C58..=0x0C5A => IndicCategory::Consonant,
0x0C60..=0x0C61 => IndicCategory::Vowel,
0x0C62..=0x0C63 => IndicCategory::Matra,
0x0C66..=0x0C6F => IndicCategory::Symbol,
0x0C77..=0x0C7F => IndicCategory::Symbol,
_ => IndicCategory::Symbol,
}
}
pub fn kannada_category(ch: char) -> IndicCategory {
let cp = ch as u32;
if cp < 0x0C80 || cp > 0x0CFF {
return IndicCategory::Other;
}
match cp {
0x0C80..=0x0C83 => IndicCategory::Bindu,
0x0C85..=0x0C8C => IndicCategory::Vowel,
0x0C8E..=0x0C90 => IndicCategory::Vowel,
0x0C92..=0x0C94 => IndicCategory::Vowel,
0x0C95..=0x0CA8 => IndicCategory::Consonant,
0x0CAA..=0x0CB3 => IndicCategory::Consonant,
0x0CB5..=0x0CB9 => IndicCategory::Consonant,
0x0CBC => IndicCategory::Nukta,
0x0CBD => IndicCategory::Symbol,
0x0CBE => IndicCategory::Matra,
0x0CBF => IndicCategory::PreBaseMatra,
0x0CC0..=0x0CC4 => IndicCategory::Matra,
0x0CC6..=0x0CC8 => IndicCategory::PreBaseMatra,
0x0CCA..=0x0CCC => IndicCategory::Matra,
0x0CCD => IndicCategory::Halant,
0x0CD5..=0x0CD6 => IndicCategory::Matra,
0x0CDD..=0x0CDE => IndicCategory::Consonant,
0x0CE0..=0x0CE1 => IndicCategory::Vowel,
0x0CE2..=0x0CE3 => IndicCategory::Matra,
0x0CE6..=0x0CEF => IndicCategory::Symbol,
0x0CF1..=0x0CFF => IndicCategory::Symbol,
_ => IndicCategory::Symbol,
}
}
pub fn malayalam_category(ch: char) -> IndicCategory {
let cp = ch as u32;
if cp < 0x0D00 || cp > 0x0D7F {
return IndicCategory::Other;
}
match cp {
0x0D00..=0x0D03 => IndicCategory::Bindu,
0x0D05..=0x0D0C => IndicCategory::Vowel,
0x0D0E..=0x0D10 => IndicCategory::Vowel,
0x0D12..=0x0D14 => IndicCategory::Vowel,
0x0D15..=0x0D3A => IndicCategory::Consonant,
0x0D3B..=0x0D3C => IndicCategory::Nukta,
0x0D3D => IndicCategory::Symbol,
0x0D3E => IndicCategory::Matra,
0x0D3F..=0x0D44 => IndicCategory::Matra,
0x0D46..=0x0D48 => IndicCategory::PreBaseMatra,
0x0D4A..=0x0D4C => IndicCategory::Matra,
0x0D4D => IndicCategory::Halant,
0x0D4E => IndicCategory::Matra,
0x0D57 => IndicCategory::Matra,
0x0D58..=0x0D5F => IndicCategory::Symbol,
0x0D60..=0x0D61 => IndicCategory::Vowel,
0x0D62..=0x0D63 => IndicCategory::Matra,
0x0D66..=0x0D6F => IndicCategory::Symbol,
0x0D70..=0x0D79 => IndicCategory::Symbol,
0x0D7A..=0x0D7F => IndicCategory::Consonant,
_ => IndicCategory::Symbol,
}
}
pub fn oriya_category(ch: char) -> IndicCategory {
let cp = ch as u32;
if cp < 0x0B00 || cp > 0x0B7F {
return IndicCategory::Other;
}
match cp {
0x0B01..=0x0B03 => IndicCategory::Bindu,
0x0B05..=0x0B0C => IndicCategory::Vowel,
0x0B0F..=0x0B10 => IndicCategory::Vowel,
0x0B13..=0x0B14 => IndicCategory::Vowel,
0x0B15..=0x0B28 => IndicCategory::Consonant,
0x0B2A..=0x0B30 => IndicCategory::Consonant,
0x0B32..=0x0B33 => IndicCategory::Consonant,
0x0B35..=0x0B39 => IndicCategory::Consonant,
0x0B3C => IndicCategory::Nukta,
0x0B3D => IndicCategory::Symbol,
0x0B3E => IndicCategory::Matra,
0x0B3F => IndicCategory::Matra,
0x0B40..=0x0B44 => IndicCategory::Matra,
0x0B47..=0x0B48 => IndicCategory::PreBaseMatra,
0x0B4B..=0x0B4C => IndicCategory::PreBaseMatra,
0x0B4D => IndicCategory::Halant,
0x0B55..=0x0B57 => IndicCategory::Matra,
0x0B5C..=0x0B5D => IndicCategory::Consonant,
0x0B5F..=0x0B61 => IndicCategory::Vowel,
0x0B62..=0x0B63 => IndicCategory::Matra,
0x0B66..=0x0B6F => IndicCategory::Symbol,
0x0B70..=0x0B7F => IndicCategory::Symbol,
_ => IndicCategory::Symbol,
}
}
pub fn sinhala_category(ch: char) -> IndicCategory {
let cp = ch as u32;
if cp < 0x0D80 || cp > 0x0DFF {
return IndicCategory::Other;
}
match cp {
0x0D81..=0x0D83 => IndicCategory::Bindu,
0x0D85..=0x0D96 => IndicCategory::Vowel,
0x0D9A..=0x0DB1 => IndicCategory::Consonant,
0x0DB3..=0x0DBB => IndicCategory::Consonant,
0x0DBD => IndicCategory::Consonant,
0x0DC0..=0x0DC6 => IndicCategory::Consonant,
0x0DCA => IndicCategory::Halant,
0x0DCF => IndicCategory::Matra,
0x0DD0..=0x0DD6 => IndicCategory::Matra,
0x0DD9..=0x0DDB => IndicCategory::PreBaseMatra,
0x0DDC..=0x0DDE => IndicCategory::PreBaseMatra,
0x0DDF => IndicCategory::Matra,
0x0DE6..=0x0DEF => IndicCategory::Symbol,
0x0DF2..=0x0DF3 => IndicCategory::Matra,
0x0DF4 => IndicCategory::Symbol,
_ => IndicCategory::Symbol,
}
}
pub fn khmer_category(ch: char) -> IndicCategory {
let cp = ch as u32;
if cp < 0x1780 || cp > 0x17FF {
return IndicCategory::Other;
}
match cp {
0x1780..=0x17A2 => IndicCategory::Consonant,
0x17A3..=0x17B3 => IndicCategory::Vowel,
0x17B4..=0x17B5 => IndicCategory::Bindu,
0x17B6 => IndicCategory::Matra,
0x17B7..=0x17BD => IndicCategory::Matra,
0x17BE => IndicCategory::PreBaseMatra,
0x17BF => IndicCategory::PreBaseMatra,
0x17C0 => IndicCategory::PreBaseMatra,
0x17C1..=0x17C3 => IndicCategory::PreBaseMatra,
0x17C4..=0x17C5 => IndicCategory::PreBaseMatra,
0x17C6..=0x17C7 => IndicCategory::Bindu,
0x17C8 => IndicCategory::Matra,
0x17C9..=0x17D1 => IndicCategory::Bindu,
0x17D2 => IndicCategory::Halant,
0x17D3 => IndicCategory::Bindu,
0x17D4..=0x17D6 => IndicCategory::Symbol,
0x17D7 => IndicCategory::Consonant,
0x17D8..=0x17D9 => IndicCategory::Symbol,
0x17DA => IndicCategory::Symbol,
0x17DB => IndicCategory::Symbol,
0x17DC => IndicCategory::Bindu,
0x17DD => IndicCategory::Bindu,
0x17E0..=0x17E9 => IndicCategory::Symbol,
0x17F0..=0x17F9 => IndicCategory::Symbol,
_ => IndicCategory::Symbol,
}
}
pub fn thai_category(ch: char) -> IndicCategory {
let cp = ch as u32;
if cp < 0x0E00 || cp > 0x0E7F {
return IndicCategory::Other;
}
match cp {
0x0E01..=0x0E2E => IndicCategory::Consonant,
0x0E2F => IndicCategory::Symbol,
0x0E30 => IndicCategory::Matra,
0x0E31 => IndicCategory::Matra,
0x0E32 => IndicCategory::Matra,
0x0E33 => IndicCategory::Matra,
0x0E34..=0x0E37 => IndicCategory::Matra,
0x0E38..=0x0E3A => IndicCategory::Matra,
0x0E3F => IndicCategory::Symbol,
0x0E40..=0x0E44 => IndicCategory::Vowel,
0x0E45 => IndicCategory::Matra,
0x0E46 => IndicCategory::Symbol,
0x0E47 => IndicCategory::Matra,
0x0E48..=0x0E4B => IndicCategory::Bindu,
0x0E4C..=0x0E4E => IndicCategory::Bindu,
0x0E4F => IndicCategory::Symbol,
0x0E50..=0x0E59 => IndicCategory::Symbol,
0x0E5A..=0x0E5B => IndicCategory::Symbol,
_ => IndicCategory::Symbol,
}
}
#[derive(Debug, Clone, Copy, Default, PartialEq, Eq)]
pub struct ClusterFlags {
pub has_reph: bool,
pub pre_base_reordered: bool,
}
pub fn cluster_boundaries_with(
chars: &[char],
category: fn(char) -> IndicCategory,
) -> Vec<(usize, usize)> {
let mut out: Vec<(usize, usize)> = Vec::new();
if chars.is_empty() {
return out;
}
let n = chars.len();
let mut start = 0usize;
for i in 1..n {
let prev = category(chars[i - 1]);
let cur = category(chars[i]);
let boundary = matches!(cur, IndicCategory::Other | IndicCategory::Symbol)
|| matches!(prev, IndicCategory::Other | IndicCategory::Symbol)
|| (matches!(cur, IndicCategory::Consonant | IndicCategory::Vowel)
&& !matches!(prev, IndicCategory::Halant));
if boundary {
out.push((start, i));
start = i;
}
}
out.push((start, n));
out
}
pub fn cluster_boundaries(chars: &[char]) -> Vec<(usize, usize)> {
cluster_boundaries_with(chars, devanagari_category)
}
#[derive(Debug, Clone, Copy)]
pub struct ReorderRules {
pub category: fn(char) -> IndicCategory,
pub ra_codepoint: char,
pub reph_enabled: bool,
}
pub const DEVANAGARI_RULES: ReorderRules = ReorderRules {
category: devanagari_category,
ra_codepoint: '\u{0930}',
reph_enabled: true,
};
pub const BENGALI_RULES: ReorderRules = ReorderRules {
category: bengali_category,
ra_codepoint: '\u{09B0}',
reph_enabled: true,
};
pub const TAMIL_RULES: ReorderRules = ReorderRules {
category: tamil_category,
ra_codepoint: '\u{0BB0}',
reph_enabled: false,
};
pub const GURMUKHI_RULES: ReorderRules = ReorderRules {
category: gurmukhi_category,
ra_codepoint: '\u{0A30}',
reph_enabled: true,
};
pub const GUJARATI_RULES: ReorderRules = ReorderRules {
category: gujarati_category,
ra_codepoint: '\u{0AB0}',
reph_enabled: true,
};
pub const TELUGU_RULES: ReorderRules = ReorderRules {
category: telugu_category,
ra_codepoint: '\u{0C30}',
reph_enabled: true,
};
pub const KANNADA_RULES: ReorderRules = ReorderRules {
category: kannada_category,
ra_codepoint: '\u{0CB0}',
reph_enabled: true,
};
pub const MALAYALAM_RULES: ReorderRules = ReorderRules {
category: malayalam_category,
ra_codepoint: '\u{0D30}',
reph_enabled: false,
};
pub const ORIYA_RULES: ReorderRules = ReorderRules {
category: oriya_category,
ra_codepoint: '\u{0B30}',
reph_enabled: true,
};
pub const SINHALA_RULES: ReorderRules = ReorderRules {
category: sinhala_category,
ra_codepoint: '\u{0DBB}',
reph_enabled: false,
};
pub const KHMER_RULES: ReorderRules = ReorderRules {
category: khmer_category,
ra_codepoint: '\u{179A}',
reph_enabled: false,
};
pub const THAI_RULES: ReorderRules = ReorderRules {
category: thai_category,
ra_codepoint: '\u{0E23}',
reph_enabled: false,
};
pub fn reorder_cluster_with(cluster: &[char], rules: &ReorderRules) -> (Vec<char>, ClusterFlags) {
let mut flags = ClusterFlags::default();
if cluster.is_empty() {
return (Vec::new(), flags);
}
let mut out: Vec<char> = cluster.to_vec();
if let Some(matra_idx) = out
.iter()
.position(|&c| (rules.category)(c) == IndicCategory::PreBaseMatra)
{
if matra_idx > 0 {
let matra = out.remove(matra_idx);
out.insert(0, matra);
flags.pre_base_reordered = true;
}
}
if rules.reph_enabled
&& cluster.len() >= 3
&& cluster[0] == rules.ra_codepoint
&& (rules.category)(cluster[1]) == IndicCategory::Halant
&& (rules.category)(cluster[2]) == IndicCategory::Consonant
{
flags.has_reph = true;
}
(out, flags)
}
pub fn reorder_cluster(cluster: &[char]) -> (Vec<char>, ClusterFlags) {
reorder_cluster_with(cluster, &DEVANAGARI_RULES)
}
pub fn devanagari_feature_tags() -> Vec<[u8; 4]> {
vec![
*b"locl", *b"ccmp", *b"nukt", *b"akhn", *b"rphf", *b"blwf", *b"half", *b"vatu", *b"cjct", *b"init", *b"pres", *b"abvs", *b"blws", *b"psts", *b"haln", ]
}
pub fn bengali_feature_tags() -> Vec<[u8; 4]> {
devanagari_feature_tags()
}
pub fn tamil_feature_tags() -> Vec<[u8; 4]> {
vec![
*b"locl", *b"ccmp", *b"akhn", *b"half", *b"pref", *b"blwf", *b"pstf", *b"init", *b"pres", *b"abvs", *b"blws", *b"psts", *b"haln", ]
}
pub fn gurmukhi_feature_tags() -> Vec<[u8; 4]> {
devanagari_feature_tags()
}
pub fn gujarati_feature_tags() -> Vec<[u8; 4]> {
devanagari_feature_tags()
}
pub fn telugu_feature_tags() -> Vec<[u8; 4]> {
vec![
*b"locl", *b"ccmp", *b"nukt", *b"akhn", *b"rphf", *b"pref", *b"blwf", *b"half", *b"abvf",
*b"pstf", *b"cjct", *b"init", *b"pres", *b"abvs", *b"blws", *b"psts", *b"haln",
]
}
pub fn kannada_feature_tags() -> Vec<[u8; 4]> {
telugu_feature_tags()
}
pub fn malayalam_feature_tags() -> Vec<[u8; 4]> {
vec![
*b"locl", *b"ccmp", *b"nukt", *b"akhn", *b"pref", *b"blwf", *b"half", *b"abvf", *b"pstf",
*b"cjct", *b"init", *b"pres", *b"abvs", *b"blws", *b"psts", *b"haln",
]
}
pub fn oriya_feature_tags() -> Vec<[u8; 4]> {
vec![
*b"locl", *b"ccmp", *b"nukt", *b"akhn", *b"rphf", *b"pref", *b"blwf", *b"half", *b"vatu",
*b"cjct", *b"init", *b"pres", *b"abvs", *b"blws", *b"psts", *b"haln",
]
}
pub fn sinhala_feature_tags() -> Vec<[u8; 4]> {
vec![
*b"locl", *b"ccmp", *b"akhn", *b"pref", *b"blwf", *b"half", *b"pstf", *b"cjct", *b"init",
*b"pres", *b"abvs", *b"blws", *b"psts", *b"haln",
]
}
pub fn khmer_feature_tags() -> Vec<[u8; 4]> {
vec![
*b"locl", *b"ccmp", *b"pref", *b"blwf", *b"abvf", *b"pstf", *b"cfar", *b"init", *b"pres",
*b"abvs", *b"blws", *b"psts", *b"haln",
]
}
pub fn thai_feature_tags() -> Vec<[u8; 4]> {
vec![*b"locl", *b"ccmp", *b"pres", *b"abvs", *b"blws", *b"psts"]
}
pub fn script_indic_tags(script: super::arabic::Script) -> Option<([u8; 4], [u8; 4])> {
match script {
super::arabic::Script::Devanagari => Some((*b"dev2", *b"deva")),
super::arabic::Script::Bengali => Some((*b"bng2", *b"beng")),
super::arabic::Script::Tamil => Some((*b"tml2", *b"taml")),
super::arabic::Script::Gurmukhi => Some((*b"gur2", *b"guru")),
super::arabic::Script::Gujarati => Some((*b"gjr2", *b"gujr")),
super::arabic::Script::Telugu => Some((*b"tel2", *b"telu")),
super::arabic::Script::Kannada => Some((*b"knd2", *b"knda")),
super::arabic::Script::Malayalam => Some((*b"mlm2", *b"mlym")),
super::arabic::Script::Oriya => Some((*b"ory2", *b"orya")),
super::arabic::Script::Sinhala => Some((*b"sinh", *b"sinh")),
super::arabic::Script::Khmer => Some((*b"khmr", *b"khmr")),
super::arabic::Script::Thai => Some((*b"thai", *b"thai")),
_ => None,
}
}
#[cfg(test)]
#[allow(non_snake_case)] mod tests {
use super::*;
#[test]
fn devanagari_category_lookup_returns_consonant_for_ka_U_0915() {
assert_eq!(devanagari_category('\u{0915}'), IndicCategory::Consonant);
}
#[test]
fn devanagari_category_lookup_returns_halant_for_U_094D() {
assert_eq!(devanagari_category('\u{094D}'), IndicCategory::Halant);
}
#[test]
fn devanagari_category_lookup_returns_pre_base_matra_for_U_093F() {
assert_eq!(devanagari_category('\u{093F}'), IndicCategory::PreBaseMatra);
}
#[test]
fn devanagari_category_classifies_vowel_a_as_vowel() {
assert_eq!(devanagari_category('\u{0905}'), IndicCategory::Vowel);
}
#[test]
fn devanagari_category_classifies_anusvara_as_bindu() {
assert_eq!(devanagari_category('\u{0902}'), IndicCategory::Bindu);
}
#[test]
fn devanagari_category_classifies_nukta_as_nukta() {
assert_eq!(devanagari_category('\u{093C}'), IndicCategory::Nukta);
}
#[test]
fn devanagari_category_classifies_post_base_matra_aa_as_matra() {
assert_eq!(devanagari_category('\u{093E}'), IndicCategory::Matra);
}
#[test]
fn devanagari_category_classifies_danda_as_symbol() {
assert_eq!(devanagari_category('\u{0964}'), IndicCategory::Symbol);
}
#[test]
fn devanagari_category_returns_other_for_latin_a() {
assert_eq!(devanagari_category('A'), IndicCategory::Other);
}
#[test]
fn script_of_recognises_devanagari_block() {
use super::super::arabic::{script_of, Script};
assert_eq!(script_of('\u{0915}'), Script::Devanagari);
assert_eq!(script_of('\u{094D}'), Script::Devanagari);
assert_eq!(script_of('\u{097F}'), Script::Devanagari);
}
#[test]
fn script_of_still_classifies_arabic_and_latin_correctly() {
use super::super::arabic::{script_of, Script};
assert_eq!(script_of('\u{0627}'), Script::Arabic);
assert_eq!(script_of('A'), Script::Other);
}
#[test]
fn pre_base_matra_reorders_before_base_consonant() {
let cluster = ['\u{0915}', '\u{093F}'];
let (out, flags) = reorder_cluster(&cluster);
assert_eq!(out, vec!['\u{093F}', '\u{0915}']);
assert!(flags.pre_base_reordered);
assert!(!flags.has_reph);
}
#[test]
fn pre_base_matra_reorders_in_conjunct_cluster() {
let cluster = ['\u{0915}', '\u{094D}', '\u{0937}', '\u{093F}'];
let (out, flags) = reorder_cluster(&cluster);
assert_eq!(out, vec!['\u{093F}', '\u{0915}', '\u{094D}', '\u{0937}']);
assert!(flags.pre_base_reordered);
}
#[test]
fn reph_formation_at_cluster_start_marks_RA_for_superscript() {
let cluster = ['\u{0930}', '\u{094D}', '\u{0915}'];
let (out, flags) = reorder_cluster(&cluster);
assert_eq!(out, vec!['\u{0930}', '\u{094D}', '\u{0915}']);
assert!(flags.has_reph);
assert!(!flags.pre_base_reordered);
}
#[test]
fn reph_with_pre_base_matra_combines_both_flags() {
let cluster = ['\u{0930}', '\u{094D}', '\u{0915}', '\u{093F}'];
let (out, flags) = reorder_cluster(&cluster);
assert_eq!(out, vec!['\u{093F}', '\u{0930}', '\u{094D}', '\u{0915}']);
assert!(flags.has_reph);
assert!(flags.pre_base_reordered);
}
#[test]
fn cluster_without_reph_consonant_does_not_set_flag() {
let cluster = ['\u{0915}', '\u{094D}', '\u{0937}'];
let (_out, flags) = reorder_cluster(&cluster);
assert!(!flags.has_reph);
}
#[test]
fn cluster_boundary_starts_new_cluster_at_consonant_after_vowel() {
let chars = ['\u{0915}', '\u{093E}', '\u{0915}'];
let bounds = cluster_boundaries(&chars);
assert_eq!(bounds, vec![(0, 2), (2, 3)]);
}
#[test]
fn cluster_boundary_keeps_conjunct_in_one_cluster() {
let chars = ['\u{0915}', '\u{094D}', '\u{0937}'];
let bounds = cluster_boundaries(&chars);
assert_eq!(bounds, vec![(0, 3)]);
}
#[test]
fn cluster_boundary_breaks_at_danda_symbol() {
let chars = ['\u{0915}', '\u{0964}', '\u{0915}'];
let bounds = cluster_boundaries(&chars);
assert_eq!(bounds, vec![(0, 1), (1, 2), (2, 3)]);
}
#[test]
fn cluster_boundary_breaks_at_non_indic_codepoint() {
let chars = ['\u{0915}', ' ', '\u{0915}'];
let bounds = cluster_boundaries(&chars);
assert_eq!(bounds, vec![(0, 1), (1, 2), (2, 3)]);
}
#[test]
fn cluster_boundary_handles_empty_input() {
let bounds = cluster_boundaries(&[]);
assert!(bounds.is_empty());
}
#[test]
fn cluster_boundary_single_consonant_is_one_cluster() {
let chars = ['\u{0915}'];
let bounds = cluster_boundaries(&chars);
assert_eq!(bounds, vec![(0, 1)]);
}
#[test]
fn devanagari_feature_tags_are_in_canonical_order() {
let tags = devanagari_feature_tags();
assert_eq!(&tags[0], b"locl");
assert_eq!(&tags[1], b"ccmp");
assert_eq!(&tags[2], b"nukt");
assert_eq!(&tags[3], b"akhn");
assert_eq!(&tags[4], b"rphf");
assert_eq!(tags.last(), Some(b"haln"));
}
#[test]
fn empty_cluster_reorder_returns_empty() {
let (out, flags) = reorder_cluster(&[]);
assert!(out.is_empty());
assert_eq!(flags, ClusterFlags::default());
}
#[test]
fn single_consonant_cluster_does_not_reorder() {
let cluster = ['\u{0915}'];
let (out, flags) = reorder_cluster(&cluster);
assert_eq!(out, vec!['\u{0915}']);
assert!(!flags.pre_base_reordered);
assert!(!flags.has_reph);
}
#[test]
fn two_clusters_with_pre_base_matras_each_reorder_independently() {
let chars = ['\u{0915}', '\u{093F}', '\u{0915}', '\u{093F}'];
let bounds = cluster_boundaries(&chars);
assert_eq!(bounds, vec![(0, 2), (2, 4)]);
for (s, e) in bounds {
let (out, flags) = reorder_cluster(&chars[s..e]);
assert_eq!(out, vec!['\u{093F}', '\u{0915}']);
assert!(flags.pre_base_reordered);
}
}
#[test]
fn bengali_category_classifies_ka_as_consonant() {
assert_eq!(bengali_category('\u{0995}'), IndicCategory::Consonant);
}
#[test]
fn bengali_category_classifies_ra_as_consonant() {
assert_eq!(bengali_category('\u{09B0}'), IndicCategory::Consonant);
}
#[test]
fn bengali_category_classifies_halant_as_halant() {
assert_eq!(bengali_category('\u{09CD}'), IndicCategory::Halant);
}
#[test]
fn bengali_category_classifies_nukta_as_nukta() {
assert_eq!(bengali_category('\u{09BC}'), IndicCategory::Nukta);
}
#[test]
fn bengali_category_pre_base_matras_i_e_ai() {
assert_eq!(bengali_category('\u{09BF}'), IndicCategory::PreBaseMatra);
assert_eq!(bengali_category('\u{09C7}'), IndicCategory::PreBaseMatra);
assert_eq!(bengali_category('\u{09C8}'), IndicCategory::PreBaseMatra);
}
#[test]
fn bengali_category_classifies_aa_matra_as_matra() {
assert_eq!(bengali_category('\u{09BE}'), IndicCategory::Matra);
}
#[test]
fn bengali_category_classifies_anusvara_as_bindu() {
assert_eq!(bengali_category('\u{0982}'), IndicCategory::Bindu);
}
#[test]
fn bengali_category_classifies_independent_vowel_a_as_vowel() {
assert_eq!(bengali_category('\u{0985}'), IndicCategory::Vowel);
}
#[test]
fn bengali_category_returns_other_for_devanagari_codepoint() {
assert_eq!(bengali_category('\u{0915}'), IndicCategory::Other);
}
#[test]
fn bengali_pre_base_matra_i_reorders_before_base() {
let cluster = ['\u{0995}', '\u{09BF}'];
let (out, flags) = reorder_cluster_with(&cluster, &BENGALI_RULES);
assert_eq!(out, vec!['\u{09BF}', '\u{0995}']);
assert!(flags.pre_base_reordered);
assert!(!flags.has_reph);
}
#[test]
fn bengali_pre_base_matra_e_reorders_before_base() {
let cluster = ['\u{0995}', '\u{09C7}'];
let (out, flags) = reorder_cluster_with(&cluster, &BENGALI_RULES);
assert_eq!(out, vec!['\u{09C7}', '\u{0995}']);
assert!(flags.pre_base_reordered);
}
#[test]
fn bengali_pre_base_matra_ai_reorders_before_base() {
let cluster = ['\u{0995}', '\u{09C8}'];
let (out, flags) = reorder_cluster_with(&cluster, &BENGALI_RULES);
assert_eq!(out, vec!['\u{09C8}', '\u{0995}']);
assert!(flags.pre_base_reordered);
}
#[test]
fn bengali_reph_formation_marks_RA_for_superscript() {
let cluster = ['\u{09B0}', '\u{09CD}', '\u{0995}'];
let (out, flags) = reorder_cluster_with(&cluster, &BENGALI_RULES);
assert_eq!(out, vec!['\u{09B0}', '\u{09CD}', '\u{0995}']);
assert!(flags.has_reph);
}
#[test]
fn bengali_conjunct_keeps_in_one_cluster() {
let chars = ['\u{0995}', '\u{09CD}', '\u{09B7}'];
let bounds = cluster_boundaries_with(&chars, bengali_category);
assert_eq!(bounds, vec![(0, 3)]);
}
#[test]
fn tamil_category_classifies_ka_as_consonant() {
assert_eq!(tamil_category('\u{0B95}'), IndicCategory::Consonant);
}
#[test]
fn tamil_category_classifies_ra_as_consonant() {
assert_eq!(tamil_category('\u{0BB0}'), IndicCategory::Consonant);
}
#[test]
fn tamil_category_classifies_pulli_as_halant() {
assert_eq!(tamil_category('\u{0BCD}'), IndicCategory::Halant);
}
#[test]
fn tamil_category_pre_base_matras_e_ee_ai() {
assert_eq!(tamil_category('\u{0BC6}'), IndicCategory::PreBaseMatra);
assert_eq!(tamil_category('\u{0BC7}'), IndicCategory::PreBaseMatra);
assert_eq!(tamil_category('\u{0BC8}'), IndicCategory::PreBaseMatra);
}
#[test]
fn tamil_category_classifies_aa_matra_as_matra() {
assert_eq!(tamil_category('\u{0BBE}'), IndicCategory::Matra);
}
#[test]
fn tamil_category_classifies_anusvara_as_bindu() {
assert_eq!(tamil_category('\u{0B82}'), IndicCategory::Bindu);
}
#[test]
fn tamil_category_classifies_independent_vowel_a_as_vowel() {
assert_eq!(tamil_category('\u{0B85}'), IndicCategory::Vowel);
}
#[test]
fn tamil_category_returns_other_for_devanagari_codepoint() {
assert_eq!(tamil_category('\u{0915}'), IndicCategory::Other);
}
#[test]
fn tamil_pre_base_matra_e_reorders_before_base() {
let cluster = ['\u{0B95}', '\u{0BC6}'];
let (out, flags) = reorder_cluster_with(&cluster, &TAMIL_RULES);
assert_eq!(out, vec!['\u{0BC6}', '\u{0B95}']);
assert!(flags.pre_base_reordered);
}
#[test]
fn tamil_RA_plus_halant_does_NOT_set_reph_flag() {
let cluster = ['\u{0BB0}', '\u{0BCD}', '\u{0B95}'];
let (_out, flags) = reorder_cluster_with(&cluster, &TAMIL_RULES);
assert!(!flags.has_reph, "Tamil reph_enabled is false");
}
#[test]
fn tamil_cluster_boundary_keeps_pulli_chain_in_one_cluster() {
let chars = ['\u{0B95}', '\u{0BCD}', '\u{0B95}'];
let bounds = cluster_boundaries_with(&chars, tamil_category);
assert_eq!(bounds, vec![(0, 3)]);
}
#[test]
fn tamil_feature_tags_omit_rphf_and_cjct() {
let tags = tamil_feature_tags();
assert!(!tags.contains(b"rphf"), "Tamil has no reph feature");
assert!(!tags.contains(b"cjct"), "Tamil has no conjunct feature");
assert!(!tags.contains(b"vatu"), "Tamil has no vattu feature");
assert!(tags.contains(b"pref"), "Tamil emits the pref feature");
}
#[test]
fn bengali_feature_tags_match_devanagari_shape() {
assert_eq!(bengali_feature_tags(), devanagari_feature_tags());
}
#[test]
fn script_indic_tags_returns_modern_and_legacy_pair_for_devanagari() {
use super::super::arabic::Script;
let pair = script_indic_tags(Script::Devanagari);
assert_eq!(pair, Some((*b"dev2", *b"deva")));
}
#[test]
fn script_indic_tags_returns_pair_for_bengali_and_tamil() {
use super::super::arabic::Script;
assert_eq!(
script_indic_tags(Script::Bengali),
Some((*b"bng2", *b"beng"))
);
assert_eq!(script_indic_tags(Script::Tamil), Some((*b"tml2", *b"taml")));
}
#[test]
fn script_indic_tags_returns_none_for_arabic_or_other() {
use super::super::arabic::Script;
assert_eq!(script_indic_tags(Script::Arabic), None);
assert_eq!(script_indic_tags(Script::Other), None);
}
#[test]
fn gurmukhi_category_classifies_ka_as_consonant() {
assert_eq!(gurmukhi_category('\u{0A15}'), IndicCategory::Consonant);
}
#[test]
fn gurmukhi_category_classifies_halant_as_halant() {
assert_eq!(gurmukhi_category('\u{0A4D}'), IndicCategory::Halant);
}
#[test]
fn gurmukhi_category_classifies_pre_base_matra_i() {
assert_eq!(gurmukhi_category('\u{0A3F}'), IndicCategory::PreBaseMatra);
}
#[test]
fn gurmukhi_pre_base_matra_i_reorders_before_base() {
let cluster = ['\u{0A15}', '\u{0A3F}'];
let (out, flags) = reorder_cluster_with(&cluster, &GURMUKHI_RULES);
assert_eq!(out, vec!['\u{0A3F}', '\u{0A15}']);
assert!(flags.pre_base_reordered);
}
#[test]
fn gurmukhi_reph_marks_RA_for_superscript() {
let cluster = ['\u{0A30}', '\u{0A4D}', '\u{0A15}'];
let (_out, flags) = reorder_cluster_with(&cluster, &GURMUKHI_RULES);
assert!(flags.has_reph);
}
#[test]
fn gujarati_category_classifies_ka_as_consonant() {
assert_eq!(gujarati_category('\u{0A95}'), IndicCategory::Consonant);
}
#[test]
fn gujarati_category_classifies_halant_as_halant() {
assert_eq!(gujarati_category('\u{0ACD}'), IndicCategory::Halant);
}
#[test]
fn gujarati_category_classifies_pre_base_matra_i() {
assert_eq!(gujarati_category('\u{0ABF}'), IndicCategory::PreBaseMatra);
}
#[test]
fn gujarati_pre_base_matra_i_reorders_before_base() {
let cluster = ['\u{0A95}', '\u{0ABF}'];
let (out, flags) = reorder_cluster_with(&cluster, &GUJARATI_RULES);
assert_eq!(out, vec!['\u{0ABF}', '\u{0A95}']);
assert!(flags.pre_base_reordered);
}
#[test]
fn gujarati_reph_marks_RA_for_superscript() {
let cluster = ['\u{0AB0}', '\u{0ACD}', '\u{0A95}'];
let (_out, flags) = reorder_cluster_with(&cluster, &GUJARATI_RULES);
assert!(flags.has_reph);
}
#[test]
fn telugu_category_classifies_ka_as_consonant() {
assert_eq!(telugu_category('\u{0C15}'), IndicCategory::Consonant);
}
#[test]
fn telugu_category_classifies_halant_as_halant() {
assert_eq!(telugu_category('\u{0C4D}'), IndicCategory::Halant);
}
#[test]
fn telugu_pre_base_matras_e_ee_ai() {
assert_eq!(telugu_category('\u{0C46}'), IndicCategory::PreBaseMatra);
assert_eq!(telugu_category('\u{0C47}'), IndicCategory::PreBaseMatra);
assert_eq!(telugu_category('\u{0C48}'), IndicCategory::PreBaseMatra);
}
#[test]
fn telugu_pre_base_matra_e_reorders_before_base() {
let cluster = ['\u{0C15}', '\u{0C46}'];
let (out, flags) = reorder_cluster_with(&cluster, &TELUGU_RULES);
assert_eq!(out, vec!['\u{0C46}', '\u{0C15}']);
assert!(flags.pre_base_reordered);
}
#[test]
fn telugu_reph_marks_RA_for_superscript() {
let cluster = ['\u{0C30}', '\u{0C4D}', '\u{0C15}'];
let (_out, flags) = reorder_cluster_with(&cluster, &TELUGU_RULES);
assert!(flags.has_reph);
}
#[test]
fn kannada_category_classifies_ka_as_consonant() {
assert_eq!(kannada_category('\u{0C95}'), IndicCategory::Consonant);
}
#[test]
fn kannada_category_classifies_halant_as_halant() {
assert_eq!(kannada_category('\u{0CCD}'), IndicCategory::Halant);
}
#[test]
fn kannada_pre_base_matras_e_ee_ai() {
assert_eq!(kannada_category('\u{0CC6}'), IndicCategory::PreBaseMatra);
assert_eq!(kannada_category('\u{0CC7}'), IndicCategory::PreBaseMatra);
assert_eq!(kannada_category('\u{0CC8}'), IndicCategory::PreBaseMatra);
}
#[test]
fn kannada_pre_base_matra_e_reorders_before_base() {
let cluster = ['\u{0C95}', '\u{0CC6}'];
let (out, flags) = reorder_cluster_with(&cluster, &KANNADA_RULES);
assert_eq!(out, vec!['\u{0CC6}', '\u{0C95}']);
assert!(flags.pre_base_reordered);
}
#[test]
fn kannada_reph_marks_RA_for_superscript() {
let cluster = ['\u{0CB0}', '\u{0CCD}', '\u{0C95}'];
let (_out, flags) = reorder_cluster_with(&cluster, &KANNADA_RULES);
assert!(flags.has_reph);
}
#[test]
fn malayalam_category_classifies_ka_as_consonant() {
assert_eq!(malayalam_category('\u{0D15}'), IndicCategory::Consonant);
}
#[test]
fn malayalam_category_classifies_halant_as_halant() {
assert_eq!(malayalam_category('\u{0D4D}'), IndicCategory::Halant);
}
#[test]
fn malayalam_pre_base_matras_e_ee_ai() {
assert_eq!(malayalam_category('\u{0D46}'), IndicCategory::PreBaseMatra);
assert_eq!(malayalam_category('\u{0D47}'), IndicCategory::PreBaseMatra);
assert_eq!(malayalam_category('\u{0D48}'), IndicCategory::PreBaseMatra);
}
#[test]
fn malayalam_chillu_classified_as_consonant() {
for cp in 0x0D7A..=0x0D7F {
let ch = char::from_u32(cp).unwrap();
assert_eq!(
malayalam_category(ch),
IndicCategory::Consonant,
"chillu U+{cp:04X} should be Consonant"
);
}
}
#[test]
fn malayalam_pre_base_matra_e_reorders_before_base() {
let cluster = ['\u{0D15}', '\u{0D46}'];
let (out, flags) = reorder_cluster_with(&cluster, &MALAYALAM_RULES);
assert_eq!(out, vec!['\u{0D46}', '\u{0D15}']);
assert!(flags.pre_base_reordered);
}
#[test]
fn malayalam_RA_plus_halant_does_NOT_set_reph_flag() {
let cluster = ['\u{0D30}', '\u{0D4D}', '\u{0D15}'];
let (_out, flags) = reorder_cluster_with(&cluster, &MALAYALAM_RULES);
assert!(!flags.has_reph);
}
#[test]
fn oriya_category_classifies_ka_as_consonant() {
assert_eq!(oriya_category('\u{0B15}'), IndicCategory::Consonant);
}
#[test]
fn oriya_category_classifies_halant_as_halant() {
assert_eq!(oriya_category('\u{0B4D}'), IndicCategory::Halant);
}
#[test]
fn oriya_pre_base_matras_e_ai_o_au() {
assert_eq!(oriya_category('\u{0B47}'), IndicCategory::PreBaseMatra);
assert_eq!(oriya_category('\u{0B48}'), IndicCategory::PreBaseMatra);
assert_eq!(oriya_category('\u{0B4B}'), IndicCategory::PreBaseMatra);
assert_eq!(oriya_category('\u{0B4C}'), IndicCategory::PreBaseMatra);
}
#[test]
fn oriya_pre_base_matra_e_reorders_before_base() {
let cluster = ['\u{0B15}', '\u{0B47}'];
let (out, flags) = reorder_cluster_with(&cluster, &ORIYA_RULES);
assert_eq!(out, vec!['\u{0B47}', '\u{0B15}']);
assert!(flags.pre_base_reordered);
}
#[test]
fn oriya_reph_marks_RA_for_superscript() {
let cluster = ['\u{0B30}', '\u{0B4D}', '\u{0B15}'];
let (_out, flags) = reorder_cluster_with(&cluster, &ORIYA_RULES);
assert!(flags.has_reph);
}
#[test]
fn script_indic_tags_returns_pair_for_all_round11_scripts() {
use super::super::arabic::Script;
assert_eq!(
script_indic_tags(Script::Gurmukhi),
Some((*b"gur2", *b"guru"))
);
assert_eq!(
script_indic_tags(Script::Gujarati),
Some((*b"gjr2", *b"gujr"))
);
assert_eq!(
script_indic_tags(Script::Telugu),
Some((*b"tel2", *b"telu"))
);
assert_eq!(
script_indic_tags(Script::Kannada),
Some((*b"knd2", *b"knda"))
);
assert_eq!(
script_indic_tags(Script::Malayalam),
Some((*b"mlm2", *b"mlym"))
);
assert_eq!(script_indic_tags(Script::Oriya), Some((*b"ory2", *b"orya")));
}
#[test]
fn telugu_feature_tags_includes_pref_pstf_abvf_position_features() {
let tags = telugu_feature_tags();
assert!(tags.contains(b"pref"), "Telugu emits pref");
assert!(tags.contains(b"pstf"), "Telugu emits pstf");
assert!(tags.contains(b"abvf"), "Telugu emits abvf");
assert!(tags.contains(b"rphf"), "Telugu emits rphf");
}
#[test]
fn malayalam_feature_tags_omits_rphf_keeps_position_features() {
let tags = malayalam_feature_tags();
assert!(!tags.contains(b"rphf"), "Malayalam has no rphf (chillu)");
assert!(tags.contains(b"pref"));
assert!(tags.contains(b"pstf"));
assert!(tags.contains(b"abvf"));
assert!(tags.contains(b"blwf"));
}
#[test]
fn gujarati_feature_tags_match_devanagari_shape() {
assert_eq!(gujarati_feature_tags(), devanagari_feature_tags());
}
#[test]
fn gurmukhi_feature_tags_match_devanagari_shape() {
assert_eq!(gurmukhi_feature_tags(), devanagari_feature_tags());
}
#[test]
fn sinhala_category_classifies_ka_as_consonant() {
assert_eq!(sinhala_category('\u{0D9A}'), IndicCategory::Consonant);
}
#[test]
fn sinhala_category_classifies_al_lakuna_as_halant() {
assert_eq!(sinhala_category('\u{0DCA}'), IndicCategory::Halant);
}
#[test]
fn sinhala_pre_base_matras_e_ee_ai() {
assert_eq!(sinhala_category('\u{0DD9}'), IndicCategory::PreBaseMatra);
assert_eq!(sinhala_category('\u{0DDA}'), IndicCategory::PreBaseMatra);
assert_eq!(sinhala_category('\u{0DDB}'), IndicCategory::PreBaseMatra);
}
#[test]
fn sinhala_pre_base_two_part_vowels_o_oo_au() {
assert_eq!(sinhala_category('\u{0DDC}'), IndicCategory::PreBaseMatra);
assert_eq!(sinhala_category('\u{0DDD}'), IndicCategory::PreBaseMatra);
assert_eq!(sinhala_category('\u{0DDE}'), IndicCategory::PreBaseMatra);
}
#[test]
fn sinhala_aa_matra_classified_as_matra() {
assert_eq!(sinhala_category('\u{0DCF}'), IndicCategory::Matra);
}
#[test]
fn sinhala_anusvara_classified_as_bindu() {
assert_eq!(sinhala_category('\u{0D82}'), IndicCategory::Bindu);
}
#[test]
fn sinhala_independent_vowel_a_classified_as_vowel() {
assert_eq!(sinhala_category('\u{0D85}'), IndicCategory::Vowel);
}
#[test]
fn sinhala_returns_other_for_devanagari_codepoint() {
assert_eq!(sinhala_category('\u{0915}'), IndicCategory::Other);
}
#[test]
fn sinhala_pre_base_matra_e_reorders_before_base() {
let cluster = ['\u{0D9A}', '\u{0DD9}'];
let (out, flags) = reorder_cluster_with(&cluster, &SINHALA_RULES);
assert_eq!(out, vec!['\u{0DD9}', '\u{0D9A}']);
assert!(flags.pre_base_reordered);
}
#[test]
fn sinhala_RA_plus_halant_does_NOT_set_reph_flag() {
let cluster = ['\u{0DBB}', '\u{0DCA}', '\u{0D9A}'];
let (_out, flags) = reorder_cluster_with(&cluster, &SINHALA_RULES);
assert!(!flags.has_reph, "Sinhala reph_enabled is false");
}
#[test]
fn sinhala_conjunct_keeps_in_one_cluster() {
let chars = ['\u{0D9A}', '\u{0DCA}', '\u{0DC2}'];
let bounds = cluster_boundaries_with(&chars, sinhala_category);
assert_eq!(bounds, vec![(0, 3)]);
}
#[test]
fn sinhala_feature_tags_omit_rphf() {
let tags = sinhala_feature_tags();
assert!(!tags.contains(b"rphf"), "Sinhala has no reph feature");
assert!(tags.contains(b"pref"));
assert!(tags.contains(b"blwf"));
assert!(tags.contains(b"pstf"));
assert!(tags.contains(b"akhn"));
}
#[test]
fn khmer_category_classifies_ka_as_consonant() {
assert_eq!(khmer_category('\u{1780}'), IndicCategory::Consonant);
}
#[test]
fn khmer_category_classifies_coeng_as_halant() {
assert_eq!(khmer_category('\u{17D2}'), IndicCategory::Halant);
}
#[test]
fn khmer_pre_base_matras_oe_ya_ie_e_ae_ai() {
for cp in 0x17BE..=0x17C3 {
let ch = char::from_u32(cp).unwrap();
assert_eq!(
khmer_category(ch),
IndicCategory::PreBaseMatra,
"U+{cp:04X} should be PreBaseMatra"
);
}
}
#[test]
fn khmer_pre_base_two_part_vowels_oo_au() {
assert_eq!(khmer_category('\u{17C4}'), IndicCategory::PreBaseMatra);
assert_eq!(khmer_category('\u{17C5}'), IndicCategory::PreBaseMatra);
}
#[test]
fn khmer_nikahit_and_reahmuk_classified_as_bindu() {
assert_eq!(khmer_category('\u{17C6}'), IndicCategory::Bindu);
assert_eq!(khmer_category('\u{17C7}'), IndicCategory::Bindu);
}
#[test]
fn khmer_aa_classified_as_matra() {
assert_eq!(khmer_category('\u{17B6}'), IndicCategory::Matra);
}
#[test]
fn khmer_independent_vowel_a_classified_as_vowel() {
assert_eq!(khmer_category('\u{17A5}'), IndicCategory::Vowel);
}
#[test]
fn khmer_returns_other_for_devanagari_codepoint() {
assert_eq!(khmer_category('\u{0915}'), IndicCategory::Other);
}
#[test]
fn khmer_pre_base_matra_e_reorders_before_base() {
let cluster = ['\u{1780}', '\u{17C1}'];
let (out, flags) = reorder_cluster_with(&cluster, &KHMER_RULES);
assert_eq!(out, vec!['\u{17C1}', '\u{1780}']);
assert!(flags.pre_base_reordered);
}
#[test]
fn khmer_coeng_chains_subjoined_consonant_into_one_cluster() {
let chars = ['\u{1780}', '\u{17D2}', '\u{1781}'];
let bounds = cluster_boundaries_with(&chars, khmer_category);
assert_eq!(bounds, vec![(0, 3)]);
}
#[test]
fn khmer_three_consonant_subjoined_chain_in_one_cluster() {
let chars = ['\u{1780}', '\u{17D2}', '\u{1781}', '\u{17D2}', '\u{1782}'];
let bounds = cluster_boundaries_with(&chars, khmer_category);
assert_eq!(bounds, vec![(0, 5)]);
}
#[test]
fn khmer_RA_plus_coeng_does_NOT_set_reph_flag() {
let cluster = ['\u{179A}', '\u{17D2}', '\u{1780}'];
let (_out, flags) = reorder_cluster_with(&cluster, &KHMER_RULES);
assert!(!flags.has_reph, "Khmer reph_enabled is false");
}
#[test]
fn khmer_feature_tags_omit_rphf_keep_pref_blwf_pstf() {
let tags = khmer_feature_tags();
assert!(!tags.contains(b"rphf"), "Khmer has no reph feature");
assert!(tags.contains(b"pref"));
assert!(tags.contains(b"blwf"));
assert!(tags.contains(b"pstf"));
assert!(tags.contains(b"abvf"));
assert!(tags.contains(b"cfar"), "Khmer emits cfar");
}
#[test]
fn thai_category_classifies_ko_kai_as_consonant() {
assert_eq!(thai_category('\u{0E01}'), IndicCategory::Consonant);
}
#[test]
fn thai_category_has_no_halant() {
for cp in 0x0E01..=0x0E2E {
let ch = char::from_u32(cp).unwrap();
assert_ne!(
thai_category(ch),
IndicCategory::Halant,
"U+{cp:04X} must not be Halant"
);
}
}
#[test]
fn thai_pre_base_vowels_classified_as_vowel() {
for cp in 0x0E40..=0x0E44 {
let ch = char::from_u32(cp).unwrap();
assert_eq!(
thai_category(ch),
IndicCategory::Vowel,
"U+{cp:04X} should be Vowel (pre-base in storage order)"
);
}
}
#[test]
fn thai_above_base_vowel_signs_classified_as_matra() {
assert_eq!(thai_category('\u{0E31}'), IndicCategory::Matra);
for cp in 0x0E34..=0x0E37 {
let ch = char::from_u32(cp).unwrap();
assert_eq!(thai_category(ch), IndicCategory::Matra);
}
assert_eq!(thai_category('\u{0E47}'), IndicCategory::Matra);
}
#[test]
fn thai_below_base_vowel_signs_classified_as_matra() {
for cp in 0x0E38..=0x0E3A {
let ch = char::from_u32(cp).unwrap();
assert_eq!(thai_category(ch), IndicCategory::Matra);
}
}
#[test]
fn thai_tone_marks_classified_as_bindu() {
for cp in 0x0E48..=0x0E4B {
let ch = char::from_u32(cp).unwrap();
assert_eq!(thai_category(ch), IndicCategory::Bindu);
}
}
#[test]
fn thai_returns_other_for_devanagari_codepoint() {
assert_eq!(thai_category('\u{0915}'), IndicCategory::Other);
}
#[test]
fn thai_pre_base_vowel_starts_new_cluster_before_consonant() {
let chars = ['\u{0E40}', '\u{0E01}'];
let bounds = cluster_boundaries_with(&chars, thai_category);
assert_eq!(bounds, vec![(0, 1), (1, 2)]);
}
#[test]
fn thai_consonant_with_tone_mark_in_one_cluster() {
let chars = ['\u{0E01}', '\u{0E48}'];
let bounds = cluster_boundaries_with(&chars, thai_category);
assert_eq!(bounds, vec![(0, 2)]);
}
#[test]
fn thai_consonant_with_above_vowel_and_tone_mark_in_one_cluster() {
let chars = ['\u{0E01}', '\u{0E34}', '\u{0E49}'];
let bounds = cluster_boundaries_with(&chars, thai_category);
assert_eq!(bounds, vec![(0, 3)]);
}
#[test]
fn thai_each_consonant_starts_new_cluster() {
let chars = ['\u{0E01}', '\u{0E02}', '\u{0E03}'];
let bounds = cluster_boundaries_with(&chars, thai_category);
assert_eq!(bounds, vec![(0, 1), (1, 2), (2, 3)]);
}
#[test]
fn thai_no_pre_base_matra_reorder() {
let cluster = ['\u{0E01}', '\u{0E32}'];
let (out, flags) = reorder_cluster_with(&cluster, &THAI_RULES);
assert_eq!(out, vec!['\u{0E01}', '\u{0E32}']);
assert!(!flags.pre_base_reordered);
assert!(!flags.has_reph);
}
#[test]
fn thai_RA_plus_anything_does_NOT_set_reph_flag() {
let cluster = ['\u{0E23}', '\u{0E01}'];
let (_out, flags) = reorder_cluster_with(&cluster, &THAI_RULES);
assert!(!flags.has_reph);
}
#[test]
fn thai_feature_tags_omit_halant_features() {
let tags = thai_feature_tags();
assert!(!tags.contains(b"rphf"));
assert!(!tags.contains(b"half"));
assert!(!tags.contains(b"pref"));
assert!(!tags.contains(b"blwf"));
assert!(!tags.contains(b"pstf"));
assert!(!tags.contains(b"cjct"));
assert!(tags.contains(b"pres"));
assert!(tags.contains(b"abvs"));
assert!(tags.contains(b"blws"));
assert!(tags.contains(b"psts"));
}
#[test]
fn script_indic_tags_returns_pair_for_round12_scripts() {
use super::super::arabic::Script;
assert_eq!(
script_indic_tags(Script::Sinhala),
Some((*b"sinh", *b"sinh"))
);
assert_eq!(script_indic_tags(Script::Khmer), Some((*b"khmr", *b"khmr")));
assert_eq!(script_indic_tags(Script::Thai), Some((*b"thai", *b"thai")));
}
#[test]
fn script_of_recognises_round12_blocks() {
use super::super::arabic::{script_of, Script};
assert_eq!(script_of('\u{0D9A}'), Script::Sinhala);
assert_eq!(script_of('\u{0DCA}'), Script::Sinhala);
assert_eq!(script_of('\u{1780}'), Script::Khmer);
assert_eq!(script_of('\u{17D2}'), Script::Khmer);
assert_eq!(script_of('\u{0E01}'), Script::Thai);
assert_eq!(script_of('\u{0E40}'), Script::Thai);
}
}