#![allow(clippy::manual_range_contains)]
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum JoiningClass {
U,
L,
R,
D,
C,
T,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum JoiningForm {
Isol,
Init,
Medi,
Fina,
}
impl JoiningForm {
pub fn feature_tag(self) -> [u8; 4] {
match self {
Self::Isol => *b"isol",
Self::Init => *b"init",
Self::Medi => *b"medi",
Self::Fina => *b"fina",
}
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum Script {
Arabic,
Hebrew,
Devanagari,
Bengali,
Tamil,
Other,
}
pub fn script_of(ch: char) -> Script {
let cp = ch as u32;
if (0x0600..=0x06FF).contains(&cp)
|| (0x0750..=0x077F).contains(&cp)
|| (0x08A0..=0x08FF).contains(&cp)
|| (0xFB50..=0xFDFF).contains(&cp)
|| (0xFE70..=0xFEFF).contains(&cp)
{
return Script::Arabic;
}
if (0x0590..=0x05FF).contains(&cp) || (0xFB1D..=0xFB4F).contains(&cp) {
return Script::Hebrew;
}
if (0x0900..=0x097F).contains(&cp) {
return Script::Devanagari;
}
if (0x0980..=0x09FF).contains(&cp) {
return Script::Bengali;
}
if (0x0B80..=0x0BFF).contains(&cp) {
return Script::Tamil;
}
Script::Other
}
pub fn feature_tags_for_run(script: Script) -> Vec<[u8; 4]> {
match script {
Script::Arabic => vec![*b"isol", *b"init", *b"medi", *b"fina"],
Script::Hebrew => vec![*b"ccmp"],
Script::Devanagari => super::indic::devanagari_feature_tags(),
Script::Bengali => super::indic::bengali_feature_tags(),
Script::Tamil => super::indic::tamil_feature_tags(),
Script::Other => Vec::new(),
}
}
pub fn joining_class(ch: char) -> JoiningClass {
let cp = ch as u32;
let in_arabic_block = (0x0600..=0x06FF).contains(&cp)
|| (0x0750..=0x077F).contains(&cp)
|| (0x0870..=0x089F).contains(&cp)
|| (0x08A0..=0x08FF).contains(&cp)
|| (0xFB50..=0xFDFF).contains(&cp)
|| (0xFE70..=0xFEFF).contains(&cp);
let in_syriac_block = (0x0700..=0x074F).contains(&cp);
let in_zwj_zwnj = cp == 0x200C || cp == 0x200D;
if !in_arabic_block && !in_syriac_block && !in_zwj_zwnj {
return JoiningClass::U;
}
if cp == 0x200D || cp == 0x0640 || cp == 0x07FA {
return JoiningClass::C;
}
if cp == 0x200C {
return JoiningClass::U;
}
if is_transparent_mark(cp) {
return JoiningClass::T;
}
match cp {
0x0621 => JoiningClass::U, 0x0622 => JoiningClass::R, 0x0623 => JoiningClass::R, 0x0624 => JoiningClass::R, 0x0625 => JoiningClass::R, 0x0626 => JoiningClass::D, 0x0627 => JoiningClass::R, 0x0628 => JoiningClass::D, 0x0629 => JoiningClass::R, 0x062A => JoiningClass::D, 0x062B => JoiningClass::D, 0x062C => JoiningClass::D, 0x062D => JoiningClass::D, 0x062E => JoiningClass::D, 0x062F => JoiningClass::R, 0x0630 => JoiningClass::R, 0x0631 => JoiningClass::R, 0x0632 => JoiningClass::R, 0x0633 => JoiningClass::D, 0x0634 => JoiningClass::D, 0x0635 => JoiningClass::D, 0x0636 => JoiningClass::D, 0x0637 => JoiningClass::D, 0x0638 => JoiningClass::D, 0x0639 => JoiningClass::D, 0x063A => JoiningClass::D, 0x063B..=0x063F => JoiningClass::D,
0x0641 => JoiningClass::D, 0x0642 => JoiningClass::D, 0x0643 => JoiningClass::D, 0x0644 => JoiningClass::D, 0x0645 => JoiningClass::D, 0x0646 => JoiningClass::D, 0x0647 => JoiningClass::D, 0x0648 => JoiningClass::R, 0x0649 => JoiningClass::D, 0x064A => JoiningClass::D, 0x066E..=0x066F => JoiningClass::D,
0x0671..=0x0673 => JoiningClass::R,
0x0674 => JoiningClass::U,
0x0675..=0x0677 => JoiningClass::R,
0x0678..=0x0687 => JoiningClass::D,
0x0688..=0x0699 => JoiningClass::R,
0x069A..=0x06A9 => JoiningClass::D,
0x06AA => JoiningClass::R,
0x06AB..=0x06BF => JoiningClass::D,
0x06C0 => JoiningClass::R,
0x06C1..=0x06C2 => JoiningClass::D,
0x06C3..=0x06CB => JoiningClass::R,
0x06CC => JoiningClass::D,
0x06CD => JoiningClass::R,
0x06CE => JoiningClass::D,
0x06CF => JoiningClass::R,
0x06D0..=0x06D1 => JoiningClass::D,
0x06D2..=0x06D3 => JoiningClass::R,
0x06D5 => JoiningClass::R,
0x0750..=0x077F => JoiningClass::D,
0x08A0..=0x08B4 => JoiningClass::D,
0x08B6..=0x08BD => JoiningClass::D,
0xFB50..=0xFDFF => JoiningClass::U,
0xFE70..=0xFEFF => JoiningClass::U,
_ => JoiningClass::U,
}
}
fn is_transparent_mark(cp: u32) -> bool {
if (0x0610..=0x061A).contains(&cp) {
return true;
}
if (0x064B..=0x065F).contains(&cp) {
return true;
}
if cp == 0x0670 {
return true;
} if (0x06D6..=0x06DC).contains(&cp) {
return true;
}
if (0x06DF..=0x06E4).contains(&cp) {
return true;
}
if (0x06E7..=0x06E8).contains(&cp) {
return true;
}
if (0x06EA..=0x06ED).contains(&cp) {
return true;
}
if (0x08D3..=0x08E1).contains(&cp) {
return true;
}
if (0x08E3..=0x08FF).contains(&cp) {
return true;
}
if (0x0711..=0x0711).contains(&cp) {
return true;
}
if (0x0730..=0x074A).contains(&cp) {
return true;
}
false
}
pub fn compute_forms(chars: &[char]) -> Vec<JoiningForm> {
let n = chars.len();
let mut forms = vec![JoiningForm::Isol; n];
if n == 0 {
return forms;
}
let classes: Vec<JoiningClass> = chars.iter().map(|&c| joining_class(c)).collect();
let prev_non_t = |i: usize| -> Option<usize> {
let mut j = i;
while j > 0 {
j -= 1;
if classes[j] != JoiningClass::T {
return Some(j);
}
}
None
};
let next_non_t = |i: usize| -> Option<usize> {
let mut j = i + 1;
while j < n {
if classes[j] != JoiningClass::T {
return Some(j);
}
j += 1;
}
None
};
for i in 0..n {
let cls = classes[i];
if cls == JoiningClass::T {
continue;
}
let left_can_join = matches!(
prev_non_t(i).map(|j| classes[j]),
Some(JoiningClass::D) | Some(JoiningClass::L) | Some(JoiningClass::C)
);
let right_can_join = matches!(
next_non_t(i).map(|j| classes[j]),
Some(JoiningClass::D) | Some(JoiningClass::R) | Some(JoiningClass::C)
);
let (this_left, this_right) = match cls {
JoiningClass::U => (false, false),
JoiningClass::R => (true, false),
JoiningClass::L => (false, true),
JoiningClass::D | JoiningClass::C => (true, true),
JoiningClass::T => unreachable!(),
};
let joins_left = left_can_join && this_left;
let joins_right = right_can_join && this_right;
forms[i] = match (joins_left, joins_right) {
(false, false) => JoiningForm::Isol,
(false, true) => JoiningForm::Init,
(true, true) => JoiningForm::Medi,
(true, false) => JoiningForm::Fina,
};
}
let mut last_form = JoiningForm::Isol;
for i in 0..n {
if classes[i] == JoiningClass::T {
forms[i] = last_form;
} else {
last_form = forms[i];
}
}
forms
}
#[cfg(test)]
#[allow(non_snake_case)] mod tests {
use super::*;
#[test]
fn joining_class_lookup_returns_R_for_alif_U_062() {
assert_eq!(joining_class('\u{0627}'), JoiningClass::R);
}
#[test]
fn joining_class_lookup_returns_D_for_ba_U_0628() {
assert_eq!(joining_class('\u{0628}'), JoiningClass::D);
}
#[test]
fn dual_joining_letter_between_two_dual_joiners_picks_medi() {
let chars = ['\u{0628}', '\u{0628}', '\u{0628}'];
let forms = compute_forms(&chars);
assert_eq!(forms[0], JoiningForm::Init);
assert_eq!(forms[1], JoiningForm::Medi);
assert_eq!(forms[2], JoiningForm::Fina);
}
#[test]
fn dual_joining_letter_at_start_picks_init() {
let chars = ['\u{0628}', '\u{062A}'];
let forms = compute_forms(&chars);
assert_eq!(forms[0], JoiningForm::Init);
assert_eq!(forms[1], JoiningForm::Fina);
}
#[test]
fn right_joining_letter_at_end_picks_fina() {
let chars = ['\u{0628}', '\u{0627}'];
let forms = compute_forms(&chars);
assert_eq!(forms[0], JoiningForm::Init);
assert_eq!(forms[1], JoiningForm::Fina);
}
#[test]
fn transparent_combining_mark_does_not_break_chain() {
let chars = ['\u{0628}', '\u{064E}', '\u{0628}'];
let forms = compute_forms(&chars);
assert_eq!(forms[0], JoiningForm::Init);
assert_eq!(forms[1], JoiningForm::Init); assert_eq!(forms[2], JoiningForm::Fina);
}
#[test]
fn alef_after_lam_in_la_word_picks_fina() {
let chars = ['\u{0644}', '\u{0627}'];
let forms = compute_forms(&chars);
assert_eq!(forms[0], JoiningForm::Init);
assert_eq!(forms[1], JoiningForm::Fina);
}
#[test]
fn isolated_letter_with_no_neighbours_picks_isol() {
let forms = compute_forms(&['\u{0628}']);
assert_eq!(forms[0], JoiningForm::Isol);
}
#[test]
fn right_joiner_followed_by_dual_joiner_breaks_chain() {
let forms = compute_forms(&['\u{0627}', '\u{0628}', '\u{0628}']);
assert_eq!(forms[0], JoiningForm::Isol);
assert_eq!(forms[1], JoiningForm::Init);
assert_eq!(forms[2], JoiningForm::Fina);
}
#[test]
fn space_between_letters_breaks_chain() {
let chars = ['\u{0628}', ' ', '\u{0628}'];
let forms = compute_forms(&chars);
assert_eq!(forms[0], JoiningForm::Isol);
assert_eq!(forms[1], JoiningForm::Isol);
assert_eq!(forms[2], JoiningForm::Isol);
}
#[test]
fn zwj_extends_chain_across_non_joiner() {
let chars = ['\u{0628}', '\u{200D}', '\u{200D}', '\u{0628}'];
let forms = compute_forms(&chars);
assert_eq!(forms[0], JoiningForm::Init);
assert_eq!(forms[1], JoiningForm::Medi);
assert_eq!(forms[2], JoiningForm::Medi);
assert_eq!(forms[3], JoiningForm::Fina);
}
#[test]
fn zwnj_breaks_chain() {
let chars = ['\u{0628}', '\u{200C}', '\u{0628}'];
let forms = compute_forms(&chars);
assert_eq!(forms[0], JoiningForm::Isol);
assert_eq!(forms[1], JoiningForm::Isol);
assert_eq!(forms[2], JoiningForm::Isol);
}
#[test]
fn script_of_arabic_alef_is_arabic() {
assert_eq!(script_of('\u{0627}'), Script::Arabic);
}
#[test]
fn script_of_hebrew_alef_is_hebrew() {
assert_eq!(script_of('\u{05D0}'), Script::Hebrew);
}
#[test]
fn script_of_latin_a_is_other() {
assert_eq!(script_of('A'), Script::Other);
}
#[test]
fn feature_tags_for_arabic_includes_four_joining_features() {
let tags = feature_tags_for_run(Script::Arabic);
assert!(tags.contains(b"isol"));
assert!(tags.contains(b"init"));
assert!(tags.contains(b"medi"));
assert!(tags.contains(b"fina"));
}
#[test]
fn feature_tags_for_other_is_empty() {
assert!(feature_tags_for_run(Script::Other).is_empty());
}
#[test]
fn feature_tag_round_trips_per_form() {
assert_eq!(JoiningForm::Isol.feature_tag(), *b"isol");
assert_eq!(JoiningForm::Init.feature_tag(), *b"init");
assert_eq!(JoiningForm::Medi.feature_tag(), *b"medi");
assert_eq!(JoiningForm::Fina.feature_tag(), *b"fina");
}
#[test]
fn empty_run_returns_empty() {
assert!(compute_forms(&[]).is_empty());
}
#[test]
fn arabic_word_alsalam_picks_expected_forms() {
let chars: Vec<char> = "السلام".chars().collect();
let forms = compute_forms(&chars);
assert_eq!(forms.len(), 6);
assert_eq!(forms[0], JoiningForm::Isol);
assert_eq!(forms[1], JoiningForm::Init);
assert_eq!(forms[2], JoiningForm::Medi);
assert_eq!(forms[3], JoiningForm::Medi);
assert_eq!(forms[4], JoiningForm::Fina);
assert_eq!(forms[5], JoiningForm::Isol);
}
}