use crate::text::{BoundaryContext, CharacterInfo};
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum RTLScript {
Arabic,
ArabicSupplement,
ArabicExtendedA,
Hebrew,
PresentationFormsA,
PresentationFormsB,
}
pub fn detect_rtl_script(code: u32) -> Option<RTLScript> {
if matches!(code, 0x0600..=0x06FF) {
return Some(RTLScript::Arabic);
}
match code {
0x0590..=0x05FF => Some(RTLScript::Hebrew),
0x0750..=0x077F => Some(RTLScript::ArabicSupplement),
0x08A0..=0x08FF => Some(RTLScript::ArabicExtendedA),
0xFB50..=0xFDFF => Some(RTLScript::PresentationFormsA),
0xFE70..=0xFEFF => Some(RTLScript::PresentationFormsB),
_ => None,
}
}
#[inline]
pub fn is_rtl_text(code: u32) -> bool {
detect_rtl_script(code).is_some()
}
pub fn is_arabic_diacritic(code: u32) -> bool {
matches!(code,
0x064B..=0x0658 | 0x06D6..=0x06DC | 0x06DF..=0x06E4 | 0x06E7..=0x06E8 | 0x06EA..=0x06ED )
}
pub fn is_arabic_letter(code: u32) -> bool {
matches!(code,
0x0621..=0x063A | 0x0641..=0x064A | 0x0750..=0x076D | 0x08A0..=0x08B4 | 0x08B6..=0x08BD )
}
pub fn is_hebrew_diacritic(code: u32) -> bool {
matches!(code,
0x05B0..=0x05BB | 0x05BC | 0x05BD | 0x05BF | 0x05C1..=0x05C2 | 0x05C4..=0x05C5 | 0x05C7 )
}
pub fn is_hebrew_letter(code: u32) -> bool {
matches!(code, 0x05D0..=0x05EA)
}
pub fn is_hebrew_punctuation(code: u32) -> bool {
matches!(code, 0x05F3 | 0x05F4)
}
#[inline]
pub fn is_rtl_diacritic(code: u32) -> bool {
is_arabic_diacritic(code) || is_hebrew_diacritic(code)
}
pub fn normalize_arabic_contextual_form(code: u32) -> u32 {
match code {
0xFB50 => 0x0671, 0xFE82 => 0x0627, 0xFE8D => 0x0627, 0xFE8E => 0x0627,
0xFE8F => 0x0628, 0xFE90 => 0x0628, 0xFE91 => 0x0628, 0xFE92 => 0x0628,
0xFB50..=0xFDFF | 0xFE70..=0xFEFF => {
code
},
_ => code,
}
}
pub fn is_lam_alef_ligature(code: u32) -> bool {
matches!(code, 0xFEF5..=0xFEFC) }
pub fn decompose_lam_alef(code: u32) -> Option<(u32, u32)> {
match code {
0xFEFB | 0xFEFC => Some((0x0644, 0x0627)), 0xFEF5 | 0xFEF6 => Some((0x0644, 0x0622)), 0xFEF7 | 0xFEF8 => Some((0x0644, 0x0623)), 0xFEF9 | 0xFEFA => Some((0x0644, 0x0625)), _ => None,
}
}
pub fn is_eastern_arabic_digit(code: u32) -> bool {
matches!(code, 0x06F0..=0x06F9)
}
pub fn is_arabic_number(code: u32) -> bool {
matches!(code,
0x0030..=0x0039 | 0x06F0..=0x06F9 )
}
pub fn should_split_at_rtl_boundary(
prev_char: &CharacterInfo,
curr_char: &CharacterInfo,
_context: Option<&BoundaryContext>,
) -> Option<bool> {
let prev_code = prev_char.code;
let curr_code = curr_char.code;
let prev_is_rtl = is_rtl_text(prev_code);
let curr_is_rtl = is_rtl_text(curr_code);
if curr_code == 0x0020 || prev_code == 0x0020 {
return Some(true);
}
if is_arabic_number(prev_code) && is_arabic_number(curr_code) {
return Some(false);
}
if !prev_is_rtl && !curr_is_rtl {
return None;
}
if curr_code == 0x0640 || prev_code == 0x0640 {
return Some(false);
}
if is_rtl_diacritic(curr_code) {
return Some(false);
}
if is_rtl_diacritic(prev_code) && is_rtl_diacritic(curr_code) {
return Some(false);
}
if let Some(tj_offset) = prev_char.tj_offset {
if tj_offset < -50 {
return Some(true);
}
}
if prev_is_rtl != curr_is_rtl && !(is_arabic_number(prev_code) && is_arabic_number(curr_code)) {
return Some(true);
}
if is_arabic_punctuation(curr_code) || is_hebrew_punctuation(curr_code) {
return Some(true);
}
if (is_arabic_letter(prev_code)
|| is_arabic_letter(normalize_arabic_contextual_form(prev_code)))
&& (is_arabic_letter(curr_code)
|| is_arabic_letter(normalize_arabic_contextual_form(curr_code)))
{
return Some(false);
}
if is_hebrew_letter(prev_code) && is_hebrew_letter(curr_code) {
return Some(false);
}
if prev_is_rtl && curr_is_rtl {
return Some(false);
}
None
}
fn is_arabic_punctuation(code: u32) -> bool {
matches!(
code,
0x060C | 0x061B | 0x061F | 0x066A | 0x066B | 0x066C | 0x066D )
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_basic_script_detection() {
assert_eq!(detect_rtl_script(0x0627), Some(RTLScript::Arabic)); assert_eq!(detect_rtl_script(0x05D0), Some(RTLScript::Hebrew)); assert_eq!(detect_rtl_script(0x0041), None); }
#[test]
fn test_basic_diacritic_detection() {
assert!(is_arabic_diacritic(0x064E)); assert!(is_hebrew_diacritic(0x05BC)); assert!(!is_arabic_diacritic(0x0627)); }
#[test]
fn test_basic_letter_detection() {
assert!(is_arabic_letter(0x0628)); assert!(is_hebrew_letter(0x05D1)); assert!(!is_arabic_letter(0x064B)); }
#[test]
fn test_lam_alef_basic() {
assert!(is_lam_alef_ligature(0xFEFC));
assert_eq!(decompose_lam_alef(0xFEFC), Some((0x0644, 0x0627)));
}
}