use crate::text::CharacterInfo;
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum ComplexScript {
Devanagari,
Bengali,
Gurmukhi,
Gujarati,
Oriya,
Tamil,
Telugu,
Kannada,
Malayalam,
Sinhala,
Thai,
Lao,
Khmer,
Burmese,
Mongolian,
}
pub fn detect_complex_script(code: u32) -> Option<ComplexScript> {
if matches!(code, 0x0900..=0x097F) {
return Some(ComplexScript::Devanagari);
}
match code {
0x0980..=0x09FF => Some(ComplexScript::Bengali),
0x0A00..=0x0A7F => Some(ComplexScript::Gurmukhi),
0x0A80..=0x0AFF => Some(ComplexScript::Gujarati),
0x0B00..=0x0B7F => Some(ComplexScript::Oriya),
0x0B80..=0x0BFF => Some(ComplexScript::Tamil),
0x0C00..=0x0C7F => Some(ComplexScript::Telugu),
0x0C80..=0x0CFF => Some(ComplexScript::Kannada),
0x0D00..=0x0D7F => Some(ComplexScript::Malayalam),
0x0D80..=0x0DFF => Some(ComplexScript::Sinhala),
0x0E00..=0x0E7F => Some(ComplexScript::Thai),
0x0E80..=0x0EFF => Some(ComplexScript::Lao),
0x1780..=0x17FF => Some(ComplexScript::Khmer),
0x1000..=0x109F => Some(ComplexScript::Burmese),
0x1800..=0x18AF => Some(ComplexScript::Mongolian),
_ => None,
}
}
#[inline]
pub fn is_complex_script(code: u32) -> bool {
detect_complex_script(code).is_some()
}
pub fn is_devanagari_diacritic(code: u32) -> bool {
matches!(code,
0x0901..=0x0903 | 0x093A..=0x093C | 0x093E..=0x094C | 0x094D | 0x094E..=0x0950 | 0x0951..=0x0957 | 0x0962..=0x0963 )
}
pub fn is_devanagari_virama(code: u32) -> bool {
code == 0x094D
}
pub fn is_devanagari_consonant(code: u32) -> bool {
matches!(code, 0x0915..=0x0939)
}
pub fn is_devanagari_matra(code: u32) -> bool {
matches!(code, 0x093E..=0x094C)
}
pub fn is_devanagari_anusvar_visarga(code: u32) -> bool {
matches!(code, 0x0902 | 0x0903)
}
pub fn is_devanagari_nukta(code: u32) -> bool {
code == 0x093C
}
pub fn handle_devanagari_boundary(
prev_char: &CharacterInfo,
curr_char: &CharacterInfo,
) -> Option<bool> {
let prev_code = prev_char.code;
let curr_code = curr_char.code;
if is_devanagari_virama(prev_code) {
return Some(false);
}
if is_devanagari_matra(curr_code) {
return Some(false);
}
if is_devanagari_nukta(curr_code) {
return Some(false);
}
if is_devanagari_anusvar_visarga(curr_code) {
return Some(false);
}
if is_devanagari_diacritic(prev_code) && is_devanagari_diacritic(curr_code) {
return Some(false);
}
None
}
pub fn is_thai_tone_mark(code: u32) -> bool {
matches!(code, 0x0E48..=0x0E4B)
}
pub fn is_thai_vowel_modifier(code: u32) -> bool {
matches!(code,
0x0E31 | 0x0E34..=0x0E37 | 0x0E39..=0x0E3A )
}
pub fn is_thai_digit(code: u32) -> bool {
matches!(code,
0x0030..=0x0039 | 0x0E50..=0x0E59 )
}
pub fn is_thai_major_punctuation(code: u32) -> bool {
matches!(code, 0x0E2F | 0x0E46 | 0x0E4F)
}
pub fn handle_thai_boundary(prev_char: &CharacterInfo, curr_char: &CharacterInfo) -> Option<bool> {
let prev_code = prev_char.code;
let curr_code = curr_char.code;
if is_thai_tone_mark(curr_code) {
return Some(false);
}
if is_thai_vowel_modifier(curr_code) {
return Some(false);
}
if is_thai_digit(prev_code) && is_thai_digit(curr_code) {
return Some(false);
}
if is_thai_major_punctuation(curr_code) {
return Some(true);
}
None
}
pub fn is_khmer_coeng(code: u32) -> bool {
code == 0x17D2
}
pub fn is_khmer_vowel_inherent(code: u32) -> bool {
matches!(code,
0x17B4..=0x17B5 | 0x17B7..=0x17BD | 0x17BE..=0x17C5 | 0x17C6 )
}
pub fn is_khmer_tone_mark(code: u32) -> bool {
matches!(code, 0x17C9..=0x17CC)
}
pub fn handle_khmer_boundary(prev_char: &CharacterInfo, curr_char: &CharacterInfo) -> Option<bool> {
let prev_code = prev_char.code;
let curr_code = curr_char.code;
if is_khmer_coeng(prev_code) {
return Some(false);
}
if is_khmer_vowel_inherent(curr_code) {
return Some(false);
}
if is_khmer_tone_mark(curr_code) {
return Some(false);
}
None
}
pub fn is_indic_diacritic(code: u32) -> bool {
match code {
0x0981..=0x0983 | 0x09BC | 0x09BE..=0x09CC | 0x09CD | 0x09D7 | 0x09E2..=0x09E3 => true,
0x0B82..=0x0B83 | 0x0BBE..=0x0BCC | 0x0BCD | 0x0BD7 => true,
0x0C01..=0x0C03 | 0x0C3E..=0x0C4C | 0x0C4D | 0x0C55..=0x0C56 | 0x0C62..=0x0C63 => true,
0x0C81..=0x0C83 | 0x0CBC | 0x0CBE..=0x0CCC | 0x0CCD | 0x0CD5..=0x0CD6 | 0x0CE2..=0x0CE3 => true,
0x0D01..=0x0D03 | 0x0D3E..=0x0D4C | 0x0D4D | 0x0D57 | 0x0D62..=0x0D63 => true,
_ => false,
}
}
pub fn handle_indic_boundary(prev_char: &CharacterInfo, curr_char: &CharacterInfo) -> Option<bool> {
let prev_code = prev_char.code;
let curr_code = curr_char.code;
if is_indic_diacritic(curr_code) {
return Some(false);
}
if is_indic_diacritic(prev_code) && is_indic_diacritic(curr_code) {
return Some(false);
}
None
}
pub fn is_complex_script_mark(code: u32) -> bool {
is_devanagari_diacritic(code)
|| is_indic_diacritic(code)
|| is_thai_tone_mark(code)
|| is_thai_vowel_modifier(code)
|| is_khmer_vowel_inherent(code)
|| is_khmer_tone_mark(code)
|| is_khmer_coeng(code)
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_detect_devanagari_range() {
assert_eq!(detect_complex_script(0x0915), Some(ComplexScript::Devanagari)); assert_eq!(detect_complex_script(0x0928), Some(ComplexScript::Devanagari)); assert_eq!(detect_complex_script(0x0971), Some(ComplexScript::Devanagari));
}
#[test]
fn test_detect_thai_range() {
assert_eq!(detect_complex_script(0x0E01), Some(ComplexScript::Thai)); assert_eq!(detect_complex_script(0x0E3F), Some(ComplexScript::Thai));
}
#[test]
fn test_detect_khmer_range() {
assert_eq!(detect_complex_script(0x1780), Some(ComplexScript::Khmer)); assert_eq!(detect_complex_script(0x17FF), Some(ComplexScript::Khmer));
}
#[test]
fn test_detect_tamil_range() {
assert_eq!(detect_complex_script(0x0B85), Some(ComplexScript::Tamil)); assert_eq!(detect_complex_script(0x0BBF), Some(ComplexScript::Tamil));
}
#[test]
fn test_detect_non_complex_script() {
assert_eq!(detect_complex_script(0x0041), None); assert_eq!(detect_complex_script(0x0020), None); }
#[test]
fn test_devanagari_virama_detection() {
assert!(is_devanagari_virama(0x094D)); }
#[test]
fn test_devanagari_matra_detection() {
assert!(is_devanagari_matra(0x093F)); assert!(is_devanagari_matra(0x0940)); assert!(is_devanagari_matra(0x0947)); assert!(!is_devanagari_matra(0x0915)); }
#[test]
fn test_devanagari_consonant_detection() {
assert!(is_devanagari_consonant(0x0915)); assert!(is_devanagari_consonant(0x0928)); assert!(!is_devanagari_consonant(0x0905)); }
#[test]
fn test_thai_tone_mark_detection() {
assert!(is_thai_tone_mark(0x0E48)); assert!(is_thai_tone_mark(0x0E49)); assert!(!is_thai_tone_mark(0x0E01)); }
#[test]
fn test_thai_vowel_modifier_detection() {
assert!(is_thai_vowel_modifier(0x0E31)); assert!(is_thai_vowel_modifier(0x0E34)); assert!(!is_thai_vowel_modifier(0x0E01)); }
#[test]
fn test_thai_digit_detection() {
assert!(is_thai_digit(0x0E50)); assert!(is_thai_digit(0x0031)); assert!(!is_thai_digit(0x0E01)); }
#[test]
fn test_khmer_coeng_detection() {
assert!(is_khmer_coeng(0x17D2)); assert!(!is_khmer_coeng(0x1780)); }
#[test]
fn test_khmer_vowel_detection() {
assert!(is_khmer_vowel_inherent(0x17BE)); assert!(is_khmer_vowel_inherent(0x17C6)); assert!(!is_khmer_vowel_inherent(0x1780)); }
#[test]
fn test_indic_diacritic_detection() {
assert!(is_indic_diacritic(0x09CD)); assert!(is_indic_diacritic(0x0BCD)); assert!(is_indic_diacritic(0x0C4D)); assert!(is_indic_diacritic(0x0CCD)); assert!(is_indic_diacritic(0x0D4D)); }
}