use unicode_script::{Script, UnicodeScript};
use unicode_width::UnicodeWidthChar;
use crate::Punctuation;
mod private {
pub trait Sealed {}
impl Sealed for char {}
}
pub trait CharExt: private::Sealed {
fn is_cjk(&self) -> bool;
fn is_english_lingual(&self) -> bool;
fn is_emoji(&self) -> bool;
fn is_punctuation(&self) -> bool;
fn is_vowel(&self) -> bool;
fn normalized(&self) -> Self;
}
impl CharExt for char {
fn is_english_lingual(&self) -> bool {
!self.is_whitespace()
&& !self.is_numeric()
&& !self.is_emoji()
&& matches!(self.width(), Some(1..))
&& !self.is_punctuation()
&& self.is_alphabetic()
&& !self.is_cjk()
&& self.script() == Script::Latin
}
fn normalized(&self) -> Self {
match self {
'\u{2018}' | '\u{2019}' | '\u{02BC}' | '\u{FF07}' => '\'',
'\u{201C}' | '\u{201D}' | '\u{FF02}' => '"',
'\u{2013}' | '\u{2014}' | '\u{2212}' | '\u{FF0D}' => '-',
_ => *self,
}
}
fn is_emoji(&self) -> bool {
let Some(block) = unicode_blocks::find_unicode_block(*self) else {
return false;
};
let blocks = [
unicode_blocks::SPECIALS,
unicode_blocks::EMOTICONS,
unicode_blocks::MISCELLANEOUS_SYMBOLS,
unicode_blocks::VARIATION_SELECTORS,
unicode_blocks::SUPPLEMENTAL_SYMBOLS_AND_PICTOGRAPHS,
];
blocks.contains(&block)
}
fn is_cjk(&self) -> bool {
let Some(block) = unicode_blocks::find_unicode_block(*self) else {
return false;
};
let blocks = [
unicode_blocks::CJK_UNIFIED_IDEOGRAPHS,
unicode_blocks::CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A,
unicode_blocks::CJK_UNIFIED_IDEOGRAPHS_EXTENSION_B,
unicode_blocks::CJK_UNIFIED_IDEOGRAPHS_EXTENSION_C,
unicode_blocks::CJK_UNIFIED_IDEOGRAPHS_EXTENSION_D,
unicode_blocks::CJK_UNIFIED_IDEOGRAPHS_EXTENSION_E,
unicode_blocks::CJK_UNIFIED_IDEOGRAPHS_EXTENSION_F,
unicode_blocks::CJK_UNIFIED_IDEOGRAPHS_EXTENSION_G,
unicode_blocks::CJK_UNIFIED_IDEOGRAPHS_EXTENSION_H,
unicode_blocks::CJK_UNIFIED_IDEOGRAPHS_EXTENSION_I,
unicode_blocks::HANGUL_JAMO,
unicode_blocks::HANGUL_SYLLABLES,
unicode_blocks::HANGUL_JAMO_EXTENDED_A,
unicode_blocks::HANGUL_JAMO_EXTENDED_B,
unicode_blocks::HANGUL_COMPATIBILITY_JAMO,
unicode_blocks::CJK_SYMBOLS_AND_PUNCTUATION,
unicode_blocks::CJK_STROKES,
unicode_blocks::CJK_COMPATIBILITY,
unicode_blocks::CJK_COMPATIBILITY_FORMS,
unicode_blocks::CJK_COMPATIBILITY_IDEOGRAPHS,
unicode_blocks::CJK_COMPATIBILITY_IDEOGRAPHS_SUPPLEMENT,
unicode_blocks::CJK_RADICALS_SUPPLEMENT,
unicode_blocks::ENCLOSED_CJK_LETTERS_AND_MONTHS,
unicode_blocks::HIRAGANA,
];
blocks.contains(&block)
}
fn is_punctuation(&self) -> bool {
Punctuation::from_char(*self).is_some()
}
fn is_vowel(&self) -> bool {
matches!(self.to_ascii_lowercase(), 'a' | 'e' | 'i' | 'o' | 'u')
}
}
#[cfg(test)]
mod tests {
use super::CharExt;
#[test]
fn cjk_is_not_english_lingual() {
assert!(!'δΈ–'.is_english_lingual())
}
}