harper-core 2.0.0

The language checker for developers.
Documentation
use unicode_script::{Script, UnicodeScript};
use unicode_width::UnicodeWidthChar;

use crate::Punctuation;

mod private {
    pub trait Sealed {}

    impl Sealed for char {}
}

pub trait CharExt: private::Sealed {
    fn is_cjk(&self) -> bool;
    /// Whether a character can be a component of an English word.
    fn is_english_lingual(&self) -> bool;
    fn is_emoji(&self) -> bool;
    fn is_punctuation(&self) -> bool;
    /// Whether the character is an (English) vowel.
    ///
    /// Checks whether the character is in the set (A, E, I, O, U); case-insensitive.
    fn is_vowel(&self) -> bool;
    fn normalized(&self) -> Self;
}

impl CharExt for char {
    fn is_english_lingual(&self) -> bool {
        !self.is_whitespace()
            && !self.is_numeric()
            && !self.is_emoji()
            && matches!(self.width(), Some(1..))
            && !self.is_punctuation()
            && self.is_alphabetic()
            && !self.is_cjk()
            && self.script() == Script::Latin
    }

    fn normalized(&self) -> Self {
        match self {
            '\u{2018}' | '\u{2019}' | '\u{02BC}' | '\u{FF07}' => '\'',
            '\u{201C}' | '\u{201D}' | '\u{FF02}' => '"',
            '\u{2013}' | '\u{2014}' | '\u{2212}' | '\u{FF0D}' => '-',
            _ => *self,
        }
    }

    fn is_emoji(&self) -> bool {
        let Some(block) = unicode_blocks::find_unicode_block(*self) else {
            return false;
        };

        let blocks = [
            unicode_blocks::SPECIALS,
            unicode_blocks::EMOTICONS,
            unicode_blocks::MISCELLANEOUS_SYMBOLS,
            unicode_blocks::VARIATION_SELECTORS,
            unicode_blocks::SUPPLEMENTAL_SYMBOLS_AND_PICTOGRAPHS,
        ];

        blocks.contains(&block)
    }

    fn is_cjk(&self) -> bool {
        let Some(block) = unicode_blocks::find_unicode_block(*self) else {
            return false;
        };

        let blocks = [
            unicode_blocks::CJK_UNIFIED_IDEOGRAPHS,
            unicode_blocks::CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A,
            unicode_blocks::CJK_UNIFIED_IDEOGRAPHS_EXTENSION_B,
            unicode_blocks::CJK_UNIFIED_IDEOGRAPHS_EXTENSION_C,
            unicode_blocks::CJK_UNIFIED_IDEOGRAPHS_EXTENSION_D,
            unicode_blocks::CJK_UNIFIED_IDEOGRAPHS_EXTENSION_E,
            unicode_blocks::CJK_UNIFIED_IDEOGRAPHS_EXTENSION_F,
            unicode_blocks::CJK_UNIFIED_IDEOGRAPHS_EXTENSION_G,
            unicode_blocks::CJK_UNIFIED_IDEOGRAPHS_EXTENSION_H,
            unicode_blocks::CJK_UNIFIED_IDEOGRAPHS_EXTENSION_I,
            unicode_blocks::HANGUL_JAMO,
            unicode_blocks::HANGUL_SYLLABLES,
            unicode_blocks::HANGUL_JAMO_EXTENDED_A,
            unicode_blocks::HANGUL_JAMO_EXTENDED_B,
            unicode_blocks::HANGUL_COMPATIBILITY_JAMO,
            unicode_blocks::CJK_SYMBOLS_AND_PUNCTUATION,
            unicode_blocks::CJK_STROKES,
            unicode_blocks::CJK_COMPATIBILITY,
            unicode_blocks::CJK_COMPATIBILITY_FORMS,
            unicode_blocks::CJK_COMPATIBILITY_IDEOGRAPHS,
            unicode_blocks::CJK_COMPATIBILITY_IDEOGRAPHS_SUPPLEMENT,
            unicode_blocks::CJK_RADICALS_SUPPLEMENT,
            unicode_blocks::ENCLOSED_CJK_LETTERS_AND_MONTHS,
            unicode_blocks::HIRAGANA,
        ];

        blocks.contains(&block)
    }

    fn is_punctuation(&self) -> bool {
        Punctuation::from_char(*self).is_some()
    }

    fn is_vowel(&self) -> bool {
        matches!(self.to_ascii_lowercase(), 'a' | 'e' | 'i' | 'o' | 'u')
    }
}

#[cfg(test)]
mod tests {
    use super::CharExt;

    #[test]
    fn cjk_is_not_english_lingual() {
        assert!(!'δΈ–'.is_english_lingual())
    }
}