use std::collections::HashMap;
use ahash::AHashSet;
use once_cell::sync::Lazy;
use strum::IntoEnumIterator;
use strum_macros::EnumIter;
use crate::language::Language;
#[derive(EnumIter, Eq, PartialEq, Hash)]
pub(crate) enum Alphabet {
Arabic,
Armenian,
Bengali,
Cyrillic,
Devanagari,
Georgian,
Greek,
Gujarati,
Gurmukhi,
Han,
Hangul,
Hebrew,
Hiragana,
Katakana,
Latin,
Tamil,
Telugu,
Thai,
}
impl Alphabet {
pub fn matches(&self, text: &str) -> bool {
self.char_set().is_match(text)
}
pub fn matches_char(&self, ch: char) -> bool {
self.char_set().is_char_match(ch)
}
pub fn all_supporting_single_language() -> HashMap<Alphabet, Language> {
let mut alphabets = HashMap::new();
for alphabet in Alphabet::iter() {
let supported_languages = alphabet.supported_languages();
if supported_languages.len() == 1 {
alphabets.insert(alphabet, supported_languages[0]);
}
}
alphabets
}
fn supported_languages(&self) -> Vec<Language> {
let mut languages = vec![];
for language in Language::iter() {
if language.alphabets().contains(self) {
languages.push(language);
}
}
languages
}
fn char_set(&self) -> &Lazy<CharSet> {
match self {
Alphabet::Arabic => &ARABIC,
Alphabet::Armenian => &ARMENIAN,
Alphabet::Bengali => &BENGALI,
Alphabet::Cyrillic => &CYRILLIC,
Alphabet::Devanagari => &DEVANAGARI,
Alphabet::Georgian => &GEORGIAN,
Alphabet::Greek => &GREEK,
Alphabet::Gujarati => &GUJARATI,
Alphabet::Gurmukhi => &GURMUKHI,
Alphabet::Han => &HAN,
Alphabet::Hangul => &HANGUL,
Alphabet::Hebrew => &HEBREW,
Alphabet::Hiragana => &HIRAGANA,
Alphabet::Katakana => &KATAKANA,
Alphabet::Latin => &LATIN,
Alphabet::Tamil => &TAMIL,
Alphabet::Telugu => &TELUGU,
Alphabet::Thai => &THAI,
}
}
}
pub(crate) struct CharSet {
characters: AHashSet<char>,
}
impl CharSet {
pub fn from_char_classes(char_classes: &[&str]) -> Self {
let mut characters = AHashSet::new();
for char_class in char_classes {
let table = crate::script::BY_NAME
.iter()
.find(|(name, _)| *name == *char_class)
.unwrap()
.1;
for &(start, end) in table {
for codepoint in start..=end {
characters.insert(codepoint);
}
}
}
CharSet { characters }
}
pub fn from_char_class(char_class: &str) -> Self {
Self::from_char_classes(&[char_class])
}
pub fn is_match(&self, text: &str) -> bool {
text.chars().all(|ch| self.is_char_match(ch))
}
pub fn is_char_match(&self, ch: char) -> bool {
self.characters.contains(&ch)
}
}
static ARABIC: Lazy<CharSet> = Lazy::new(|| CharSet::from_char_class("Arabic"));
static ARMENIAN: Lazy<CharSet> = Lazy::new(|| CharSet::from_char_class("Armenian"));
static BENGALI: Lazy<CharSet> = Lazy::new(|| CharSet::from_char_class("Bengali"));
static CYRILLIC: Lazy<CharSet> = Lazy::new(|| CharSet::from_char_class("Cyrillic"));
static DEVANAGARI: Lazy<CharSet> = Lazy::new(|| CharSet::from_char_class("Devanagari"));
static GEORGIAN: Lazy<CharSet> = Lazy::new(|| CharSet::from_char_class("Georgian"));
static GREEK: Lazy<CharSet> = Lazy::new(|| CharSet::from_char_class("Greek"));
static GUJARATI: Lazy<CharSet> = Lazy::new(|| CharSet::from_char_class("Gujarati"));
static GURMUKHI: Lazy<CharSet> = Lazy::new(|| CharSet::from_char_class("Gurmukhi"));
static HAN: Lazy<CharSet> = Lazy::new(|| CharSet::from_char_class("Han"));
static HANGUL: Lazy<CharSet> = Lazy::new(|| CharSet::from_char_class("Hangul"));
static HEBREW: Lazy<CharSet> = Lazy::new(|| CharSet::from_char_class("Hebrew"));
static HIRAGANA: Lazy<CharSet> = Lazy::new(|| CharSet::from_char_class("Hiragana"));
static KATAKANA: Lazy<CharSet> = Lazy::new(|| CharSet::from_char_class("Katakana"));
static LATIN: Lazy<CharSet> = Lazy::new(|| CharSet::from_char_class("Latin"));
static TAMIL: Lazy<CharSet> = Lazy::new(|| CharSet::from_char_class("Tamil"));
static TELUGU: Lazy<CharSet> = Lazy::new(|| CharSet::from_char_class("Telugu"));
static THAI: Lazy<CharSet> = Lazy::new(|| CharSet::from_char_class("Thai"));