1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
/*
 * Copyright © 2020-present Peter M. Stahl pemistahl@gmail.com
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either expressed or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

use std::collections::HashMap;

use ahash::AHashSet;
use once_cell::sync::Lazy;
use strum::IntoEnumIterator;
use strum_macros::EnumIter;

use crate::language::Language;

#[derive(EnumIter, Eq, PartialEq, Hash)]
pub(crate) enum Alphabet {
    Arabic,
    Armenian,
    Bengali,
    Cyrillic,
    Devanagari,
    Georgian,
    Greek,
    Gujarati,
    Gurmukhi,
    Han,
    Hangul,
    Hebrew,
    Hiragana,
    Katakana,
    Latin,
    Tamil,
    Telugu,
    Thai,
}

impl Alphabet {
    pub fn matches(&self, text: &str) -> bool {
        self.char_set().is_match(text)
    }

    pub fn matches_char(&self, ch: char) -> bool {
        self.char_set().is_char_match(ch)
    }

    pub fn all_supporting_single_language() -> HashMap<Alphabet, Language> {
        let mut alphabets = HashMap::new();
        for alphabet in Alphabet::iter() {
            let supported_languages = alphabet.supported_languages();
            if supported_languages.len() == 1 {
                alphabets.insert(alphabet, supported_languages[0]);
            }
        }
        alphabets
    }

    fn supported_languages(&self) -> Vec<Language> {
        let mut languages = vec![];
        for language in Language::iter() {
            if language.alphabets().contains(self) {
                languages.push(language);
            }
        }
        languages
    }

    fn char_set(&self) -> &Lazy<CharSet> {
        match self {
            Alphabet::Arabic => &ARABIC,
            Alphabet::Armenian => &ARMENIAN,
            Alphabet::Bengali => &BENGALI,
            Alphabet::Cyrillic => &CYRILLIC,
            Alphabet::Devanagari => &DEVANAGARI,
            Alphabet::Georgian => &GEORGIAN,
            Alphabet::Greek => &GREEK,
            Alphabet::Gujarati => &GUJARATI,
            Alphabet::Gurmukhi => &GURMUKHI,
            Alphabet::Han => &HAN,
            Alphabet::Hangul => &HANGUL,
            Alphabet::Hebrew => &HEBREW,
            Alphabet::Hiragana => &HIRAGANA,
            Alphabet::Katakana => &KATAKANA,
            Alphabet::Latin => &LATIN,
            Alphabet::Tamil => &TAMIL,
            Alphabet::Telugu => &TELUGU,
            Alphabet::Thai => &THAI,
        }
    }
}

pub(crate) struct CharSet {
    characters: AHashSet<char>,
}

impl CharSet {
    pub fn from_char_classes(char_classes: &[&str]) -> Self {
        let mut characters = AHashSet::new();

        for char_class in char_classes {
            let table = crate::script::BY_NAME
                .iter()
                .find(|(name, _)| *name == *char_class)
                .unwrap()
                .1;

            for &(start, end) in table {
                for codepoint in start..=end {
                    characters.insert(codepoint);
                }
            }
        }

        CharSet { characters }
    }

    pub fn from_char_class(char_class: &str) -> Self {
        Self::from_char_classes(&[char_class])
    }

    pub fn is_match(&self, text: &str) -> bool {
        text.chars().all(|ch| self.is_char_match(ch))
    }

    pub fn is_char_match(&self, ch: char) -> bool {
        self.characters.contains(&ch)
    }
}

static ARABIC: Lazy<CharSet> = Lazy::new(|| CharSet::from_char_class("Arabic"));
static ARMENIAN: Lazy<CharSet> = Lazy::new(|| CharSet::from_char_class("Armenian"));
static BENGALI: Lazy<CharSet> = Lazy::new(|| CharSet::from_char_class("Bengali"));
static CYRILLIC: Lazy<CharSet> = Lazy::new(|| CharSet::from_char_class("Cyrillic"));
static DEVANAGARI: Lazy<CharSet> = Lazy::new(|| CharSet::from_char_class("Devanagari"));
static GEORGIAN: Lazy<CharSet> = Lazy::new(|| CharSet::from_char_class("Georgian"));
static GREEK: Lazy<CharSet> = Lazy::new(|| CharSet::from_char_class("Greek"));
static GUJARATI: Lazy<CharSet> = Lazy::new(|| CharSet::from_char_class("Gujarati"));
static GURMUKHI: Lazy<CharSet> = Lazy::new(|| CharSet::from_char_class("Gurmukhi"));
static HAN: Lazy<CharSet> = Lazy::new(|| CharSet::from_char_class("Han"));
static HANGUL: Lazy<CharSet> = Lazy::new(|| CharSet::from_char_class("Hangul"));
static HEBREW: Lazy<CharSet> = Lazy::new(|| CharSet::from_char_class("Hebrew"));
static HIRAGANA: Lazy<CharSet> = Lazy::new(|| CharSet::from_char_class("Hiragana"));
static KATAKANA: Lazy<CharSet> = Lazy::new(|| CharSet::from_char_class("Katakana"));
static LATIN: Lazy<CharSet> = Lazy::new(|| CharSet::from_char_class("Latin"));
static TAMIL: Lazy<CharSet> = Lazy::new(|| CharSet::from_char_class("Tamil"));
static TELUGU: Lazy<CharSet> = Lazy::new(|| CharSet::from_char_class("Telugu"));
static THAI: Lazy<CharSet> = Lazy::new(|| CharSet::from_char_class("Thai"));