whatlang/scripts/
detect.rs

1use std::cmp::Reverse;
2
3use super::chars;
4use super::script::Script;
5use crate::utils::is_stop_char;
6
7type ScriptCounter = (Script, fn(char) -> bool, usize);
8
9/// Detect only a script by a given text.
10/// Works much faster than a complete detection with `detect`.
11///
12/// # Example
13/// ```
14/// use whatlang::{detect_script, Script};
15/// let script = detect_script("Благодаря Эсперанто вы обрётете друзей по всему миру!").unwrap();
16/// assert_eq!(script, Script::Cyrillic);
17/// ```
18pub fn detect_script(text: &str) -> Option<Script> {
19    let raw_info = raw_detect_script(text);
20    raw_info.main_script()
21}
22
23#[derive(Debug)]
24pub struct RawScriptInfo {
25    pub counters: Vec<(Script, usize)>,
26}
27
28impl RawScriptInfo {
29    fn new(mut counters: Vec<(Script, usize)>) -> Self {
30        counters.sort_unstable_by_key(|(_, score)| Reverse(*score));
31        Self { counters }
32    }
33
34    pub(crate) fn main_script(&self) -> Option<Script> {
35        // expect - is safe because self.counters is never expected to be empty
36        // See raw_detect_script().
37        let pair = self.counters.first().expect("counters must not be empty");
38        if pair.1 > 0 { Some(pair.0) } else { None }
39    }
40
41    pub(crate) fn count(&self, script: Script) -> usize {
42        // expect - is safe because self.counters always have all scripts
43        // See raw_detect_script().
44        self.counters
45            .iter()
46            .find(|(s, _count)| *s == script)
47            .expect("count() failed because script is not found")
48            .1
49    }
50}
51
52pub fn raw_detect_script(text: &str) -> RawScriptInfo {
53    let mut script_counters: [ScriptCounter; 25] = [
54        (Script::Latin, chars::is_latin, 0),
55        (Script::Cyrillic, chars::is_cyrillic, 0),
56        (Script::Arabic, chars::is_arabic, 0),
57        (Script::Mandarin, chars::is_mandarin, 0),
58        (Script::Devanagari, chars::is_devanagari, 0),
59        (Script::Hebrew, chars::is_hebrew, 0),
60        (Script::Ethiopic, chars::is_ethiopic, 0),
61        (Script::Georgian, chars::is_georgian, 0),
62        (Script::Bengali, chars::is_bengali, 0),
63        (Script::Hangul, chars::is_hangul, 0),
64        (Script::Hiragana, chars::is_hiragana, 0),
65        (Script::Katakana, chars::is_katakana, 0),
66        (Script::Greek, chars::is_greek, 0),
67        (Script::Kannada, chars::is_kannada, 0),
68        (Script::Tamil, chars::is_tamil, 0),
69        (Script::Thai, chars::is_thai, 0),
70        (Script::Gujarati, chars::is_gujarati, 0),
71        (Script::Gurmukhi, chars::is_gurmukhi, 0),
72        (Script::Telugu, chars::is_telugu, 0),
73        (Script::Malayalam, chars::is_malayalam, 0),
74        (Script::Oriya, chars::is_oriya, 0),
75        (Script::Myanmar, chars::is_myanmar, 0),
76        (Script::Sinhala, chars::is_sinhala, 0),
77        (Script::Khmer, chars::is_khmer, 0),
78        (Script::Armenian, chars::is_armenian, 0),
79    ];
80
81    for ch in text.chars() {
82        if is_stop_char(ch) {
83            continue;
84        }
85
86        // For performance reasons, we need to mutate script_counters by calling
87        // `swap` function, it would not be possible to do using normal iterator.
88        for i in 0..script_counters.len() {
89            let found = {
90                let (_script, check_fn, ref mut count) = script_counters[i];
91                if check_fn(ch) {
92                    *count += 1;
93                    true
94                } else {
95                    false
96                }
97            };
98            // Have to let borrow of count fall out of scope before doing swapping, or we could
99            // do this above.
100            if found {
101                // If script was found, move it closer to the front.
102                // If the text contains largely 1 or 2 scripts, this will
103                // cause these scripts to be eventually checked first.
104                if i > 0 {
105                    script_counters.swap(i - 1, i);
106                }
107                break;
108            }
109        }
110    }
111
112    let counters: Vec<(Script, usize)> = script_counters
113        .iter()
114        .map(|&(script, _, count)| (script, count))
115        .collect();
116
117    RawScriptInfo::new(counters)
118}
119
120#[cfg(test)]
121mod tests {
122    use super::*;
123
124    #[test]
125    fn test_script_name() {
126        assert_eq!(Script::Cyrillic.name(), "Cyrillic");
127        assert_eq!(Script::Katakana.name(), "Katakana");
128    }
129
130    #[test]
131    fn test_detect_script() {
132        assert_eq!(detect_script("1234567890-,;!"), None);
133
134        // One script
135        assert_eq!(detect_script("Hello!"), Some(Script::Latin));
136        assert_eq!(detect_script("Привет всем!"), Some(Script::Cyrillic));
137        assert_eq!(
138            detect_script("ქართული ენა მსოფლიო "),
139            Some(Script::Georgian)
140        );
141        assert_eq!(
142            detect_script("県見夜上温国阪題富販"),
143            Some(Script::Mandarin)
144        );
145        assert_eq!(
146            detect_script(" ككل حوالي 1.6، ومعظم الناس "),
147            Some(Script::Arabic)
148        );
149        assert_eq!(
150            detect_script("हिमालयी वन चिड़िया (जूथेरा सालिमअली) चिड़िया की एक प्रजाति है"),
151            Some(Script::Devanagari)
152        );
153        assert_eq!(
154            detect_script("היסטוריה והתפתחות של האלפבית העברי"),
155            Some(Script::Hebrew)
156        );
157        assert_eq!(
158            detect_script("የኢትዮጵያ ፌዴራላዊ ዴሞክራሲያዊሪፐብሊክ"),
159            Some(Script::Ethiopic)
160        );
161
162        // Mixed scripts
163        assert_eq!(
164            detect_script("Привет! Текст на русском with some English."),
165            Some(Script::Cyrillic)
166        );
167        assert_eq!(
168            detect_script("Russian word любовь means love."),
169            Some(Script::Latin)
170        );
171    }
172}