1use std::cmp::Reverse;
2
3use super::chars;
4use super::script::Script;
5use crate::utils::is_stop_char;
6
7type ScriptCounter = (Script, fn(char) -> bool, usize);
8
9pub fn detect_script(text: &str) -> Option<Script> {
19 let raw_info = raw_detect_script(text);
20 raw_info.main_script()
21}
22
23#[derive(Debug)]
24pub struct RawScriptInfo {
25 pub counters: Vec<(Script, usize)>,
26}
27
28impl RawScriptInfo {
29 fn new(mut counters: Vec<(Script, usize)>) -> Self {
30 counters.sort_unstable_by_key(|(_, score)| Reverse(*score));
31 Self { counters }
32 }
33
34 pub(crate) fn main_script(&self) -> Option<Script> {
35 let pair = self.counters.first().expect("counters must not be empty");
38 if pair.1 > 0 { Some(pair.0) } else { None }
39 }
40
41 pub(crate) fn count(&self, script: Script) -> usize {
42 self.counters
45 .iter()
46 .find(|(s, _count)| *s == script)
47 .expect("count() failed because script is not found")
48 .1
49 }
50}
51
52pub fn raw_detect_script(text: &str) -> RawScriptInfo {
53 let mut script_counters: [ScriptCounter; 25] = [
54 (Script::Latin, chars::is_latin, 0),
55 (Script::Cyrillic, chars::is_cyrillic, 0),
56 (Script::Arabic, chars::is_arabic, 0),
57 (Script::Mandarin, chars::is_mandarin, 0),
58 (Script::Devanagari, chars::is_devanagari, 0),
59 (Script::Hebrew, chars::is_hebrew, 0),
60 (Script::Ethiopic, chars::is_ethiopic, 0),
61 (Script::Georgian, chars::is_georgian, 0),
62 (Script::Bengali, chars::is_bengali, 0),
63 (Script::Hangul, chars::is_hangul, 0),
64 (Script::Hiragana, chars::is_hiragana, 0),
65 (Script::Katakana, chars::is_katakana, 0),
66 (Script::Greek, chars::is_greek, 0),
67 (Script::Kannada, chars::is_kannada, 0),
68 (Script::Tamil, chars::is_tamil, 0),
69 (Script::Thai, chars::is_thai, 0),
70 (Script::Gujarati, chars::is_gujarati, 0),
71 (Script::Gurmukhi, chars::is_gurmukhi, 0),
72 (Script::Telugu, chars::is_telugu, 0),
73 (Script::Malayalam, chars::is_malayalam, 0),
74 (Script::Oriya, chars::is_oriya, 0),
75 (Script::Myanmar, chars::is_myanmar, 0),
76 (Script::Sinhala, chars::is_sinhala, 0),
77 (Script::Khmer, chars::is_khmer, 0),
78 (Script::Armenian, chars::is_armenian, 0),
79 ];
80
81 for ch in text.chars() {
82 if is_stop_char(ch) {
83 continue;
84 }
85
86 for i in 0..script_counters.len() {
89 let found = {
90 let (_script, check_fn, ref mut count) = script_counters[i];
91 if check_fn(ch) {
92 *count += 1;
93 true
94 } else {
95 false
96 }
97 };
98 if found {
101 if i > 0 {
105 script_counters.swap(i - 1, i);
106 }
107 break;
108 }
109 }
110 }
111
112 let counters: Vec<(Script, usize)> = script_counters
113 .iter()
114 .map(|&(script, _, count)| (script, count))
115 .collect();
116
117 RawScriptInfo::new(counters)
118}
119
120#[cfg(test)]
121mod tests {
122 use super::*;
123
124 #[test]
125 fn test_script_name() {
126 assert_eq!(Script::Cyrillic.name(), "Cyrillic");
127 assert_eq!(Script::Katakana.name(), "Katakana");
128 }
129
130 #[test]
131 fn test_detect_script() {
132 assert_eq!(detect_script("1234567890-,;!"), None);
133
134 assert_eq!(detect_script("Hello!"), Some(Script::Latin));
136 assert_eq!(detect_script("Привет всем!"), Some(Script::Cyrillic));
137 assert_eq!(
138 detect_script("ქართული ენა მსოფლიო "),
139 Some(Script::Georgian)
140 );
141 assert_eq!(
142 detect_script("県見夜上温国阪題富販"),
143 Some(Script::Mandarin)
144 );
145 assert_eq!(
146 detect_script(" ككل حوالي 1.6، ومعظم الناس "),
147 Some(Script::Arabic)
148 );
149 assert_eq!(
150 detect_script("हिमालयी वन चिड़िया (जूथेरा सालिमअली) चिड़िया की एक प्रजाति है"),
151 Some(Script::Devanagari)
152 );
153 assert_eq!(
154 detect_script("היסטוריה והתפתחות של האלפבית העברי"),
155 Some(Script::Hebrew)
156 );
157 assert_eq!(
158 detect_script("የኢትዮጵያ ፌዴራላዊ ዴሞክራሲያዊሪፐብሊክ"),
159 Some(Script::Ethiopic)
160 );
161
162 assert_eq!(
164 detect_script("Привет! Текст на русском with some English."),
165 Some(Script::Cyrillic)
166 );
167 assert_eq!(
168 detect_script("Russian word любовь means love."),
169 Some(Script::Latin)
170 );
171 }
172}