use std::cmp::Reverse;
use super::chars;
use super::script::Script;
use crate::utils::is_stop_char;
type ScriptCounter = (Script, fn(char) -> bool, usize);
pub fn detect_script(text: &str) -> Option<Script> {
let raw_info = raw_detect_script(text);
raw_info.main_script()
}
#[derive(Debug)]
pub struct RawScriptInfo {
pub counters: Vec<(Script, usize)>,
}
impl RawScriptInfo {
fn new(mut counters: Vec<(Script, usize)>) -> Self {
counters.sort_unstable_by_key(|(_, score)| Reverse(*score));
Self { counters }
}
pub(crate) fn main_script(&self) -> Option<Script> {
let pair = self.counters.first().expect("counters must not be empty");
if pair.1 > 0 {
Some(pair.0)
} else {
None
}
}
pub(crate) fn count(&self, script: Script) -> usize {
self.counters
.iter()
.find(|(s, _count)| *s == script)
.expect("count() failed because script is not found")
.1
}
}
pub fn raw_detect_script(text: &str) -> RawScriptInfo {
let mut script_counters: [ScriptCounter; 25] = [
(Script::Latin, chars::is_latin, 0),
(Script::Cyrillic, chars::is_cyrillic, 0),
(Script::Arabic, chars::is_arabic, 0),
(Script::Mandarin, chars::is_mandarin, 0),
(Script::Devanagari, chars::is_devanagari, 0),
(Script::Hebrew, chars::is_hebrew, 0),
(Script::Ethiopic, chars::is_ethiopic, 0),
(Script::Georgian, chars::is_georgian, 0),
(Script::Bengali, chars::is_bengali, 0),
(Script::Hangul, chars::is_hangul, 0),
(Script::Hiragana, chars::is_hiragana, 0),
(Script::Katakana, chars::is_katakana, 0),
(Script::Greek, chars::is_greek, 0),
(Script::Kannada, chars::is_kannada, 0),
(Script::Tamil, chars::is_tamil, 0),
(Script::Thai, chars::is_thai, 0),
(Script::Gujarati, chars::is_gujarati, 0),
(Script::Gurmukhi, chars::is_gurmukhi, 0),
(Script::Telugu, chars::is_telugu, 0),
(Script::Malayalam, chars::is_malayalam, 0),
(Script::Oriya, chars::is_oriya, 0),
(Script::Myanmar, chars::is_myanmar, 0),
(Script::Sinhala, chars::is_sinhala, 0),
(Script::Khmer, chars::is_khmer, 0),
(Script::Armenian, chars::is_armenian, 0),
];
for ch in text.chars() {
if is_stop_char(ch) {
continue;
}
for i in 0..script_counters.len() {
let found = {
let (_script, check_fn, ref mut count) = script_counters[i];
if check_fn(ch) {
*count += 1;
true
} else {
false
}
};
if found {
if i > 0 {
script_counters.swap(i - 1, i);
}
break;
}
}
}
let counters: Vec<(Script, usize)> = script_counters
.iter()
.map(|&(script, _, count)| (script, count))
.collect();
RawScriptInfo::new(counters)
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_script_name() {
assert_eq!(Script::Cyrillic.name(), "Cyrillic");
assert_eq!(Script::Katakana.name(), "Katakana");
}
#[test]
fn test_detect_script() {
assert_eq!(detect_script("1234567890-,;!"), None);
assert_eq!(detect_script("Hello!"), Some(Script::Latin));
assert_eq!(detect_script("Привет всем!"), Some(Script::Cyrillic));
assert_eq!(
detect_script("ქართული ენა მსოფლიო "),
Some(Script::Georgian)
);
assert_eq!(
detect_script("県見夜上温国阪題富販"),
Some(Script::Mandarin)
);
assert_eq!(
detect_script(" ككل حوالي 1.6، ومعظم الناس "),
Some(Script::Arabic)
);
assert_eq!(
detect_script("हिमालयी वन चिड़िया (जूथेरा सालिमअली) चिड़िया की एक प्रजाति है"),
Some(Script::Devanagari)
);
assert_eq!(
detect_script("היסטוריה והתפתחות של האלפבית העברי"),
Some(Script::Hebrew)
);
assert_eq!(
detect_script("የኢትዮጵያ ፌዴራላዊ ዴሞክራሲያዊሪፐብሊክ"),
Some(Script::Ethiopic)
);
assert_eq!(
detect_script("Привет! Текст на русском with some English."),
Some(Script::Cyrillic)
);
assert_eq!(
detect_script("Russian word любовь means love."),
Some(Script::Latin)
);
}
}